"""Create and score appearance features.

Functions:
find

    The following functions are useful for training and interacting
    with the underlying classifier.
make_vector
make_vector_with_context
score
score_with_context
score_vector
describe_vector
can_handle
can_handle_context

    The following functions implement base functionality.
is_all_caps
upper_only_at_end
has_unusual_mixed_case

ends_with_roman
has_greek
has_dash

has_a_digit
digits_only_at_beginning
digits_only_in_middle
digits_only_at_end
digits_only_on_outside
has_unusual_mixed_numbers

looks_like_family
ends_with_in

has_one_letter
has_two_letters
has_three_to_five_letters
has_six_or_more_letters

"""
import re

from Extracto import strcompn
from Extracto.genename import support

# Find a word with all capital letters, allowing -s at the end and a
# lower case letter at the beginning.
# e.g. mEH, sEH, cMRP
# microsomal epoxide hydrolase
# soluble EH
# canalicular MRP
_ALL_CAPS_RE = re.compile(r"[a-z]?[A-Z]{2,}s?")
def is_all_caps(word):
    """is_all_caps(word) -> boolean"""
    return _ALL_CAPS_RE.match(word) is not None

def has_unusual_mixed_case(word):
    """has_unusual_mixed_case(word) -> boolean"""
    from Extracto import ctype
    if not strcompn.has_unusual_mixed_case(word):
        return 0
    # An abbreviation with a lower case 's' at the end is not unusual
    # mixed case.
    # GSTs, EHs, PSTs
    if len(word) >= 2 and ctype.isupper(word[:-1]) and word[-1] == 's':
        return 0
    return 0

# Contains upper case letters, only at the end of the word.  At least
# one lower case letter appears before that.
_UPPER_ONLY_AT_END_RE = re.compile(r"^[^A-Z]*[a-z][^A-Z]*([^a-z]*[A-Z])+$")
def upper_only_at_end(word):
    return _UPPER_ONLY_AT_END_RE.match(word) is not None

def ends_with_roman(word):
    if len(word) < 3:    # must have >= 2 letters+roman numeral
        return 0
    lword = word.lower()
    for roman in strcompn.ROMAN_NUMERALS:
        lroman = roman.lower()
        if not lword.endswith(lroman):   # make sure it ends with roman numeral
            continue
        if lword == lroman:              # make sure it is not a roman numeral
            continue
        w = lword[:-len(lroman)]
        w = strcompn.setremove(w, strcompn.NOT_LETTER_SET)
        if len(w) < 2:                   # make sure it has at least 2 letters
            continue
        return 1
    return 0

def has_a_digit(word):
    """has_a_digit(word) -> boolean"""
    import mx.TextTools as TT
    return TT.setfind(word, TT.number_set) >= 0

# Digits at end, at least one alphabetical character.
_DIGITS_ONLY_AT_BEGINNING = re.compile(r"^([^A-Za-z]*\d)+\D*[A-Za-z]\D*$")
def digits_only_at_beginning(word):
    """digits_only_at_beginning(word) -> boolean"""
    return _DIGITS_ONLY_AT_BEGINNING.match(word) is not None

# Digits in the middle, at least one alphabetical character on each side.
_DIGITS_ONLY_IN_MIDDLE = re.compile(
    r"^\D*[A-Za-z]\D*([^A-Za-z]*\d)+\D*[A-Za-z]\D*$")
def digits_only_in_middle(word):
    return _DIGITS_ONLY_IN_MIDDLE.match(word) is not None

_LOOKS_LIKE_FAMILY_RE = re.compile(r"[A-Za-z]{2,}\d([A-Za-z]\d?)?$")
def looks_like_family(word):
    return _LOOKS_LIKE_FAMILY_RE.match(word) is not None

GREEK_RE = re.compile(
    r"(?:\b|[^a-z])(%s)(?:\b|[^a-z])" % '|'.join(strcompn.GREEK_LETTERS))
def has_greek(word):
    if len(word) < 3:     # must have >= 2 letters + greek
        return 0
    # PPARgamma2
    # beta2AR
    m = GREEK_RE.search(word)
    if m is None:
        return 0
    # make sure it's not a complete greek letter
    s, e = m.span(1)
    if e-s == len(word):
        return 0
    # make sure there are at least 2 other letters
    w = word[:s] + word[e:]
    w = strcompn.setremove(w, strcompn.NOT_LETTER_SET)
    if len(w) < 2:
        return 0
    return 1

def has_dash(word):
    # Look for a dash that's not at the end of the word.
    import mx.TextTools as TT
    from Extracto import strcompn
    word = TT.setstrip(word, strcompn.SPACE_AND_PUNC_SET)
    return TT.find(word, "-") >= 0
    
# Digits at end, at least one alphabetical character.
_DIGITS_ONLY_AT_END = re.compile(r"^\D*[A-Za-z]\D*(\d[^A-Za-z]*)+$")
def digits_only_at_end(word):
    """digits_only_at_end(word) -> boolean"""
    return _DIGITS_ONLY_AT_END.match(word) is not None

# Digits on both sides, at least one alphabetical character.
_DIGITS_ONLY_ON_OUTSIDE_RE = re.compile(
    r"^([^A-Za-z]*\d)+\D*[A-Za-z]\D*(\d[^A-Za-z]*)+$")
def digits_only_on_outside(word):
    return _DIGITS_ONLY_ON_OUTSIDE_RE.match(word) is not None

def has_unusual_mixed_numbers(word):
    """has_unusual_mixed_numbers(word) -> boolean"""
    return strcompn.has_unusual_mixed_numbers(word)

def ends_with_in(word):
    return len(word) >= 4 and word.endswith("in")

def has_six_or_more_letters(word):
    """has_six_or_more_letters(word) -> boolean"""
    num_letters = len(strcompn.setremove(word, strcompn.NOT_LETTER_SET))
    return num_letters >= 6

def has_one_letter(word):
    num_letters = len(strcompn.setremove(word, strcompn.NOT_LETTER_SET))
    return num_letters == 1

def has_two_letters(word):
    num_letters = len(strcompn.setremove(word, strcompn.NOT_LETTER_SET))
    return num_letters == 2

def has_three_to_five_letters(word):
    num_letters = len(strcompn.setremove(word, strcompn.NOT_LETTER_SET))
    return num_letters >= 3 and num_letters <= 5

from Extracto.genename import Feature
class AppearanceFeature(Feature.Feature):
    def __init__(self):
        Feature.Feature.__init__(self, "appearance.classifier")
        self._FEATURES = [
            # Looks at the cases of the words.
            is_all_caps,
            upper_only_at_end,
            has_unusual_mixed_case,
            
            # Looks at the numbers.
            looks_like_family,
            has_greek,
            has_dash,
            
            # Looks at the semantics of the word.
            #ends_with_in,
            ends_with_roman,
            
            # Other.
            has_one_letter,
            has_two_letters,
            has_three_to_five_letters,
            has_six_or_more_letters,
            
            # Obsolete.
            #has_a_digit,
            digits_only_at_beginning,
            #digits_only_in_middle,
            digits_only_at_end,
            #digits_only_on_outside,
            #has_unusual_mixed_numbers,
            ]

        from Extracto import memoize
        from Extracto import Cache
        self.make_vector_with_context = memoize.memoize(
            self.make_vector_with_context, cache=Cache.SizedDictCache(10000),
            args2key=support.context2word)

    def _make_vector_with_context(self, document, sentence_range, word_range):
        word = support.context2word(document, sentence_range, word_range)
        rword = _retokenize(word)
        vector = [f(rword) for f in self._FEATURES]
        return vector
    
    def _describe_vector(self):
        return [x.func_name for x in self._FEATURES]

    def _can_handle_context(self, document, sentence_range, word_range):
        word = support.context2word(document, sentence_range, word_range)
        if word.lower().endswith("in"):
            return 0
        return 1

def _retokenize(word):
    """Return the word in a normalized form."""
    import mx.TextTools as TT
    from Extracto import strcompn

    greek_letters = []
    # First, do a quick scan to see which greek letters are present in
    # the word.  If any are found, do a more thorough analysis.
    lword = word.lower()
    for letter in strcompn.GREEK_LETTERS:
        if TT.find(lword, letter) >= 0:
            greek_letters.append(letter)
    if not greek_letters:
        return word

    # Remove greek letters at the beginning or end of the word.
    non_alpha_set = TT.invset(TT.alpha)
    while 1:
        # Strip punctuation and numbers from the end of the word.  Do
        # this inside the while loop because word is changed from
        # iteration to iteration.
        word_alphaonly = TT.setstrip(word, non_alpha_set)
        lword_alphaonly = word_alphaonly.lower()
        for letter in greek_letters:
            if len(word_alphaonly) <= len(letter):
                # Ignore words that are shorter than, or equal to the
                # greek letter.
                continue

            l = len(word_alphaonly)
            if lword_alphaonly.startswith(letter) and \
               strcompn.is_boundary(
                word_alphaonly, len(letter)-1, len(letter)):
                # The word begins with a greek letter, and there's a
                # word boundary between the greek letter and the next
                # character.
                i = word.lower().index(letter)
                word = word[i+len(letter):]
                break
            elif lword_alphaonly.endswith(letter) and \
                 strcompn.is_boundary(
                word_alphaonly, l-len(letter)-1, l-len(letter)):
                # The word ends with a greek letter, with a word
                # boundary.
                i = word.lower().rindex(letter)
                word = word[:i]
                break
        else:
            # I have iterated through all the greek letters and could
            # not find any at the beginning or end of the word.
            break
    return word

support.bind_methods_to_module(__name__, AppearanceFeature())

def find(document):
    """Return list of (start, end, score)."""
    import math
    context = support.doc2context(document, fancy_tokenizer=1)
    data = []
    for x, statement_range, word_range in context:
        if not can_handle_context(document, statement_range, word_range):
            continue
        score = score_with_context(document, statement_range, word_range)
        data.append((word_range[0], word_range[1], math.exp(score)))
    return data