"""Create and score morphology features.

Functions:
find

    The following functions are useful for training and interacting
    with the underlying classifier.
make_vector
make_vector_with_context
score
score_with_context
score_vector
describe_vector
can_handle
can_handle_context

    The following functions implement base functionality.
stem_greekX
stem_Xgreek
stem_Xroman
stem_proX
stem_apoholoX
stem_Xupper
stem_Xnum
stem_Xfamily
stem_lXl
    
score_greekX
score_Xgreek
score_Xroman
score_proX
score_apoholoX
score_Xupper
score_Xnum
score_Xfamily
score_lXl

"""
import re

from Extracto import memoize
from Extracto.genename import support

# Naming Conventions: These function names are named for the patterns
# detected.  X standards for the input word.  Thus, "proX" looks for
# the input word prefixed with "pro".


def stem_greekX(word):
    """stem_greekX(word) -> stem (word without greek prefix) or None"""
    # Remove all the greek letters that appear at the beginning of the
    # word, such that:
    # betaE11 -> E11
    # beta2AR -> AR
    # alphabetaTCR -> TCR
    # beta -> <None>
    # beta2 -> <None>
    # betaI -> <None>
    import mx.TextTools as TT
    from Extracto import strcompn

    # Make a regular expression that matches as many greek letters as
    # possible.
    s = "|".join(strcompn.GREEK_LETTERS)
    greek_letters_re = re.compile(r"^((%s)\d*)*$" % s, re.IGNORECASE)
    
    # Find the boundaries in the word.  Then, check to see if the
    # prefixes of the boundaries are greek letters.
    boundaries = [i for i in range(1, len(word))
                  if strcompn.is_boundary(word, i-1, i)]
    boundaries.reverse()
    for i in boundaries:
        s = word[:i]
        if greek_letters_re.match(s):
            break
    else:
        return None    # Nothing found.
    # The stem is everything after the boundary.
    stem = word[i:]
    # Strip the punctuation and numbers after the greek letter, before
    # the word.
    stem = TT.setstrip(stem, strcompn.NOT_LETTER_SET, 0, len(stem), -1)
    if len(stem) < 2:
        return None
    return stem

def stem_Xgreek(word):
    # Remove all the greek letters that appear at the end of the word.
    import mx.TextTools as TT
    from Extracto import strcompn

    # Make a regular expression that matches as many greek letters as
    # possible.
    s = "|".join(strcompn.GREEK_LETTERS)
    greek_letters_re = re.compile(r"^(%s)+$" % s, re.IGNORECASE)

    # Find the boundaries in the word.  Then, check to see if the
    # suffixes of the boundaries are greek letters.
    boundaries = [i for i in range(1, len(word))
                  if strcompn.is_boundary(word, i-1, i)]
    for i in boundaries:
        s = word[i:]
        if strcompn.has_unusual_mixed_case(s):
            continue
        if greek_letters_re.match(s):
            break
    else:
        return None    # Nothing found.
    # The stem is everything before the boundary.
    stem = word[:i]
    if len(stem) < 2:
        return None
    return stem

def stem_Xroman(word):
    import mx.TextTools as TT
    from Extracto import strcompn

    # Get a list of roman numerals that match the end of the word.
    at_end = []
    lword = word.lower()
    for numeral in strcompn.ROMAN_NUMERALS:
        if lword.endswith(numeral.lower()):
            at_end.append(word[-len(numeral):])

    # Sort the roman numerals by decreasing length.
    schwartz = [(-len(x), x) for x in at_end]
    schwartz.sort()
    at_end = [x[1] for x in schwartz]

    for numeral in at_end:
        stem = word[:-len(numeral)]

        # Make sure the stem has at least 2 letters.
        clean_stem = TT.setstrip(stem, strcompn.NOT_LETTER_SET)
        if len(clean_stem) < 2:
            continue

        # If the numeral occurs at a word boundary, accept the stem.
        if strcompn.is_boundary(word, len(stem)-1, len(stem)):
            return stem

        # If the numeral is all upper case, and it's more than one
        # letter, accept the stem.
        if len(numeral) > 1 and numeral.isupper():
            return stem
    
    return None

# Starts with "pro", stem >= 2 chars, stem contains character
PROX_RE = re.compile(r"^pro([a-z]{2,})$", re.IGNORECASE)
def stem_proX(word):
    m = PROX_RE.match(word)
    if m is None:
        return None
    return m.group(1)

# Starts with "apo" or "holo", stem >= 2 chars, stem contains character
APOHOLOX_RE = re.compile(r"^(?:apo|holo)([a-z]{2,})$", re.IGNORECASE)
def stem_apoholoX(word):
    m = APOHOLOX_RE.match(word)
    if m is None:
        return None
    return m.group(1)

# Starts with a 3 letters (first one may be capitalized) plus a single
# upper case letter.
XUPPER_RE = re.compile(r"^(?:([A-Z]?[a-z]{2,})|([a-z]{3,}))[A-Z]$")
def stem_Xupper(word):
    m = XUPPER_RE.match(word)
    if m is None:
        return None
    return m.group(1)
    
# Starts with two more more letters followed by 1-3 digits.
XNUM_RE = re.compile(r"^([a-z]{2,})\d{1,3}$", re.IGNORECASE)
def stem_Xnum(word):
    m = XNUM_RE.match(word)
    if m is None:
        return None
    return m.group(1)

# Check to see whether the gene name exists without the
# family/subfamily, etc.
# SULT1A2 -> SULT
# CYP2D -> CYP
# 2 or more letters, 1-2 digits, a letter, and 0-2 digits.  Similar to
# XNUM, but requires at least one more letter.
XFAMILY_RE = re.compile(r"^([A-Z]{2,})\d{1,2}[A-Z]\d{0,2}$", re.IGNORECASE)
def stem_Xfamily(word):
    m = XFAMILY_RE.match(word)
    if m is None:
        return None
    return m.group(1)

def stem_lXl(word):
    import re
    import mx.TextTools as TT
    from Extracto import strcompn
    
    # Look for words that either begin or end with a lower case
    # letter, next to an upper case one.
    if len(word) < 4:
        return None
    prefix, suffix = word[:2], word[-2:]
    prefix_letters = re.sub(r"[^%s]" % TT.alpha, "", word[1:])
    suffix_letters = re.sub(r"[^%s]" % TT.alpha, "", word[:-1])
    
    # Assume that the prefix and suffix are valid.  Then, rule them
    # out if they are invalid.
    valid_prefix = valid_suffix = 1

    # e.g. cJun, cFOS, cMYC
    if not prefix.isalpha():    # make sure all letters
        valid_prefix = 0
    elif not (prefix[0].islower() and prefix[1].isupper()):
        valid_prefix = 0
    elif len(prefix_letters) < 3:
        valid_prefix = 0

    # e.g. Gal80p
    if not suffix[1].isalpha():
        valid_suffix = 0
    elif not strcompn.is_boundary(suffix, 0, 1):
        valid_suffix = 0
    elif len(suffix_letters) < 3:
        valid_suffix = 0
    elif suffix[1] == 's':     # Don't want 's' at end, for plural words.
        valid_suffix = 0

    if valid_prefix:
        return word[1:]
    if valid_suffix:
        return word[:-1]
    return None
    
def _load_scorefile(filename):
    from Extracto import datafile
    scores = {}
    for line in datafile.open(filename).readlines():
        word, score = line.rstrip().split()
        scores[word] = float(score)
    return scores
_load_scorefile = memoize.memoize(_load_scorefile)

def _read_score(filename, word):
    scores = _load_scorefile(filename)
    return scores.get(word, 0)

def score_greekX(word):
    return _read_score("morphology.greekX.scores", word)

def score_Xgreek(word):
    return _read_score("morphology.Xgreek.scores", word)

def score_Xroman(word):
    return _read_score("morphology.Xroman.scores", word)

def score_proX(word):
    return _read_score("morphology.proX.scores", word)

def score_apoholoX(word):
    return _read_score("morphology.apoholoX.scores", word)

def score_Xupper(word):
    return _read_score("morphology.Xupper.scores", word)

def score_Xnum(word):
    return _read_score("morphology.Xnum.scores", word)

def score_Xfamily(word):
    return _read_score("morphology.Xfamily.scores", word)

def score_lXl(word):
    return _read_score("morphology.lXl.scores", word)
    
def _retokenize(word):
    """Return the word in a normalized form."""
    # Normalize the results of the tokenizer.

    import string
    import mx.TextTools as TT

    # Remove the '-' in the words.
    # EXAMPLES: IL-1 -> IL1, caspase-9 -> caspase9
    word = TT.replace(word, "-", "")

    # Remove punctuation at the end of the word.
    # EXAMPLES: CD3+ -> CD3
    punct_set = TT.set(string.punctuation)
    word = TT.setstrip(word, punct_set, 0, len(word), 1)

    # Remove the spaces in the tokens.
    # EXAMPLES: Kpn I -> KpnI
    word = TT.replace(word, " ", "")

    return word

from Extracto.genename import Feature
class MorphologyFeature(Feature.Feature):
    def __init__(self):
        Feature.Feature.__init__(self, "morphology.classifier")
        self._FEATURES = [
            score_greekX,
            score_Xgreek,
            score_Xroman,
            
            #score_proX,   # Obsolete.
            score_apoholoX,
            score_Xupper,
            score_Xnum,
            score_Xfamily,
            score_lXl,
            ]

    def _make_vector_with_context(self, document, sentence_range, word_range):
        import math

        word = support.context2word(document, sentence_range, word_range)
        rword = _retokenize(word)
        vector = [f(rword) for f in self._FEATURES]
        # Log the vector and transform it so that numbers are >= 0.
        vector = [max(1E-3, x) for x in vector]
        vector = [math.log10(x)+3 for x in vector]
        return vector
        
    def _describe_vector(self):
        return [x.func_name for x in self._FEATURES]

    def _can_handle_context(self, document, sentence_range, word_range):
        import mx.TextTools as TT
        non_alnum_set = TT.invset(TT.alphanumeric)
        word = support.context2word(document, sentence_range, word_range)
        # Make sure the word contains no punctuation or spaces.
        if TT.setfind(word, non_alnum_set) >= 0:
            return 0
        # Make sure the word contains a letter.
        if TT.setfind(word, TT.alpha_set) < 0:
            return 0
        return 1

support.bind_methods_to_module(__name__, MorphologyFeature())

def find(document):
    """Return list of (start, end, score)."""
    import math
    context = support.doc2context(document, fancy_tokenizer=1)
    data = []
    for x, statement_range, word_range in context:
        if not can_handle_context(document, statement_range, word_range):
            continue
        score = score_with_context(document, statement_range, word_range)
        data.append((word_range[0], word_range[1], math.exp(score)))
    return data
