"""This module finds gene symbols using abbreviation matching.

Functions:
find      Find the occurrences of genes.

"""
ABBREV_SCORE_CUTOFF = 0.03

def _as_expression(s):
    import re
    import mx.TextTools as TT
    # Normalize an abbreviation or long form for a regular expression.
    # Ignore differences in whitespace.
    s = TT.collapse(s)   # collapse first, so \n\t\v not excaped
    s = [re.escape(x) for x in s.split()]
    s = r"\s+".join(s)
    s = r"\b%s\b" % s
    return s

def find(document, gene_ranges):
    import re
    import mx.TextTools as TT
    from Extracto import refns
    from Extracto import docfns
    
    str_document = str(document)

    # Get a list of all the abbreviations for this document.
    abbreviations = docfns.extract_abbrevs(document, ABBREV_SCORE_CUTOFF)

    # Don't use if the abbreviation is too short.  Too many false
    # positives.
    abbreviations = [x for x in abbreviations if len(x[1]) > 1]

    # Figure out where the synonyms are.
    synonyms = {}   # word -> list of ranges
    for long_form, abbrev, x in abbreviations:
        synonyms.setdefault(abbrev.lower(), []).extend(
            refns.findall_ranges(
            re.compile(_as_expression(long_form), re.IGNORECASE),
            str_document))
        synonyms.setdefault(long_form.lower(), []).extend(
            refns.findall_ranges(
            re.compile(_as_expression(abbrev), re.IGNORECASE),
            str_document))

    abb_ranges = gene_ranges[:]
    i = 0
    while i < len(abb_ranges):
        start, end, score = abb_ranges[i]
        word = str_document[start:end]
        nword = TT.collapse(word).lower()
        
        # If word has a synonym, then each of the synonyms also has
        # this score.
        for s, e in synonyms.get(nword, []):
            # Append it to the end of the abb ranges, so that we can
            # later search it for abbreviations as well.
            if (s, e, score) not in abb_ranges:
                abb_ranges.append((s, e, score))
        i += 1
        
    # Remove the original gene ranges.
    abb_ranges = abb_ranges[len(gene_ranges):]
    return abb_ranges
