"""

Functions:
find      Find the occurrences of genes.

"""
import re
from Extracto import memoize

def _get_nongenes_that_end_with_ase():
    from Extracto import datafile
    from Extracto import comments
    nongenes = {}
    for line in datafile.open("ase.nongene").readlines():
        line = comments.remove(line.rstrip())
        if not line:
            continue
        word = line.lower()
        nongenes[word] = 1
        nongenes["%ss" % word] = 1
    return nongenes
_get_nongenes_that_end_with_ase = memoize.memoize(
    _get_nongenes_that_end_with_ase)

# Allowing -ases at the end includes families of proteins.
# _ASE_RE = re.compile(r"[a-zA-Z]+ases?\b", re.IGNORECASE)
_ASE_RE = re.compile(r"[a-zA-Z]+ase\b", re.IGNORECASE)
def _find_ase_genes(sentence):
    from Extracto import refns
    
    # return a list of ranges
    nongenes = _get_nongenes_that_end_with_ase()
    sentence = str(sentence)
    ranges = []
    reobjs = refns.re_findall(_ASE_RE, sentence)
    for m in reobjs:
        m_start, m_end = m.start(), m.end()
        word = sentence[m_start:m_end]
        if not nongenes.has_key(word.lower()):
            ranges.append((m_start, m_end))
    return ranges

def find(document):
    """Return list of (start, end)."""
    from Extracto.genename import support

    data = []
    for statement, offset in support.doc2statements(document):
        ranges = _find_ase_genes(statement)
        data += [(s+offset, e+offset) for (s, e) in ranges]
    return data
