"""

Functions:
find      Find the occurrences of genes.

"""
import re


# Some patterns:
# cyp2D6
# cyp2D6 ... 2D6
# cytochrome p450 1A2
# cytochrome p4501A2
# cytochrome P-450
# cytochrome P-4502E1
# cytochrome P-45IIEI
# p450 IID1
# P-450IIEI
# CYPs
# http://drnelson.utmem.edu/P450.family.list.html
CYP_DIGIT = r"(\d+|I|II|III|IV|V|VI|VII|VIII|IX|X)"
CYP_SUBFAMILY = r"[ABCDEFGHJKLMNPQRSTUVWXYZ]"

# A family is 2D6, 1C1, etc...
FAMILY_STR = r"(?!is)(?P<family>%s(%s%s?)?)" % (CYP_DIGIT, CYP_SUBFAMILY, CYP_DIGIT)
P450_STR = r"(p-?450?(\s*%s)?)" % (FAMILY_STR)

CYP1_RE = re.compile(r"cyp%s" % FAMILY_STR, re.IGNORECASE)
CYP2_RE = re.compile(r"cytochrome\s+%s" % P450_STR, re.IGNORECASE)
CYP3_RE = re.compile(P450_STR, re.IGNORECASE)
CYP4_RE = re.compile(r"CYPs")

def _find_cyt_and_families(statement):
    # return (a list of gene ranges, a list of family ranges)
    from Extracto import rangefns
    from Extracto import refns
    
    statement = str(statement)
    reobjs = refns.re_findall(CYP1_RE, statement) + \
             refns.re_findall(CYP2_RE, statement) + \
             refns.re_findall(CYP3_RE, statement) + \
             refns.re_findall(CYP4_RE, statement)
    ranges, families = [], []
    for m in reobjs:
        m_start, m_end = m.start(), m.end()
        ranges.append((m_start, m_end))
        if m.groupdict().has_key('family'):
            m_start, m_end = m.start('family'), m.end('family')
            if m_end > m_start:
                families.append((m_start, m_end))
    return rangefns.munge(ranges), families

def find(document):
    """Return list of (start, end)."""
    from Bio import listfns
    from Extracto import rangefns
    from Extracto import refns
    from Extracto.genename import support

    str_document = str(document)

    # Get a list of the ranges for the cytochrome P450's and a list of
    # the families that occur in this document.
    ranges = []     # list of (s, e) gene ranges
    families = []   # list of names of families
    for statement, offset in support.doc2statements(document):
        x = _find_cyt_and_families(statement)
        cyt_ranges, family_ranges = x
        ranges += [(s+offset, e+offset) for (s, e) in cyt_ranges]
        families += [statement[s:e] for (s, e) in family_ranges]
    families = [x.upper() for x in families]
    families = listfns.items(families)

    # If I find one family in a document, then all other instances of
    # that family is also a cytochrome p450.
    for family in families:
        family_re = re.compile(r"\b%s\b" % family, re.IGNORECASE)
        reobjs = refns.re_findall(family_re, str_document)
        for m in reobjs:
            m_start, m_end = m.start(), m.end()
            ranges.append((m_start, m_end))
    
    return rangefns.munge(ranges)
