"""This module provides functions to search for entries from a lexicon
in a block of text.

Functions:
find      Find entries in the lexicon in a document.

"""
from Extracto import memoize

def _normalize(string, ignore_case, all_boundaries_equal):
    if ignore_case:
        string = string.lower()
    if all_boundaries_equal:
        import re
        string = re.sub(r"[\W]+", " ", string)
    return string
        
def _make_phrase_trie(phrases, ignore_case, all_boundaries_equal):
    """Return a trie where the keys are the normalized phrases and the
    values are the original phrases."""
    from Bio import trie
    t = trie.trie()
    for phrase in phrases:
        nphrase = _normalize(phrase, ignore_case, all_boundaries_equal)
        t[nphrase] = phrase
    return t
_make_phrase_trie = memoize.memoize(
    _make_phrase_trie, args2key=lambda p, i, a: (tuple(p), i, a))

def _get_synonyms(document, phrases, ignore_case, all_boundaries_equal):
    """Return a dictionary where the keys are the synonyms (from
    abbreviations) and the values are a tuple of (phrase, score).

    """
    from Extracto import docfns

    lexicon = _make_phrase_trie(phrases, ignore_case, all_boundaries_equal)

    # Look at all the abbreviations and prefixes found in the
    # document.  If either an abbrev or prefix is in the lexicon and
    # the other is not, add it to my dictionary.
    analogs = {}  # abbreviation or long form -> phrase, score
    
    abbrevs = docfns.extract_abbrevs(document, ignore_nonmstring=1)
    for prefix, abbrev, score in abbrevs:
        prefix = _normalize(prefix, ignore_case, all_boundaries_equal)
        abbrev = _normalize(abbrev, ignore_case, all_boundaries_equal)
        
        # If none or both the prefix and abbreviation are in the
        # lexicon, then ignore.
        if lexicon.has_key(prefix) == lexicon.has_key(abbrev):
            continue
        # I'm making an assumption here that the there won't
        # be 2 different abbreviations.
        if lexicon.has_key(abbrev):  # abbrev in lexicon, prefix is not
            analogs[prefix] = (lexicon[abbrev], score)
        else:
            analogs[abbrev] = (lexicon[prefix], score)
    return analogs

def _align_nstr2str(norm_string, string):
    # len(norm_string) >= len(string)
    from Extracto import ctype
    from Extracto import tokenfns

    isalnum = ctype.isalnum

    # I don't care about the case in the alignment.
    norm_string, string = norm_string.lower(), string.lower()
    nindex2index = [None] * len(norm_string)
    index = 0
    for nindex in range(len(norm_string)):
        if not isalnum(norm_string[nindex]):
            nindex2index[nindex] = index
        else:
            while norm_string[nindex] != string[index]:
                if index >= len(string):
                    raise AssertionError, "ran out of characters"
                index += 1
            nindex2index[nindex] = index
            index += 1
    nindex2index.append(len(string))
    return nindex2index

def find(document, phrases, ignore_case=0, all_boundaries_equal=0,
         include_abbreviations=0):
    """Find occurrences of phrases in a document.  Return a list of
    (phrase, start, end, score (for abbreviations)).

    """
    from Bio import triefind
    
    str_document = str(document)
    nstr_document = _normalize(str_document, ignore_case, all_boundaries_equal)

    # The normalization might have messed up the indexing.  Figure out
    # how to convert normalized indexes to indexes on the original
    # string.
    nindex2index = _align_nstr2str(nstr_document, str_document)

    lexicon = _make_phrase_trie(phrases, ignore_case, all_boundaries_equal)
    # Make a list of the entries in the lexicon that I found.
    # entries_found is a list of (entry, start, end, score or None)
    entries_found = []
    for nentry, nstart, nend in triefind.find_words(nstr_document, lexicon):
        entries_found.append(
            (lexicon[nentry], nindex2index[nstart], nindex2index[nend], None))

    if not include_abbreviations:
        return entries_found

    # Dictionary of normalized synonym -> (phrase, score).
    synonyms = _get_synonyms(document, phrases,
                             ignore_case, all_boundaries_equal)
    # Do the search for synonyms in the lexicon.
    synlexicon = _make_phrase_trie(
        synonyms.keys(), ignore_case, all_boundaries_equal)
    for nsynonym, nstart, nend in \
        triefind.find_words(nstr_document, synlexicon):
        phrase, score = synonyms[nsynonym]
        entries_found.append(
            (phrase, nindex2index[nstart], nindex2index[nend], score))
    return entries_found