"""Create and score context features.

Functions:
find

    The following functions are useful for training and interacting
    with the underlying classifier.
make_vector
make_vector_with_context
score
score_with_context
score_vector
describe_vector
can_handle
can_handle_context

    The following functions implement base functionality.
get_common_neighbors
get_indicative_neighbors  List of bigrams different between gene and noun.

get_neighbor_frequencies

"""
from Extracto import config
from Extracto import memoize
from Extracto import Cache
from Extracto.genename import support

class _gene_vs_noun_iterator:
    def __init__(self, filename, score_cutoff=None):
        from Extracto import datafile
        self.handle = open(datafile.find(filename))
        self.score_cutoff = score_cutoff
    def __getitem__(self, index):
        while 1:
            x = self.handle.readline()
            if not x:
                raise IndexError
            cols = x.rstrip().split()
            where, word, gene_c, gene_p, nongene_c, nongene_p, chisq, p = cols
            gene_c, gene_p = int(gene_c), float(gene_p)
            nongene_c, nongene_p = int(nongene_c), float(nongene_p)
            chisq, p = float(chisq), float(p)

            if self.score_cutoff is None or p <= self.score_cutoff:
                break
        if where == 'first':
            w1, w2 = None, word
        else:
            w1, w2 = word, None
        return w1, w2, gene_c, gene_p, nongene_c, nongene_p, chisq, p

def get_common_neighbors():
    bigrams = []
    for x in _gene_vs_noun_iterator("gene_vs_noun.significant.common"):
        w1, w2 = x[:2]
        bigrams.append((w1, w2))
    return bigrams
get_common_neighbors = memoize.memoize(get_common_neighbors)


def get_indicative_neighbors(score_cutoff=1E-6):
    """get_indicative_neighbors(score_cutoff) -> list of tuples (word1, word2)

    Return a list of tuples of neighbors of words that are significantly
    different for gene and non-gene words.  One of word1 and word2
    will be None and should be replaced by the gene or non-gene word.

    """
    bigrams = []
    for x in _gene_vs_noun_iterator("gene_vs_noun.significant", score_cutoff):
        w1, w2 = x[:2]
        bigrams.append((w1, w2))
    return bigrams
get_indicative_neighbors = memoize.memoize(get_indicative_neighbors)

def _get_neighbor_frequencies_hashable(neighbors, word):
    from Extracto import Medline
    vector = []
    # Set the vector to the counts of the significant bigrams for this
    # word.
    for w1, w2 in neighbors:
        # Replace w1 or w2 with the word as appropriate.
        if w1 is None:
            w1 = word
        else:
            w2 = word
        count = Medline.bigram_frequency(w1, w2)
        vector.append(count)
    return vector

def get_neighbor_frequencies(neighbors, word):
    return _get_neighbor_frequencies_hashable(tuple(neighbors), word)

# get_neighbor_frequencies requires a lot of database accesses.  Thus,
# cache the results to a database.  Also make an in-memory cache to
# speed up short-term access.
_get_neighbor_frequencies_hashable = memoize.memoize(
    _get_neighbor_frequencies_hashable,
    cache=Cache.DBCache(config.db.jchangdb, "cache_context_nf"))
_get_neighbor_frequencies_hashable = memoize.memoize(
    _get_neighbor_frequencies_hashable, cache=Cache.SizedDictCache(10000))



from Extracto.genename import Feature
DEFAULT_SCORE_CUTOFF = 1e-6
class ContextFeature(Feature.Feature):
    def __init__(self, score_cutoff=None):
        Feature.Feature.__init__(self, "context.classifier")
        self._score_cutoff = score_cutoff or DEFAULT_SCORE_CUTOFF
        
    def _get_neighbors(self):
        return get_common_neighbors()

    def _make_vector_with_context(self, document, sentence_range, word_range):
        from Numeric import sum
        import operator
        word = support.context2word(document, sentence_range, word_range)
        bigrams = self._get_neighbors()
        vector = get_neighbor_frequencies(bigrams, word)

        # Normalize by percentage
        s = float(sum(vector))
        if s:
            vector = [v/s for v in vector]
        return vector
        
    def _describe_vector(self):
        bigrams = self._get_neighbors()
        features = []
        for w1, w2 in bigrams:
            if w1 is None:
                f = "UNK-%s" % w2
            else:
                f = "%s-UNK" % w1
            features.append(f)
        return features

    def _can_handle_context(self, document, sentence_range, word_range):
        import mx.TextTools as TT
        from Numeric import sum
        non_alnum_set = TT.invset(TT.alphanumeric)
        word = support.context2word(document, sentence_range, word_range)
        # Make sure the word contains no punctuation or spaces.
        if TT.setfind(word, non_alnum_set) >= 0:
            return 0
        # Make sure the word contains a letter.
        if TT.setfind(word, TT.alpha_set) < 0:
            return 0
        # Make sure the vector contains some data.
        word_range = 0, len(word)
        # Calling make_vector here would result in infinite recursion.
        vector = self._make_vector_with_context(word, word_range, word_range)
        num_instances = sum(vector)
        if num_instances == 0:
            return 0
        return 1

support.bind_methods_to_module(__name__, ContextFeature())


def find(document):
    """Return list of (start, end, score)."""
    import math
    context = support.doc2context(document)
    data = []
    for x, statement_range, word_range in context:
        if not can_handle_context(document, statement_range, word_range):
            continue
        score = score_with_context(document, statement_range, word_range)
        data.append((word_range[0], word_range[1], math.exp(score)))
    return data
