"""This module scores possible gene names that end with -in.

Functions:
find

    The following functions implement base functionality.
make_vector_with_context
score
score_with_context
can_handle_context

"""
from Extracto import config
from Extracto import memoize
from Extracto import Cache
from Extracto.genename import support

def _calc_score(word):
    from Extracto import PnpClassifier
    from Extracto import datafile
    training_file = datafile.find("mesh_proteins.clusters")
    x = PnpClassifier.classify(word, training_file)
    word, prediction, scores = x
    p1, p0 = scores
    return p1
_calc_score = memoize.memoize(
    _calc_score,
    cache=Cache.DBCache(config.db.jchangdb, "cache_endswithin_score"))
_calc_score = memoize.memoize(_calc_score, cache=Cache.SizedDictCache(200000))

def make_vector_with_context(document, sentence_range, word_range):
    word = support.context2word(document, sentence_range, word_range)
    return [score(word)]

def score(word):
    """Return a probability that the word looks like a gene/protein word."""
    if not can_handle_context(word, (0, len(word)), (0, len(word))):
        return 0
    return _calc_score(word.lower())

def score_with_context(document, sentence_range, word_range):
    word = support.context2word(document, sentence_range, word_range)
    return score(word)

def can_handle_context(document, sentence_range, word_range):
    word = support.context2word(document, sentence_range, word_range)
    if not word.lower().endswith("in"):
        return 0
    return 1

def find(document):
    """Return list of (start, end, score)."""
    data = []
    for x, statement_range, word_range in support.doc2context(document):
        if not can_handle_context(document, statement_range, word_range):
            continue
        word = support.context2word(document, statement_range, word_range)
        s = score(word)
        data.append((word_range[0], word_range[1], s))
    return data
