"""

Functions:
list_names  Get a list of species names.
find        Find species names in a string.

"""
from Extracto import memoize

def list_names():
    """Return a list of species names."""
    from Bio import listfns
    from Extracto import parentheses

    speclist = _load_speclist()    # list of (official, common, synonym)

    # Get rid of the common names.
    speclist = [(x[0], x[2]) for x in speclist]   # list of (official, synonym)
    
    # Clean up the synonyms.  There are many kinds of synonyms:
    # UNRELATED:    Abies alba                   European silver fir
    # DIFF. GENUS:  Bordetella bronchiseptica    Alcaligenes bronchisepticus
    # DIFF. GENUS:  Brevundimonas vesicularis    Pseudomonas vesicularis
    # VARIANT:      Botrytis cinerea             Botryotinia fuckeliana
    
    # I want to just throw out the unrelated ones.  However, the
    # variants are hard to detect (but possible with an alignment).
    # For now, I'm just going to keep the ones with different genus.
    # To do that, I'm going to keep all the synonyms that contain one
    # word in common with the official name.
    for i in range(len(speclist)):
        official, synonym = speclist[i]
        official_words = official.lower().split()
        for word in synonym.lower().split():
            if word in official_words:
                break
        else:
            # No synonym word in official.  Don't use this synonym.
            synonym = ''
        speclist[i] = official, synonym

    # Get a list of the official species names and synonyms.
    names = [x[0] for x in speclist] + \
            [x[1] for x in speclist if x[1]]

    # Do some cleaning up of the names.
    names = [parentheses.remove(x).strip() for x in names]
    names = listfns.items(names)
    names.sort()
    return names
list_names = memoize.memoize(list_names)

def _load_speclist():
    """Return tuples of (official name, common name, synonym) from
    SWISS-PROT's speclist.txt file."""
    from Bio.expressions.swissprot import speclist
    import Martel
    from Extracto import datafile
    from xml.sax import handler

    handle = datafile.open("speclist.txt")

    class _SpeciesNameExtractor(handler.ContentHandler):
        def __init__(self):
            self._context = []
            self.records = []  # list of (official_name, common_name, synonym)
        def startElement(self, name, attrs):
            self._context.append(name)
            if name == 'record':
                self._official_name = ''
                self._common_name = ''
                self._synonym = ''
        def endElement(self, name):
            assert self._context.pop() == name, "tags not nested properly"
            if name == 'record':
                x = self._official_name, self._common_name, self._synonym
                self.records.append(x)
        def characters(self, content):
            if not self._context:
                return
            name = self._context[-1]
            if name == 'official_name':
                self._official_name += content
            elif name == 'common_name':
                self._common_name += content
            elif name == 'synonym':
                self._synonym += content
                
    extractor = _SpeciesNameExtractor()
    format = speclist.format
    format = Martel.select_names(
        format, ['record', 'official_name', 'common_name', 'synonym'])
    parser = format.make_parser(debug_level=0)
    parser.setContentHandler(extractor)
    parser.setErrorHandler(handler.ErrorHandler())
    parser.parseFile(handle)
    return extractor.records

def _make_variants(name):
    """Make a list of variants of the name that might appear in text."""
    # For a typical genus/species name look for:
    # Saccharomyces cerevisiae
    # - S. cerevisiae
    # - S cerevisiae
    # - Saccharomyces
    # Do not try to create variants for weird looking names:
    # - Yaba monkey tumor virus
    
    words = name.split()
    if len(words) != 2:
        return []
    variants = []
    genus, species = words
    if len(genus) > 1:
        variants.append("%s %s" % (genus[0], species))
        variants.append("%s. %s" % (genus[0], species))
    # I tried to create variants using just the genus:
    # - Saccharomyces
    # However, this yields too many false positives because many genus
    # names are common words:
    # - Beta trigyna
    # - Cancer magister
    # - Common rhea
    # - Helix aspersa
    # - Human rotavirus
    # - Rabbit rotavirus
    # (Maybe the human and rabbit ones should be left in?)
    # Creating a list of stopwords of common genus variants that
    # should be ignored is a lot of work.  I took a list of all the
    # genus, printed out the ones that appear in a dictionary, and
    # there are still 1061 of them (most are good genus names).  Maybe
    # I need a smaller dictionary.
    # variants.append(genus)
    
    return variants

def _get_trie_of_names_and_variants():
    """Return of trie where the keys are species names.  The names are
    all in lower case for doing case insensitive searches."""
    from Bio import listfns
    from Bio import trie

    # Get a list of all the species names and their variants.
    names = list_names()
    variants = []
    for n in names:
        variants.extend(_make_variants(n))
    names = names + variants
    names = listfns.items(names)

    # I want to do a case insensitive search, so make everything lower
    # case.
    names = [x.lower() for x in names]

    # Create a trie to store all the names.
    names_trie = trie.trie()
    for n in names:
        names_trie[n] = 1
    return names_trie
_get_trie_of_names_and_variants = memoize.memoize(
    _get_trie_of_names_and_variants)
    

def find(document):
    """Return list of (start, end)."""
    from Bio import triefind
    from Extracto import rangefns
    
    str_document = str(document)
    names_trie = _get_trie_of_names_and_variants()
    x = triefind.find_words(str_document.lower(), names_trie)
    ranges = [(x[1], x[2]) for x in x]
    ranges = rangefns.munge(ranges)
    return ranges
