"""preprocess.py

This module provides a facade to functions useful for preprocessing
text.

Functions:
find_tokens        Identify the locations of tokens in the string.
join_numbers       Find numbers and join them.  Changes the tokens.
find_sentences     Add tags marking sentence boundaries, based on heuristics.
find_abbrevs       Find abbreviations.

    Calls 3rd party apps.
porter_stem        Find the stems of tokens using Porter's stemmer.
abbSys_abbrev      Find abbreviations using abbSys.
rbt_tag            Do POS tagging using RBT.
sundance_parse     Parse the sentences using Sundance.

Dependencies:
   FUNCTION          DEPENDS ON
join_numbers         nothing
find_sentences       nothing
porter_stem          nothing
abbSys_abbrev        SENTENCE
abbreviations        SENTENCE
rbt_tag              TOKEN, SENTENCE
sundance_parse       TOKEN, SENTENCE

"""
from markup_consts import *



def find_tokens(string, clean=0, name=TOKEN):
    """find_tokens(string[, clean][, name]) -> string"""
    from Extracto import mstring
    from Extracto import tokenizer
    from Extracto import tokenfns
    
    if not isinstance(string, mstring.mstring):
        string = mstring.mstring(string)
    if clean:
        # Get rid of extraneous whitespace.
        string = string.strip()
        string = string.collapse()
    tokens = tokenizer.tokenize_str(str(string))
    offsets = tokenfns.find_offsets(tokens, string)
    for i in range(len(offsets)):
        s, e = offsets[i], offsets[i]+len(tokens[i])
        string.add_markup(name, str(i), s, e)
    return string

def join_numbers(string, token_name=TOKEN):
    """join_numbers(string[, token_name]) -> string"""
    from Extracto import rangefns
    from Extracto import number
    
    string = string[:]
    # Find where the numbers are.
    ranges = number.find(string)
    # Now join the tokens that are comprised of numbers.
    x = filter(lambda x,t=token_name: x[0]==t, string._markups)
    tranges = map(lambda x: x[-2:], x)
    ranges = rangefns.munge(ranges + tranges)

    # Now generate the final list of markups.  This consists of all
    # the non-token markups that were there before, plus new ones.
    markups = filter(lambda x,t=token_name: x[0]!=t, string._markups)
    for i in range(len(ranges)):
        s, e = ranges[i]
        markups.append((token_name, str(i), s, e))

    string._markups = markups
    return string

def find_sentences(string, name=SENTENCE, first_sentence=0):
    """find_sentences(string[, name]) -> string"""
    from Extracto import sentence
    string = string[:]
    ranges = sentence.find(string)
    for i in range(len(ranges)):
        s, e = ranges[i]
        string.add_markup(name, str(i+first_sentence), s, e)
    return string

def find_abbrevs(string, has_many_sentences=1,
                 name=ABBREV, sentence_name=SENTENCE):
    """find_abbrevs(string[, has_many_sentences][, name][, sentence_name]) -> mstring

    Find the abbreviations in a list of sentences.  If
    has_many_sentences is true (default), then will first identify the
    boundaries using the sentence_name.

    """
    from Bio import listfns
    from Extracto import abbreviation
    
    string = string[:]
    if has_many_sentences:
        sents = string.extract(sentence_name)
    else:
        sents = [string]
        
    abbrevs = []
    for s in sents:
        abbs = abbreviation.find(s)
        abbrevs.extend(abbs)
    # Get rid of duplicates.
    abbrevs = listfns.items(abbrevs)
    # Add the abbreviations as markups.
    for prefix, abb, score in abbrevs:
        value = "%s|%s|%s" % (prefix, abb, score)
        string.add_markup(name, value)
    return string

def porter_stem(string, name=PORTER, token_name=TOKEN):
    """porter_stem(string[, name][, token_name]) -> string"""
    from Bio import listfns
    from Extracto import ctype
    from Extracto import tokenfns
    from Extracto import stem
    
    string = string[:]
    tokens = string.extract(token_name)
    tokens = map(str, tokens)
    token2char = tokenfns.count_offsets(tokens)
    # Find the tokens that look like words.
    indexes = listfns.indexesof(tokens, ctype.isalnum)
    for i in indexes:
        # Stem the words.
        value = stem.porter(tokens[i])
        s, e = token2char[i], token2char[i]+len(tokens[i])
        string.add_markup(name, value, s, e)
    return string

def abbSys_abbrev(string, has_many_sentences=1,
                  name=ABBSYS, sentence_name=SENTENCE):
    """abbSys_abbrev(string[, has_many_sentences][, name][, sentence_name]) -> mstring

    Find the abbreviations in a list of sentences.  If
    has_many_sentences is true (default), then will first identify the
    boundaries using the sentence_name.  This may improve performance,
    because abbSys will not be confused by things that occur across
    sentence boundaries.

    """
    from Bio import listfns
    from Extracto import abbSys
    string = string[:]
    if has_many_sentences:
        sents = string.extract(sentence_name)
    else:
        sents = [string]
        
    abbrevs = []
    for s in sents:
        abbs = abbSys.find(s)
        abbrevs.extend(abbs)
    # Get rid of duplicates.
    abbrevs = listfns.items(abbrevs)
    # Add the abbreviations as markups.
    for long, abb in abbrevs:
        value = "%s|%s" % (long, abb)
        string.add_markup(name, value)
    return string
    
def rbt_tag(string, has_many_sentences=1,
            name=RBT, token_name=TOKEN, sentence_name=SENTENCE, params=None):
    """rbt_tag(string[, has_many_sentences][, name][, token_name][, sentence_name][, params]) -> mstring

    Do POS tagging on a string with RBT.  If has_many_sentences is set
    to true (default), will split the string into sentences according
    to sentence_name.  params is an optional RBT.TaggerParams object
    to pass to the tagger.

    """
    from Extracto import mstring
    from Extracto import tokenfns
    from Extracto import RBT as _RBT
    
    string = string[:]

    # If the string is empty, don't do anything with it.
    if not str(string).strip():
        return string
    
    # Since RBT wants all sentences to be on the same line, I'm going
    # to change the whitespace to make sure that happens.
    cleaned = string.collapse()
    # Split the tokens into sentences.
    if has_many_sentences:
        sents = cleaned.extract(sentence_name)
        assert len(sents), "I could not find tokens with name %s" % \
               sentence_name
        sep = mstring.mstring("\n")
        sep.add_markup(token_name, "", 0, len(sep))
        cleaned = sep.join(sents)
    tokens = cleaned.extract(token_name)
    assert len(tokens), "I could not find tokens with name %s" % token_name
    tokens = map(str, tokens)
    tags = _RBT.tag_tokens(tokens, params=params)
    
    # Get rid of all the "\n" characters that I put in.
    for i in range(len(tokens)):
        if tokens[i] == "\n":
            tokens[i] = ''
    # Now match the tags back onto the original string.
    offsets = tokenfns.find_offsets(tokens, string)
    for tag, index in tags:
        s, e = offsets[index], offsets[index]+len(tokens[index])
        string.add_markup(name, tag, s, e)
    return string

def sundance_parse(string,
                   has_many_sentences=1,
                   root_name=SUNDANCE_ROOT,
                   tag_source_name=SUNDANCE_TAG_SOURCE,
                   part_of_speech_name=SUNDANCE_PART_OF_SPEECH,
                   markup_name=SUNDANCE_MARKUP,
                   token_name=TOKEN, sentence_name=SENTENCE,
                   params=None
                   ):
    """sundance_parse(string[, has_many_sentences][, root_name]
    [, tag_source_name][, part_of_speech_name][, markup_name]
    [, token_name][, sentence_name][, params]) -> string

    Parse a list of tokens.  If has_many_sentences is true
    (default), then will identify the boundaries using sentence_tag
    and parse them separately.  params is an optional
    sundance.NLPParams object to pass to the parser.

    """
    from Extracto import tokenfns
    from Extracto import sentence
    from Extracto import sundance
    
    # Define a function to join a list of strings together with sep.
    # This is useful for munging together multiple parts of speech.
    def list2str(x, sep=','):
        if not x:
            return x
        return sep.join(x)
    string = string[:]
    if has_many_sentences:
        sents = string.extract(sentence_name)
    else:
        sents = [string]
    sents_offsets = tokenfns.find_offsets(sents, string)
    
    # Now parse each sentence.
    for sentence, sent_start in zip(sents, sents_offsets):
        tokens = sentence.extract(token_name)
        token2char = tokenfns.count_offsets(tokens)
        x = sundance.seg_tokens(tokens, params=params)
        r, ts, pos, markups = x
        for data, index in r:
            s, e = token2char[index], token2char[index]+len(tokens[index])
            s, e = s+sent_start, e+sent_start
            string.add_markup(SUNDANCE_ROOT, data, s, e)
        for data, index in ts:
            s, e = token2char[index], token2char[index]+len(tokens[index])
            s, e = s+sent_start, e+sent_start
            string.add_markup(SUNDANCE_TAG_SOURCE, data, s, e)
        for data, index in pos:
            s, e = token2char[index], token2char[index]+len(tokens[index])
            s, e = s+sent_start, e+sent_start
            data = list2str(data)
            string.add_markup(SUNDANCE_PART_OF_SPEECH, data, s, e)
        for name, s, e in markups:
            s, e = token2char[s], token2char[e]
            s, e = s+sent_start, e+sent_start
            string.add_markup(SUNDANCE_MARKUP, name, s, e)
    return string
