"""This module recognizes series of noun phrases.

Series of noun phrases come in the patterns:
- A and B
- A, B, C, (and|or|but not) D
- A, B, C (and|or|but not) D

Functions:
find    Find series of noun phrases.

"""
import string

import mx.TextTools as TT

from Bio import listfns
from Bio import stringfns

def _find_sequence_of_tags(tags, allowed_tags, start=0):
    # Return (start, end) index into tags, or None if not found.
    # Find a consequence sequence of tags that are listed in
    # allowed_tags.
    i = start
    
    # Find the first tag in allowed_tags.
    while i < len(tags) and tags[i] not in allowed_tags:
        i += 1
    if i >= len(tags):
        return None
    start_index = i

    # Find the end of the allowed_tags.
    while i < len(tags) and tags[i] in allowed_tags:
        i += 1
    end_index = i

    return start_index, end_index


NP_TAGS = [
    'NN', 'NNP', 'NNPS', 'NNS',
    'JJ', 'JJR', 'JJS', 'JJSS',
    'DT',
    'CC',
    'RB',               # not
    'VBG', 'VBN',       # gerund or present participle, past participle
    'PRP',              # personal pronoun
    ',', ':', '(', ')', 'SYM',
    'CD',               # cardinal number
    '.',                # token-internal periods, e.g. "9.5"
    ]
NP_TAGS = listfns.asdict(NP_TAGS)

def _find_consecutive_nouns(tags):
    # Returns list of (start, end)
    # This returns all the sequences of nouns, etc in the sentence.
    # Some of them are series, and other aren't.
    start = 0
    sequences = []
    while start <= len(tags):
        x = _find_sequence_of_tags(tags, NP_TAGS, start=start)
        if not x:
            break
        s, e = x
        sequences.append(x)
        start = e+1
    return sequences

def _clean_consecutive_nouns(series, sentence, tokens, token_indexes, tags):
    # Return (start, end) or None, cleaned up
    # series is (start, end) index into tokens
    start, end = series

    # Strip the "CC", ",", ".", and ":" tags from the ends.
    while start < len(tags) and tags[start] in ["CC", ",", ":", "."]:
        start += 1
    if start >= len(tags):
        return None
    while end > 0 and tags[end-1] in ["CC", ",", ":", "."]:
        end -= 1
    if end <= 0:
        return None

    return start, end

def _is_np_series(series, sentence, tokens, token_indexes, tags):
    # series is (start, end) index into tokens
    start, end = series

    sent_start = token_indexes[start]
    if end >= len(token_indexes):
        sent_end = len(sentence)
    else:
        sent_end = token_indexes[end]

    # If there are no alphabetical characters, then this is not an NP
    # series.
    if TT.setfind(str(sentence)[sent_start:sent_end], TT.alpha_set) < 0:
        return 0

    # If there are no nouns, then this is not an NP series.
    for tag in tags[start:end]:
        if tag.startswith("NN") or tag.startswith("JJ"):
            break
    else:
        return 0

    # If there is no conjunction, then this is not an NP series.
    for tag in tags[start:end]:
        if tag == 'CC':
            break
    else:
        return 0
    
    return 1

def _find_np_series(sentence):
    # return list of (start, end)
    from Extracto import tokenfns
    from markup_consts import RBT
    
    tokens = sentence.extract(RBT)
    if not tokens:
        raise ValueError, "I could not find any RBT tags on sentence"
    markups = [x.markups(name=RBT)[0] for x in tokens]
    tags = [x[1] for x in markups]
    token_indexes = tokenfns.find_offsets(tokens, sentence)

    #for i in range(len(tokens)):
    #    print "%s/%s " % (tokens[i], tags[i]),
    #print 
    series = _find_consecutive_nouns(tags)
    args = sentence, tokens, token_indexes, tags
    series = [_clean_consecutive_nouns(x, *args) for x in series]
    series = filter(None, series)   # don't want the None's
    series = [x for x in series if _is_np_series(x, *args)]

    # Series are indexes of tags.  Convert them to indexes into the
    # sentence.
    sent_series = []
    for s, e in series:
        si = token_indexes[s]
        if e >= len(token_indexes):
            ei = len(str(sentence))
        else:
            ei = token_indexes[e]
        sent_series.append((si, ei))

    return sent_series

def find(sentence):
    """find(sentence) -> list of (start, end)

    Return the ranges of series of noun phrases (e.g. "A, B, and C").

    """
    import re
    from Extracto import mstring
    from Extracto import parentheses

    if not isinstance(sentence, mstring.mstring):
        raise ValueError, "sentence must be a mstring object"
    str_sentence = str(sentence)

    abbrev_re = re.compile(r"\s*\(\w+\)")

    ranges = []
    for statement in parentheses.separate(sentence, preserve_spacing=1):
        if not str(statement).strip():
            continue
        series = _find_np_series(statement)

        # Do some cleaning up with the ranges.
        for i in range(len(series)):
            s, e = series[i]
            
            # Since I split the sentence into series, the indexes
            # might have been messed up.  I need to back the end index
            # up to the first non-whitespace token.
            e = stringfns.rfind_anychar(
                str(statement), string.whitespace, index=e-1, negate=1)
            e += 1

            # Sometimes a series will end with an abbreviation.  Be
            # sure to include the abbreviation in the noun phrase.
            # neuropeptide Y (NPY), and substance P (SP)
            m = abbrev_re.match(str_sentence[e:])
            if m:
                e += m.end()
            
            series[i] = s, e
        ranges.extend(series)
    ranges.sort()
    return ranges
