"""sentence.py

Functions:
find     Find the sentences in a string.
split    Split a string into a list of sentences.

"""
SENTENCE_BOUNDARIES = "?!."   # Chars that can indicate sentence boundaries.

def _is_boundary(tokens, index, length):
    """_is_boundary(tokens, index, length) -> boolean

    Return a boolean value indicating whether the token at
    tokens[index] is a sentence boundary.  length is the current
    length of the sentence.
    
    """
    import string
    from Extracto import ctype
    
    ### General rules

    # RULE: If I'm at the end of the list, then add a sentence
    # boundary.
    # COMMENTS: There's no more text.  Articles always end at the end
    # of a sentence.
    if index == len(tokens)-1:
        return 1
    
    # RULE: Only '.', '?', '!', and '"' can be at the end of a sentence.
    # COMMENTS: We're not going to separate clauses separated by ';'.
    if len(tokens[index]) != 1 or tokens[index] not in '?.!"':
        return 0
    
    # RULE: If this sentence boundary is followed by another sentence
    # boundary, then it's not a sentence.
    # COMMENTS: For repeated boundaries, the first one should never be
    # the end of a sentence.  e.g. '???', '!!', '...'.  Actually, with
    # the whitespace rule, this is strictly not necessary.  However,
    # I'll keep it in here because it models a language phenomenon
    # plus the whitespace rule might change.
    if len(tokens[index+1]) == 1 and tokens[index+1] in SENTENCE_BOUNDARIES:
        return 0

    # RULE: A sentence boundary character followed by "(ABSTRACT" is
    # always a sentence boundary.
    # COMMENTS: This is to handle the "(ABSTRACT TRUNCATED AT 250
    # WORDS)" string appended by PubMed.  They do not always leave a
    # space after the last period.
    if index <= len(tokens)-11 and \
       tokens[index+1] == '(' and tokens[index+2] == 'ABSTRACT':
        return 1

    # RULE: Sentence boundaries must be followed by whitespace.
    # COMMENTS: This rules out numbers, urls
    if not ctype.isspace(tokens[index+1]):
        return 0

    ### Rules to handle question marks and exclamation points

    # RULE: Question marks and exclamation points mark the end of the
    # sentence, as long as they're not followed by quotes.
    # COMMENTS: In scientific literature, I can't think of any cases
    # where this fails.  I should be on the lookout for negative
    # examples.
    if tokens[index] == '!' or tokens[index] == '?':
        return not (index+1 < len(tokens) and tokens[index+1] == '"')


    ### Rules to handle quotation marks

    # RULE: Quotation marks are sentence boundaries if they follow a
    # real sentence boundary character.
    # COMMENTS: Sentences can end with ."
    if tokens[index] == '"':
        return index > 0 and \
               len(tokens[index-1]) == 1 and \
               tokens[index-1] in SENTENCE_BOUNDARIES

    ### Rules to handle periods

    # I now know I'm at a period followed by whitespace.

    # RULE: If the previous 4 tokens are 'e.g.' then I'm not at
    # a sentence boundary.
    if index > 3 and tokens[index-3] == "e" and tokens[index-2] == "." \
       and tokens[index-1] == "g":
        return 0

    # RULE: Find the next non-whitespace token.  If there's no more
    # tokens, then I'm at the end of the sentence.
    next_token = index + 1
    while next_token < len(tokens) and ctype.isspace(tokens[next_token]):
        next_token += 1
    if next_token >= len(tokens):
        # No more tokens, I'm at the end of the list.  See previous rule.
        return 1

    # RULE: If the next non-whitespace token is a ')', then I'm not at
    # a sentence boundary.
    if tokens[next_token] == ')':
        return 0
    
    # RULE: If the next non-whitespace starts with a capital letter,
    # then I'm at a sentence boundary.
    # Find the next non-whitespace token.
    if tokens[next_token][0].isupper():
        return 1

    # RULE: If the next token starts with '[omp]-' followed by a
    # capital letter, then I'm at a sentence boundary.
    # COMMENTS: Chemical names like p-Nitrophenol are capitalized
    # after the o-, m-, p- abbreviation.  It looks like if the 'meta'
    # is actually written out, it's capitalized, e.g. PMID 10939229.
    if next_token <= len(tokens) - 3:
        if tokens[next_token] in ['o', 'm', 'p'] and \
           tokens[next_token+1] == '-' and \
           tokens[next_token+2][0].isupper():
            return 1

    # RULE: Ignore non-whitespace tokens without alphabetic characters.
    # COMMENTS: The first alphabetic character after a period should
    # be capitalized.  This handles cases like 5'-Deletions.
    # Find the first alphabetic character.
    t = next_token
    while t < len(tokens):
        # Find the index of the first letter.
        i = ctype.strcspn(tokens[t], string.letters)
        # If there are no letters in this token, skip it.
        if i == len(tokens[t]):
            t = t + 1
            continue
        # If the letter is capitalized, then this is a sentence boundary.
        if tokens[t][i].isupper():
            return 1
        # Letter is lower case, so don't call this a sentence boundary.
        break

    # Otherwise, I'm not at a sentence boundary.

    # This correctly handles:
    # - token-internal periods, e.g. 14.2, N.I.H.-approved, e.g.
    # - multiple end of sentence boundaries, e.g. ???, etc...
    # - abbreviations in sentences
    #   hCG (300 ng i.p.)                                            0010927634
    #   hCG (300 ng i.p. )
    # - sentence ends with ."
    # - species names, e.g. E. coli
    # - E.C. numbers
    #   EC 1.6.2.4                                                   0010927020
    # - next sentence starts with a number
    #   observed. 5?-Deletion                                        0010931833
    #   controls. 4-IM                                               0010917203
    # - next sentence starts with a chemical name
    #   e.g. p-Nitrophenol
    # - next sentence starts with a punctuation mark,
    #   e.g. parenthesis, quotation
    # - next sentence starts with a number token.
    # - PubMed added message:
    #   method.(ABSTRACT TRUNCATED AT 250 WORDS)

    # This doesn't handle (and should):
    # - next sentence starts with a non-capitalized word,
    #   e.g. hCG
    # - abbreviations in names
    #   Mol. Brain Res.                                              0010906488


    # XXX allow exceptions, hCG, cDNA, mRNA

    # This doesn't handle, but may not need to:
    # - numbering
    #   1. XXXX    2. XXXX                                           0010923861

    # special cases in 10898934

    return 0

def find(string):
    """find(string) -> list of (start, end) marking the sentences"""
    from Extracto import tokenfns
    from Extracto import tokenizer
    
    string = str(string)
    
    tokens = tokenizer.tokenize_str(string)
    token2char = tokenfns.count_offsets(tokens)
    # Iterate through all the tokens, passing each one to the
    # disambiguation function.
    indexes = []
    start = 0     # start of the sentence
    for i in range(len(tokens)):
        if _is_boundary(tokens, i, i-start):
            indexes.append(token2char[i+1])
            start = i+1

    # Now pull out the sentences from each boundary.
    start = 0
    sentences = []
    for i in indexes:
        s = string[start:i]
        s = s.strip()
        if s:
            sentences.append(s)
        start = i

    offsets = tokenfns.find_offsets(sentences, string)
    ranges = [(o, o+len(s)) for s, o in zip(sentences, offsets)]
    return ranges

def split(string):
    """split(string) -> list of sentences"""
    ranges = find(string)
    sentences = []
    for s, e in ranges:
        sentences.append(string[s:e].strip())
    return sentences

