"""phraselet.py

A phraselet is a group of words that are not broken up by punctuation.

Functions:
find     Find the phraselets in a string.

"""
from Bio import listfns

import tokenfns
import ctype


BOUNDARY_CHARS = listfns.asdict(',;:(){}[]"!?')

def _is_boundary(string, index):
    """_is_boundary(string, index) -> boolean

    Return whether string[index] is a boundary for a phraselet.

    """
    ### General rules

    left_is_space = (not index) or ctype.isspace(string[index-1])
    right_is_space = (index == len(string)-1) or \
                     ctype.isspace(string[index+1]) or \
                     ctype.ispunct(string[index+1])
    
    # RULE: If I'm at a comma, semicolon, etc. at a space, then this
    # is a phraselet boundary.
    # COMMENTS: Some punctuation characters aren't boundaries, period
    # might be abbreviation, < could be less than, $ could be money,
    # etc.
    if BOUNDARY_CHARS.has_key(string[index]) and \
       (left_is_space or right_is_space):
        return 1

    # RULE: The end of the string is a phraselet boundary.
    if index == len(string)-1:
        return 1
    
    return 0

def find(string):
    """find(string) -> list of (start, end) marking the phraselets"""
    string = str(string)
    
    indexes = []
    for i in range(len(string)):
        if _is_boundary(string, i):
            indexes.append(i)

    # Now pull out the phraselets from each boundary.
    start = 0
    phraselets = []
    for i in indexes:
        s = string[start:i]
        s = s.strip()
        if s:
            phraselets.append(s)
        start = i+1

    offsets = tokenfns.find_offsets(phraselets, string)
    ranges = [(o, o+len(s)) for s, o in zip(phraselets, offsets)]
    return ranges