"""parentheses.py

Functions:
find      Find the parentheses in a string.
remove    Remove the parentheses from a string.
separate  Separate a sentence into parenthetical statements.

"""
def is_parenthetical(string, range):
    """is_parenthetical(string, range) -> boolean

    Return between parenthetical statements, ... (...) ..., or
    parentheses within things like gene names, NAD(P)H.

    """
    from Extracto import ctype
    
    string = str(string)
    start, end = range

    # RULE: It is a parenthetical statement if there are spaces (or
    # boundaries) before and after the parentheses.
    # EXAMPLE: This is an example (a good one) of a paren.
    if (not start or ctype.isspace(string[start-1])) and \
       (end >= len(string) or ctype.isspace(string[end])):
        return 1

    # RULE: It is a parenthetical statement if there is a space or
    # boundary before the parentheses, and there is a synactical
    # boundary (period, comma, etc) after the parenthesis.
    if (not start or ctype.isspace(string[start-1])) and \
       (end >= len(string) or string[end] in ".,;:!?"):
        return 1

    # RULE: It is not a parenthetical statement if there are
    # alphanumeric characters around the parentheses.
    if (start and ctype.isalnum(string[start-1])) or \
       (end < len(string) and ctype.isalnum(string[end])):
        return 0

    # RULE: It is not a parenthetical statement if there is a dash
    # after the parentheses.
    # EXAMPLE: [125I]-ET
    if end < len(string) and string[end] == '-':
        return 0

    # Otherwise, by default, it is a parenthetical statement.
    return 1

def find_all(string, *args):
    """find_all(string[, delimiters]) -> list of (start, end)

    Find all occurrences of strings that appear within a set of delimiters.

    """
    import re
    
    if args:
        delimiters = args
        lengths = map(len, delimiters)
        if min(lengths) != 2 or max(lengths) != 2:
            raise ValueError, "delimiter must be '<open><close>' chars"
    else:
        delimiters = ["()", "[]"]
    reobjs = [_delimiter2pattern(x[0], x[1]) for x in delimiters]
    reobjs = [re.compile(x) for x in reobjs]
    parens = []
    for reobj in reobjs:
        while 1:
            r = reobj.search(str(string))
            if not r:
                break
            start, end = r.span()
            parens.append((start, end))
            # Need to be able to find parentheses within parentheses
            # too.  Thus, when I find a match, remove it so that it
            # doesn't interfere with other matches.
            string = string[:start] + "-" + string[start+1:end-1] + \
                     "-" + string[end:]
    return parens

def find(string, *args):
    """find(string[, delimiters]) -> list of (start, end)

    Find the parenthetical statements that appear in a string.
    Ignores token-internal parentheses, e.g. NAD(P)H, [125I]-ET.
    
    """
    ranges = find_all(string, *args)
    i = 0
    while i < len(ranges):
        if not is_parenthetical(string, ranges[i]):
            del ranges[i]
        else:
            i += 1
    return ranges

def remove(string, *args):
    """remove(string) -> string"""
    spans = find(string, *args)
    return _remove_spans(string, spans)

def _remove_spans(string, spans):
    """_remove_spans(string, spans) -> string"""
    if not spans:
        return string
    spans = spans[:]
    # Make sure the spans don't overlap.
    spans.sort()
    i = 0
    while i < len(spans)-1:
        start, end = spans[i]
        nstart, nend = spans[i+1]
        if nstart < end:
            spans[i] = start, max(end, nend)
            del spans[i+1]
        else:
            i += 1
    # Now delete them in reverse order so that indexes aren't messed
    # up.
    spans.reverse()
    for start, end in spans:
        string = string[:start] + string[end:]
    return string

def separate(sentence, preserve_spacing=0):
    """separate(sentence[, preserve_spacing]) -> statements

    Separate a sentence into separate parenthetical statements.  For
    example, the sentence:
      The MAPK (mitogen-activated protein kinase) pathway ...
    has two statements "The MAPK pathway ..." and "mitogen-activated
    protein kinase".

    If preserve_spacing is a true value, then I will make sure the
    spacing of the statements matches that of the original sentence.
    Thus, the string for each statement is the same length as the
    original and the statements are directly aligned onto the original
    sentence.

    """
    import mx.TextTools as TT
    from Extracto import textfns
    from Extracto import rangefns
    from Extracto.strcompn import NONSPACE
    
    paren_ranges = find(sentence)

    # For simplicity, make the whole sentence a parenthetical
    # statement.
    len_sentence = len(sentence)
    if (0, len_sentence) not in paren_ranges:
        paren_ranges.append((0, len_sentence))

    # Now pull out all the parenthetical statements from the sentence.
    # For each statement, remove all the statements that occur within
    # me.  If they're sorted in order, all the internal ones will come
    # after me in the list.
    paren_ranges.sort()
    statements = []
    for i in range(len(paren_ranges)):
        # Get my ranges
        statement_ranges = [paren_ranges[i]]

        # Remove all the internal parenthetical statements.
        for j in range(i+1, len(paren_ranges)):
            s, e = paren_ranges[j]
            statement_ranges = rangefns.remove(statement_ranges, s, e)

        # Now reconstruct the sentence from these ranges.
        statement_ranges.sort()
        statement = sentence[:0]
        last_end = 0
        for s, e in statement_ranges:
            statement = statement + ' '*(s-last_end)
            statement = statement + sentence[s:e]
            last_end = e
        statement = statement + ' '*(len_sentence-last_end)

        # Strip off the parentheses at the ends.  This makes things
        # easier, so when I extend the gene names, I don't have to
        # worry about how to handle parentheses.
        s, e = textfns.shrink_to(statement, NONSPACE, 0, len(statement))
        if s < len(statement) and e > 0:
            if (statement[s] == '(' and statement[e-1] == ')') or \
               (statement[s] == '[' and statement[e-1] == ']'):
                statement = statement[:s] + " " + statement[s+1:]
                statement = statement[:e-1] + " " + statement[e:]

        if not preserve_spacing:
            if hasattr(statement, 'collapse'):
                statement = statement.collapse().strip()
            else:
                statement = TT.collapse(statement).strip()

        statements.append(statement)
    return statements


def _delimiter2pattern(open, close):
    return r"\%s[^\%s\%s]+\%s" % (open, open, close, close)
