"""Miscellaneous functions that work with text.

Functions:
splitwords     Split a string based on whitespace or punctuation.
find_nopunc    Find a string, being lenient on punctuation mismatches.
extend_to      Extend a range to a boundary, defined by a set of characters.
shrink_to      Shrink the range to a boundary.

"""
import string, re

from Bio import stringfns

WORD_CHARS = string.letters + string.digits

def splitwords(s):
    """splitwords(s) -> list of words

    Split s based on any non-alphanumeric character.

    """
    words = stringfns.splitany(s, sep=WORD_CHARS, negate=1)
    words = filter(None, words)
    return words

def find_lenient(string, what,
                 ignore=string.whitespace+string.punctuation, start=0):
    """find_lenient(string, what[, ignore][, start]) -> (start, end) or None

    Find the first occurrence of what in string, allowing mismatches
    on any character in ignore.  By default, ignores mismatches on
    whitespace and punctuation.

    """
    words = stringfns.split(what, sep=ignore)
    words = filter(None, words)
    s = "[%s]*" % ignore
    s = s.join(words)
    m = re.search(s, string[start:])
    if m is None:
        return None
    s, e = m.span(0)
    return s+start, e+start

##def find_nopunc(string, what, start=0):
##    """find_nopunc(string, what[, start]) -> (start, end) or None

##    Find the first occurrence of what in string, being lenient on
##    punctuation mismatches.

##    """
##    words = splitwords(what)
##    m = re.search(r"\W+".join(words), string[start:])
##    if m is None:
##        return None
##    s, e = m.span(0)
##    return s+start, e+start

def extend_to(string, boundary, start, end, mode=0):
    """extend_to(string, boundary, start, end[, mode]) -> start, end

    Extend the range at start, end to the boundaries of the word.
    mode indicates which size to extend (<0 left boundary, >0 right
    boundary, =0 both boundaries.)

    """
    string = str(string)
    if start >= end:
        raise ValueError, "start should be less than end"
    if mode <= 0:
        i = stringfns.rfind_anychar(string, boundary, index=start)
        if i == -1:
            start = 0
        else:
            start = i+1
    if mode >= 0:
        i = stringfns.find_anychar(string, boundary, index=end-1)
        if i == -1:
            end = len(string)
        else:
            end = i
    return start, end

def shrink_to(string, boundary, start, end, mode=0):
    """shrink_to(string, boundary, start, end[, mode]) -> start, end

    Shrink the range at start, end to the boundaries of the word.
    mode indicates which size to shrink (<0 left boundary, >0 right
    boundary, =0 both boundaries.)

    """
    string = str(string)
    if start >= end:
        raise ValueError, "start should be less than end"
    if mode <= 0:
        i = stringfns.find_anychar(string, boundary, index=start)
        if i == -1:
            start = len(string)
        else:
            start = i
    if mode >= 0:
        i = stringfns.rfind_anychar(string, boundary, index=end-1)
        if i == -1:
            end = 0
        else:
            end = i+1
    return start, end
    
