"""parentheses.py Functions: find Find the parentheses in a string. remove Remove the parentheses from a string. separate Separate a sentence into parenthetical statements. """ def is_parenthetical(string, range): """is_parenthetical(string, range) -> boolean Return between parenthetical statements, ... (...) ..., or parentheses within things like gene names, NAD(P)H. """ from Extracto import ctype string = str(string) start, end = range # RULE: It is a parenthetical statement if there are spaces (or # boundaries) before and after the parentheses. # EXAMPLE: This is an example (a good one) of a paren. if (not start or ctype.isspace(string[start-1])) and \ (end >= len(string) or ctype.isspace(string[end])): return 1 # RULE: It is a parenthetical statement if there is a space or # boundary before the parentheses, and there is a synactical # boundary (period, comma, etc) after the parenthesis. if (not start or ctype.isspace(string[start-1])) and \ (end >= len(string) or string[end] in ".,;:!?"): return 1 # RULE: It is not a parenthetical statement if there are # alphanumeric characters around the parentheses. if (start and ctype.isalnum(string[start-1])) or \ (end < len(string) and ctype.isalnum(string[end])): return 0 # RULE: It is not a parenthetical statement if there is a dash # after the parentheses. # EXAMPLE: [125I]-ET if end < len(string) and string[end] == '-': return 0 # Otherwise, by default, it is a parenthetical statement. return 1 def find_all(string, *args): """find_all(string[, delimiters]) -> list of (start, end) Find all occurrences of strings that appear within a set of delimiters. """ import re if args: delimiters = args lengths = map(len, delimiters) if min(lengths) != 2 or max(lengths) != 2: raise ValueError, "delimiter must be '' chars" else: delimiters = ["()", "[]"] reobjs = [_delimiter2pattern(x[0], x[1]) for x in delimiters] reobjs = [re.compile(x) for x in reobjs] parens = [] for reobj in reobjs: while 1: r = reobj.search(str(string)) if not r: break start, end = r.span() parens.append((start, end)) # Need to be able to find parentheses within parentheses # too. Thus, when I find a match, remove it so that it # doesn't interfere with other matches. string = string[:start] + "-" + string[start+1:end-1] + \ "-" + string[end:] return parens def find(string, *args): """find(string[, delimiters]) -> list of (start, end) Find the parenthetical statements that appear in a string. Ignores token-internal parentheses, e.g. NAD(P)H, [125I]-ET. """ ranges = find_all(string, *args) i = 0 while i < len(ranges): if not is_parenthetical(string, ranges[i]): del ranges[i] else: i += 1 return ranges def remove(string, *args): """remove(string) -> string""" spans = find(string, *args) return _remove_spans(string, spans) def _remove_spans(string, spans): """_remove_spans(string, spans) -> string""" if not spans: return string spans = spans[:] # Make sure the spans don't overlap. spans.sort() i = 0 while i < len(spans)-1: start, end = spans[i] nstart, nend = spans[i+1] if nstart < end: spans[i] = start, max(end, nend) del spans[i+1] else: i += 1 # Now delete them in reverse order so that indexes aren't messed # up. spans.reverse() for start, end in spans: string = string[:start] + string[end:] return string def separate(sentence, preserve_spacing=0): """separate(sentence[, preserve_spacing]) -> statements Separate a sentence into separate parenthetical statements. For example, the sentence: The MAPK (mitogen-activated protein kinase) pathway ... has two statements "The MAPK pathway ..." and "mitogen-activated protein kinase". If preserve_spacing is a true value, then I will make sure the spacing of the statements matches that of the original sentence. Thus, the string for each statement is the same length as the original and the statements are directly aligned onto the original sentence. """ import mx.TextTools as TT from Extracto import textfns from Extracto import rangefns from Extracto.strcompn import NONSPACE paren_ranges = find(sentence) # For simplicity, make the whole sentence a parenthetical # statement. len_sentence = len(sentence) if (0, len_sentence) not in paren_ranges: paren_ranges.append((0, len_sentence)) # Now pull out all the parenthetical statements from the sentence. # For each statement, remove all the statements that occur within # me. If they're sorted in order, all the internal ones will come # after me in the list. paren_ranges.sort() statements = [] for i in range(len(paren_ranges)): # Get my ranges statement_ranges = [paren_ranges[i]] # Remove all the internal parenthetical statements. for j in range(i+1, len(paren_ranges)): s, e = paren_ranges[j] statement_ranges = rangefns.remove(statement_ranges, s, e) # Now reconstruct the sentence from these ranges. statement_ranges.sort() statement = sentence[:0] last_end = 0 for s, e in statement_ranges: statement = statement + ' '*(s-last_end) statement = statement + sentence[s:e] last_end = e statement = statement + ' '*(len_sentence-last_end) # Strip off the parentheses at the ends. This makes things # easier, so when I extend the gene names, I don't have to # worry about how to handle parentheses. s, e = textfns.shrink_to(statement, NONSPACE, 0, len(statement)) if s < len(statement) and e > 0: if (statement[s] == '(' and statement[e-1] == ')') or \ (statement[s] == '[' and statement[e-1] == ']'): statement = statement[:s] + " " + statement[s+1:] statement = statement[:e-1] + " " + statement[e:] if not preserve_spacing: if hasattr(statement, 'collapse'): statement = statement.collapse().strip() else: statement = TT.collapse(statement).strip() statements.append(statement) return statements def _delimiter2pattern(open, close): return r"\%s[^\%s\%s]+\%s" % (open, open, close, close)