"""Create and score morphology features. Functions: find The following functions are useful for training and interacting with the underlying classifier. make_vector make_vector_with_context score score_with_context score_vector describe_vector can_handle can_handle_context The following functions implement base functionality. stem_greekX stem_Xgreek stem_Xroman stem_proX stem_apoholoX stem_Xupper stem_Xnum stem_Xfamily stem_lXl score_greekX score_Xgreek score_Xroman score_proX score_apoholoX score_Xupper score_Xnum score_Xfamily score_lXl """ import re from Extracto import memoize from Extracto.genename import support # Naming Conventions: These function names are named for the patterns # detected. X standards for the input word. Thus, "proX" looks for # the input word prefixed with "pro". def stem_greekX(word): """stem_greekX(word) -> stem (word without greek prefix) or None""" # Remove all the greek letters that appear at the beginning of the # word, such that: # betaE11 -> E11 # beta2AR -> AR # alphabetaTCR -> TCR # beta -> # beta2 -> # betaI -> import mx.TextTools as TT from Extracto import strcompn # Make a regular expression that matches as many greek letters as # possible. s = "|".join(strcompn.GREEK_LETTERS) greek_letters_re = re.compile(r"^((%s)\d*)*$" % s, re.IGNORECASE) # Find the boundaries in the word. Then, check to see if the # prefixes of the boundaries are greek letters. boundaries = [i for i in range(1, len(word)) if strcompn.is_boundary(word, i-1, i)] boundaries.reverse() for i in boundaries: s = word[:i] if greek_letters_re.match(s): break else: return None # Nothing found. # The stem is everything after the boundary. stem = word[i:] # Strip the punctuation and numbers after the greek letter, before # the word. stem = TT.setstrip(stem, strcompn.NOT_LETTER_SET, 0, len(stem), -1) if len(stem) < 2: return None return stem def stem_Xgreek(word): # Remove all the greek letters that appear at the end of the word. import mx.TextTools as TT from Extracto import strcompn # Make a regular expression that matches as many greek letters as # possible. s = "|".join(strcompn.GREEK_LETTERS) greek_letters_re = re.compile(r"^(%s)+$" % s, re.IGNORECASE) # Find the boundaries in the word. Then, check to see if the # suffixes of the boundaries are greek letters. boundaries = [i for i in range(1, len(word)) if strcompn.is_boundary(word, i-1, i)] for i in boundaries: s = word[i:] if strcompn.has_unusual_mixed_case(s): continue if greek_letters_re.match(s): break else: return None # Nothing found. # The stem is everything before the boundary. stem = word[:i] if len(stem) < 2: return None return stem def stem_Xroman(word): import mx.TextTools as TT from Extracto import strcompn # Get a list of roman numerals that match the end of the word. at_end = [] lword = word.lower() for numeral in strcompn.ROMAN_NUMERALS: if lword.endswith(numeral.lower()): at_end.append(word[-len(numeral):]) # Sort the roman numerals by decreasing length. schwartz = [(-len(x), x) for x in at_end] schwartz.sort() at_end = [x[1] for x in schwartz] for numeral in at_end: stem = word[:-len(numeral)] # Make sure the stem has at least 2 letters. clean_stem = TT.setstrip(stem, strcompn.NOT_LETTER_SET) if len(clean_stem) < 2: continue # If the numeral occurs at a word boundary, accept the stem. if strcompn.is_boundary(word, len(stem)-1, len(stem)): return stem # If the numeral is all upper case, and it's more than one # letter, accept the stem. if len(numeral) > 1 and numeral.isupper(): return stem return None # Starts with "pro", stem >= 2 chars, stem contains character PROX_RE = re.compile(r"^pro([a-z]{2,})$", re.IGNORECASE) def stem_proX(word): m = PROX_RE.match(word) if m is None: return None return m.group(1) # Starts with "apo" or "holo", stem >= 2 chars, stem contains character APOHOLOX_RE = re.compile(r"^(?:apo|holo)([a-z]{2,})$", re.IGNORECASE) def stem_apoholoX(word): m = APOHOLOX_RE.match(word) if m is None: return None return m.group(1) # Starts with a 3 letters (first one may be capitalized) plus a single # upper case letter. XUPPER_RE = re.compile(r"^(?:([A-Z]?[a-z]{2,})|([a-z]{3,}))[A-Z]$") def stem_Xupper(word): m = XUPPER_RE.match(word) if m is None: return None return m.group(1) # Starts with two more more letters followed by 1-3 digits. XNUM_RE = re.compile(r"^([a-z]{2,})\d{1,3}$", re.IGNORECASE) def stem_Xnum(word): m = XNUM_RE.match(word) if m is None: return None return m.group(1) # Check to see whether the gene name exists without the # family/subfamily, etc. # SULT1A2 -> SULT # CYP2D -> CYP # 2 or more letters, 1-2 digits, a letter, and 0-2 digits. Similar to # XNUM, but requires at least one more letter. XFAMILY_RE = re.compile(r"^([A-Z]{2,})\d{1,2}[A-Z]\d{0,2}$", re.IGNORECASE) def stem_Xfamily(word): m = XFAMILY_RE.match(word) if m is None: return None return m.group(1) def stem_lXl(word): import re import mx.TextTools as TT from Extracto import strcompn # Look for words that either begin or end with a lower case # letter, next to an upper case one. if len(word) < 4: return None prefix, suffix = word[:2], word[-2:] prefix_letters = re.sub(r"[^%s]" % TT.alpha, "", word[1:]) suffix_letters = re.sub(r"[^%s]" % TT.alpha, "", word[:-1]) # Assume that the prefix and suffix are valid. Then, rule them # out if they are invalid. valid_prefix = valid_suffix = 1 # e.g. cJun, cFOS, cMYC if not prefix.isalpha(): # make sure all letters valid_prefix = 0 elif not (prefix[0].islower() and prefix[1].isupper()): valid_prefix = 0 elif len(prefix_letters) < 3: valid_prefix = 0 # e.g. Gal80p if not suffix[1].isalpha(): valid_suffix = 0 elif not strcompn.is_boundary(suffix, 0, 1): valid_suffix = 0 elif len(suffix_letters) < 3: valid_suffix = 0 elif suffix[1] == 's': # Don't want 's' at end, for plural words. valid_suffix = 0 if valid_prefix: return word[1:] if valid_suffix: return word[:-1] return None def _load_scorefile(filename): from Extracto import datafile scores = {} for line in datafile.open(filename).readlines(): word, score = line.rstrip().split() scores[word] = float(score) return scores _load_scorefile = memoize.memoize(_load_scorefile) def _read_score(filename, word): scores = _load_scorefile(filename) return scores.get(word, 0) def score_greekX(word): return _read_score("morphology.greekX.scores", word) def score_Xgreek(word): return _read_score("morphology.Xgreek.scores", word) def score_Xroman(word): return _read_score("morphology.Xroman.scores", word) def score_proX(word): return _read_score("morphology.proX.scores", word) def score_apoholoX(word): return _read_score("morphology.apoholoX.scores", word) def score_Xupper(word): return _read_score("morphology.Xupper.scores", word) def score_Xnum(word): return _read_score("morphology.Xnum.scores", word) def score_Xfamily(word): return _read_score("morphology.Xfamily.scores", word) def score_lXl(word): return _read_score("morphology.lXl.scores", word) def _retokenize(word): """Return the word in a normalized form.""" # Normalize the results of the tokenizer. import string import mx.TextTools as TT # Remove the '-' in the words. # EXAMPLES: IL-1 -> IL1, caspase-9 -> caspase9 word = TT.replace(word, "-", "") # Remove punctuation at the end of the word. # EXAMPLES: CD3+ -> CD3 punct_set = TT.set(string.punctuation) word = TT.setstrip(word, punct_set, 0, len(word), 1) # Remove the spaces in the tokens. # EXAMPLES: Kpn I -> KpnI word = TT.replace(word, " ", "") return word from Extracto.genename import Feature class MorphologyFeature(Feature.Feature): def __init__(self): Feature.Feature.__init__(self, "morphology.classifier") self._FEATURES = [ score_greekX, score_Xgreek, score_Xroman, #score_proX, # Obsolete. score_apoholoX, score_Xupper, score_Xnum, score_Xfamily, score_lXl, ] def _make_vector_with_context(self, document, sentence_range, word_range): import math word = support.context2word(document, sentence_range, word_range) rword = _retokenize(word) vector = [f(rword) for f in self._FEATURES] # Log the vector and transform it so that numbers are >= 0. vector = [max(1E-3, x) for x in vector] vector = [math.log10(x)+3 for x in vector] return vector def _describe_vector(self): return [x.func_name for x in self._FEATURES] def _can_handle_context(self, document, sentence_range, word_range): import mx.TextTools as TT non_alnum_set = TT.invset(TT.alphanumeric) word = support.context2word(document, sentence_range, word_range) # Make sure the word contains no punctuation or spaces. if TT.setfind(word, non_alnum_set) >= 0: return 0 # Make sure the word contains a letter. if TT.setfind(word, TT.alpha_set) < 0: return 0 return 1 support.bind_methods_to_module(__name__, MorphologyFeature()) def find(document): """Return list of (start, end, score).""" import math context = support.doc2context(document, fancy_tokenizer=1) data = [] for x, statement_range, word_range in context: if not can_handle_context(document, statement_range, word_range): continue score = score_with_context(document, statement_range, word_range) data.append((word_range[0], word_range[1], math.exp(score))) return data