"""Miscellaneous functions for identifying gene names. Functions: is_nonupper Whether the word does not have upper case letters. is_nonlower Whether the word does not have lower case letters. is_all_caps Whether the word is all CAPS. is_capitalized Whether the word in Capitalized. has_unusual_mixed_case Whether the word is mixed case. is_boundary Whether there is a word boundary between two indexes. Variables: SPACE NONSPACE SPACE_AND_PUNC UPPER_AND_NUMBER """ import string import mx.TextTools as TT from Extracto import datafile SPACE = string.whitespace SPACE_SET = TT.set(SPACE) NONSPACE = string.letters + string.digits + string.punctuation NONSPACE_SET = TT.set(NONSPACE) SPACE_AND_PUNC = string.whitespace + string.punctuation SPACE_AND_PUNC_SET = TT.set(SPACE_AND_PUNC) UPPER_AND_NUMBER = TT.A2Z+TT.number UPPER_AND_NUMBER_SET = TT.set(UPPER_AND_NUMBER) NOT_LETTER_SET = TT.invset(string.letters) LETTER_OR_NUMBER = string.letters+string.digits NOT_NUMBER_OR_LETTER_SET = TT.invset(string.digits+string.letters) GREEK_LETTERS = [x.rstrip() for x in datafile.open("greek_letters").readlines()] ROMAN_NUMERALS = [x.rstrip() for x in datafile.open("roman_numerals").readlines()] del x def setremove(word, set): """setremove(word, set) -> remove all the letters in the set""" return ''.join(TT.setsplit(word, set)) def is_nonupper(word): """is_nonupper(word) -> boolean The semantics of this is a little different from .islower. islower requires there to be no upper case letters and at least one lower case one. is_nonupper just requires there to be no upper case letters. """ return TT.setfind(word, TT.A2Z_set) == -1 def is_nonlower(word): """is_nonlower(word) -> boolean The semantics of this is a little different from .isupper. isupper requires there to be no lower case letters and at least one upper case one. is_nonlower just requires there to be no lower case letters. """ return TT.setfind(word, TT.a2z_set) == -1 def is_all_caps(word): """is_all_caps(word) -> boolean Return whether all the letters in the word (must have at least 1 letter) is in upper case. """ return word.isupper() def is_capitalized(word): """is_capitalized(word) -> boolean Return whether the word is capitalized, where the first letter is capitalized and the rest are lower case. Requires at least one letter. """ # Make sure there's at least 1 letter. i = TT.setfind(word, TT.alpha_set) if i < 0: return 0 # Now make sure that's the only letter that's upper case. return word[i].isupper() and is_nonupper(word[i+1:]) def has_unusual_mixed_case(word): """has_unusual_mixed_case(word) -> boolean""" # I only want to look at the letter. word = setremove(word, NOT_LETTER_SET) # Make sure there is at least 1 letter. if not word: return 0 # If all the letters are lower case (no upper case), then it's not unusual. if TT.setfind(word, TT.A2Z_set) < 0: return 0 # If all the letters are upper case (no lower case), then it's not unusual. if TT.setfind(word, TT.a2z_set) < 0: return 0 # If there's an upper case letter that's not the first one, then # this is unusual. return TT.setfind(word, TT.A2Z_set, 1) >= 1 def has_unusual_mixed_numbers(word): """has_unusual_mixed_numbers(word) -> boolean Return whether there are numbers mixed with letters (not at the beginning or end of the string). """ word = setremove(word, NOT_NUMBER_OR_LETTER_SET) # Make sure there are numbers and letters in this word. if not word: return 0 # Remove the numbers from the ends of the word. stripped = TT.setstrip(word, TT.number_set) # Check whether there's a number in the middle somewhere. return TT.setfind(stripped, TT.number_set) >= 0 def is_boundary(string, index0, index1): """Return whether there is a word boundary between index0 and index1. A word boundary is marked by non-alpha characters, or alphabetical characters with different cases.""" if index1 < index0: index0, index1 = index1, index0 # If either if the indexes is at the end of the string, this is a # word boundary. if index0 < 0 or index1 >= len(string): return 1 l0, l1 = string[index0], string[index1] if l0.isalpha() ^ l1.isalpha(): # If one is a character return 1 if l0.islower() != l1.islower(): return 1 return 0