"""Miscellaneous functions for identifying gene names.

Functions:
is_nonupper             Whether the word does not have upper case letters.
is_nonlower             Whether the word does not have lower case letters.
is_all_caps             Whether the word is all CAPS.
is_capitalized          Whether the word in Capitalized.
has_unusual_mixed_case  Whether the word is mixed case.
is_boundary             Whether there is a word boundary between two indexes.

Variables:
SPACE
NONSPACE
SPACE_AND_PUNC
UPPER_AND_NUMBER

"""
import string
import mx.TextTools as TT
from Extracto import datafile

SPACE = string.whitespace
SPACE_SET = TT.set(SPACE)
NONSPACE = string.letters + string.digits + string.punctuation
NONSPACE_SET = TT.set(NONSPACE)
SPACE_AND_PUNC = string.whitespace + string.punctuation
SPACE_AND_PUNC_SET = TT.set(SPACE_AND_PUNC)
UPPER_AND_NUMBER = TT.A2Z+TT.number
UPPER_AND_NUMBER_SET = TT.set(UPPER_AND_NUMBER)
NOT_LETTER_SET = TT.invset(string.letters)
LETTER_OR_NUMBER = string.letters+string.digits
NOT_NUMBER_OR_LETTER_SET = TT.invset(string.digits+string.letters)

GREEK_LETTERS = [x.rstrip() for x in
                 datafile.open("greek_letters").readlines()]
ROMAN_NUMERALS = [x.rstrip() for x in
                  datafile.open("roman_numerals").readlines()]
del x

def setremove(word, set):
    """setremove(word, set) -> remove all the letters in the set"""
    return ''.join(TT.setsplit(word, set))

def is_nonupper(word):
    """is_nonupper(word) -> boolean
    
    The semantics of this is a little different from <string>.islower.
    islower requires there to be no upper case letters and at least
    one lower case one.  is_nonupper just requires there to be no
    upper case letters.

    """
    return TT.setfind(word, TT.A2Z_set) == -1

def is_nonlower(word):
    """is_nonlower(word) -> boolean
    
    The semantics of this is a little different from <string>.isupper.
    isupper requires there to be no lower case letters and at least
    one upper case one.  is_nonlower just requires there to be no
    lower case letters.

    """
    return TT.setfind(word, TT.a2z_set) == -1

def is_all_caps(word):
    """is_all_caps(word) -> boolean

    Return whether all the letters in the word (must have at least 1
    letter) is in upper case.

    """
    return word.isupper()

def is_capitalized(word):
    """is_capitalized(word) -> boolean

    Return whether the word is capitalized, where the first letter is
    capitalized and the rest are lower case.  Requires at least one
    letter.

    """
    # Make sure there's at least 1 letter.
    i = TT.setfind(word, TT.alpha_set)
    if i < 0:
        return 0
    # Now make sure that's the only letter that's upper case.
    return word[i].isupper() and is_nonupper(word[i+1:])

def has_unusual_mixed_case(word):
    """has_unusual_mixed_case(word) -> boolean"""
    # I only want to look at the letter.
    word = setremove(word, NOT_LETTER_SET)
    # Make sure there is at least 1 letter.
    if not word:
        return 0
    # If all the letters are lower case (no upper case), then it's not unusual.
    if TT.setfind(word, TT.A2Z_set) < 0:
        return 0
    # If all the letters are upper case (no lower case), then it's not unusual.
    if TT.setfind(word, TT.a2z_set) < 0:
        return 0
    # If there's an upper case letter that's not the first one, then
    # this is unusual.
    return TT.setfind(word, TT.A2Z_set, 1) >= 1

def has_unusual_mixed_numbers(word):
    """has_unusual_mixed_numbers(word) -> boolean

    Return whether there are numbers mixed with letters (not at the
    beginning or end of the string).

    """
    word = setremove(word, NOT_NUMBER_OR_LETTER_SET)
    # Make sure there are numbers and letters in this word.
    if not word:
        return 0
    # Remove the numbers from the ends of the word.
    stripped = TT.setstrip(word, TT.number_set)
    # Check whether there's a number in the middle somewhere.
    return TT.setfind(stripped, TT.number_set) >= 0

def is_boundary(string, index0, index1):
    """Return whether there is a word boundary between index0 and
    index1.  A word boundary is marked by non-alpha characters, or
    alphabetical characters with different cases."""
    if index1 < index0:
        index0, index1 = index1, index0
    # If either if the indexes is at the end of the string, this is a
    # word boundary.
    if index0 < 0 or index1 >= len(string):
        return 1
    l0, l1 = string[index0], string[index1]
    if l0.isalpha() ^ l1.isalpha():   # If one is a character
        return 1
    if l0.islower() != l1.islower():
        return 1
    return 0

