"""number.py

This module provides code for finding numbers in text.

Functions:
find    Find all occurrences of numbers.

"""
import re
import string
import mx.TextTools as TT
import tokenfns

# from the perl manpage: perldoc -q determine
_WHOLE_NUMBER = r"\d+"
_INTEGER = r"-?\d+"
_PM_INTEGER = r"[+-]?\d+"
_REAL = r"-?\d+\.?\d*"
_DECIMAL = r"-?(?:\d+(?:\.\d*)?|\.\d+)"
_FLOAT = r"(?:[+-]?)(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?"

_NUMBER_RE = re.compile(r"(%s|%s|%s|%s|%s|%s)" % (
    _FLOAT, _DECIMAL, _REAL, _PM_INTEGER, _INTEGER, _WHOLE_NUMBER))

_MEANINGFUL_CHARS = string.digits + "-+" + "Ee" + "."
def find(string):
    """find(string) -> list of (start, end)

    Look for numbers in a string.  Return a list of tuples (start,
    end) that indicate ranges in the string that contain a single
    number.

    """
    string = str(string)
    # Check to see whether there are any digits here first.
    if TT.setfind(string, TT.number_set) == -1:
        return []

    # First, apply some rules to help disambiguate the numbers.

    # RULE: A sign cannot be to the right of a letter (except E or e)
    # or number.  If it is, remove it.
    string = re.sub(r"(\w)(?<![Ee])[-+]", r"\1 ", string)

    # RULE: "+/-" is not a sign.  Remove it.
    string = re.sub(re.escape("+/-"), r"   ", string)

    # RULE: E.C. numbers are not numbers, e.g. 3.8.13.6 . Remove the
    # periods from them.
    string = re.sub(r"(\d+)\.(\d+)\.(\d+)\.(\d+)", r"\1 \2 \3 \4", string)

    # RULE: Only digits, signs, E, e, and . belong in numbers.  Remove
    # all others.
    string = re.sub("[^%s]" % re.escape(_MEANINGFUL_CHARS), " ", string)

    numbers = _NUMBER_RE.findall(string)
    offsets = tokenfns.find_offsets(numbers, string)

    ranges = []
    for i in range(len(numbers)):
        s, e = offsets[i], offsets[i] + len(numbers[i])
        ranges.append((s, e))
    
    return ranges
