"""RBT.py

This is a Python wrapper for Brill's Rule-Based Parts of Speech
Tagger.

Classes:
TaggerParams  Encapsulates the parameters needed by tagger.

Functions:
tag           Tag a string.
tag_tokens    Tag a list of tokens.
parse_tagger  Parse the output from tagger.
tagger        Run tagger.

"""
# Need to figure out how to return the output from RBT.

from xml.sax import handler
from Extracto import memoize

def _make_tagger_parser():
    from formats import tagger_format
    return tagger_format.format.make_parser()
_make_tagger_parser = memoize.memoize(_make_tagger_parser)


class TaggerParams:
    """Holds parameters for the tagger program. 

    Members:
    tagger_path
    lexicon
    bigrams
    lexicalrulefile
    contextualrulefile
    wordlist
    splitnumber
    intermedfile
    start_state_tagger_only
    final_state_tagger_only

    """
    def __init__(self, rbt_path=None):
        """TaggerParams([rbt_path]) -> instance

        Create an instance of TaggerParams.  rbt_path is an optional
        path to the top of the RULE_BASED_TAGGER directory.  If
        provided, I will use reasonable default values for
        tagger_path, lexicon, bigrams, lexicalrulefile, and
        contextualrulefile.  Otherwise, they will need to be set.

        """
        import os
        from Extracto import apppath
        
        self._rootpath = rbt_path
        if not rbt_path:
            rbt_path = apppath.find("RULE_BASED_TAGGER_V1.14")
        rbt_path = os.path.join(os.getcwd(), rbt_path)  # use full path
        self.tagger_path = os.path.join(rbt_path, "Bin_and_Data", "tagger")
        self.lexicon, self.bigrams, \
                      self.lexicalrulefile, self.contextualrulefile = \
                      _default_params_for_tagger(rbt_path)
        self.wordlist = None
        self.splitnumber = None
        self.intermedfile = None
        self.start_state_tagger_only = None
        self.final_state_tagger_only = None

class _TaglineExtractor(handler.ContentHandler):
    # Extract taglines from RBT output.
    def __init__(self):
        self.name = None
        self.taglines = []
    def startElement(self, name, attrs):
        self.name = name
    def endElement(self, name):
        self.name = None
    def characters(self, content):
        if self.name == 'tagline':
            self.taglines.append(content)

def _default_params_for_tagger(rbt_path):
    """_default_params_for_tagger(rbt_path) ->
    lexicon, bigrams, lexicalrulefule, contextualrulefile

    Make a list of the default variables necessary to run tagger.

    """
    import os
    
    join = os.path.join     # for convenience
    dat = "Bin_and_Data"
    lexicon = join(rbt_path, dat, "LEXICON")
    bigrams = join(rbt_path, dat, "BIGRAMS")
    lexicalrulefile = join(rbt_path, dat, "LEXICALRULEFILE")
    contextualrulefile = join(rbt_path, dat, "CONTEXTUALRULEFILE")
    return lexicon, bigrams, lexicalrulefile, contextualrulefile

def _args_for_tagger_fn(corpus_to_tag, params):
    """_args_for_tagger_fn(corpus_to_tag, params) -> args, keywds

    Prepare the arguments to be passed into the tagger function.
    corpus_to_tag is the filename of the corpus that should be tagged.
    params is a TaggerParams object.

    """
    args = (params.tagger_path, params.lexicon, corpus_to_tag, params.bigrams,
            params.lexicalrulefile, params.contextualrulefile)
    keywds = {
        "wordlist" : params.wordlist,
        "splitnumber" : params.splitnumber,
        "intermedfile" : params.intermedfile,
        "start_state_tagger_only" : params.start_state_tagger_only,
        "final_state_tagger_only" : params.final_state_tagger_only
        }
    for key, value in keywds.items():
        if value is None:
            del keywds[key]
    return args, keywds

def _format_tokens(tokens):
    """_format_tokens(tokens) -> str

    Format tokens into Brill's format.

    """
    from Extracto import ctype
    
    tokens = tokens[:]
    # tokens should be represented as a string, with a whitespace
    # separating each token.  Do this while respecting the newlines as
    # sentence boundaries.  This may result in extra spaces inserted,
    # e.g. between text and punctuation.
    
    # First, delete all the spaces, excluding newlines.
    i = 0
    while i < len(tokens):
        if tokens[i] != '\n' and ctype.isspace(tokens[i]):
            del tokens[i]
        else:
            i = i + 1
    # Now, enter spaces between every token that's not a newline.
    i = 0
    while i < len(tokens)-1:
        if tokens[i] != '\n' and tokens[i+1] != '\n':
            tokens.insert(i+1, " ")
            i = i + 2
        else:
            i = i + 1
    return ''.join(tokens) + '\n'
        
def _split_token(token):
    """_split_token(token) -> word, tag

    Split a token from RBT (e.g. world/NN) into the word and tag.

    """
    i = token.rfind('/')
    if i < 0:
        raise SyntaxError, "RBT token %s missing '/'" % token
    return token[:i], token[i+1:]

def tag(string, params=None, parser=None):
    """tag(string[, params]) -> list of tag, start, end

    Do POS tagging on a string.  Returns a list of the tags and where
    they apply to the string.  params should be a TaggerParams object.

    """
    from Extracto import tokenizer
    from Extracto import tokenfns

    tokens = tokenizer.tokenize_str(string)
    token2char = tokenfns.count_offsets(tokens)
    # Do the tagging.
    x = tag_tokens(tokens, params, parser=parser)
    # Now map the tagging indexes back onto the string.
    results = []
    for tag, index in x:
        s, e = token2char[index], token2char[index]+len(tokens[index])
        results.append((tag, s, e))
    return results

def tag_tokens(tokens, params=None, parser=None):
    """tag_tokens(tokens[, params]) -> list of tag, index

    Assign POS tags.  tokens is a list of strings.  If there are
    multiple sentences in tokens, they should be separated by
    newlines.  params should be a TaggerParams object.

    """
    import os
    import tempfile
    from Bio import listfns
    from Extracto import ctype
    
    tokens = map(str, tokens)
    if not params:
        params = TaggerParams()
    # Write the tokens to a temporary file and call the tagger
    # program.
    filename = tempfile.mktemp()
    open(filename, 'w').write(_format_tokens(tokens))
    try:
        args, keywds = _args_for_tagger_fn(filename, params)
        handle = apply(tagger, args, keywds)
        # For debugging, look at the results.
        #open("rbt.in", 'w').write(_format_tokens(tokens))
        #print open(filename).read()
        #open('rbt.out', 'w').write(handle.read())
        #handle = open('rbt.out')
        # Now parse the results into a list of lines.
        taglines = parse_tagger(handle, parser=parser)
    finally:
        # Delete the temp file when RBT has finished.
        os.unlink(filename)

    # Now I mark up the tokens with the tags.  This is tricky because
    # I have to make sure the RBT results and tokens are aligned.
    # They have the same number of non-whitespace tokens, so I should
    # skip over all the whitespace in tokens.
    taglines = [x.rstrip() for x in taglines]
    results = ' '.join(taglines).split(' ')    # munge taglines into 1 list
    indexes = listfns.indexesof(tokens, ctype.isspace, opposite=1)
    if len(indexes) != len(results):
        raise AssertionError, "tokens %d and results %d not aligned" % (
            len(indexes), len(results))
    tags = []
    for ti, ri in zip(indexes, range(len(results))):
        w, t = _split_token(results[ri])
        tags.append((t, ti))
    return tags

def parse_tagger(handle, parser=None):
    """parse_tagger(handle[, parser]) -> list of strings

    Parse the results from running tagger.  handle is a file-like
    object holding the output.  parser is an optional SAX-type
    parser used to parse the results.  It should send out "tagline"
    messages for each line of tagged results.
    
    """
    if parser is None:
        parser = _make_tagger_parser()
    extractor = _TaglineExtractor()
    parser.setContentHandler(extractor)
    parser.setErrorHandler(handler.ErrorHandler())
    parser.parseFile(handle)

    # Check to make sure the parsed results looks reasonable.
    if len(extractor.taglines) < 1:
        raise SyntaxError, "I couldn't find any taglines in the output"
    return extractor.taglines

def tagger(tagger_path, lexicon, corpus_to_tag, bigrams, lexicalrulefile,
           contextualrulefile, wordlist=None, splitnumber=None,
           intermedfile=None,
           start_state_tagger_only=None, final_state_tagger_only=None):
    """tagger(tagger_path, lexicon, corpus_to_tag, bigrams, lexicalrulefile,
    contextualrulefile[, wordlist][, splitnumber][, intermedfile]
    [, start_state_tagger_only][, final_state_tagger_only]) -> handle

    Call the 'tagger' program and return a handle to the results.
    Please consult the documentation for 'tagger' for a description of
    the parameters.
    
    """
    import os
    
    # Check to make sure all these files actually exist.
    files = [tagger_path, lexicon, corpus_to_tag, bigrams,
             lexicalrulefile, contextualrulefile, wordlist]
    for file in files:
        if file is None:
            continue
        if not os.path.exists(file):
            raise ValueError, "I cannot find the file %s" % file

    args = [lexicon, corpus_to_tag, bigrams,
            lexicalrulefile, contextualrulefile]
    if wordlist is not None:
        args.extend(["-w", wordlist])
    if splitnumber is not None:
        args.extend(["-s", str(splitnumber)])
    if intermedfile is not None:
        args.extend(["-i", intermedfile])
    if start_state_tagger_only:
        args.append("-S")
    if final_state_tagger_only:
        args.append("-F")

    # The tagger has 2 requirements:
    # - I need to be in the same directory as the 'tagger' program.
    # - It needs to be able to run start-state-tagger and
    #   final-state-tagger.  I'll have to make sure '.' is in my path.
    pathstr = os.environ["PATH"]
    path = pathstr.split(':')
    cwd = os.getcwd()
    rbt_dir, x = os.path.split(tagger_path)
    try:
        os.chdir(rbt_dir)
        if '.' not in path and rbt_dir not in path:
            path.insert(0, '.')
            os.environ["PATH"] = ':'.join(path)
            path_changed = 1
        else:
            path_changed = 0
        w, r = os.popen4([tagger_path] + args)
        w.close()
    finally:
        if path_changed:
            os.environ["PATH"] = pathstr
        os.chdir(cwd)
    return r