#!/usr/bin/env python

"""This is a command line script to scan documents."""

import sys
import os
import getopt
import fcntl
import time
import operator

from Bio.Tools import MultiProc

from Extracto import Tokens
from Extracto import preprocess
from Extracto import preprocessfns


USAGE = """%s [-h] [-i] [-x] [-j nprocs] lexicon inpath_or_file


DESCRIPTION

    This program scans either a document or a directory of documents
    for words in a lexicon.  If the documents have been preprocessed
    with abbreviations, I'll also look for the abbreviations of the
    words in the lexicon.


OPTIONS

   inpath_or_file    Either a path to documents, where each document is its
                     own file, or a filename.

   lexicon           Text file that contains one word per line.

   -h                Print out this message.

   -i                Ignore case.  Uses case by default.

   -x                Input documents are XML format.  By default, assumes
                     raw text.

   -j nprocs         Number of processors to use.  Default is 1.

""" % sys.argv[0]

INPUT_IS_XML = 0
NPROCS = 1
IGNORE_CASE = 0

def writel(s, handle=None):
    if handle is None:
        handle = sys.stdout
    fcntl.lockf(handle.fileno(), fcntl.LOCK_EX)
    try:
        handle.write(s)
        handle.flush()
    finally:
        fcntl.lockf(handle.fileno(), fcntl.LOCK_UN)

def extract_abbrevs(document):
    """extract_abbrevs(document) -> list of prefix, abbrev, score"""
    abbrevs = []
    markups = document.markups(name=markup_consts.ABBREV)
    for x, value, x, x in markups:
        prefix, abbrev, score = value.split("|")
        score = float(score)
        abbrevs.append((prefix, abbrev, score))
    return abbrevs

def find_exact(lexicon, word):
    # Return the exact match of word to an entry in the finder or None.
    for fword, s, e in triefind.find(word, lexicon):
        # Make sure I found a complete match and not a substring.
        if s == 0 and e == len(word):
            return fword
    return None

def _scan_one_file(file, lexicon):
    if INPUT_IS_XML:
        from Extracto import xml_format
        tokens = xml_format.load(open(file))
        data = tokens.as_string()
        abbrevs = extract_abbrevs(tokens)
    else:
        data = open(file).read()
        abbrevs = []

    # Make a list of the entries in the lexicon that I found.
    # entries_found is a list of
    # (entry, file offset, sent. num, sent. offset, length, score or None)
    entries_found = []
    for entry, start, end in lexicon.findall(data):
        # Fill in None values for sentence number and offset for now.
        # I'll calculate what they are later.
        entries_found.append((entry, start, None, None, end-start, None))

    # Now look for possible abbreviations.  Look at all the
    # abbreviations and prefixes found in the document.  If
    # either an abbrev or prefix is in the lexicon and the
    # other is not, add it to my list of things to search.
    analogs = {}  # word -> word in lexicon, score
    for prefix, abbrev, score in abbrevs:
        lex_abbrev = find_exact(lexicon, abbrev)
        lex_prefix = find_exact(lexicon, prefix)
        # If either none or both are in the lexicon, then ignore.
        if operator.truth(lex_abbrev) == operator.truth(lex_prefix):
            continue
        # I'm making an assumption here that the there won't
        # be 2 different abbreviations.
        if lex_abbrev:   # abbrev in lexicon, prefix is not
            analogs[prefix.lower()] = (lex_abbrev, score)
        else:
            analogs[abbrev.lower()] = (lex_prefix, score)
    # Do the search for abbreviations in the lexicon.
    finder = phrase.compile(analogs.keys(), ignore_case=1)
    for abbrev_or_prefix, start, end in finder.findall(data):
        analog, score = analogs[abbrev_or_prefix.lower()]
        entries_found.append((analog, start, None, None, end-start, score))

    # If there is nothing found, just quit here without further
    # processing.
    if not entries_found:
        return

    # Now figure out the sentence number and offsets.
    if not INPUT_IS_XML:
        # Do the processing to find the sentences.
        tokens = Tokens.Tokens(tokenizer.tokenize_str(data))
        tokens = preprocess.join_numbers(tokens)
        tokens = preprocess.find_sentences(tokens)
    # Split the tokens into sentences and find their boundaries.
    sentences = preprocessfns.split_sentences(tokens)
    indexes = []
    index = 0
    for i in range(len(sentences)):
        index += len(''.join(sentences[i]))
        indexes.append(index)
    for i in range(len(entries_found)):
        entry, foff, sentnum, soff, length, score = entries_found[i]
        for j, index in zip(range(len(indexes)), indexes):
            if foff < index:
                sentnum = j
                if j == 0:
                    soff = foff
                else:
                    soff = foff - indexes[j-1]
                break
        else:
            raise AssertionError, "I could not find the sentence for %d" % foff
        # Now strip the sentence and fix the sentence offset.
        s = sentences[sentnum].as_string()
        soff -= s.find(s.lstrip())
        entries_found[i] = entry, foff, sentnum, soff, length, score

    # file is the fullpath.  Print just the tail part of the
    # path.
    head, tail = os.path.split(file)
    for entry, foff, sentnum, soff, length, score in entries_found:
        writel("%s|%s|%d|%d|%d|%d|%s\n" % (
            tail, entry, foff, sentnum, soff, length, score))

def scan_files(files, lexicon):
    def do_some(start, skip, files, lexicon):
        for i in range(start, len(files), skip):
            try:
                _scan_one_file(files[i], lexicon)
            except (KeyboardInterrupt, SystemExit):
                raise
            except Exception, x:
                writel("ERROR '%s' in file %s\n" % (str(x), files[i]))
                raise
    MultiProc.run(NPROCS, do_some, (files, lexicon))

def _find_all_files(path_or_file):
    if os.path.isdir(path_or_file):
        files = []
        def add_files(arg, dirname, names, files=files):
            for name in names:
                full = os.path.join(dirname, name)
                files.append(full)
        os.path.walk(path_or_file, add_files, None)
    else:
        files = [path_or_file]
    return files

if __name__ == '__main__':
    if len(sys.argv) == 1:             # If they didn't specify any arguments,
        print >>sys.stderr, USAGE      # print the instructions and quit.
        sys.exit(0)
        
    try:
        optlist, args = getopt.getopt(sys.argv[1:], "hixj:")
    except getopt.error, x:
        print >>sys.stderr, x
        sys.exit(0)
    if len(args) != 2:                 # If they gave extraneous arguments,
        print >>sys.stderr, USAGE      # print the instructions and quit.
        sys.exit(0)
    lexicon_file, inpath_or_file = args

    outpath = None
    preprocs = []
    for opt, arg in optlist:
        if opt == '-h':
            print USAGE
            sys.exit(0)
        elif opt == '-i':
            IGNORE_CASE = 1
        elif opt == '-x':
            INPUT_IS_XML = 1
        elif opt == '-j':
            NPROCS = int(arg)
            if NPROCS < 1 or NPROCS > 100:
                raise ValueError, "nprocs should be >= 1 and <= 100"

    # Do some checking to make sure the input is reasonable.
    if not os.path.exists(inpath_or_file):
        print >>sys.stderr, "I can't find the path or file %s" % inpath_or_file
        sys.exit(0)
    if not os.path.exists(lexicon_file):
        print >>sys.stderr, "I can't find the lexicon %s" % lexicon_file
        sys.exit(0)

    # Print out some diagnostic information.
    writel("# %s\n" % (sys.argv[0]))
    writel("# Scanning: %s\n" % inpath_or_file)
    writel("# Using lexicon: %s\n" % lexicon_file)
    timestr = time.strftime("%A, %d %B %Y %I:%M:%S%p")
    if os.environ.has_key("USER"):
        writel("# Run on %s by %s\n" % (timestr, os.environ["USER"]))
    else:
        writel("# Run on %s\n" % timestr)
    writel("# Output format is:\n")
    writel("# FILENAME|LEXICON ENTRY|FILE OFFSET|SENT. NUMBER|SENT. OFFSET|LENGTH|SCORE\n")
    writel("# The SENTENCE OFFSET is the offset from the beginning of the\n")
    writel("# sentence with leading whitespace stripped.\n")
    # XXX problems if the lexicon has a "|" character in it...
    
    # Load the lexicon.
    from Extracto import datafile
    from Bio import trie
    lines = open(lexicon_file).readlines()
    phrases = datafile.clean(lines)
    lexicon = trie.trie()
    for phrase in phrases:
        lexicon[phrase.lower()] = 1

    # Now search files for the lexicon.
    files = _find_all_files(inpath_or_file)
    scan_files(files, lexicon)