#!/usr/bin/env python

import os
import sys
import string
import getopt
import time

from Bio.Tools import stringfns
from Bio.Tools import listfns
from Bio.Tools import MultiProc

from Extracto import stem
from Extracto import sentence, phraselet, parentheses

USAGE = \
"""ngrams [-h] [-n N] [-i] [-d] [-r] [-s stem algorithm] [-j num procs] [-l filename] inpath_or_file

Count the number of times words appear in a list of files.


OPTIONS

    inpath_or_file    Either a path to documents, where each document is its
                      own file, or a filename.

    -l filename       Only process the files listed in this filename.

    -h             Print this usage message.
    
    -n N           The N in N-gram.  Should be between 1 and 10.
                   1 by default.

    -i             Ignore case.  Uses case by default.
    
    -d             Document frequency.  Count the number of documents
                   each word appears in, rather than the total frequency
                   of the words.

    -r             Raw text.  Do not respect sentence or phraselet
                   boundaries.

    -s algorithm   Use a stemming algorithm.
                   This can only be 'porter'.  No stemming by default.

    -j num procs   The number of processes to run concurrently.
                   1 by default.

"""

IGNORE_CASE = 0          # Whether to ignore case when collecting words.
STEMMER = None           # Whether to use stemming.
PROCESSES = 1            # The number of processes to run concurrently.
DOCUMENT_FREQUENCY = 0   # Whether to compute the document frequency.
RAW_TEXT = 0             # Treat the text raw, without sentence or phraselets.
N = 1                    # N in N-gram

def to_ngrams(n, words):
    """to_ngrams(n, words) -> list of n-grams"""
    ngrams = []
    for i in range(len(words)-(n-1)):
        ngrams.append(tuple(words[i:i+n]))
    return ngrams

def count_ngrams(filename):
    """count_ngrams(filename) -> dict of ngram -> count"""
    text = open(filename).read()
    if RAW_TEXT:
        strings = [text]
    else:
        strings = find_phraselets(text)
    if IGNORE_CASE:
        strings = [s.lower() for s in strings]
    ngrams = []
    for s in strings:
        words = stringfns.splitany(s, string.whitespace + string.punctuation)
        if STEMMER:
            words = map(STEMMER, words)
        words = filter(len, words)
        ngrams.extend(to_ngrams(N, words))
    return listfns.count(ngrams)

def _count_ngrams_from_files_h(start, skip, filenames):
    counts = {}
    for i in range(start, len(filenames), skip):
        file = filenames[i]
        nc = count_ngrams(file)
        for ngram in nc.keys():
            if DOCUMENT_FREQUENCY:
                counts[ngram] = counts.get(ngram, 0) + 1
            else:
                counts[ngram] = counts.get(ngram, 0) + nc[ngram]
    return counts
    
def count_ngrams_from_files(filenames):
    """count_ngrams_from_files(filenames) -> counts

    filenames is a list of files.  counts is a dictionary from
    word -> count.

    """
    all_counts = {}
    results = MultiProc.run(
        PROCESSES, _count_ngrams_from_files_h, fn_args=(filenames,))
    for counts in results:
        for word in counts.keys():
            all_counts[word] = all_counts.get(word, 0) + counts[word]
    return all_counts

def find_phraselets(string):
    # First, find all the sentences.
    sentences = []
    for s, e in sentence.find(string):
        sentences.append((string[s:e]))

    # Now pull all the parenthetical statements out of the sentence.
    phrases = []
    for sent in sentences:
        ranges = parentheses.find(sent)
        for s, e in ranges:
            phrases.append(sent[s:e])
        phrases.append(parentheses._remove_spans(sent, ranges))

    # For each of the phrases, recognizes the phraselets.
    phraselets = []
    for phrase in phrases:
        for s, e in phraselet.find(phrase):
            phraselets.append(phrase[s:e])
    return phraselets

def _find_all_files(path_or_file):
    if os.path.isdir(path_or_file):
        files = []
        def add_files(arg, dirname, names, files=files):
            for name in names:
                full = os.path.join(dirname, name)
                files.append(full)
        os.path.walk(path_or_file, add_files, None)
    else:
        files = [path_or_file]
    return files

if __name__ == '__main__':
    try:
        optlist, args = getopt.getopt(sys.argv[1:], "hn:ids:rj:l:")
    except getopt.error, x:
        print USAGE
        print >>sys.stderr, x
        sys.exit(-1)
    if len(args) != 1:                 # If they gave extraneous arguments,
        print >>sys.stderr, USAGE      # print the instructions and quit.
        sys.exit(0)
    inpath_or_file, = args

    only_file = None
    for opt, arg in optlist:
        if opt == '-h':
            print USAGE
            sys.exit(0)
        elif opt == '-n':
            N = int(arg)
            if N < 1 or N > 10:
                print >>sys.stderr, 'Invalid argument "%d" for N.' % N
                sys.exit(-1)
        elif opt == '-i':
            IGNORE_CASE = 1
        elif opt == '-l':
            only_file = arg
        elif opt == '-r':
            RAW_TEXT = 1
        elif opt == '-s':
            if arg == "porter":
                STEMMER = stem.porter
            else:
                print >>sys.stderr, 'Invalid argument "%s" for stemmer.' % arg
                sys.exit(-1)
        elif opt == '-j':
            PROCESSES = int(arg)
            if PROCESSES < 1 or PROCESSES > 500:
                print >>sys.stderr, 'Invalid argument "%d" for processes.' % \
                      PROCESSES
                sys.exit(-1)
        elif opt == '-d':
            DOCUMENT_FREQUENCY = 1

    # If I'm only getting wordcounts, the boundaries don't matter.
    if N ==1:
        RAW_TEXT = 1

    if not os.path.exists(inpath_or_file):
        print >>sys.stderr, "I could not find %s" % inpath_or_file
        sys.exit(0)
    if only_file:
        if not os.path.isdir(inpath_or_file):
            print >>sys.stderr, "%s is not a directory" % inpath_or_file
            sys.exit(0)
        files = [x.rstrip() for x in open(only_file)]
        filenames = []
        for file in files:
            filenames.append(os.path.join(inpath_or_file, file))
    else:
        filenames = []
        for arg in args:
            filenames.extend(_find_all_files(arg))
        
    if not filenames:
        print USAGE
        sys.exit(-1)

    counts = count_ngrams_from_files(filenames)

    # Finally print out the counts of the ngrams.
    items = counts.items()
    # I want to sort by decreasing count and increasing alphabet.
    items.sort(lambda x, y: cmp((y[1], x[0]), (x[1], y[0])))
    for ngram, count in items:
        print "%s %d" % (' '.join(ngram), count)
        sys.stdout.flush()