#!/usr/bin/env python import os import sys import string import getopt import time from Bio.Tools import stringfns from Bio.Tools import listfns from Bio.Tools import MultiProc from Extracto import stem from Extracto import sentence, phraselet, parentheses USAGE = \ """ngrams [-h] [-n N] [-i] [-d] [-r] [-s stem algorithm] [-j num procs] [-l filename] inpath_or_file Count the number of times words appear in a list of files. OPTIONS inpath_or_file Either a path to documents, where each document is its own file, or a filename. -l filename Only process the files listed in this filename. -h Print this usage message. -n N The N in N-gram. Should be between 1 and 10. 1 by default. -i Ignore case. Uses case by default. -d Document frequency. Count the number of documents each word appears in, rather than the total frequency of the words. -r Raw text. Do not respect sentence or phraselet boundaries. -s algorithm Use a stemming algorithm. This can only be 'porter'. No stemming by default. -j num procs The number of processes to run concurrently. 1 by default. """ IGNORE_CASE = 0 # Whether to ignore case when collecting words. STEMMER = None # Whether to use stemming. PROCESSES = 1 # The number of processes to run concurrently. DOCUMENT_FREQUENCY = 0 # Whether to compute the document frequency. RAW_TEXT = 0 # Treat the text raw, without sentence or phraselets. N = 1 # N in N-gram def to_ngrams(n, words): """to_ngrams(n, words) -> list of n-grams""" ngrams = [] for i in range(len(words)-(n-1)): ngrams.append(tuple(words[i:i+n])) return ngrams def count_ngrams(filename): """count_ngrams(filename) -> dict of ngram -> count""" text = open(filename).read() if RAW_TEXT: strings = [text] else: strings = find_phraselets(text) if IGNORE_CASE: strings = [s.lower() for s in strings] ngrams = [] for s in strings: words = stringfns.splitany(s, string.whitespace + string.punctuation) if STEMMER: words = map(STEMMER, words) words = filter(len, words) ngrams.extend(to_ngrams(N, words)) return listfns.count(ngrams) def _count_ngrams_from_files_h(start, skip, filenames): counts = {} for i in range(start, len(filenames), skip): file = filenames[i] nc = count_ngrams(file) for ngram in nc.keys(): if DOCUMENT_FREQUENCY: counts[ngram] = counts.get(ngram, 0) + 1 else: counts[ngram] = counts.get(ngram, 0) + nc[ngram] return counts def count_ngrams_from_files(filenames): """count_ngrams_from_files(filenames) -> counts filenames is a list of files. counts is a dictionary from word -> count. """ all_counts = {} results = MultiProc.run( PROCESSES, _count_ngrams_from_files_h, fn_args=(filenames,)) for counts in results: for word in counts.keys(): all_counts[word] = all_counts.get(word, 0) + counts[word] return all_counts def find_phraselets(string): # First, find all the sentences. sentences = [] for s, e in sentence.find(string): sentences.append((string[s:e])) # Now pull all the parenthetical statements out of the sentence. phrases = [] for sent in sentences: ranges = parentheses.find(sent) for s, e in ranges: phrases.append(sent[s:e]) phrases.append(parentheses._remove_spans(sent, ranges)) # For each of the phrases, recognizes the phraselets. phraselets = [] for phrase in phrases: for s, e in phraselet.find(phrase): phraselets.append(phrase[s:e]) return phraselets def _find_all_files(path_or_file): if os.path.isdir(path_or_file): files = [] def add_files(arg, dirname, names, files=files): for name in names: full = os.path.join(dirname, name) files.append(full) os.path.walk(path_or_file, add_files, None) else: files = [path_or_file] return files if __name__ == '__main__': try: optlist, args = getopt.getopt(sys.argv[1:], "hn:ids:rj:l:") except getopt.error, x: print USAGE print >>sys.stderr, x sys.exit(-1) if len(args) != 1: # If they gave extraneous arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) inpath_or_file, = args only_file = None for opt, arg in optlist: if opt == '-h': print USAGE sys.exit(0) elif opt == '-n': N = int(arg) if N < 1 or N > 10: print >>sys.stderr, 'Invalid argument "%d" for N.' % N sys.exit(-1) elif opt == '-i': IGNORE_CASE = 1 elif opt == '-l': only_file = arg elif opt == '-r': RAW_TEXT = 1 elif opt == '-s': if arg == "porter": STEMMER = stem.porter else: print >>sys.stderr, 'Invalid argument "%s" for stemmer.' % arg sys.exit(-1) elif opt == '-j': PROCESSES = int(arg) if PROCESSES < 1 or PROCESSES > 500: print >>sys.stderr, 'Invalid argument "%d" for processes.' % \ PROCESSES sys.exit(-1) elif opt == '-d': DOCUMENT_FREQUENCY = 1 # If I'm only getting wordcounts, the boundaries don't matter. if N ==1: RAW_TEXT = 1 if not os.path.exists(inpath_or_file): print >>sys.stderr, "I could not find %s" % inpath_or_file sys.exit(0) if only_file: if not os.path.isdir(inpath_or_file): print >>sys.stderr, "%s is not a directory" % inpath_or_file sys.exit(0) files = [x.rstrip() for x in open(only_file)] filenames = [] for file in files: filenames.append(os.path.join(inpath_or_file, file)) else: filenames = [] for arg in args: filenames.extend(_find_all_files(arg)) if not filenames: print USAGE sys.exit(-1) counts = count_ngrams_from_files(filenames) # Finally print out the counts of the ngrams. items = counts.items() # I want to sort by decreasing count and increasing alphabet. items.sort(lambda x, y: cmp((y[1], x[0]), (x[1], y[0]))) for ngram, count in items: print "%s %d" % (' '.join(ngram), count) sys.stdout.flush()