#!/usr/bin/env python """This is a command line script to scan documents.""" import sys import os import getopt import fcntl import time import operator from Bio.Tools import MultiProc from Extracto import Tokens from Extracto import preprocess from Extracto import preprocessfns USAGE = """%s [-h] [-i] [-x] [-j nprocs] lexicon inpath_or_file DESCRIPTION This program scans either a document or a directory of documents for words in a lexicon. If the documents have been preprocessed with abbreviations, I'll also look for the abbreviations of the words in the lexicon. OPTIONS inpath_or_file Either a path to documents, where each document is its own file, or a filename. lexicon Text file that contains one word per line. -h Print out this message. -i Ignore case. Uses case by default. -x Input documents are XML format. By default, assumes raw text. -j nprocs Number of processors to use. Default is 1. """ % sys.argv[0] INPUT_IS_XML = 0 NPROCS = 1 IGNORE_CASE = 0 def writel(s, handle=None): if handle is None: handle = sys.stdout fcntl.lockf(handle.fileno(), fcntl.LOCK_EX) try: handle.write(s) handle.flush() finally: fcntl.lockf(handle.fileno(), fcntl.LOCK_UN) def extract_abbrevs(document): """extract_abbrevs(document) -> list of prefix, abbrev, score""" abbrevs = [] markups = document.markups(name=markup_consts.ABBREV) for x, value, x, x in markups: prefix, abbrev, score = value.split("|") score = float(score) abbrevs.append((prefix, abbrev, score)) return abbrevs def find_exact(lexicon, word): # Return the exact match of word to an entry in the finder or None. for fword, s, e in triefind.find(word, lexicon): # Make sure I found a complete match and not a substring. if s == 0 and e == len(word): return fword return None def _scan_one_file(file, lexicon): if INPUT_IS_XML: from Extracto import xml_format tokens = xml_format.load(open(file)) data = tokens.as_string() abbrevs = extract_abbrevs(tokens) else: data = open(file).read() abbrevs = [] # Make a list of the entries in the lexicon that I found. # entries_found is a list of # (entry, file offset, sent. num, sent. offset, length, score or None) entries_found = [] for entry, start, end in lexicon.findall(data): # Fill in None values for sentence number and offset for now. # I'll calculate what they are later. entries_found.append((entry, start, None, None, end-start, None)) # Now look for possible abbreviations. Look at all the # abbreviations and prefixes found in the document. If # either an abbrev or prefix is in the lexicon and the # other is not, add it to my list of things to search. analogs = {} # word -> word in lexicon, score for prefix, abbrev, score in abbrevs: lex_abbrev = find_exact(lexicon, abbrev) lex_prefix = find_exact(lexicon, prefix) # If either none or both are in the lexicon, then ignore. if operator.truth(lex_abbrev) == operator.truth(lex_prefix): continue # I'm making an assumption here that the there won't # be 2 different abbreviations. if lex_abbrev: # abbrev in lexicon, prefix is not analogs[prefix.lower()] = (lex_abbrev, score) else: analogs[abbrev.lower()] = (lex_prefix, score) # Do the search for abbreviations in the lexicon. finder = phrase.compile(analogs.keys(), ignore_case=1) for abbrev_or_prefix, start, end in finder.findall(data): analog, score = analogs[abbrev_or_prefix.lower()] entries_found.append((analog, start, None, None, end-start, score)) # If there is nothing found, just quit here without further # processing. if not entries_found: return # Now figure out the sentence number and offsets. if not INPUT_IS_XML: # Do the processing to find the sentences. tokens = Tokens.Tokens(tokenizer.tokenize_str(data)) tokens = preprocess.join_numbers(tokens) tokens = preprocess.find_sentences(tokens) # Split the tokens into sentences and find their boundaries. sentences = preprocessfns.split_sentences(tokens) indexes = [] index = 0 for i in range(len(sentences)): index += len(''.join(sentences[i])) indexes.append(index) for i in range(len(entries_found)): entry, foff, sentnum, soff, length, score = entries_found[i] for j, index in zip(range(len(indexes)), indexes): if foff < index: sentnum = j if j == 0: soff = foff else: soff = foff - indexes[j-1] break else: raise AssertionError, "I could not find the sentence for %d" % foff # Now strip the sentence and fix the sentence offset. s = sentences[sentnum].as_string() soff -= s.find(s.lstrip()) entries_found[i] = entry, foff, sentnum, soff, length, score # file is the fullpath. Print just the tail part of the # path. head, tail = os.path.split(file) for entry, foff, sentnum, soff, length, score in entries_found: writel("%s|%s|%d|%d|%d|%d|%s\n" % ( tail, entry, foff, sentnum, soff, length, score)) def scan_files(files, lexicon): def do_some(start, skip, files, lexicon): for i in range(start, len(files), skip): try: _scan_one_file(files[i], lexicon) except (KeyboardInterrupt, SystemExit): raise except Exception, x: writel("ERROR '%s' in file %s\n" % (str(x), files[i])) raise MultiProc.run(NPROCS, do_some, (files, lexicon)) def _find_all_files(path_or_file): if os.path.isdir(path_or_file): files = [] def add_files(arg, dirname, names, files=files): for name in names: full = os.path.join(dirname, name) files.append(full) os.path.walk(path_or_file, add_files, None) else: files = [path_or_file] return files if __name__ == '__main__': if len(sys.argv) == 1: # If they didn't specify any arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) try: optlist, args = getopt.getopt(sys.argv[1:], "hixj:") except getopt.error, x: print >>sys.stderr, x sys.exit(0) if len(args) != 2: # If they gave extraneous arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) lexicon_file, inpath_or_file = args outpath = None preprocs = [] for opt, arg in optlist: if opt == '-h': print USAGE sys.exit(0) elif opt == '-i': IGNORE_CASE = 1 elif opt == '-x': INPUT_IS_XML = 1 elif opt == '-j': NPROCS = int(arg) if NPROCS < 1 or NPROCS > 100: raise ValueError, "nprocs should be >= 1 and <= 100" # Do some checking to make sure the input is reasonable. if not os.path.exists(inpath_or_file): print >>sys.stderr, "I can't find the path or file %s" % inpath_or_file sys.exit(0) if not os.path.exists(lexicon_file): print >>sys.stderr, "I can't find the lexicon %s" % lexicon_file sys.exit(0) # Print out some diagnostic information. writel("# %s\n" % (sys.argv[0])) writel("# Scanning: %s\n" % inpath_or_file) writel("# Using lexicon: %s\n" % lexicon_file) timestr = time.strftime("%A, %d %B %Y %I:%M:%S%p") if os.environ.has_key("USER"): writel("# Run on %s by %s\n" % (timestr, os.environ["USER"])) else: writel("# Run on %s\n" % timestr) writel("# Output format is:\n") writel("# FILENAME|LEXICON ENTRY|FILE OFFSET|SENT. NUMBER|SENT. OFFSET|LENGTH|SCORE\n") writel("# The SENTENCE OFFSET is the offset from the beginning of the\n") writel("# sentence with leading whitespace stripped.\n") # XXX problems if the lexicon has a "|" character in it... # Load the lexicon. from Extracto import datafile from Bio import trie lines = open(lexicon_file).readlines() phrases = datafile.clean(lines) lexicon = trie.trie() for phrase in phrases: lexicon[phrase.lower()] = 1 # Now search files for the lexicon. files = _find_all_files(inpath_or_file) scan_files(files, lexicon)