#!/usr/bin/env python """This is a command line script to process documents.""" import sys import os import getopt import fcntl import time from operator import truth from Bio.Tools import listfns from Bio.Tools import MultiProc from Extracto import preprocess #from Extracto import xml_format from Extracto import bin_format from Extracto.markup_consts import * USAGE = """%s [-h] [-v] [-x] [-n] [-f] [-j nprocs] [-a] [-m] [-s] [-t] [-p] [-b] [-r] [-d] [-l filename] [-o outpath] inpath_or_file DESCRIPTION This program takes a directory of documents and preprocesses them. OPTIONS inpath_or_file Either a path to documents, where each document is its own file, or a filename. -o outpath Where the preprocessed files should be stored. Required if inpath given. -l filename Only process the files listed in this filename. -h Print out this message. -v Print verbose output. -x Input documents are XML format. By default, assumes raw text. -n No clobber. Do no overwrite done files. -f Fast check. If a file exists, assume it's good. -j nprocs Number of processors to use. Default is 1. -a Do all preprocessing (e.g. -n -s -t -p -b -r -d) -m Join the numbers. -s Identify the sentences using a heuristic. -t Find abbreviations. -p Do Porter stemming. -b Find abbreviations using abbSys. -r Do POS tagging using RBT. -d Do parsing with Sundance. """ % sys.argv[0] class NumberPreprocessor: def is_done(self, ms): return 0 def do(self, ms): return preprocess.join_numbers(ms) class SentencePreprocessor: def is_done(self, ms): return truth(ms.markups(name=SENTENCE)) def do(self, ms): return preprocess.find_sentences(ms) class ClausePreprocessor: def is_done(self, ms): return 1 def do(self, ms): return preprocess.find_clauses(ms) class AbbreviationPreprocessor: def is_done(self, ms): return truth(ms.markups(name=ABBREV)) def do(self, ms): return preprocess.find_abbrevs(ms) class PorterPreprocessor: def is_done(self, ms): return truth(ms.markups(name=PORTER)) def do(self, ms): return preprocess.porter_stem(ms) class AbbSysPreprocessor: def is_done(self, ms): return truth(ms.markups(name=ABBSYS)) def do(self, ms): return preprocess.abbSys_abbrev(ms) class RBTPreprocessor: def is_done(self, ms): return truth(ms.markups(name=RBT)) def do(self, ms): return preprocess.rbt_tag(ms) class SundancePreprocessor: def is_done(self, ms): return truth(ms.markups(name=SUNDANCE_TAG_SOURCE)) def do(self, ms): return preprocess.sundance_parse(ms) DEPENDENCIES = { "-a" : ['-m', '-s', '-t', '-p', '-b', '-r', '-d'], "-m" : [], "-s" : ["-m"], "-c" : ["-s", "-r"], "-t" : ["-s"], "-p" : [], "-b" : ["-s"], "-r" : ["-s"], "-d" : ["-s"] } ARG2PREPROCESSOR = { # argument -> (sort order, preprocessor) "-a" : (0, None), "-m" : (10, NumberPreprocessor()), "-s" : (30, SentencePreprocessor()), "-c" : (50, ClausePreprocessor()), "-t" : (40, AbbreviationPreprocessor()), "-p" : (20, PorterPreprocessor()), "-b" : (40, AbbSysPreprocessor()), "-r" : (40, RBTPreprocessor()), "-d" : (40, SundancePreprocessor()) } time_format = "%H:%M:%S" date_format = "%m/%d/%Y" full_format = "%s %s" % (date_format, time_format) def now(format=full_format): """now(format=full_format) -> string Return a timestamp with format describing the time formatting """ time_tup = time.localtime(time.time()) return time.strftime(format, time_tup) def writel(s, handle=None): if handle is None: handle = sys.stdout fcntl.lockf(handle.fileno(), fcntl.LOCK_EX) try: handle.write(s) handle.flush() finally: fcntl.lockf(handle.fileno(), fcntl.LOCK_UN) def printl(s): t = now() writel("%s %s\n" % (t, s)) NOCLOBBER = 0 FASTCHECK = 0 NPROCS = 1 INPUT_IS_XML = 0 VERBOSE = 0 def _is_file_ok(filename): if not os.path.exists(filename): return 0 if FASTCHECK: return 1 try: #xml_format.load(open(filename)) bin_format.load(open(filename)) except KeyError: raise except Exception, x: printl("%s broken (%s)" % (filename, x)) return 0 return 1 def _preprocess_one_file(infile, preprocessors, update_fn=None): if not os.path.exists(infile): if update_fn is not None: update_fn("%s does not exist" % infile) return None # Load the string. if INPUT_IS_XML: ms = xml_format.load(open(infile)) else: ms = preprocess.find_tokens(open(infile).read(), clean=1) head, tail = os.path.split(infile) # Run through all the preprocessors. for preproc in preprocessors: if not preproc.is_done(ms): if update_fn is not None and VERBOSE: update_fn("Processing %s (%s)" % (tail, preproc.__class__.__name__)) ms = preproc.do(ms) if update_fn is not None: update_fn("%s Done" % tail) #return xml_format.save_str(ms, pretty=1) return bin_format.save_str(ms) def _preprocess_one(id, inpath, outpath, preprocessors): infile = os.path.join(inpath, id) outfile = os.path.join(outpath, id) if NOCLOBBER and _is_file_ok(outfile): return try: s = _preprocess_one_file(infile, preprocessors, printl) except: printl("ERROR processing %s" % id) raise if s is not None: open(outfile, 'w').write(s) def _preprocess_some(start, skip, ids, inpath, outpath, preprocessors): for i in range(start, len(ids), skip): _preprocess_one(ids[i], inpath, outpath, preprocessors) def _preprocess(inpath, outpath, only_file, preprocessors): # Prepare the directories for processing. if only_file: lines = open(only_file).readlines() ids = [x.rstrip() for x in lines] printl("Found %d files in %s" % (len(ids), only_file)) else: ids = os.listdir(inpath) printl("Found %d files in %s" % (len(ids), inpath)) if not os.path.exists(outpath): printl("%s doesn't exist, creating" % outpath) os.mkdir(outpath) # If FASTCHECK, then we can do the checking all within this # thread. Otherwise, the checking will take too long and we'll # have to do it separately. Doing it in the parent thread allows # us to more efficiently spread the work. if NOCLOBBER and FASTCHECK: i = 0 while i < len(ids): outfile = os.path.join(outpath, ids[i]) if os.path.exists(outfile): del ids[i] else: i += 1 printl("Starting preprocessing") if NPROCS > 1: printl("Creating %d processes" % NPROCS) MultiProc.run(NPROCS, _preprocess_some, fn_args=(ids, inpath, outpath, preprocessors)) printl("Done") if __name__ == '__main__': if len(sys.argv) == 1: # If they didn't specify any arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) try: optlist, args = getopt.getopt(sys.argv[1:], "hvxnfj:amstpbrdo:l:") except getopt.error, x: print >>sys.stderr, x sys.exit(0) if len(args) != 1: # If they gave extraneous arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) inpath_or_file, = args outpath = None preprocs = [] only_file = None for opt, arg in optlist: if opt == '-h': print USAGE sys.exit(0) elif opt == '-v': VERBOSE = 1 elif opt == '-x': raise NotImplementedError, "XML format no longer supported" INPUT_IS_XML = 1 elif opt == '-n': NOCLOBBER = 1 elif opt == '-f': FASTCHECK = 1 elif opt == '-j': NPROCS = int(arg) if NPROCS < 1 or NPROCS > 100: raise ValueError, "nprocs should be >= 1 and <= 100" elif opt == '-o': outpath = arg elif opt == '-l': only_file = arg elif opt in ['-a', '-m', '-s', '-c', '-t', '-p', '-b', '-r', '-d']: preprocs.append(opt) # If no options specified, then preprocess everything. if not preprocs: preprocs.append("-a") # Do some checking to make sure the input is reasonable. if inpath_or_file == outpath: print >>sys.stderr, "outpath can't be the same as inpath" sys.exit(0) if os.path.isdir(inpath_or_file): if outpath is None: print >>sys.stderr, "Please specify an outpath" sys.exit(0) if os.path.exists(outpath) and not os.path.isdir(outpath): print >>sys.stderr, "outpath %s is not a directory" % outpath sys.exit(0) if only_file and not os.path.exists(only_file): print >>sys.stderr, "I could not find the file %s" % only_file sys.exit(0) # Add the dependencies. Keep on adding until all of them are # added. preprocs = listfns.items(preprocs) preprocs.sort() old_preprocs = [] while old_preprocs != preprocs: old_preprocs = preprocs[:] for p in old_preprocs: preprocs.extend(DEPENDENCIES[p]) preprocs = listfns.items(preprocs) preprocs.sort() # Now sort the preprocessors in the correct order. for i in range(len(preprocs)): preprocs[i] = ARG2PREPROCESSOR[preprocs[i]] preprocs.sort() preprocs = [x[1] for x in preprocs] preprocs = filter(None, preprocs) #import profile # XXX if os.path.isdir(inpath_or_file): #profile.run("_preprocess(inpath_or_file, outpath, preprocs)") _preprocess(inpath_or_file, outpath, only_file, preprocs) elif os.path.isfile(inpath_or_file): #profile.run("print _preprocess_one_file(inpath_or_file, preprocs)") print _preprocess_one_file(inpath_or_file, preprocs) else: print >>sys.stderr, "I can't find '%s'" % inpath_or_file sys.exit(0)