#!/usr/bin/env python import os import sys import time import fcntl import getopt from xml.sax import handler from Bio.Tools import MultiProc from Bio.Tools import listfns from Bio.Medline import NLMMedlineXML USAGE = """%s [-h] [-n] [-j nprocs] infile_or_path outpath DESCRIPTION This program indexes the XML-format files saves the results in outpath. infile_or_path could be a single file to index or a path that contains files to index. -h Print out this message. -n noclobber. Do not index a file if it already exists. -j nprocs Run with more than one process. """ % sys.argv[0] def writel(s, handle=None): if handle is None: handle = sys.stdout fcntl.lockf(handle.fileno(), fcntl.LOCK_EX) try: handle.write(s) handle.flush() finally: fcntl.lockf(handle.fileno(), fcntl.LOCK_UN) def make_outfile_name(infile, outpath): # The name for an index file should be the name of the medline # file without the ".xml" extension. path, file = os.path.split(infile) head, ext = os.path.splitext(file) return os.path.join(outpath, head) def index_files(files, outpath): if NOCLOBBER: # I only want to do the ones that haven't been done. missing_files = [] for infile in files: outfile = make_outfile_name(infile, outpath) if not os.path.exists(outfile): missing_files.append(infile) files = missing_files class _IndexCollector: def __init__(self, outpath): self.outpath = outpath self.indexes = {} # infile -> list of data def __call__(self, infile, event, data): if event == 'START': self.indexes[infile] = [] elif event == 'RECORD': self.indexes[infile].append(data) elif event == 'END': outfile = make_outfile_name(infile, self.outpath) lines = [] for pmid, medline_id, start, end in self.indexes[infile]: lines.append("PMID %s MedlineID %s START %d END %d\n" % (pmid, medline_id, start, end)) open(outfile, 'w').writelines(lines) timestr = time.strftime("%m/%d/%Y %H:%M:%S") writel("%s I wrote %d records to %s\n" % (timestr, len(lines), outfile)) # Clear the indexes so I don't just accumulate memory. del self.indexes[infile] else: raise AssertionError, "Unknown event %s" % event index_fn = _IndexCollector(outpath) NLMMedlineXML.index_many(files, index_fn, nprocs=NPROCS) def print_file(handle, start=0): """print_file(handle[, start]) Print out a file to sys.stdout. """ handle.seek(start) while 1: line = handle.readline() if not line: break sys.stdout.write(line) sys.stdout.flush() def _find_all_files(path_or_file): if os.path.isdir(path_or_file): files = [] def add_files(arg, dirname, names, files=files): for name in names: full = os.path.join(dirname, name) files.append(full) os.path.walk(path_or_file, add_files, None) else: files = [path_or_file] return files NPROCS = 1 NOCLOBBER = 0 if __name__ == '__main__': if len(sys.argv) == 1: # If they didn't specify any arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) try: optlist, args = getopt.getopt(sys.argv[1:], "hnj:") except getopt.error, x: print >>sys.stderr, x sys.exit(0) if len(args) != 2: # If they gave extraneous arguments, print >>sys.stderr, USAGE # print the instructions and quit. sys.exit(0) inpath_or_file, outpath = args for opt, arg in optlist: if opt == '-h': print USAGE sys.exit(0) elif opt == '-n': NOCLOBBER = 1 elif opt == '-j': NPROCS = int(arg) if NPROCS < 1 or NPROCS > 100: raise ValueError, "nprocs should be >= 1 and <= 100" # Do some checking to make sure the input is reasonable. if not os.path.exists(inpath_or_file): print >>sys.stderr, "I can't find the path or file %s" % inpath_or_file sys.exit(0) if not os.path.exists(outpath): os.mkdir(outpath) if not os.path.isdir(outpath): raise ValueError, "%s doesn't look like a path" % outpath # Print out some diagnostic information. writel("%s\n" % (sys.argv[0])) writel("Indexing %s\n" % inpath_or_file) writel("Using %d processors\n" % NPROCS) timestr = time.strftime("%A, %d %B %Y %I:%M:%S%p") if os.environ.has_key("USER"): writel("Run on %s by %s\n" % (timestr, os.environ["USER"])) else: writel("Run on %s\n" % timestr) files = _find_all_files(inpath_or_file) # I only want the .xml files. files = filter(lambda x: x.endswith(".xml"), files) files.sort() index_files(files, outpath)