#!/usr/bin/env python # This script downloads articles from PubMed in MEDLARS format. # See the USAGE variable for more information. import os import sys import time import getopt import fcntl from Bio import Medline from Bio import PubMed from Bio import MultiProc def writel(s, handle=None): if handle is None: handle = sys.stdout fcntl.lockf(handle.fileno(), fcntl.LOCK_EX) try: handle.write(s) handle.flush() finally: fcntl.lockf(handle.fileno(), fcntl.LOCK_UN) def writet(s, handle=None): if s.strip() != '': s = "%s %s" % (time.strftime("%H:%M:%S %m/%d/%Y"), s) writel(s, handle=handle) def is_valid_record(record): try: Medline.RecordParser().parse_str(record) except SyntaxError: return 0 return 1 class RecordHandler: def __init__(self, outpath=None): self.broken_ids = [] self.successful = [] self.outpath = outpath def print_record(self, id, record): if not is_valid_record(record): self.broken_record(id) else: writel("%s\n" % record) def save_record(self, id, record): if not is_valid_record(record): self.broken_record(id) else: filename = os.path.join(self.outpath, id) writet("Saving record %s as %s\n" % (id, filename)) open(filename, 'w').write(record) def broken_record(self, id): writel("ERROR: Could not download ID: %s\n" % id, sys.stderr) USAGE = \ """download_pubmed [-h] [-o outpath] [-f] [-d delay] [-j num procs] file_or_ids... Download articles from PubMed in MEDLARS format. Takes a list of either PubMed ID's or MEDLINE UID's. The ids can either be given on the command line or via STDIN. OPTIONS -h Print this usage message. -o outpath By default, I print the results to the screen. However, if an outpath is specified, then I will save the articles into that directory, one articles to a file. The name of the file is the id of the record. -f fast check. if the outpath is specified, and it looks like the file already exists, I normally do a full parse to see whether i should redownload it. If fastcheck is specified, I will only check to see if the file exists. -d delay The number of seconds to wait between queries. By default, waits about 10 minutes between batches up to 500 records. -j num procs The number of processes to run concurrently. Only speeds up background tasks. Does not affect rate of queries to PubMed. 1 by default. """ FAST_CHECK = 0 def _check_ids(start, skip, ids, outpath): # Return a list of the ids that are broken. # Get a list of the files that already exist in the outpath. existing_files = {} for x in os.listdir(outpath): existing_files[x] = 1 broken = [] for i in range(start, len(ids), skip): id = ids[i] filename = os.path.join(outpath, id) if not existing_files.has_key(id): broken.append(id) elif FAST_CHECK: # accept any file that exists. writet("%s exists, skipping\n" % id) elif not is_valid_record(open(filename).read()): broken.append(id) writet("%s exists but doesn't parse, will redownload\n" % id) else: writet("%s exists and parses, skipping\n" % id) return broken def download(ids, outpath, handler, delay, processes): # If their path exists, see if they already have the file. If they # do, and they parse well, then don't download it. if outpath: broken = [] x = MultiProc.run(processes, _check_ids, fn_args=(ids, outpath)) for i in range(len(x)): broken.extend(x[i]) ids = broken if not ids: return if not outpath: PubMed.download_many(ids, handler.print_record, handler.broken_record, delay=delay) else: PubMed.download_many(ids, handler.save_record, handler.broken_record, delay=delay) def read_file(filename): """read_file(filename) -> ids""" ids = open(filename).readlines() return [x.rstrip() for x in ids] if __name__ == '__main__': try: optlist, args = getopt.getopt(sys.argv[1:], "ho:fd:j:") except getopt.error, x: print USAGE print >>sys.stderr, x sys.exit(0) if not args: print USAGE sys.exit(0) file_or_ids = args outpath = None delay = 600 processes = 1 for opt, arg in optlist: if opt == '-h': print USAGE sys.exit(0) elif opt == '-o': outpath = arg elif opt == '-f': FAST_CHECK = 1 elif opt == '-d': delay = float(arg) if delay < 0 or delay > 10000: print >>sys.stderr, "Delay must be a positive number < 10000" sys.exit(-1) elif opt == '-j': processes = int(arg) if processes < 1 or processes > 1000: print >>sys.stderr, "Processes must be a pos. number < 1000" sys.exit(-1) # Check to make sure the outpath path exists. if outpath and not os.path.exists(outpath): writel("%r doesn't seem to be a directory.\n" % outpath, sys.stderr) sys.exit(0) handler = RecordHandler(outpath) ids = [] for file_or_id in file_or_ids: if os.path.exists(file_or_id): ids.extend(read_file(file_or_id)) else: ids.append(file_or_id) download(ids, outpath, handler, delay, processes)