#!/usr/bin/env python import os import sys import getopt import select from Bio import Medline from Bio import PubMed from Bio import listfns USAGE = \ """medlars2bibtex [-h] [-c cachepath] [file_or_id...] Reformat MEDLARS format PubMed articles into bibtex. Takes a list of filenames or ids to reformat. The files must contain PubMed UID's or PMID's, one per line. OPTIONS -h Print this usage message. -c cachepath This should point to a directory where MEDLARS-formatted articles are stored. The filename of each article should be its ID. """ class RecordRetriever: def __init__(self, cachepath=None, parser=None): self.cachepath = cachepath self.parser = parser if cachepath is None: self.PUBMED = PubMed.Dictionary(parser=parser) def __getitem__(self, id): if self.cachepath is None: return self.PUBMED[id] filename = os.path.join(cachepath, id) try: handle = open(filename) except IOError, x: raise KeyError, "Couldn't find id %s as %s" % (id, filename) if self.parser is not None: return self.parser.parse(handle) return handle.read() def escape_for_latex(s): s = s.replace('-', '--') s = s.replace('{', '\{') s = s.replace('}', '\}') return s class BibtexArticle: def __init__(self): self.id = '' self.author = '' self.title = '' self.journal = '' self.year = '' self.month = '' self.volume = '' self.pages = '' self.issue = '' self.abstract = '' self.pmid = '' def write_bibtex_article(article, outhandle=sys.stdout): fields = [ ("AUTHOR", article.author), ("TITLE", article.title), ("JOURNAL", article.journal), ("YEAR", article.year), ("MONTH", article.month), ("VOLUME", article.volume), ("NUMBER", article.issue), ("PAGES", article.pages), ("PMID", article.pmid), ("ABSTRACT", article.abstract), ] use_braces = ["ABSTRACT", "TITLE"] use_braces = listfns.asdict(use_braces) outhandle.write('@ARTICLE{%s,\n' % article.id) for name, value in fields: if not value: continue value = escape_for_latex(value) if use_braces.has_key(name): outhandle.write(' %s={%s},\n' % (name, value)) else: outhandle.write(' %s="%s",\n' % (name, value)) outhandle.write(' ANNOTE="",\n') # Add an empty note field. outhandle.write("}\n") def medline2bibtex(rec): """medline2bibtex(rec) -> BibtexArticle""" # need better parsing of date field (XXXX Jan 01) bib = BibtexArticle() # Convert the author field. newauthors = [] for author in rec.authors: parts = author.split() junior = parts[-1] == 'Jr' if junior: newauthor = "%s, Jr, %s" % (parts[-2], ' '.join(parts[:-2])) else: newauthor = "%s %s" % (parts[-1], ' '.join(parts[:-1])) newauthors.append(newauthor) bib.author = ' and '.join(newauthors) # Convert the title field. title = rec.title.replace("\n", " ") if title[0] == '[' and title[-1] == ']': # Strip [...] title = title[1:-1] if title[-1] == '.': # Strip end period title = title[:-1] if title[-21:] == '[In Process Citation]': # Strip "[In Process Citation]" title = title[:21].rstrip() bib.title = title # Convert the journal field. bib.journal = rec.title_abbreviation # Convert the year and month fields. cols = rec.publication_date.split() bib.year = cols[0] if len(cols) >= 2: bib.month = cols[1] # Convert the volume field. bib.volume = rec.volume_issue # Convert the issue field. bib.issue = rec.issue_part_supplement # Convert the pages. # XXX Need to handle pages "12-24, 25-137" PMID 10575648 pages = rec.pagination i = pages.find('-') # 153-64 should be 153-164 if i >= -1: first, last = pages[:i], pages[i+1:] if len(first) > len(last): last = first[:-len(last)] + last pages = "%s-%s" % (first, last) bib.pages = pages # Convert the PubMed ID bib.pmid = rec.pubmed_id while bib.pmid and bib.pmid[0] == '0': bib.pmid = bib.pmid[1:] # Convert the abstract. bib.abstract = ' '.join(rec.abstract.split()) # Try and make a reasonable ID. if not rec.authors: author = "unknown" else: author = rec.authors[0].split()[0].lower() year = bib.year[-2:] bib.id = "%s%s" % (author, year) return bib def convert_article(rec, outhandle=sys.stdout): bib = medline2bibtex(rec) write_bibtex_article(bib, outhandle) outhandle.write("\n") def convert_records(ids, medline): allowed_pubtypes = ['JOURNAL ARTICLE', 'EDITORIAL', 'OVERALL'] allowed_pubtypes = listfns.asdict(allowed_pubtypes) for id in ids: rec = medline[id] ## pubtypes = [x.upper() for x in rec.publication_types] ## # Make sure the record has a known publication type. ## for t in pubtypes: ## if allowed_pubtypes.has_key(t): ## break ## else: ## print >>sys.stderr, "I don't know how to handle type %s (%s)" % \ ## (rec.publication_types, id) ## sys.exit(0) convert_article(rec) def read_file(filename): """read_file(filename) -> ids""" ids = open(filename).readlines() return [x.rstrip() for x in ids] if __name__ == '__main__': try: optlist, args = getopt.getopt(sys.argv[1:], "hc:") except getopt.error, x: print USAGE print >>sys.stderr, x sys.exit(0) if not args: print USAGE sys.exit(0) file_or_ids = args cachepath = None for opt, arg in optlist: if opt == '-h': print USAGE sys.exit(0) elif opt == '-c': cachepath = arg # Check to make sure the cachepath path exists. if cachepath and not os.path.exists(cachepath): print >>sys.stderr, \ "'%s' doesn't seem to be a directory." % cachepath medline = RecordRetriever(cachepath, Medline.RecordParser()) ids = [] for file_or_id in file_or_ids: if os.path.exists(file_or_id): ids.extend(read_file(file_or_id)) else: ids.append(file_or_id) convert_records(ids, medline) ## # Read STDIN for ID's. ## while 1: ## # See if STDIN is ready for reading. ## try: ## inh, outh, errh = select.select([sys.stdin], [], [], 1.0) ## except KeyboardInterrupt: ## print USAGE ## sys.exit(0) ## if not inh: ## # If STDIN isn't ready, then convert whatever I have now. ## if ids: ## ids = [] ## else: ## # Read an ID from STDIN. ## line = sys.stdin.readline() ## if not line: ## break ## ids.append(line.rstrip()) ## # Now download anything that hasn't been downloaded yet. ## if ids: ## convert_records(ids, medline)