#!/usr/bin/env python

import os
import sys
import getopt
import select

from Bio import Medline
from Bio import PubMed
from Bio import listfns

USAGE = \
"""medlars2bibtex [-h] [-c cachepath] [file_or_id...]

Reformat MEDLARS format PubMed articles into bibtex.  Takes a list of
filenames or ids to reformat.  The files must contain PubMed UID's or
PMID's, one per line.


OPTIONS

-h              Print this usage message.

-c cachepath    This should point to a directory where MEDLARS-formatted
                articles are stored.  The filename of each article should
                be its ID.

"""

class RecordRetriever:
    def __init__(self, cachepath=None, parser=None):
        self.cachepath = cachepath
        self.parser = parser
        if cachepath is None:
            self.PUBMED = PubMed.Dictionary(parser=parser)
    def __getitem__(self, id):
        if self.cachepath is None:
            return self.PUBMED[id]
        filename = os.path.join(cachepath, id)
        try:
            handle = open(filename)
        except IOError, x:
            raise KeyError, "Couldn't find id %s as %s" % (id, filename)
        if self.parser is not None:
            return self.parser.parse(handle)
        return handle.read()

def escape_for_latex(s):
    s = s.replace('-', '--')
    s = s.replace('{', '\{')
    s = s.replace('}', '\}')
    return s

class BibtexArticle:
    def __init__(self):
        self.id = ''
        self.author = ''
        self.title = ''
        self.journal = ''
        self.year = ''
        self.month = ''
        self.volume = ''
        self.pages = ''
        self.issue = ''
        self.abstract = ''
        self.pmid = ''

def write_bibtex_article(article, outhandle=sys.stdout):
    fields = [
        ("AUTHOR", article.author),
        ("TITLE", article.title),
        ("JOURNAL", article.journal),
        ("YEAR", article.year),
        ("MONTH", article.month),
        ("VOLUME", article.volume),
        ("NUMBER", article.issue),
        ("PAGES", article.pages),
        ("PMID", article.pmid),
        ("ABSTRACT", article.abstract),
        ]
    use_braces = ["ABSTRACT", "TITLE"]
    use_braces = listfns.asdict(use_braces)
    
    outhandle.write('@ARTICLE{%s,\n' % article.id)
    for name, value in fields:
        if not value:
            continue
        value = escape_for_latex(value)
        if use_braces.has_key(name):
            outhandle.write('  %s={%s},\n' % (name, value))
        else:
            outhandle.write('  %s="%s",\n' % (name, value))
    outhandle.write('  ANNOTE="",\n')  # Add an empty note field.
    outhandle.write("}\n")

def medline2bibtex(rec):
    """medline2bibtex(rec) -> BibtexArticle"""
    # need better parsing of date field (XXXX Jan 01)
    bib = BibtexArticle()

    # Convert the author field.
    newauthors = []
    for author in rec.authors:
        parts = author.split()
        junior = parts[-1] == 'Jr'
        if junior:
            newauthor = "%s, Jr, %s" % (parts[-2], ' '.join(parts[:-2]))
        else:
            newauthor = "%s %s" % (parts[-1], ' '.join(parts[:-1]))
        newauthors.append(newauthor)
    bib.author = ' and '.join(newauthors)

    # Convert the title field.
    title = rec.title.replace("\n", " ")
    if title[0] == '[' and title[-1] == ']':  # Strip [...]
        title = title[1:-1]
    if title[-1] == '.':                      # Strip end period
        title = title[:-1]
    if title[-21:] == '[In Process Citation]':  # Strip "[In Process Citation]"
        title = title[:21].rstrip()
    bib.title = title

    # Convert the journal field.
    bib.journal = rec.title_abbreviation

    # Convert the year and month fields.
    cols = rec.publication_date.split()
    bib.year = cols[0]
    if len(cols) >= 2:
        bib.month = cols[1]

    # Convert the volume field.
    bib.volume = rec.volume_issue

    # Convert the issue field.
    bib.issue = rec.issue_part_supplement

    # Convert the pages.
    # XXX Need to handle pages "12-24, 25-137" PMID 10575648
    pages = rec.pagination
    i = pages.find('-')    # 153-64 should be 153-164
    if i >= -1:
        first, last = pages[:i], pages[i+1:]
        if len(first) > len(last):
            last = first[:-len(last)] + last
            pages = "%s-%s" % (first, last)
    bib.pages = pages

    # Convert the PubMed ID
    bib.pmid = rec.pubmed_id
    while bib.pmid and bib.pmid[0] == '0':
        bib.pmid = bib.pmid[1:]

    # Convert the abstract.
    bib.abstract = ' '.join(rec.abstract.split())

    # Try and make a reasonable ID.
    if not rec.authors:
        author = "unknown"
    else:
        author = rec.authors[0].split()[0].lower()
    year = bib.year[-2:]
    bib.id = "%s%s" % (author, year)
    
    return bib

def convert_article(rec, outhandle=sys.stdout):
    bib = medline2bibtex(rec)
    write_bibtex_article(bib, outhandle)
    outhandle.write("\n")
    
def convert_records(ids, medline):
    allowed_pubtypes = ['JOURNAL ARTICLE', 'EDITORIAL', 'OVERALL']
    allowed_pubtypes = listfns.asdict(allowed_pubtypes)
    
    for id in ids:
        rec = medline[id]
##        pubtypes = [x.upper() for x in rec.publication_types]
##        # Make sure the record has a known publication type.
##        for t in pubtypes:
##            if allowed_pubtypes.has_key(t):
##                break
##        else:
##            print >>sys.stderr, "I don't know how to handle type %s (%s)" % \
##                  (rec.publication_types, id)
##            sys.exit(0)
        convert_article(rec)

def read_file(filename):
    """read_file(filename) -> ids"""
    ids = open(filename).readlines()
    return [x.rstrip() for x in ids]

if __name__ == '__main__':
    try:
        optlist, args = getopt.getopt(sys.argv[1:], "hc:")
    except getopt.error, x:
        print USAGE
        print >>sys.stderr, x
        sys.exit(0)

    if not args:
        print USAGE
        sys.exit(0)

    file_or_ids = args
    cachepath = None
    for opt, arg in optlist:
        if opt == '-h':
            print USAGE
            sys.exit(0)
        elif opt == '-c':
            cachepath = arg

    # Check to make sure the cachepath path exists.
    if cachepath and not os.path.exists(cachepath):
        print >>sys.stderr, \
              "'%s' doesn't seem to be a directory." % cachepath

    medline = RecordRetriever(cachepath, Medline.RecordParser())

    ids = []
    for file_or_id in file_or_ids:
        if os.path.exists(file_or_id):
            ids.extend(read_file(file_or_id))
        else:
            ids.append(file_or_id)
    convert_records(ids, medline)
            
##    # Read STDIN for ID's.
##    while 1:
##        # See if STDIN is ready for reading.
##        try:
##            inh, outh, errh = select.select([sys.stdin], [], [], 1.0)
##        except KeyboardInterrupt:
##            print USAGE
##            sys.exit(0)
##        if not inh:
##            # If STDIN isn't ready, then convert whatever I have now.
##            if ids:
##                ids = []
##        else:
##            # Read an ID from STDIN.
##            line = sys.stdin.readline()
##            if not line:
##                break
##            ids.append(line.rstrip())
##    # Now download anything that hasn't been downloaded yet.
##    if ids:
##        convert_records(ids, medline)