#!/usr/bin/env python
# (c) 2006 Eric Willgohs for Stanford University

import sys
import os
import glob
import getopt
import xml.dom.minidom
import shelve

from subprocess import *#IGNORE W0401

try:
    import Bio.PDB
except:
    sys.stderr.write( "This tool requires BioPython and RNAVIEW; installation instructions for which\n" )
    sys.stderr.write( "may be found (at the time of writing) here: http://www.biopython.org\n" )
    sys.stderr.write( "and here: http://ndbserver.rutgers.edu/services/download/index.html\n" )

def getVersion():
    return 0.861
def getMinNeededVersion():
    return 0.86
def getMinUpdatableVersion():
    return 0.0
def getReservedKeys():
    return ['DB_version', 'DB_reservedKeys']
    
def main():
    global rnaDB
    print "rnaDB.py version %5.3f"%getVersion()
    processOptions()
    
    if actionType == -1: # update
        doUpdate()
        return
    elif (rnaDBName in os.listdir(homeDir)):
        rnaDB = shelve.open(rnaDBName)
        if rnaDB.has_key('DB_version'):
            dbVer = rnaDB['DB_version']
        else:
            dbVer = 0.0
        if dbVer < getMinUpdatableVersion():
            sys.stderr.write( "%s format version %4.2f is incompatible with rnaDB.py version %5.3f,\n"%(rnaDBName, dbVer, getVersion() ))
            sys.stderr.write( "and cannot be updated; please save %s under a different name and rebuild.\n"%rnaDBName)
            sys.exit(2)
        elif dbVer < getMinNeededVersion():
            sys.stderr.write( "%s format version %4.2f is an outdated format for rnaDB.py version %5.3f.\n"%(rnaDBName, dbVer, getVersion() ))
            sys.stderr.write( "Please save a copy and then update format by running python rnaDB.py --update\n")
            sys.exit(2)
        else:
            print "database %s format version %5.3f"%(rnaDBName, rnaDB['DB_version'])    
        
    if actionType == 0: # build
        doBuild()
    elif actionType ==1: # extract
        doExtract()
    elif actionType == 2: #inventory
        doInventory()
    elif actionType == 3: #printHelices
        doPrintHelices()
    elif actionType == 4: #audit
        doAudit()
    return # end of main()

def usage():
    sys.stderr.write( "command format:  python rnaDB.py [options] <identifier1> [<identifier2>...]\n" )
    sys.stderr.write( "<options> MUST include either --audit, --build, --extract,  --inventory or --printHelices\n")
    sys.stderr.write( "<identifier> : for --build, a pdb filename, or with the -l option a name of a textfile containing a list of pdb files,\n")
    sys.stderr.write(              "\t\t\t or a wildcard-glob for either.\n" )
    sys.stderr.write( "<identifier> : for --extract, a pdb ID, or pdbID:chains such as 1A34:BC, or with the -l option a name \n")
    sys.stderr.write(              "\t\t\t of a textfile containing a list of such, or a wildcard-glob for such textfiles.\n" )
    sys.stderr.write( "options:\n" )
    sys.stderr.write( "\t--audit: checks existing entries for helix patterns known to need special attention\n" )
    sys.stderr.write( "\t--build: new pdb entries will be added to the rnaDB\n" )
    sys.stderr.write( "\t--extract:  helical (default) or non-helical RNA portions of specified pdbIDs and chains\n" )
    sys.stderr.write(       "\t\t\t  will be extracted to separate pdb files\n" )
    sys.stderr.write( "\t--db=<rna_db_name>: specifies rna_db file name, default \"rna_db.dat\"\n" )
    sys.stderr.write( "\t-h, --help: display this help message.  This tool requires BioPython and RNAVIEW; installation instructions\n" )
    sys.stderr.write( "\t\t\t for which are found here: http://www.biopython.org/docs/install/Installation.html\n" )
    sys.stderr.write( "\t\t\t and here: http://ndbserver.rutgers.edu/services/download/index.html\n" )
    sys.stderr.write( "\t-l : command line identifiers are textfiles containing a whitespace separated list of \n" )
    sys.stderr.write( "\t\t\t identifiers to be processed\n" )
    sys.stderr.write( "\t--tmp=<dir_name>: specifies directory for temporary files, default is \"rnaDB_tmp\"\n" )
    sys.stderr.write( "\t--dest=<dir_name>: specifies directory for extracted pdb files, default is \"rna_pdbs\"\n" )
    sys.stderr.write( "\t--helices: deprecated, replaced with intra_helices, below, for clarity.\n" )
    sys.stderr.write( "\t--intra_helices: only works with --extract, specifies single-stranded (intra-molecular) helical portions are to be extracted\n" )
    sys.stderr.write( "\t--all_helices (default): only works with --extract, specifies all helical portions are to be extracted\n" )
    sys.stderr.write( "\t--inter_helices: only works with --extract, specifies only intermolecular helical portions are to be extracted\n" )
    sys.stderr.write( "\t--invert: only works with --extract, specifies extraction of the portions not meeting helical and/or other requirements\n" )
    sys.stderr.write( "\t--noDL: disables downloading of pdb files from pdb.org\n" )
    sys.stderr.write( "\t--printHelices: prints helix descriptors for helices in specified entries\n" )
    sys.stderr.write( "\t--basePairs: adds inidividual basePair details to --printHelices output\n" )
    sys.stderr.write( "\t--savexml: saves xml files created during build process\n" )
    sys.stderr.write( "\n" )
    return

def processOptions():
    global fromLists, ident_list, homeDir, tmpDir, destDir, rnaDBName, cutoff
    global getSingle, getDouble, invert, local, showBasePairs, actionType, savexml
    
    fromLists = 0
    ident_list = []
    homeDir = os.getcwd()
    tmpDir = "rnaDB_tmp"
    destDir = "rna_pdbs"
    rnaDBName = "rna_db.dat"
    getSingle = 1
    getDouble = 1
    invert = 0
    local = 0
    showBasePairs = 0
    savexml = 0
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "lh", ["audit", "help", "db=", "tmp=", "build", "extract", 
                                                        "inventory", "helices", "invert", "dest=", "all_helices", 
                                                        "inter_helices", "intra_helices", "noDL", "printHelices", 
                                                        "basePairs", "update", "savexml"])
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    
    optlist = [o for o,a in opts]
    if "--update" in optlist:
        actionType = -1
    elif "--build" in optlist:
        actionType = 0
    elif "--extract" in optlist:
        actionType = 1
    elif "--inventory" in optlist:
        actionType = 2
    elif "--printHelices" in optlist:
        actionType = 3
    elif "--audit" in optlist:
        actionType = 4
    else:
        usage()
        sys.exit(2)
        
    for o, a in opts:
        if o in ["-h", "--help"]:
            usage()
            sys.exit()
        elif o == "-l":
            fromLists = 1
        elif o == "-p":
           cutoff = float(a)
        elif o == "--db=":
            rnaDBName = a
        elif o == "--tmp=":
            tmpDir = a
        elif o == "--dest=":
            destDir = a
        elif o == "--intra_helices":
            getDouble = 0
        elif o == "--inter_helices":
            getSingle = 0
        elif o == "--invert":
            invert = 1
        elif o == "--helices":
            sys.stderr.write( "The '--helices' option is deprecated, please use '--intra_helices' instead.\n" )
            getDouble = 0
        elif o == "--noDL":
            local = 1
        elif o == "--basePairs":
            showBasePairs = 1
        elif o == "--savexml":
            savexml = 1

    if fromLists:
        for f in args:
            for file in glob.glob(f):
                source_fh = open(file)
                lines = source_fh.readlines()
                thisList = [f.strip() for f in lines]
                ident_list.extend(thisList)
                source_fh.close()
    else: #not fromLists
        for f in args:
            templist = glob.glob(f)
            f = f.split(".")[0]
            if ((templist==[])and (((len(f)==4)and(local==0))or(actionType==1))):
                templist = [f]
            ident_list.extend(templist)
    print "expanded identifier list is %s"%ident_list
    return

def doUpdate():
    global rnaDB, ident_list
    if (rnaDBName not in os.listdir(homeDir)):
        sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName)
        sys.exit(2)
        
    rnaDB = shelve.open(rnaDBName)
    if rnaDB.has_key('DB_version'):
        dbVer = rnaDB['DB_version']
    else:
        dbVer = 0.0
    print "Beginning update of %s from format version %5.3f to %5.3f"%(rnaDBName, dbVer, getVersion())
    if dbVer < getMinUpdatableVersion():
        sys.stderr.write( "%s version %4.2f is incompatible with rnaDB.py version %5.3f,\n"%(rnaDBName, dbVer, getVersion() ))
        sys.stderr.write( "and cannot be updated; please save %s under a different name and rebuild.\n"%rnaDBName)
        sys.exit(2)
    elif dbVer < 0.86:
        rnaDB['DB_reservedKeys'] = getReservedKeys()
        rnaDB.sync()
        ident_list = getAllDBIdents()
        for ident in ident_list:
            if ident==ident.upper():
                continue
            elif (ident.upper() not in ident_list):
                entry = rnaDB[ident]
                entry.pdbID = ident.upper()
                rnaDB[entry.pdbID] = entry
            del rnaDB[ident]
            rnaDB.sync()
            continue
        rnaDB['DB_version'] = 0.86
    rnaDB.close()      
    print "Update complete; running the audit command now is recommended."
    return

def doBuild():
    global rnaDB, ident_list
    if not (tmpDir in os.listdir(homeDir)):
        newTempDir=1
        os.mkdir(tmpDir)
    
    if (rnaDBName in os.listdir(homeDir)):
        print "RNA DB %s already exists; new entries will be added to it or update existing entries."%rnaDBName
    else:
        print "RNA DB %s will be created."%rnaDBName
        rnaDB = shelve.open(rnaDBName)
        rnaDB['DB_version'] = getVersion()
        rnaDB['DB_reservedKeys'] = getReservedKeys()
        rnaDB.sync()

    for file in ident_list:
        print "Processing file %s.  "%file
        newEntry = RNA_DB_Entry(pdbFile=file, tempDir=tmpDir, local=local, savexml=savexml)
        if len(newEntry.rnaChains)>0:
            rnaDB[newEntry.pdbID]=newEntry
            rnaDB.sync()
        print "ident %s finished.\n"%file
    rnaDB.close()
    if os.listdir(tmpDir)==[]:
        os.rmdir(tmpDir)
    print "Build finished.\n"
    return    

def doExtract():
    global rnaDB, ident_list
    if not (rnaDBName in os.listdir(homeDir)):
        sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName)
        sys.exit(2)
    if not (destDir in os.listdir(homeDir)):
        os.mkdir(destDir)
    print "PDB extractions will be saved to folder %s\n"%destDir
        
    allKeys= getAllDBIdents()
    for ident in ident_list:
        identParts = ident.split(":")
        pdbID = identParts[0].split(".")[0]
        print "Processing identifier %s.  "%pdbID
        if (pdbID in allKeys):
            entry = rnaDB[pdbID]
        elif (pdbID.upper() in allKeys):
            entry = rnaDB[pdbID.upper()]
        elif (pdbID.lower() in allKeys):
            entry = rnaDB[pdbID.lower()]
        else:
            sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID))
            continue
        target_helices = []
        if getSingle:
            target_helices.extend(entry.intraMolHelices)
        if getDouble:
            target_helices.extend(entry.interMolHelices)
        if (entry.pdbFile not in os.listdir(os.getcwd())):
                 if (local):
                     sys.stderr.write("**** pdb file %s does not exist in directory %s.\n"%(entry.pdbFile,os.getcwd()))
                 else:
                     try:
                         dlPDB(entry.pdbID)
                     except:
                         sys.stderr.write("**** unable to download pdb file %s to directory %s.\n"%(entry.pdbFile,os.getcwd()))
                         continue
                     entry.pdbFile=entry.pdbID+".pdb"
                     rnaDB[entry.pdbID]=entry
                     rnaDB.sync()
        if len(target_helices) == 0:
            sys.stderr.write("**** %s has no target helices.****\n"%ident)
            continue     
        else:
            s=RNA_DB_Entry.parser.get_structure('unused_ref', entry.pdbFile)
            RNA_DB_Entry.io.set_structure(s)
            firstModel = s.get_list()[0]
            name_split = list(os.path.splitext(entry.pdbFile))
            if len(identParts)==1:
                theseChains="any"
            else:
                if identParts[1]=="__first":
                    theseChains = [entry.naChains[target_helices[0][0][0]-1]]
                else:
                    theseChains=list(identParts[1])
                name_split[0]=name_split[0]+"_chain"+"".join(theseChains)
            os.chdir(destDir)
            if invert:
                name_split[0]=name_split[0]+"_inverted"
                RNA_DB_Entry.io.save(name_split[0]+name_split[1], invertSelect(entry, target_helices,firstModel,theseChains))
            else:
                name_split[0]=name_split[0]+"_helices"
                RNA_DB_Entry.io.save(name_split[0]+name_split[1], helixSelect(entry, target_helices,firstModel,theseChains))
            os.chdir("..")
    rnaDB.close()
    print "Extraction Done."
    return
    
def doInventory():
    global rnaDB, ident_list
    if not (rnaDBName in os.listdir(homeDir)):
        sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName)
        sys.exit(2)
    
    allKeys= getAllDBIdents()
    if (ident_list == []):
        ident_list = allKeys
    for ident in ident_list:
        identParts = ident.split(":")
        pdbID = identParts[0].split(".")[0]
        print "Processing identifier %s.  "%pdbID
        if (pdbID in allKeys):
            entry = rnaDB[pdbID]
        elif (pdbID.upper() in allKeys):
            entry = rnaDB[pdbID.upper()]
        elif (pdbID.lower() in allKeys):
            entry = rnaDB[pdbID.lower()]
        else:
            sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID))
            continue
        ssChains={}
        dsChains={}
        for helix in entry.intraMolHelices:
            tempIndex= helix[0][0]-1
            if ((tempIndex>=0)and (tempIndex<len(entry.naChains))):
                 ssChains[entry.naChains[tempIndex]]=1
            else:
                sys.stderr.write("**** pdb ID %s has anomalous helix reference to na chain %i.\n"%(pdbID,tempIndex))
        for helix in entry.interMolHelices:
            tempIndex= helix[0][0]-1
            if ((tempIndex>=0)and (tempIndex<len(entry.naChains))):
                dsChains[entry.naChains[tempIndex]]=1
            else:
                sys.stderr.write("**** pdb ID %s has anomalous helix reference to na chain %i.\n"%(pdbID,tempIndex))
            tempIndex= helix[1][0]-1
            if ((tempIndex>=0)and (tempIndex<len(entry.naChains))):
                dsChains[entry.naChains[tempIndex]]=1
            else:
                sys.stderr.write("**** pdb ID %s has anomalous helix reference to na chain %i.\n"%(pdbID,tempIndex))
        print "%s has %i single-stranded RNA helices in chains %s and %i intermolecular RNA helices in chains %s."%(pdbID,len(entry.intraMolHelices), "".join(ssChains.keys()),len(entry.interMolHelices), "".join(dsChains.keys()))
    rnaDB.close()
    print "Inventory Done."
    return

def doPrintHelices():
    global rnaDB, ident_list
    if not (rnaDBName in os.listdir(homeDir)):
        sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName)
        sys.exit(2)
                 
    allKeys= getAllDBIdents()
    if (ident_list == []):
        ident_list = allKeys
                 
    if (showBasePairs):
         print "printing helix descriptors followed by listing of base pairs in the helix, formatted as:"
         print '"pdbID  5\'chainID  5\'resSeq  3\'chainID  3\'resSeq  length" (helix descriptor)'
         print '"pdbID  5\'chainID  5\'resSeq1  3\'chainID  3\'resSeq1 " (basePair 1 descriptor)'
         print '"pdbID  5\'chainID  5\'resSeq2  3\'chainID  3\'resSeq2 " (basePair 2 descriptor)'
         print '<etc.>'
         print '============================================'
    else:
         print "printing helix descriptors, formatted as:"
         print '"pdbID  5\'chainID  5\'resSeq  3\'chainID  3\'resSeq  length" (helix descriptor)'
         print '<etc.>'
         print '============================================'
    
    for ident in ident_list:
        identParts = ident.split(":")
        pdbID = identParts[0].split(".")[0]
        print "Processing identifier %s.  "%pdbID
        if (pdbID in allKeys):
            entry = rnaDB[pdbID]
        elif (pdbID.upper() in allKeys):
            entry = rnaDB[pdbID.upper()]
        elif (pdbID.lower() in allKeys):
            entry = rnaDB[pdbID.lower()]
        else:
            sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID))
            continue
        target_helices = []
        if getSingle:
            target_helices.extend(entry.intraMolHelices)
        if getDouble:
            target_helices.extend(entry.interMolHelices)
        if len(target_helices) == 0:
            sys.stderr.write("**** %s has no target helices.****\n"%ident)
            continue     
        else:
            for helix in target_helices:
                ((mol5,pos5),(mol3,pos3), length) = helix
                mol5ID = entry.naChains[mol5-1] 
                mol3ID = entry.naChains[mol3-1] 
                try:
                    #print "raw helix in %s: ((%s, %i), (%s, %i), %i)"%(entry.pdbID, mol5, pos5, mol3, pos3, length) 
                    print "%s %s %i %s %i %i"%(entry.pdbID, mol5ID, entry.nTables[mol5ID][pos5-1], mol3ID, entry.nTables[mol3ID][pos3-1], length)
                except:
                    sys.stderr.write( "helix lookup problem in %s: ((%s, %i), (%s, %i), %i)\n"%(entry.pdbID, mol5, pos5, mol3, pos3, length) )
                    continue
                if (showBasePairs):
                    for bpIndex in range( length ):
                        try:
                            print "%s %s %i %s %i"%( entry.pdbID, mol5ID, entry.nTables[mol5ID][pos5-1+bpIndex], mol3ID, entry.nTables[mol3ID][pos3-1-bpIndex] )
                        except:
                            sys.stderr.write( "basepair lookup problem in %s: basepair %i of helix ((%s, %i), (%s, %i), %i)\n"%( entry.pdbID, bpIndex, mol5, pos5, mol3, pos3, length ) )
                            continue
    rnaDB.close()
    print "printHelices Done."
    return

def doAudit():
    global rnaDB, ident_list
    if not (rnaDBName in os.listdir(homeDir)):
        sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName)
        sys.exit(2)
                 
    faultyEntries=[]
    allKeys= getAllDBIdents()
    print "auditing helix descriptors"
    if (ident_list == []):
        ident_list = allKeys
    for ident in ident_list:
        identParts = ident.split(":")
        pdbID = identParts[0].split(".")[0]
        #print "Processing identifier %s.  "%pdbID
        if (pdbID in allKeys):
            entry = rnaDB[pdbID]
        elif (pdbID.upper() in allKeys):
            entry = rnaDB[pdbID.upper()]
        elif (pdbID.lower() in allKeys):
            entry = rnaDB[pdbID.lower()]
        else:
            sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID))
            continue
        target_helices = []
        if getSingle:
            target_helices.extend(entry.intraMolHelices)
        if getDouble:
            target_helices.extend(entry.interMolHelices)
        if len(target_helices) == 0:
            continue     
        else:
            for helix in target_helices:
                try:
                    debug = 0
                    ((mol5,pos5),(mol3,pos3), length) = helix
                    mol5ID = entry.naChains[mol5-1] 
                    mol3ID = entry.naChains[mol3-1] 
                    mol5Begin = entry.nTables[mol5ID][pos5-1]
                    debug = 1
                    mol5End   = entry.nTables[mol5ID][pos5-1+length-1]
                    debug = 2
                    mol5Dist = mol5End - mol5Begin + 1
                    debug = 3
                    mol3Begin = entry.nTables[mol3ID][pos3-1-length+1]
                    debug = 4
                    mol3End = entry.nTables[mol3ID][pos3-1]
                    debug = 5
                    mol3Dist = mol3End - mol3Begin+1
                    debug = 6
                    if ((mol5Dist!=length) or (mol3Dist!=length)):
                        seq5 = ["%s"%e for e in entry.nTables[mol5ID][pos5-1: pos5-1+length]]
                        debug = 7
                        seq3 = ["%s"%e for e in entry.nTables[mol3ID][pos3-1: pos3-1-length:-1]]
                        debug = 8
                        sys.stderr.write( "discontiguous helix  in %s: ((%s, %i), (%s, %i), %i)\n"%(entry.pdbID, mol5, pos5, mol3, pos3, length) )
                        sys.stderr.write( "\ttranslated as %s %s %i %s %i %i\n"%(entry.pdbID, mol5ID, entry.nTables[mol5ID][pos5-1], mol3ID, entry.nTables[mol3ID][pos3-1], length))
                        sys.stderr.write( "\t5' gap %i resSeq sequence %s\n"%((mol5Dist-length)," ".join(seq5)))
                        sys.stderr.write( "\t3' gap %i resSeq sequence %s\n"%((mol3Dist-length)," ".join(seq3)))
                except:
                    if entry.pdbID not in faultyEntries:
                        faultyEntries.append(entry.pdbID)
                    sys.stderr.write( "****Please rebuild entry %s due to helix lookup problem: ((%s, %i), (%s, %i), %i)\n"%(entry.pdbID, mol5, pos5, mol3, pos3, length) )

    rnaDB.close()
    if faultyEntries != []:
        print "It is highly recommended that you rebuild the following entries: %s"%faultyEntries
    print "Audit Done."
    return

def getAllDBIdents():
    return [key for key in rnaDB.keys() if key not in rnaDB['DB_reservedKeys']]

class RNA_DB_Entry:
    parser = Bio.PDB.PDBParser()
    io= Bio.PDB.PDBIO()
    rcsb_pdb = Bio.config.DBRegistry.CGIDB(
                                         name="rcsb_pdb",
                                         cgi="http://www.pdb.org/pdb/downloadFile.do",
                                         key= "structureId",
                                         params=[("fileFormat","pdb"),("compression","NO")])
    Bio.db.register(rcsb_pdb)
    ourPDB = Bio.db["rcsb_pdb"]
    
    def __init__(self, pdbFile="", pdbID="", tempDir="rnaDB_temp", local=0, savexml = 0):
        self.pdbFile=pdbFile
        self.pdbID=pdbID.upper()
        self.models=0
        self.chains=[]
        self.rnaChains=[]
        self.dnaChains=[]
        self.naChains =[]
        self.intraMolHelices=[]
        self.interMolHelices=[]
        self.sequences={}
        self.nTables={}
        self.family=""
        if pdbFile <> "":
            self.pdbID = os.path.split(pdbFile)[1].split('.')[0].upper()
            self.parsePDB(tempDir=tempDir, local=local, savexml = savexml)
        
    def parsePDB(self, tempDir="rnaDB_temp", local=0, savexml = 0):
        cutoff = 0.80
        localFiles = os.listdir(os.getcwd())
        if ((local==0)and (self.pdbFile not in localFiles)):
            self.pdbFile = self.pdbID+".pdb"
            if (self.pdbFile not in localFiles):
                sys.stderr.write("**** pdb file %s not found locally; attempting to download.\n"%self.pdbFile)
                try:
                    dlPDB(self.pdbID)
                except:
                    sys.stderr.write("**** unable to download pdb %s to directory %s.\n"%(self.pdbID,os.getcwd()))
                    return
        file = self.pdbFile
        pathSplit = os.path.split(file)
        name_split = list(os.path.splitext(pathSplit[1]))
        
        try:
            s=RNA_DB_Entry.parser.get_structure('unused_ref', file)
        except:
            sys.stderr.write("**** unable to parse pdb %s.\n"%(self.pdbID))
            return;
        self.models = len(s.get_list())
        firstModel = s.get_list()[0]
        RNA_DB_Entry.io.set_structure(s)

        if len(s.get_list()) > 1: #more than one model
            name_split[0]=name_split[0]+".model_1"
        thisFile = name_split[0]+".tmp"+name_split[1]
                
        chains = firstModel.get_list()
        self.chains = [ch.get_id() for ch in chains]
        for chain in chains:
            stats = resStats(chain)
            chID = chain.get_id()
            if isRnaviewNA(chain):
                self.naChains.append(chID)
                self.sequences[chID]=""
                if ((stats[0]<2)and (stats[1]>=2)):  
                    self.dnaChains.append(chID)
                else: 
                    self.rnaChains.append(chID)

        if not self.naChains:
            sys.stderr.write("%s has no usable nucleic acid chains.\n"%self.pdbID)
        else:
            os.chdir(tempDir)
            RNA_DB_Entry.io.save(thisFile, mySelect(firstModel,"any"))
            p2 = os.system("rnaview -p %s > tmp.out > tmp.err"%thisFile)
            xmlFile=thisFile+".xml"
    
            if not (xmlFile in os.listdir(os.getcwd())):
                sys.stderr.write( "rnaview has not produced xml for %s; likely cause is absence of basepairs\n"%self.pdbID )
            else:
                doc=xml.dom.minidom.parse(xmlFile)
                numSHelices=0
                numDHelices=0
                ends=['base-id-5p','base-id-3p']
                for node in doc.getElementsByTagName('molecule')+doc.getElementsByTagName('interactions'):
                    if node.localName=="molecule":
                        molID = int(node.getAttribute("id"))
                        try:
                            chID = self.naChains[molID-1]
                        except:
                            sys.stderr.write("error looking up mol # %i in na Chains %s"%(molID, self.naChains))
                        if (chID in self.rnaChains) or (chID in self.dnaChains):
                            print "getting sequence for chain %s of %s"%(chID,self.pdbID)
                            self.sequences[chID],self.nTables[chID]=getSequenceInfo(node, pdbID=self.pdbID)
                        if not (chID in self.rnaChains):
                            continue #to next molecule or interaction node
                    for helix in node.getElementsByTagName('helix'):
                        bases = [helix.getElementsByTagName(e)[0].getElementsByTagName('base-id')[0] for e in ends]
                        pos5,pos3 = map(int,[GetText(b.getElementsByTagName('position')[0]) for b in bases])
                        length=int(GetText(helix.getElementsByTagName('length')[0]))
                        if node.localName=="molecule":
                            numSHelices+=1
                            self.intraMolHelices.append(((molID,pos5),(molID,pos3), length))
                        else:
                            mol5,mol3 = map(int,[b.getElementsByTagName('molecule-id')[0].getAttribute("ref") for b in bases])
                            numDHelices+=1
                            self.interMolHelices.append(((mol5,pos5),(mol3,pos3), length))
                print "%s [%s] has %i single stranded NA helices and %i intermolecular NA helices."%(file,self.pdbID,numSHelices,numDHelices)
            
            removeList = [thisFile, thisFile+".ps", thisFile+".out", "tmp.out", "tmp.err", "base_pair_statistics.out"]
            if (not(savexml)):
                removeList.add(xmlFile)
            localFiles = os.listdir(os.getcwd())
            for oldFile in removeList:
                if oldFile in localFiles:
                    os.remove(oldFile)
            os.chdir("..")
#end def Class rnaDB_Entry
        
def GetText(firstnode):
    text = ""
    nodelist = [firstnode]
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            text += node.data
        else:
            nodelist.extend(node.childNodes)
    return text.strip()

def getSequenceInfo(node, pdbID="unknown"):
    sequences = node.getElementsByTagName('sequence')
    if len(sequences)<>1:
        sys.stderr.write("molecule %i of file %s unexpectedly has %i sequence entries.\n"%(node.getAttribute("id"),pdbID,len(sequences)))

    #get the sequence & any modifications
    sd=sequences[0].getElementsByTagName('seq-data')[0]
    thisSeq = list([n.data.replace(' ','').replace('\n','') for n in sd.childNodes if n.nodeType==n.TEXT_NODE][0])
    seqMods = sequences[0].getElementsByTagName('modification')
    for mod in seqMods:
        modPos=int(GetText(mod.getElementsByTagName('position')[0]))
        thisSeq[modPos-1]=GetText(mod.getElementsByTagName('modified-type')[0])

    #verify the numbering range used by RNAMLView is as expected 
    Nrange = sequences[0].getElementsByTagName('numbering-system')[0].getElementsByTagName('numbering-range')[0]
    if (GetText(Nrange.getElementsByTagName('start')[0])<>"1") or int(GetText(Nrange.getElementsByTagName('end')[0]))<>len(thisSeq):
        sys.stderr.write("Unexpected numbering range for molecule %i of file %s.\n"%(node.getAttribute("id"),pdbID))
        
    #get the numbering table for translating into pdb residue ID's
    nTable = sequences[0].getElementsByTagName('numbering-table')[0]
    nTableList = map(int, list([n.data.split() for n in nTable.childNodes if n.nodeType==n.TEXT_NODE][0]))
    if len(nTableList)<>len(thisSeq):
        sys.stderr.write("Unexpected numbering table for molecule %i of file %s.\n"%(node.getAttribute("id"),pdbID))
    return (thisSeq,nTableList)

def isRNA(stats, cutoff):
    # probably don't need to allow so many hetatms
    # probably should also check for res.get_resname() being in list of 
    # regular and modified rna types
    (rnaCount, dnaCount, pepCount, hetCount) = stats
    chLen=rnaCount + dnaCount + pepCount + hetCount
#    if ((rnaCount >= 2) and (rnaCount > hetCount) and (rnaCount > (float(chLen)-hetCount)*cutoff)):
    if (rnaCount >= 2):
        return 1
    else:
        return 0

def isRnaviewNA(ch):
#===============================================================================
# this routine performs the same analysis as does rnaview in fpair_sub.c
#===============================================================================
    naCount = 0
    for res in ch.get_list():
            if ((res.has_id('N1'))and(res.has_id('C2'))and(res.has_id('C6'))):
                N1 = res['N1']
                C2 = res['C2']
                C6 = res['C6']
                if (((N1-C2)<2.0)and((N1-C6)<2.0)and((C2-C6)<3.0)):
                     naCount += 1
    if (naCount>1):
        return True
    else:
        return False
                     
def resStats(ch):

    rnaCount = 0
    dnaCount = 0
    pepCount = 0
    hetCount = 0
    for res in ch.get_list():
            atomList = [a.get_name() for a in res.get_list()]
            if "O2*" in atomList:
                rnaCount += 1
            elif "CA" in atomList:
                pepCount += 1
            elif "O2" in atomList: 
                dnaCount += 1
            else:
                hetCount += 1

    print "chain %s, %i total residues: %i rna's, %i dna's, %i peptides, %i hetatms."\
            %(ch.get_id(), len(ch), rnaCount, dnaCount, pepCount, hetCount)
    return (rnaCount, dnaCount, pepCount, hetCount)

class mySelect(Bio.PDB.Select):
    
    def __init__(self, mod, ch):
        self.myModel= mod
        self.myChain= ch
        
    def accept_model(self, model):
        if (self.myModel=="any"):
            return 1
        elif (self.myModel == model):
            return 1
        else:
            return 0
        
    def accept_chain(self, chain):
        if (self.myChain=="any"):
            return 1
        elif chain == self.myChain:
            return 1
        else:
            return 0
        
class helixSelect(Bio.PDB.Select):
    
    def __init__(self, entry, helices, mod="any", chList="any"):
        self.myEntry= entry
        self.myModel= mod
        self.myChains= chList
        self.myHelices= helices
        
    def accept_model(self, model):
        if (self.myModel=="any"):
            return 1
        elif (self.myModel == model):
            return 1
        else:
            return 0
        
    def accept_chain(self, chain):
        if (self.myChains=="any"):
            return 1
        elif chain.get_id() in self.myChains:
            return 1
        else:
            return 0

    def accept_residue(self, res):
        for helix in self.myHelices:
            try:
                debugPoint=1
                resMol = res.get_parent().get_id()
                debugPoint=2
                ((mol5,pos5),(mol3,pos3), length) = helix
                debugPoint=3
                mol5ID= self.myEntry.naChains[mol5-1]
                debugPoint=4
                mol3ID= self.myEntry.naChains[mol3-1]
                debugPoint=5
            except:
                sys.stderr.write("helix lookup problem\n")
                sys.stderr.write("helix ((mol5,pos5),(mol3,pos3), length): ((%i,%i),(%i,%i),%i)\n"%(mol5,pos5,mol3,pos3,length))
                sys.stderr.write("na chains: %s\n"%self.myEntry.nTables.keys())
                sys.stderr.write("resMol, mol5ID, mol3ID: %s, %s, %s\n"%(resMol,mol5ID,mol3ID))
                sys.stderr.write("debugPoint: %i\n"%debugPoint)
                return 0
            if ((resMol!=mol5ID) and (resMol!=mol3ID)):
                continue
            
            try:
                debugPoint=1
                thisNTable = self.myEntry.nTables[resMol]
                debugPoint=2
                thisResSeq = res.get_id()[1]
                debugPoint=3
                if thisResSeq in thisNTable:
                    resPos = thisNTable.index( thisResSeq )+1
                    debugPoint=4
                    if resMol == mol5ID:
                        if (pos5 <= resPos) and ( pos5+length > resPos):
                            return 1
                    if resMol == mol3ID:
                        if (pos3 >= resPos) and ( pos3-length < resPos):
                            return 1
                else: 
                # this is known to occur for residues with too few atom entries in the pdb file to satisfy 
                # rnaview, and for hetatm groups assigned an rna chain in the pdb file, but which
                # rnaview does not consider to be rna residues (such as a sulfate group)
                    sys.stdout.write( "*** the following residue will be ignored under extraction: chain %s residue %s\n"%(resMol,res) )
                    return 0
                     
            except:
                sys.stderr.write( "residue lookup problem\n" )
                sys.stderr.write( "debugPoint: %i\n"%debugPoint )
                sys.stderr.write( "helix ((mol5,pos5),(mol3,pos3), length): ((%i,%i),(%i,%i),%i)\n"%( mol5, pos5, mol3, pos3, length ) )
                sys.stderr.write( "res mol ID: %s\n"%repr(resMol) )
                sys.stderr.write( "nTable keys: %s\n"%self.myEntry.nTables.keys() )
                sys.stderr.write( "nTable: %s\n"%thisNTable )
                sys.stderr.write( "res Seq: %s\n"%thisResSeq )
                return 0
        #all helices done, no match found
        return 0

    def getResPos(self, molID, res):
        return resPos 

class invertSelect(helixSelect):

    def accept_residue(self, res):
        return not helixSelect.accept_residue(self,res)

def dlPDB(pdbID):
             
    thisPDB=RNA_DB_Entry.ourPDB[pdbID]
    pdblines=thisPDB.readlines()
    tempfile = open(pdbID+".pdb", 'w')
    tempfile.writelines(pdblines)
    tempfile.close()
                     
#===============================================================================
#Main
"""

"""
if __name__ == "__main__":
    main()
    
