#!/usr/bin/env python # (c) 2006 Eric Willgohs for Stanford University import sys import os import glob import getopt import xml.dom.minidom import shelve from subprocess import *#IGNORE W0401 try: import Bio.PDB except: sys.stderr.write( "This tool requires BioPython and RNAVIEW; installation instructions for which\n" ) sys.stderr.write( "may be found (at the time of writing) here: http://www.biopython.org\n" ) sys.stderr.write( "and here: http://ndbserver.rutgers.edu/services/download/index.html\n" ) def getVersion(): return 0.861 def getMinNeededVersion(): return 0.86 def getMinUpdatableVersion(): return 0.0 def getReservedKeys(): return ['DB_version', 'DB_reservedKeys'] def main(): global rnaDB print "rnaDB.py version %5.3f"%getVersion() processOptions() if actionType == -1: # update doUpdate() return elif (rnaDBName in os.listdir(homeDir)): rnaDB = shelve.open(rnaDBName) if rnaDB.has_key('DB_version'): dbVer = rnaDB['DB_version'] else: dbVer = 0.0 if dbVer < getMinUpdatableVersion(): sys.stderr.write( "%s format version %4.2f is incompatible with rnaDB.py version %5.3f,\n"%(rnaDBName, dbVer, getVersion() )) sys.stderr.write( "and cannot be updated; please save %s under a different name and rebuild.\n"%rnaDBName) sys.exit(2) elif dbVer < getMinNeededVersion(): sys.stderr.write( "%s format version %4.2f is an outdated format for rnaDB.py version %5.3f.\n"%(rnaDBName, dbVer, getVersion() )) sys.stderr.write( "Please save a copy and then update format by running python rnaDB.py --update\n") sys.exit(2) else: print "database %s format version %5.3f"%(rnaDBName, rnaDB['DB_version']) if actionType == 0: # build doBuild() elif actionType ==1: # extract doExtract() elif actionType == 2: #inventory doInventory() elif actionType == 3: #printHelices doPrintHelices() elif actionType == 4: #audit doAudit() return # end of main() def usage(): sys.stderr.write( "command format: python rnaDB.py [options] [...]\n" ) sys.stderr.write( " MUST include either --audit, --build, --extract, --inventory or --printHelices\n") sys.stderr.write( " : for --build, a pdb filename, or with the -l option a name of a textfile containing a list of pdb files,\n") sys.stderr.write( "\t\t\t or a wildcard-glob for either.\n" ) sys.stderr.write( " : for --extract, a pdb ID, or pdbID:chains such as 1A34:BC, or with the -l option a name \n") sys.stderr.write( "\t\t\t of a textfile containing a list of such, or a wildcard-glob for such textfiles.\n" ) sys.stderr.write( "options:\n" ) sys.stderr.write( "\t--audit: checks existing entries for helix patterns known to need special attention\n" ) sys.stderr.write( "\t--build: new pdb entries will be added to the rnaDB\n" ) sys.stderr.write( "\t--extract: helical (default) or non-helical RNA portions of specified pdbIDs and chains\n" ) sys.stderr.write( "\t\t\t will be extracted to separate pdb files\n" ) sys.stderr.write( "\t--db=: specifies rna_db file name, default \"rna_db.dat\"\n" ) sys.stderr.write( "\t-h, --help: display this help message. This tool requires BioPython and RNAVIEW; installation instructions\n" ) sys.stderr.write( "\t\t\t for which are found here: http://www.biopython.org/docs/install/Installation.html\n" ) sys.stderr.write( "\t\t\t and here: http://ndbserver.rutgers.edu/services/download/index.html\n" ) sys.stderr.write( "\t-l : command line identifiers are textfiles containing a whitespace separated list of \n" ) sys.stderr.write( "\t\t\t identifiers to be processed\n" ) sys.stderr.write( "\t--tmp=: specifies directory for temporary files, default is \"rnaDB_tmp\"\n" ) sys.stderr.write( "\t--dest=: specifies directory for extracted pdb files, default is \"rna_pdbs\"\n" ) sys.stderr.write( "\t--helices: deprecated, replaced with intra_helices, below, for clarity.\n" ) sys.stderr.write( "\t--intra_helices: only works with --extract, specifies single-stranded (intra-molecular) helical portions are to be extracted\n" ) sys.stderr.write( "\t--all_helices (default): only works with --extract, specifies all helical portions are to be extracted\n" ) sys.stderr.write( "\t--inter_helices: only works with --extract, specifies only intermolecular helical portions are to be extracted\n" ) sys.stderr.write( "\t--invert: only works with --extract, specifies extraction of the portions not meeting helical and/or other requirements\n" ) sys.stderr.write( "\t--noDL: disables downloading of pdb files from pdb.org\n" ) sys.stderr.write( "\t--printHelices: prints helix descriptors for helices in specified entries\n" ) sys.stderr.write( "\t--basePairs: adds inidividual basePair details to --printHelices output\n" ) sys.stderr.write( "\t--savexml: saves xml files created during build process\n" ) sys.stderr.write( "\n" ) return def processOptions(): global fromLists, ident_list, homeDir, tmpDir, destDir, rnaDBName, cutoff global getSingle, getDouble, invert, local, showBasePairs, actionType, savexml fromLists = 0 ident_list = [] homeDir = os.getcwd() tmpDir = "rnaDB_tmp" destDir = "rna_pdbs" rnaDBName = "rna_db.dat" getSingle = 1 getDouble = 1 invert = 0 local = 0 showBasePairs = 0 savexml = 0 try: opts, args = getopt.getopt(sys.argv[1:], "lh", ["audit", "help", "db=", "tmp=", "build", "extract", "inventory", "helices", "invert", "dest=", "all_helices", "inter_helices", "intra_helices", "noDL", "printHelices", "basePairs", "update", "savexml"]) except getopt.GetoptError: usage() sys.exit(2) optlist = [o for o,a in opts] if "--update" in optlist: actionType = -1 elif "--build" in optlist: actionType = 0 elif "--extract" in optlist: actionType = 1 elif "--inventory" in optlist: actionType = 2 elif "--printHelices" in optlist: actionType = 3 elif "--audit" in optlist: actionType = 4 else: usage() sys.exit(2) for o, a in opts: if o in ["-h", "--help"]: usage() sys.exit() elif o == "-l": fromLists = 1 elif o == "-p": cutoff = float(a) elif o == "--db=": rnaDBName = a elif o == "--tmp=": tmpDir = a elif o == "--dest=": destDir = a elif o == "--intra_helices": getDouble = 0 elif o == "--inter_helices": getSingle = 0 elif o == "--invert": invert = 1 elif o == "--helices": sys.stderr.write( "The '--helices' option is deprecated, please use '--intra_helices' instead.\n" ) getDouble = 0 elif o == "--noDL": local = 1 elif o == "--basePairs": showBasePairs = 1 elif o == "--savexml": savexml = 1 if fromLists: for f in args: for file in glob.glob(f): source_fh = open(file) lines = source_fh.readlines() thisList = [f.strip() for f in lines] ident_list.extend(thisList) source_fh.close() else: #not fromLists for f in args: templist = glob.glob(f) f = f.split(".")[0] if ((templist==[])and (((len(f)==4)and(local==0))or(actionType==1))): templist = [f] ident_list.extend(templist) print "expanded identifier list is %s"%ident_list return def doUpdate(): global rnaDB, ident_list if (rnaDBName not in os.listdir(homeDir)): sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName) sys.exit(2) rnaDB = shelve.open(rnaDBName) if rnaDB.has_key('DB_version'): dbVer = rnaDB['DB_version'] else: dbVer = 0.0 print "Beginning update of %s from format version %5.3f to %5.3f"%(rnaDBName, dbVer, getVersion()) if dbVer < getMinUpdatableVersion(): sys.stderr.write( "%s version %4.2f is incompatible with rnaDB.py version %5.3f,\n"%(rnaDBName, dbVer, getVersion() )) sys.stderr.write( "and cannot be updated; please save %s under a different name and rebuild.\n"%rnaDBName) sys.exit(2) elif dbVer < 0.86: rnaDB['DB_reservedKeys'] = getReservedKeys() rnaDB.sync() ident_list = getAllDBIdents() for ident in ident_list: if ident==ident.upper(): continue elif (ident.upper() not in ident_list): entry = rnaDB[ident] entry.pdbID = ident.upper() rnaDB[entry.pdbID] = entry del rnaDB[ident] rnaDB.sync() continue rnaDB['DB_version'] = 0.86 rnaDB.close() print "Update complete; running the audit command now is recommended." return def doBuild(): global rnaDB, ident_list if not (tmpDir in os.listdir(homeDir)): newTempDir=1 os.mkdir(tmpDir) if (rnaDBName in os.listdir(homeDir)): print "RNA DB %s already exists; new entries will be added to it or update existing entries."%rnaDBName else: print "RNA DB %s will be created."%rnaDBName rnaDB = shelve.open(rnaDBName) rnaDB['DB_version'] = getVersion() rnaDB['DB_reservedKeys'] = getReservedKeys() rnaDB.sync() for file in ident_list: print "Processing file %s. "%file newEntry = RNA_DB_Entry(pdbFile=file, tempDir=tmpDir, local=local, savexml=savexml) if len(newEntry.rnaChains)>0: rnaDB[newEntry.pdbID]=newEntry rnaDB.sync() print "ident %s finished.\n"%file rnaDB.close() if os.listdir(tmpDir)==[]: os.rmdir(tmpDir) print "Build finished.\n" return def doExtract(): global rnaDB, ident_list if not (rnaDBName in os.listdir(homeDir)): sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName) sys.exit(2) if not (destDir in os.listdir(homeDir)): os.mkdir(destDir) print "PDB extractions will be saved to folder %s\n"%destDir allKeys= getAllDBIdents() for ident in ident_list: identParts = ident.split(":") pdbID = identParts[0].split(".")[0] print "Processing identifier %s. "%pdbID if (pdbID in allKeys): entry = rnaDB[pdbID] elif (pdbID.upper() in allKeys): entry = rnaDB[pdbID.upper()] elif (pdbID.lower() in allKeys): entry = rnaDB[pdbID.lower()] else: sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID)) continue target_helices = [] if getSingle: target_helices.extend(entry.intraMolHelices) if getDouble: target_helices.extend(entry.interMolHelices) if (entry.pdbFile not in os.listdir(os.getcwd())): if (local): sys.stderr.write("**** pdb file %s does not exist in directory %s.\n"%(entry.pdbFile,os.getcwd())) else: try: dlPDB(entry.pdbID) except: sys.stderr.write("**** unable to download pdb file %s to directory %s.\n"%(entry.pdbFile,os.getcwd())) continue entry.pdbFile=entry.pdbID+".pdb" rnaDB[entry.pdbID]=entry rnaDB.sync() if len(target_helices) == 0: sys.stderr.write("**** %s has no target helices.****\n"%ident) continue else: s=RNA_DB_Entry.parser.get_structure('unused_ref', entry.pdbFile) RNA_DB_Entry.io.set_structure(s) firstModel = s.get_list()[0] name_split = list(os.path.splitext(entry.pdbFile)) if len(identParts)==1: theseChains="any" else: if identParts[1]=="__first": theseChains = [entry.naChains[target_helices[0][0][0]-1]] else: theseChains=list(identParts[1]) name_split[0]=name_split[0]+"_chain"+"".join(theseChains) os.chdir(destDir) if invert: name_split[0]=name_split[0]+"_inverted" RNA_DB_Entry.io.save(name_split[0]+name_split[1], invertSelect(entry, target_helices,firstModel,theseChains)) else: name_split[0]=name_split[0]+"_helices" RNA_DB_Entry.io.save(name_split[0]+name_split[1], helixSelect(entry, target_helices,firstModel,theseChains)) os.chdir("..") rnaDB.close() print "Extraction Done." return def doInventory(): global rnaDB, ident_list if not (rnaDBName in os.listdir(homeDir)): sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName) sys.exit(2) allKeys= getAllDBIdents() if (ident_list == []): ident_list = allKeys for ident in ident_list: identParts = ident.split(":") pdbID = identParts[0].split(".")[0] print "Processing identifier %s. "%pdbID if (pdbID in allKeys): entry = rnaDB[pdbID] elif (pdbID.upper() in allKeys): entry = rnaDB[pdbID.upper()] elif (pdbID.lower() in allKeys): entry = rnaDB[pdbID.lower()] else: sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID)) continue ssChains={} dsChains={} for helix in entry.intraMolHelices: tempIndex= helix[0][0]-1 if ((tempIndex>=0)and (tempIndex=0)and (tempIndex=0)and (tempIndex' print '============================================' else: print "printing helix descriptors, formatted as:" print '"pdbID 5\'chainID 5\'resSeq 3\'chainID 3\'resSeq length" (helix descriptor)' print '' print '============================================' for ident in ident_list: identParts = ident.split(":") pdbID = identParts[0].split(".")[0] print "Processing identifier %s. "%pdbID if (pdbID in allKeys): entry = rnaDB[pdbID] elif (pdbID.upper() in allKeys): entry = rnaDB[pdbID.upper()] elif (pdbID.lower() in allKeys): entry = rnaDB[pdbID.lower()] else: sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID)) continue target_helices = [] if getSingle: target_helices.extend(entry.intraMolHelices) if getDouble: target_helices.extend(entry.interMolHelices) if len(target_helices) == 0: sys.stderr.write("**** %s has no target helices.****\n"%ident) continue else: for helix in target_helices: ((mol5,pos5),(mol3,pos3), length) = helix mol5ID = entry.naChains[mol5-1] mol3ID = entry.naChains[mol3-1] try: #print "raw helix in %s: ((%s, %i), (%s, %i), %i)"%(entry.pdbID, mol5, pos5, mol3, pos3, length) print "%s %s %i %s %i %i"%(entry.pdbID, mol5ID, entry.nTables[mol5ID][pos5-1], mol3ID, entry.nTables[mol3ID][pos3-1], length) except: sys.stderr.write( "helix lookup problem in %s: ((%s, %i), (%s, %i), %i)\n"%(entry.pdbID, mol5, pos5, mol3, pos3, length) ) continue if (showBasePairs): for bpIndex in range( length ): try: print "%s %s %i %s %i"%( entry.pdbID, mol5ID, entry.nTables[mol5ID][pos5-1+bpIndex], mol3ID, entry.nTables[mol3ID][pos3-1-bpIndex] ) except: sys.stderr.write( "basepair lookup problem in %s: basepair %i of helix ((%s, %i), (%s, %i), %i)\n"%( entry.pdbID, bpIndex, mol5, pos5, mol3, pos3, length ) ) continue rnaDB.close() print "printHelices Done." return def doAudit(): global rnaDB, ident_list if not (rnaDBName in os.listdir(homeDir)): sys.stderr.write("RNA DB %s does not exist.\n"%rnaDBName) sys.exit(2) faultyEntries=[] allKeys= getAllDBIdents() print "auditing helix descriptors" if (ident_list == []): ident_list = allKeys for ident in ident_list: identParts = ident.split(":") pdbID = identParts[0].split(".")[0] #print "Processing identifier %s. "%pdbID if (pdbID in allKeys): entry = rnaDB[pdbID] elif (pdbID.upper() in allKeys): entry = rnaDB[pdbID.upper()] elif (pdbID.lower() in allKeys): entry = rnaDB[pdbID.lower()] else: sys.stderr.write("**** RNA DB %s does not have an entry for %s.\n"%(rnaDBName,pdbID)) continue target_helices = [] if getSingle: target_helices.extend(entry.intraMolHelices) if getDouble: target_helices.extend(entry.interMolHelices) if len(target_helices) == 0: continue else: for helix in target_helices: try: debug = 0 ((mol5,pos5),(mol3,pos3), length) = helix mol5ID = entry.naChains[mol5-1] mol3ID = entry.naChains[mol3-1] mol5Begin = entry.nTables[mol5ID][pos5-1] debug = 1 mol5End = entry.nTables[mol5ID][pos5-1+length-1] debug = 2 mol5Dist = mol5End - mol5Begin + 1 debug = 3 mol3Begin = entry.nTables[mol3ID][pos3-1-length+1] debug = 4 mol3End = entry.nTables[mol3ID][pos3-1] debug = 5 mol3Dist = mol3End - mol3Begin+1 debug = 6 if ((mol5Dist!=length) or (mol3Dist!=length)): seq5 = ["%s"%e for e in entry.nTables[mol5ID][pos5-1: pos5-1+length]] debug = 7 seq3 = ["%s"%e for e in entry.nTables[mol3ID][pos3-1: pos3-1-length:-1]] debug = 8 sys.stderr.write( "discontiguous helix in %s: ((%s, %i), (%s, %i), %i)\n"%(entry.pdbID, mol5, pos5, mol3, pos3, length) ) sys.stderr.write( "\ttranslated as %s %s %i %s %i %i\n"%(entry.pdbID, mol5ID, entry.nTables[mol5ID][pos5-1], mol3ID, entry.nTables[mol3ID][pos3-1], length)) sys.stderr.write( "\t5' gap %i resSeq sequence %s\n"%((mol5Dist-length)," ".join(seq5))) sys.stderr.write( "\t3' gap %i resSeq sequence %s\n"%((mol3Dist-length)," ".join(seq3))) except: if entry.pdbID not in faultyEntries: faultyEntries.append(entry.pdbID) sys.stderr.write( "****Please rebuild entry %s due to helix lookup problem: ((%s, %i), (%s, %i), %i)\n"%(entry.pdbID, mol5, pos5, mol3, pos3, length) ) rnaDB.close() if faultyEntries != []: print "It is highly recommended that you rebuild the following entries: %s"%faultyEntries print "Audit Done." return def getAllDBIdents(): return [key for key in rnaDB.keys() if key not in rnaDB['DB_reservedKeys']] class RNA_DB_Entry: parser = Bio.PDB.PDBParser() io= Bio.PDB.PDBIO() rcsb_pdb = Bio.config.DBRegistry.CGIDB( name="rcsb_pdb", cgi="http://www.pdb.org/pdb/downloadFile.do", key= "structureId", params=[("fileFormat","pdb"),("compression","NO")]) Bio.db.register(rcsb_pdb) ourPDB = Bio.db["rcsb_pdb"] def __init__(self, pdbFile="", pdbID="", tempDir="rnaDB_temp", local=0, savexml = 0): self.pdbFile=pdbFile self.pdbID=pdbID.upper() self.models=0 self.chains=[] self.rnaChains=[] self.dnaChains=[] self.naChains =[] self.intraMolHelices=[] self.interMolHelices=[] self.sequences={} self.nTables={} self.family="" if pdbFile <> "": self.pdbID = os.path.split(pdbFile)[1].split('.')[0].upper() self.parsePDB(tempDir=tempDir, local=local, savexml = savexml) def parsePDB(self, tempDir="rnaDB_temp", local=0, savexml = 0): cutoff = 0.80 localFiles = os.listdir(os.getcwd()) if ((local==0)and (self.pdbFile not in localFiles)): self.pdbFile = self.pdbID+".pdb" if (self.pdbFile not in localFiles): sys.stderr.write("**** pdb file %s not found locally; attempting to download.\n"%self.pdbFile) try: dlPDB(self.pdbID) except: sys.stderr.write("**** unable to download pdb %s to directory %s.\n"%(self.pdbID,os.getcwd())) return file = self.pdbFile pathSplit = os.path.split(file) name_split = list(os.path.splitext(pathSplit[1])) try: s=RNA_DB_Entry.parser.get_structure('unused_ref', file) except: sys.stderr.write("**** unable to parse pdb %s.\n"%(self.pdbID)) return; self.models = len(s.get_list()) firstModel = s.get_list()[0] RNA_DB_Entry.io.set_structure(s) if len(s.get_list()) > 1: #more than one model name_split[0]=name_split[0]+".model_1" thisFile = name_split[0]+".tmp"+name_split[1] chains = firstModel.get_list() self.chains = [ch.get_id() for ch in chains] for chain in chains: stats = resStats(chain) chID = chain.get_id() if isRnaviewNA(chain): self.naChains.append(chID) self.sequences[chID]="" if ((stats[0]<2)and (stats[1]>=2)): self.dnaChains.append(chID) else: self.rnaChains.append(chID) if not self.naChains: sys.stderr.write("%s has no usable nucleic acid chains.\n"%self.pdbID) else: os.chdir(tempDir) RNA_DB_Entry.io.save(thisFile, mySelect(firstModel,"any")) p2 = os.system("rnaview -p %s > tmp.out > tmp.err"%thisFile) xmlFile=thisFile+".xml" if not (xmlFile in os.listdir(os.getcwd())): sys.stderr.write( "rnaview has not produced xml for %s; likely cause is absence of basepairs\n"%self.pdbID ) else: doc=xml.dom.minidom.parse(xmlFile) numSHelices=0 numDHelices=0 ends=['base-id-5p','base-id-3p'] for node in doc.getElementsByTagName('molecule')+doc.getElementsByTagName('interactions'): if node.localName=="molecule": molID = int(node.getAttribute("id")) try: chID = self.naChains[molID-1] except: sys.stderr.write("error looking up mol # %i in na Chains %s"%(molID, self.naChains)) if (chID in self.rnaChains) or (chID in self.dnaChains): print "getting sequence for chain %s of %s"%(chID,self.pdbID) self.sequences[chID],self.nTables[chID]=getSequenceInfo(node, pdbID=self.pdbID) if not (chID in self.rnaChains): continue #to next molecule or interaction node for helix in node.getElementsByTagName('helix'): bases = [helix.getElementsByTagName(e)[0].getElementsByTagName('base-id')[0] for e in ends] pos5,pos3 = map(int,[GetText(b.getElementsByTagName('position')[0]) for b in bases]) length=int(GetText(helix.getElementsByTagName('length')[0])) if node.localName=="molecule": numSHelices+=1 self.intraMolHelices.append(((molID,pos5),(molID,pos3), length)) else: mol5,mol3 = map(int,[b.getElementsByTagName('molecule-id')[0].getAttribute("ref") for b in bases]) numDHelices+=1 self.interMolHelices.append(((mol5,pos5),(mol3,pos3), length)) print "%s [%s] has %i single stranded NA helices and %i intermolecular NA helices."%(file,self.pdbID,numSHelices,numDHelices) removeList = [thisFile, thisFile+".ps", thisFile+".out", "tmp.out", "tmp.err", "base_pair_statistics.out"] if (not(savexml)): removeList.add(xmlFile) localFiles = os.listdir(os.getcwd()) for oldFile in removeList: if oldFile in localFiles: os.remove(oldFile) os.chdir("..") #end def Class rnaDB_Entry def GetText(firstnode): text = "" nodelist = [firstnode] for node in nodelist: if node.nodeType == node.TEXT_NODE: text += node.data else: nodelist.extend(node.childNodes) return text.strip() def getSequenceInfo(node, pdbID="unknown"): sequences = node.getElementsByTagName('sequence') if len(sequences)<>1: sys.stderr.write("molecule %i of file %s unexpectedly has %i sequence entries.\n"%(node.getAttribute("id"),pdbID,len(sequences))) #get the sequence & any modifications sd=sequences[0].getElementsByTagName('seq-data')[0] thisSeq = list([n.data.replace(' ','').replace('\n','') for n in sd.childNodes if n.nodeType==n.TEXT_NODE][0]) seqMods = sequences[0].getElementsByTagName('modification') for mod in seqMods: modPos=int(GetText(mod.getElementsByTagName('position')[0])) thisSeq[modPos-1]=GetText(mod.getElementsByTagName('modified-type')[0]) #verify the numbering range used by RNAMLView is as expected Nrange = sequences[0].getElementsByTagName('numbering-system')[0].getElementsByTagName('numbering-range')[0] if (GetText(Nrange.getElementsByTagName('start')[0])<>"1") or int(GetText(Nrange.getElementsByTagName('end')[0]))<>len(thisSeq): sys.stderr.write("Unexpected numbering range for molecule %i of file %s.\n"%(node.getAttribute("id"),pdbID)) #get the numbering table for translating into pdb residue ID's nTable = sequences[0].getElementsByTagName('numbering-table')[0] nTableList = map(int, list([n.data.split() for n in nTable.childNodes if n.nodeType==n.TEXT_NODE][0])) if len(nTableList)<>len(thisSeq): sys.stderr.write("Unexpected numbering table for molecule %i of file %s.\n"%(node.getAttribute("id"),pdbID)) return (thisSeq,nTableList) def isRNA(stats, cutoff): # probably don't need to allow so many hetatms # probably should also check for res.get_resname() being in list of # regular and modified rna types (rnaCount, dnaCount, pepCount, hetCount) = stats chLen=rnaCount + dnaCount + pepCount + hetCount # if ((rnaCount >= 2) and (rnaCount > hetCount) and (rnaCount > (float(chLen)-hetCount)*cutoff)): if (rnaCount >= 2): return 1 else: return 0 def isRnaviewNA(ch): #=============================================================================== # this routine performs the same analysis as does rnaview in fpair_sub.c #=============================================================================== naCount = 0 for res in ch.get_list(): if ((res.has_id('N1'))and(res.has_id('C2'))and(res.has_id('C6'))): N1 = res['N1'] C2 = res['C2'] C6 = res['C6'] if (((N1-C2)<2.0)and((N1-C6)<2.0)and((C2-C6)<3.0)): naCount += 1 if (naCount>1): return True else: return False def resStats(ch): rnaCount = 0 dnaCount = 0 pepCount = 0 hetCount = 0 for res in ch.get_list(): atomList = [a.get_name() for a in res.get_list()] if "O2*" in atomList: rnaCount += 1 elif "CA" in atomList: pepCount += 1 elif "O2" in atomList: dnaCount += 1 else: hetCount += 1 print "chain %s, %i total residues: %i rna's, %i dna's, %i peptides, %i hetatms."\ %(ch.get_id(), len(ch), rnaCount, dnaCount, pepCount, hetCount) return (rnaCount, dnaCount, pepCount, hetCount) class mySelect(Bio.PDB.Select): def __init__(self, mod, ch): self.myModel= mod self.myChain= ch def accept_model(self, model): if (self.myModel=="any"): return 1 elif (self.myModel == model): return 1 else: return 0 def accept_chain(self, chain): if (self.myChain=="any"): return 1 elif chain == self.myChain: return 1 else: return 0 class helixSelect(Bio.PDB.Select): def __init__(self, entry, helices, mod="any", chList="any"): self.myEntry= entry self.myModel= mod self.myChains= chList self.myHelices= helices def accept_model(self, model): if (self.myModel=="any"): return 1 elif (self.myModel == model): return 1 else: return 0 def accept_chain(self, chain): if (self.myChains=="any"): return 1 elif chain.get_id() in self.myChains: return 1 else: return 0 def accept_residue(self, res): for helix in self.myHelices: try: debugPoint=1 resMol = res.get_parent().get_id() debugPoint=2 ((mol5,pos5),(mol3,pos3), length) = helix debugPoint=3 mol5ID= self.myEntry.naChains[mol5-1] debugPoint=4 mol3ID= self.myEntry.naChains[mol3-1] debugPoint=5 except: sys.stderr.write("helix lookup problem\n") sys.stderr.write("helix ((mol5,pos5),(mol3,pos3), length): ((%i,%i),(%i,%i),%i)\n"%(mol5,pos5,mol3,pos3,length)) sys.stderr.write("na chains: %s\n"%self.myEntry.nTables.keys()) sys.stderr.write("resMol, mol5ID, mol3ID: %s, %s, %s\n"%(resMol,mol5ID,mol3ID)) sys.stderr.write("debugPoint: %i\n"%debugPoint) return 0 if ((resMol!=mol5ID) and (resMol!=mol3ID)): continue try: debugPoint=1 thisNTable = self.myEntry.nTables[resMol] debugPoint=2 thisResSeq = res.get_id()[1] debugPoint=3 if thisResSeq in thisNTable: resPos = thisNTable.index( thisResSeq )+1 debugPoint=4 if resMol == mol5ID: if (pos5 <= resPos) and ( pos5+length > resPos): return 1 if resMol == mol3ID: if (pos3 >= resPos) and ( pos3-length < resPos): return 1 else: # this is known to occur for residues with too few atom entries in the pdb file to satisfy # rnaview, and for hetatm groups assigned an rna chain in the pdb file, but which # rnaview does not consider to be rna residues (such as a sulfate group) sys.stdout.write( "*** the following residue will be ignored under extraction: chain %s residue %s\n"%(resMol,res) ) return 0 except: sys.stderr.write( "residue lookup problem\n" ) sys.stderr.write( "debugPoint: %i\n"%debugPoint ) sys.stderr.write( "helix ((mol5,pos5),(mol3,pos3), length): ((%i,%i),(%i,%i),%i)\n"%( mol5, pos5, mol3, pos3, length ) ) sys.stderr.write( "res mol ID: %s\n"%repr(resMol) ) sys.stderr.write( "nTable keys: %s\n"%self.myEntry.nTables.keys() ) sys.stderr.write( "nTable: %s\n"%thisNTable ) sys.stderr.write( "res Seq: %s\n"%thisResSeq ) return 0 #all helices done, no match found return 0 def getResPos(self, molID, res): return resPos class invertSelect(helixSelect): def accept_residue(self, res): return not helixSelect.accept_residue(self,res) def dlPDB(pdbID): thisPDB=RNA_DB_Entry.ourPDB[pdbID] pdblines=thisPDB.readlines() tempfile = open(pdbID+".pdb", 'w') tempfile.writelines(pdblines) tempfile.close() #=============================================================================== #Main """ """ if __name__ == "__main__": main()