#!/usr/bin/perl # pdb2fasta (C) T J R Cutts 1998 # pulled from # http://iubio.bio.indiana.edu/R58699-62479-/news/bionet/software/9806.newsm # changed to use ATOM records obviously who ever wrote this didn't use pdb # files much. often SEQRES does not correspond to the ATOM records 1-1 # -jd # This script extracts the ATOM records from a PDB file, and writes out # separate FASTA format sequence files, one for each strand in the PDB # file. The title line of the FASTA file is constructed from the COMPND # records of the PDB file, and the strand letter. %lookup=( 'A' => 'A', 'C' => 'C', 'G' => 'G', 'T' => 'T', 'U' => 'U', 'ALA' => 'A', 'ASX' => 'B', 'CYS' => 'C', 'ASP' => 'D', 'GLU' => 'E', 'PHE' => 'F', 'GLY' => 'G', 'HIS' => 'H', 'ILE' => 'I', 'LYS' => 'K', 'LEU' => 'L', 'MET' => 'M', 'ASN' => 'N', 'PRO' => 'P', 'GLN' => 'Q', 'ARG' => 'R', 'SER' => 'S', 'THR' => 'T', 'VAL' => 'V', 'TRP' => 'W', 'XXX' => 'X', 'TYR' => 'Y', 'GLX' => 'Z', '...' => '.', 'END' => '*' ); die "Usage: pdb2fasta \nA separate output file will be created for each strand in the PDB file.\n" unless ($#ARGV==0); @pdb=(); die "Could not open $ARGV[0].pdb\n" unless (open(P, "<$ARGV[0].pdb")); # Put the file, stripped of its carriage returns, in an array. foreach (

) { chomp; push(@pdb, $_); } close(P); %strand = (); # Extract the SEQRES and COMPND records # @seqres=grep(/^SEQRES/, @pdb); @cmpnd=grep(/^COMPND/, @pdb); @atom=grep(/^ATOM/, @pdb); # Create a suitable FASTA title line from the COMPND records $cmpnd='>'.join(' ', @cmpnd); $cmpnd =~ s/COMPND//g; $cmpnd =~ s/\s+/ /g; # foreach $line (@seqres) # { # # Split the SEQRES line from positions 19-70 into an array of residues # @residues = split(/\s+/, substr($line, 19, 51)); # # # For each of these, uppercase the residue, look up its single letter # # equivalent in the hash at the top of this script, and then append # # that single letter to this strand's variable for output. # foreach $res (@residues) # { # $res = uc($res); # $strand{substr($line, 11, 1)} .= $lookup{$res}; # } # # } $resnum_p = substr($atom[0], 22, 4); $resname = substr($atom[0], 17, 3); $resname = uc($resname); $strand{substr($atom[0], 20, 1)} .= $lookup{$resname}; foreach $line (@atom) { $resnum_c = substr($line, 22, 4); if ($resnum_c == $resnum_p) {next;} $resnum_p = $resnum_c; $resname = substr($line, 17, 3); $resname = uc($resname); $strand{substr($line, 20, 1)} .= $lookup{$resname}; } # For each strand discovered, create a FASTA format file printed nicely # with 60 residues per line. foreach $key (keys %strand) { ($basename) = $ARGV[0];# =~ /(.*)\..*/; $filename = ($key eq " ") ? "${basename}.tfa" : "${basename}_$key.tfa"; open (O, ">$filename") || die "Could not open $filename for writing"; if (length $cmpnd > 2) { if ($key eq " "){ print O "$cmpnd\n";} else { print O "$cmpnd - strand $key\n";} } else { $time = localtime; $dir = `pwd`; chomp $dir; print O "> $time, $dir/${ARGV[0]}\n"; } for ($n = 0 ; $n < length($strand{$key}); $n+=60) { print O substr($strand{$key}, $n, 60)."\n"; } close O; }