#!/usr/bin/perl

# pdb2fasta (C) T J R Cutts 1998

# pulled from
# http://iubio.bio.indiana.edu/R58699-62479-/news/bionet/software/9806.newsm
# changed to use ATOM records  obviously who ever wrote this didn't use pdb
# files much.  often SEQRES does not correspond to the ATOM records 1-1
#  -jd

# This script extracts the ATOM records from a PDB file, and writes out
# separate FASTA format sequence files, one for each strand in the PDB
# file.  The title line of the FASTA file is constructed from the COMPND
# records of the PDB file, and the strand letter.

    %lookup=(
             'A' => 'A',
             'C' => 'C',
             'G' => 'G',
             'T' => 'T',
             'U' => 'U',
             
             'ALA' => 'A',
             'ASX' => 'B',
             'CYS' => 'C',
             'ASP' => 'D',
             'GLU' => 'E',
             'PHE' => 'F',
             'GLY' => 'G',
             'HIS' => 'H',
             'ILE' => 'I',
             'LYS' => 'K',
             'LEU' => 'L',
             'MET' => 'M',
             'ASN' => 'N',
             'PRO' => 'P',
             'GLN' => 'Q',
             'ARG' => 'R',
             'SER' => 'S',
             'THR' => 'T',
             'VAL' => 'V',
             'TRP' => 'W',
             'XXX' => 'X',
             'TYR' => 'Y',
             'GLX' => 'Z',
             '...' => '.',
             'END' => '*' 
             );

die "Usage: pdb2fasta <pdbfile>\nA separate output file will be created for each strand in the PDB file.\n"
    unless ($#ARGV==0);

@pdb=();


die "Could not open $ARGV[0].pdb\n" unless (open(P, "<$ARGV[0].pdb"));

# Put the file, stripped of its carriage returns, in an array.
foreach (<P>)
{
    chomp;
    push(@pdb, $_);
}

close(P);

%strand = ();

# Extract the SEQRES and COMPND records
# @seqres=grep(/^SEQRES/, @pdb);
@cmpnd=grep(/^COMPND/, @pdb);
@atom=grep(/^ATOM/, @pdb);

# Create a suitable FASTA title line from the COMPND records
$cmpnd='>'.join(' ', @cmpnd);
$cmpnd =~ s/COMPND//g;
$cmpnd =~ s/\s+/ /g;

# foreach $line (@seqres)
# {
#     # Split the SEQRES line from positions 19-70 into an array of residues
#     @residues = split(/\s+/, substr($line, 19, 51));
#     
#     # For each of these, uppercase the residue, look up its single letter
#     # equivalent in the hash at the top of this script, and then append
#     # that single letter to this strand's variable for output.
#     foreach $res (@residues)
#     {
#         $res = uc($res);
#         $strand{substr($line, 11, 1)} .= $lookup{$res};
#     }
#     
# }


$resnum_p = substr($atom[0], 22, 4);
$resname = substr($atom[0], 17, 3);
$resname = uc($resname);

$strand{substr($atom[0], 20, 1)} .= $lookup{$resname};

foreach $line (@atom) {
    $resnum_c = substr($line, 22, 4);
    if ($resnum_c == $resnum_p) {next;}
    $resnum_p = $resnum_c;
    $resname = substr($line, 17, 3);    
    $resname = uc($resname);
    $strand{substr($line, 20, 1)} .= $lookup{$resname};

}


# For each strand discovered, create a FASTA format file printed nicely
# with 60 residues per line.

foreach $key (keys %strand)
{
    ($basename) = $ARGV[0];# =~ /(.*)\..*/;
    $filename = ($key eq " ") ? "${basename}.tfa" : "${basename}_$key.tfa";
    open (O, ">$filename") ||
        die "Could not open $filename for writing";

    if (length $cmpnd > 2) {
	if ($key eq " "){
	    print O "$cmpnd\n";}
	else {
	    print O "$cmpnd - strand $key\n";}

    } else {
	$time = localtime;
	$dir = `pwd`;
	chomp $dir;
	print O "> $time, $dir/${ARGV[0]}\n";
    }
    
    for ($n = 0 ; $n < length($strand{$key}); $n+=60)
    {
        print O substr($strand{$key}, $n, 60)."\n";
    }

    close O;
}