#!/usr/local/bin/perl -w


BEGIN 
{
	push(@INC, "/Perl/lib");
	push(@INC, "/Perl/site/lib");
}

use lib "include";
use Bio::AlignIO;
use Bio::Structure::IO;
use strict;


calcWeights("identity", 98);



# PURPOSE: COMPUTES THE SUM-OF-PAIRS WEIGHTS FOR RESIDUES USING A SPECIFIED SCORING MATRIX(WAC,MIYATA,OR ID), 
# 		   THEN WRITES RESIDUE-WEIGHT PAIRS TO FILE 'weights.txt'

sub calcWeights
{
	my ( $line, @lineSplit, %structureRes );
	my ( @matrix,  %aa, $decPlace );
	my ( $alignIO, $aln );
	my $numSeq;
	my $status;
	my ( $sequence, @residue, $res, $column, @alnColumns );
	my ( $numCol, @col, $colLength, $sumPairs, $gapGapScore, @score, $numScore );
	my ( $colIndex, $i, $j );
	my ( $amino1, $amino2, $matrixAmino1, $matrixAmino2 );
	my $invWeight;
	my $key;
	
	#get input parameters passed from UI
	my $matrixType = shift;
	my $refSeqPos = shift;
	my $scaleFactor = shift;

	
	#open file that contains the defined PDB residue numbers
	open( SEQUENCE, "data/pdbseq.txt" ) || die("Cannot open sequence file for input");
	while ( $line = <SEQUENCE> ) 
	{
		chomp($line);
		@lineSplit = split(/\t/, $line);
	
		#record whether a residue number is defined in the pdb structure and distinguishes btw amino acid & het
		if($lineSplit[1] ne 'HET')
		{
			$structureRes{$lineSplit[0]} = 'aa';
		}
		else
		{
			$structureRes{$lineSplit[0]} = 'het';
		}
	}
	close(SEQUENCE);
	
	#if user specified IDENTITY matrix
	if( $matrixType eq "identity")
	{
		#identity matrix is easy to compose: just set diagonal to 1's and all other values in matrix to 0's
		for($i = 0; $i < 20; $i++)
		{
			for($j = 0; $j < 20; $j++)
			{
				if ($i==$j)
				{
					$matrix[$i][$j] = 0;
				}
				else
				{
					$matrix[$i][$j] = 1;
				}
			}
		}
		
		#set amino acid representation/order within matrix
		%aa =
		(
		   'A' => 0,
		   'R' => 1,
		   'N' => 2,
		   'D' => 3,
		   'C' => 4,
		   'Q' => 5,
		   'E' => 6, 
		   'G' => 7, 
		   'H' => 8, 
		   'I' => 9, 
		   'L' => 10, 
		   'K' => 11, 
		   'M' => 12, 
		   'F' => 13, 
		   'P' => 14, 
		   'S' => 15, 
		   'T' => 16, 
		   'W' => 17, 
		   'Y' => 18, 
		   'V' => 19,
		 );
		
		$gapGapScore = 1;
		$decPlace = "%.2f";
	}
	
	#if user specified WAC matrix		
	elsif ( $matrixType eq "wac" ) 
	{
		@matrix = 
		(
			[4],
			[ 0, 4 ],
			[ 0, 1, 4 ],
			[ -1, 0, 0, 4 ],
			[ 0, 0, 0, 0, 4 ],
			[ 0, 0, 0, 0, -1, 4 ],
			[ 0, 1, 1, 0, 0, 0, 4 ],
			[ -2, -1, -1, -1, -2, -2, 0, 4 ],
			[ -2, -2, -1, -1, -3, -2, 0, 0, 4 ],
			[ 0, 0, 1, 0, 0, 0, 1, -1, 0, 4 ],
			[ 0, -1, -1, 0, 0, -1, 0, -1, -1, 0, 4 ],
			[ -1, -1, 0, 0, -1, -1, 0, -1, -1, 0, 0, 4 ],
			[ -2, 0, 0, 0, -1, -1, 0, -1, -1, 0, 0, 2, 4 ],
			[ 1, 0, 0, 1, 1, 0, 0, -1, -1, 1, 0, 0, 0, 4 ],
			[ 0, -1, 0, 0, 0, -2, -1, -4, -4, 0, -1, -2, -2, 2, 4 ],
			[ 0, -1, 0, 0, 0, -1, -1, -3, -3, 0, -1, -1, -2, 2, 1, 4 ],
			[ 0, -1, 0, 0, 1, -2, -1, -3, -4, -1, -1, -2, -3, 1, 2, 1, 4 ],
			[ 0, -2, -2, 0, 0, -2, -1, -4, -4, 0, 0, -2, -2, 2, 0, 1, 0, 4 ],
			[ 0, -1, -1, -1, 0, -2, 0, -3, -4, 1, 0, -1, -2, 0, 0, 0, 0, 1, 4 ],
			[ 0, -1, -1, 0, 0, -2, -1, -3, -2, 0, 0, 0, -2, 2, 0, 1, 0, 2, 1, 4 ],
		);
		
		#transpose matrix to fill upper right hand corner
		for ( $i = 0 ; $i < 20 ; $i++ )
		{
			for ( $j = $i + 1 ; $j < 20 ; $j++ ) 
			{
				$matrix[$i][$j] = $matrix[$j][$i];
			}
		}
		
		#scale matrix values by 5 so that it's non-negative
		for ( $i = 0 ; $i < 20 ; $i++ ) 
		{
			for ( $j = 0 ; $j < 20 ; $j++ ) 
			{
				$matrix[$i][$j] += 5;
			}
		}
		
		#set amino acid representation/order within matrix
		%aa = 
		(
			'C' => 0,
			'S' => 1,
			'T' => 2,
			'P' => 3,
			'A' => 4,
			'G' => 5,
			'N' => 6,
			'D' => 7,
			'E' => 8,
			'Q' => 9,
			'H' => 10,
			'R' => 11,
			'K' => 12,
			'M' => 13,
			'I' => 14,
			'L' => 15,
			'V' => 16,
			'F' => 17,
			'Y' => 18,
			'W' => 19,
		);
		
		$gapGapScore = 0;
		$decPlace = "%.1f";
	}    
	
	#if user specified MIYATA matrix
	else  	#$matrixType eq "miyata" 
	{
		@matrix= 
		(
			[0, 1.33, 1.39, 2.22, 1.84, 1.45, 2.48, 3.26, 2.83, 3.48, 2.56, 3.27, 3.06, 0.86, 1.65, 1.63, 1.46, 2.24, 2.38, 3.34],
			[0,   0,  0.06, 0.97, 0.56, 0.87, 1.92, 2.48, 1.80, 2.40, 2.15, 2.94, 2.90, 1.79, 2.70, 2.62, 2.36, 3.17, 3.12, 4.17],
			[0,   0,  0,    0.91, 0.51, 0.90, 1.92, 2.46, 1.78, 2.37, 2.17, 2.96, 2.92, 1.85, 2.76, 2.69, 2.42, 3.23, 3.18, 4.23],
			[0,   0,  0,    0,    0.85, 1.70, 2.48, 2.78, 1.96, 2.37, 2.78, 3.54, 3.58, 2.76, 3.67, 3.60, 3.34, 4.14, 4.08, 5.13],
			[0,   0,  0,    0,    0,    0.89, 1.65, 2.06, 1.31, 1.87, 1.94, 2.71, 2.74, 2.15, 3.04, 2.95, 2.67, 3.45, 3.33, 4.38],
			[0,   0,  0,    0,    0,    0,    1.12, 1.83, 1.40, 2.05, 1.32, 2.10, 2.03, 1.42, 2.25, 2.14, 1.86, 2.60, 2.45, 3.50],
			[0,   0,  0,    0,    0,    0,    0,    0.84, 0.99, 1.47, 0.32, 1.06, 1.13, 2.13, 2.70, 2.57, 2.30, 2.81, 2.48, 3.42],
			[0,   0,  0,    0,    0,    0,    0,    0,    0.85, 0.90, 0.96, 1.14, 1.45, 2.97, 3.53, 3.39, 3.13, 3.59, 3.22, 4.08],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0.65, 1.29, 1.84, 2.04, 2.76, 3.49, 3.37, 3.08, 3.70, 3.42, 4.39],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    1.72, 2.05, 2.34, 3.40, 4.10, 3.98, 3.69, 4.27, 3.95, 4.88],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0.79, 0.82, 2.11, 2.59, 2.45, 2.19, 2.63, 2.27, 3.16],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0.40, 2.70, 2.98, 2.84, 2.63, 2.85, 2.42, 3.11],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    2.43, 2.62, 2.49, 2.29, 2.47, 2.02, 2.72],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0.91, 0.85, 0.62, 1.43, 1.52, 2.51],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0.14, 0.41, 0.63, 0.94, 1.73],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0.29, 0.61, 0.86, 1.72],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0.82, 0.93, 1.89],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0.48, 1.11],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1.06],
			[0,   0,  0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0   ],
		);
		
		#transpose matrix
		for($i = 0; $i < 20; $i++)
		{
			for($j = $i; $j < 20; $j++)
			{
				$matrix[$j][$i] = $matrix[$i][$j];
			}
		}
		
		#set amino acid representation/order within matrix
		 %aa =
		 (
		   'C' => 0,
		   'P' => 1,
		   'A' => 2,
		   'G' => 3,
		   'S' => 4,
		   'T' => 5,
		   'Q' => 6, 
		   'E' => 7, 
		   'N' => 8, 
		   'D' => 9, 
		   'H' => 10, 
		   'K' => 11, 
		   'R' => 12, 
		   'V' => 13, 
		   'L' => 14, 
		   'I' => 15, 
		   'M' => 16, 
		   'F' => 17, 
		   'Y' => 18, 
		   'W' => 19,
		 );
		 $gapGapScore = 6;
		 $decPlace = "%.1f";
	}
	
	
	#use bioperl module AlignIO to process the MSA and extract only columns in the MSA
	#for which the sequence of interest is ungapped
	$alignIO = new Bio::AlignIO( -format => 'clustalw', -file => 'align.aln' );
	$aln = $alignIO->next_aln();
	$numSeq = $aln->no_sequences;
	$status = $aln->splice_by_seq_pos($refSeqPos);
	if($status != 1)
	{
		#unsuccessful splicing by reference sequence 
		die("Can't locate reference sequence in MSA");
	}
	else
	{
		foreach my $seq ( $aln->each_alphabetically())
		#get an array of sequences objects sorted alphabetically by name and then by start point
		{   
			#look at one sequence object at a time
			$column = 1;
			$sequence = $seq->seq();    #retrieve the sequence from each sequence object
			@residue = split( "", $sequence );  #split into array of individual residues
			foreach $res (@residue) 
			{
				$alnColumns[$column] .= $res;
				$column++;
			}
		}
		
		#open file for writing weight associated with each defined residue in the PDB structure
		open( WEIGHTS, ">data/weights.txt" ) || die("Cannot open file for writing residue weights");
			
		#for each column, calculate the sum-of-pairs score by summing up
		#the corresponding matrix value for each pair of amino acids within that column
		#and then taking the average
		$numCol = @alnColumns;
		for ( $colIndex = 1 ; $colIndex < $numCol ; $colIndex++ ) 
		{
			$sumPairs  = 0;
			$column    = $alnColumns[$colIndex];
			@col       = split( "", $column );
			$colLength = @col;
		
			$numScore = 0;
			for ( $i = 0 ; $i < $colLength ; $i++ ) 
			{
				for ( $j = $i + 1 ; $j < $colLength ; $j++ ) 
				{
					$amino1 = $col[$i];
					$amino2 = $col[$j];
		
					if ( $amino1 eq '-' || $amino2 eq '-' ) 
					{
						$sumPairs += $gapGapScore;
						$numScore++;
					}
					else 
					{
						if(exists($aa{$amino1}) && exists($aa{$amino2}))
						{
							$matrixAmino1 = $aa{$amino1};
							$matrixAmino2 = $aa{$amino2};
						
							$sumPairs += $matrix[$matrixAmino1][$matrixAmino2];
							$numScore++;
						}
					}
				}
			}
		
			$score[$colIndex] = $sumPairs / $numScore;
			#keep weights to 1 decimal place
			$score[$colIndex] = sprintf( $decPlace, $score[$colIndex] * $scaleFactor );
		
			#only write residue-weight information to file if the residue
			#is defined in the PDB
			if (exists($structureRes{$colIndex}))
			{
				if ( $structureRes{$colIndex} eq 'aa' ) 
				{
					if($matrixType eq "wac")
					{
						#for wac, need to scale weights such that small weight <-> very conserved
						$invWeight = 10 - $score[$colIndex];
						print WEIGHTS "$colIndex\t$invWeight\n";
					}
					else    # $matrixType is "id" or "miyata"  
					{
						print WEIGHTS "$colIndex\t$score[$colIndex]\n";
					}
				}
			}
		}
		#now, all we have to do is print the HET numbers and give them weights of 0
		foreach $key (sort{$a <=> $b}(keys(%structureRes)))
		{
			if($structureRes{$key} eq 'het')
			{
				print WEIGHTS "$key\t",sprintf($decPlace, 0), "\n"	
			}
		}
		close(WEIGHTS);
	}
}
