"""
DEPRECATIOIN WARNING
===================
I am not using this script any longer for computing all of the possible pairwise combination, it
is simply not possible to do (would have to store ~ a trillion floting point values).


Computes the pairwise tanimoto scores between all drugs
and the drug with the given index (taken as input).

EXAMPLE
=======
python compute_tanimoto.py 10

Computes the pairwise tanimoto between all drugs and the 
drug at index 10. Indices start at 0.

CLUSTER
=======
This script was designed to be run on the cluster. To submit all
the jobs needed to complete the run use the following bash script.

for ((i=0;i<123591;i+=100)); do bsub -o output/s$i_n100.out python compute_tanimoto.py $i 100; done

"""

import os
import sys
import csv
import tempfile

SMILES_FILE = '123591_smiles.txt'
IDENTIFIER = 0
SMILES = 1

if __name__ == '__main__':
    
    start = int(sys.argv[1])
    num = int(sys.argv[2])
    
    drugs = [(sid, smiles) for sid, smiles in csv.reader(open(SMILES_FILE), delimiter='\t')]
    
    resfile = open('results/tanimotos_s%d_s%d.txt' % (start, start+num), 'w')
    writer = csv.writer(resfile)
    
    for index in range(start, start+num):
        
        print >> sys.stderr, "Computing tanimotos for drug at index %d: %s" % (index, drugs[index][IDENTIFIER])
        
        tfile = tempfile.NamedTemporaryFile(suffix='.smi')
        
        print >> tfile, drugs[index][SMILES]
        for sid, smiles in drugs:
            print >> tfile, smiles
        
        tfile.flush()
        
        outfile = tempfile.NamedTemporaryFile()
        
        os.system('babel -ismi %s -ofpt -xfMACCS > %s' % (tfile.name, outfile.name))
        
        scores = [float(x.split('=')[1].strip()) for x in open(outfile.name) if x[0] == '>' and x.find('=') != -1]
        
        # sanity check
        if not scores[index] == 1.0:
            print >> sys.stderr, "Sanity check FAILED on %d!! The molecule is not equivalent to itself!" % index
            continue
        
        if not len(scores) == len(drugs):
            print >> sys.stderr, "Sanity check FAILED on %d!! The length of the results vector is incorrect!" % index
            continue
        
        sid1 = drugs[index][IDENTIFIER]
        for i, score in enumerate(scores):
            sid2 = drugs[i][IDENTIFIER]
            writer.writerow([sid1, sid2, score])
        
        tfile.close()
        outfile.close()
    
    resfile.close()
        