"""
Discover adverse events significantly associated with drugs.
"""

import R
import os
import csv
import sys
import random
import MySQLdb
import tempfile
import operator
from namedmatrix import NamedMatrix

MAX_REPORTS = 120000
DRUG_DRUG_EXP = 100
DRUG_IND_EXP = 100
MIN_BG_REPORTS = 100

db = MySQLdb.connect(host="localhost", port=3307, user="root", passwd="enter_your_password",db="project_aers")
c = db.cursor()

#### ------------------------------------------------------------------------
# The code in p4.1 counted how many reports we have for each pair,
# which I then stored in pair_report_count, and we can use from there.
#### ------------------------------------------------------------------------

# Define the pair we are interested in.
stitch_id1 = sys.argv[1]
stitch_id2 = sys.argv[2]

print >> sys.stderr, "%s:%s: Building the drug data matrix and file." % (stitch_id1, stitch_id2)
# Populate the drug_data
query = """
select report_id, stitch_id, drug_name, umls_id, adverse_event
from drug_report_event
where stitch_id = '%s';
""" % stitch_id1
c.execute(query)
data1 = [row for row in c.fetchall()]

query = """
select distinct report_id
from drug_report_event
where stitch_id = '%s';
""" % stitch_id2
c.execute(query)
reports2 = [row[0] for row in c.fetchall()]

report_data = [row for row in data1 if row[0] in reports2]
report_ids = sorted(set([r[0] for r in report_data]))
umls_ids = sorted(set([r[3] for r in report_data]))

print >> sys.stderr, "%s:%s: Found %d reports with both drugs on the report." % (stitch_id1, stitch_id2, len(report_ids))

drug_data = dict()
for row in report_data:
    report_id = str(row[0])
    umls_id = str(row[3])
    if not drug_data.has_key(report_id):
        drug_data[report_id] = set()
    drug_data[report_id].add(umls_id)

# Cleanup
del report_data
del reports2
del data1

drug_data_file = tempfile.mkstemp(suffix=".csv")[-1]
drug_file = open(drug_data_file,'w')
writer = csv.writer(drug_file)
writer.writerow(['RowID'] + umls_ids)
for report_id in map(str, report_ids):
    row = [1 if umls_id in drug_data[report_id] else 0 for umls_id in umls_ids]
    writer.writerow([report_id] + row)

drug_file.close()

print >> sys.stderr, "%s:%s: Building the drug background data matrix and file." % (stitch_id1, stitch_id2)
while True:
    # Get the background data.
    query = """
    select report_id, umls_id
    from drug_report_event join
    (
        select corr_drug as stitch_id
        from 
        (
            select corr_drug
            from 
            (
                select stitch_id2 as corr_drug
                from corr_drug_drug
                where stitch_id1  = '%s'
                and corrected < 1e-%d
            ) a left join
            (
                select stitch_id1 as corr_drug
                from corr_drug_drug
                where stitch_id2  = '%s'
                and corrected < 1e-%d
            ) b using (corr_drug)
        ) a join
        (
            select corr_drug
            from 
            (
                select stitch_id2 as corr_drug
                from corr_drug_drug
                where stitch_id1  = '%s'
                and corrected < 1e-%d
            ) a left join
            (
                select stitch_id1 as corr_drug
                from corr_drug_drug
                where stitch_id2  = '%s'
                and corrected < 1e-%d
            ) b using (corr_drug)
        ) b using (corr_drug)
    ) a using (stitch_id)
    """ % tuple([stitch_id1, DRUG_DRUG_EXP]*2 + [stitch_id2, DRUG_DRUG_EXP]*2)
    num_bg_reports = c.execute(query)
    if num_bg_reports < MIN_BG_REPORTS:
        if DRUG_DRUG_EXP == 0:
            print >> sys.stderr, "%s:%s: Failed permenantly to retrieve enough background. Exiting." % (stitch_id1, stitch_id2)
            sys.exit(1)
        print >> sys.stderr, "%s:%s: Failed to retrieve enough background reports (%d), reducing significance threshold (1e-%d)." % (stitch_id1, stitch_id2, num_bg_reports, DRUG_DRUG_EXP)
        DRUG_DRUG_EXP /= 2
    else:
        break

drug_report_ids = report_ids
umls_id_set = set(umls_ids)
report_data = [row for row in c.fetchall() if len(set([row[1]])&umls_id_set) > 0]

print >> sys.stderr, "%s:%s: Found %d correlated drug reports." % (stitch_id1, stitch_id2, len(report_data))

report_ids = sorted(set([r[0] for r in report_data]) - set(drug_report_ids) )
if len(report_ids) > MAX_REPORTS:
    report_ids = random.sample(report_ids, MAX_REPORTS)

bg_data = dict()
for row in report_data:
    report_id = str(row[0])
    umls_id = str(row[1])
    
    if not bg_data.has_key(report_id):
        bg_data[report_id] = set()
    bg_data[report_id].add(umls_id)

# Cleanup
del report_data

bg_data_file = tempfile.mkstemp(suffix=".csv")[-1]
bg_file = open(bg_data_file,'w')
writer = csv.writer(bg_file)
writer.writerow(['RowID'] + umls_ids)
for report_id in map(str,report_ids):
    row = [1 if umls_id in bg_data[report_id] else 0 for umls_id in umls_ids]
    writer.writerow([report_id] + row)

bg_file.close()

# Last bit of cleanup
del drug_data
del bg_data

if len(report_ids) < 10 or len(drug_report_ids) < 10:
    print >> sys.stderr, "Did not have enough reports to warrant statistics. Quiting."
    sys.exit(1)

print >> sys.stderr, "%s:%s: Running the statistical analysis." % (stitch_id1, stitch_id2)

result_file = os.path.expanduser("~/Stanford/AltmanLab/aers/part4.d/%s:%s.csv" % (stitch_id1, stitch_id2))
r_string = """
bg_data <- read.csv('%s')
rownames(bg_data) <- bg_data[,1]
bg_data <- bg_data[,-1]

drug_data <- read.csv('%s')
rownames(drug_data) <- drug_data[,1]
drug_data <- drug_data[,-1]

results <- matrix(rep(c(NA,NA,NA),ncol(drug_data)), ncol=3)
colnames(results) <- c("pvalue","drug_mean","bg_mean")
rownames(results) <- colnames(drug_data)

nmeans <- 1000
ssize1 <- round(max(10,0.01*nrow(bg_data)))
ssize2 <- round(max(10,0.01*nrow(drug_data)))
write(paste("%s:%s:","nmeans=",nmeans,"sizes:",ssize1,ssize2, "lengths:",nrow(bg_data),nrow(drug_data)), stderr())

for (i in 1:nrow(results)) {
    # write(paste("Working on ",i,"of ",nrow(results)), stderr())
	vals1 <- bg_data[,i]
	vals2 <- drug_data[,i]
	
	v1means <- rep(0, nmeans)
	v1means <- unlist(lapply(v1means,function(x){mean(sample(vals1,ssize1))}))
	
	v2means <- rep(0, nmeans)
	v2means <- unlist(lapply(v2means,function(x){mean(sample(vals2,ssize2))}))
	
	results[i,1] <- t.test(v1means, v2means)$p.value
	results[i,2] <- mean(v2means)
	results[i,3] <- mean(v1means)
}

write.csv(results,file="%s",quote=F)
""" % (bg_data_file, drug_data_file, stitch_id1, stitch_id2, result_file)

R.runRString(r_string)

print >> sys.stderr, "%s:%s: Completed run, result saved to %s" % (stitch_id1, stitch_id2, result_file)

os.system("rm %s" % bg_data_file)
os.system("rm %s" % drug_data_file)