"""
Bias plots.

"""

import os
import csv
import sys
import math
import numpy
import MySQLdb

import rpy2.robjects as robjects

db = MySQLdb.connect(host="localhost", port=3307, user="root", passwd="enter_your_password",db="project_aers")
c = db.cursor()

query = """
select stitch_id1, stitch_id2, estimate
from corr_drug_drug_minreports100
where estimate > 0
and corrected < 1
and `database` = 'aers'
"""
c.execute(query)

corr_data = c.fetchall()

estimates = sorted([x[2] for x in corr_data])
num_bins = 50
bin_length = len(estimates)/num_bins
corr_bins = [(estimates[i*bin_length], estimates[(i+1)*bin_length]) for i in range(num_bins)]
corr_bins[-1] = (corr_bins[-1][0], max(estimates)+0.01)

#corr_bins = [(-10.0, -5.5), (-5.5, -4.5), (-4.5, -3.5), (-3.5, -3.0), (-3.0, -2.0), (-2.0, 0.0)]

corr_dict = dict()
for sid1, sid2, est in corr_data:
    if not sid1 in corr_dict:
        corr_dict[sid1] = dict()
    corr_dict[sid1][sid2] = est
    if not sid2 in corr_dict:
        corr_dict[sid2] = dict()
    corr_dict[sid2][sid1] = est

# query = """
# select stitch_id, umls_id
# from prop_pred_drug_events
# where a > 10
# and prr > 100
# """
query = """
select stitch_id, umls_id
from log2RR_pred_drug_events
where Nij > 10
and RR > 100
"""
print >> sys.stderr, query
c.execute(query)

gold_data = c.fetchall()
gold_dict = dict()

for sid, uid in gold_data:
    if not sid in gold_dict:
        gold_dict[sid] = set()
    gold_dict[sid].add(uid)


# query = """
# select stitch_id, umls_id, prr
# # from prop_pred_drug_events
# from pred_drug_events_e5
# # from pred_drug_events_fi # indications only
# where not prr is null
# """
query = """
select stitch_id, umls_id, RR
# from log2RR_pred_drug_events
from psm_drug_events
where not RR is null
"""
print >> sys.stderr, query
c.execute(query)

pred_data = c.fetchall()

prob_data = dict()
for bin in corr_bins:
    prob_data[bin] = list()


###### Setting up an Odds Ratio Test ###########
# We want to know how much more likely is it that a drug correlated is
# associated with the other drug's effect
#               Corr >= 0.1     |   Corr < 0.1
#   --------------------------------------------
#   PRR >= 2 |       a           |       b
#   PRR < 2  |       c           |       d

a,b,c,d = 0,0,0,0

# data_file = open(os.path.expanduser('~/Dropbox/all_prr_data_drg.csv'), 'w')
# writer = csv.writer(data_file)
for i,(sid, uid, prr) in enumerate(pred_data):
    if (i+1)%(len(pred_data)/10) == 0:
        print >> sys.stderr, ".",
    
    if sid in corr_dict:
        for corr_sid, corr_est in corr_dict[sid].items():
            if corr_sid in gold_dict and uid in gold_dict[corr_sid]:
                bin = [x for x in corr_bins if x[0] <= corr_est < x[1]][0]
                prob_data[bin].append(prr)
                # writer.writerow([prr, corr_est])
                if corr_est >= 0.1:
                    if prr >= 2:
                        a += 1
                    else:
                        c += 1
                else:
                    if prr >= 2:
                        b += 1
                    else:
                        d += 1

print >> sys.stderr, "\n", a, b, c, d

# data_file.close()

# PRR Curve
# data_file = open(os.path.expanduser('~/Dropbox/binned_prr_data_drg.csv'), 'w')
# writer = csv.writer(data_file)
# for i,bin in enumerate(corr_bins):
#     print "\t".join(map(str,[bin[0], bin[1], i, numpy.mean(prob_data[bin]), numpy.std(prob_data[bin]), len(prob_data[bin])]))
#     probs = prob_data[bin]
#     writer.writerow([(bin[0]+bin[1])/2.0, numpy.mean(probs), 1.96*numpy.std(probs)/math.sqrt(len(probs))])
# 
# data_file.close()
# 
# # Probability Curve
data_file = open(os.path.expanduser('~/Dropbox/binned_prr_probs_drg.csv'), 'w')
writer = csv.writer(data_file)
for i,bin in enumerate(corr_bins):
    probs = [float(prr > 10) for prr in prob_data[bin]]
    # print >> sys.stderr,  "\t".join(map(str,[bin[0], bin[1], i, numpy.mean(probs), numpy.std(probs), len(probs)]))
    writer.writerow([(bin[0]+bin[1])/2.0, numpy.mean(probs), 1.96*numpy.std(probs)/math.sqrt(len(probs))])

data_file.close()

print >> sys.stderr, robjects.r("""
data <- read.csv('~/Dropbox/binned_prr_probs_drg.csv', header=F)
#summary(lm(data$V2 ~ data$V1))
cor.test(data$V1, data$V2, method="spearman")
""")