"""
timeline_analysis.py

We would like to follow a drug (say Vioxx) and monitor the drugs signal for an Adverse Event (say myocardial infarction)
over time to see how the reporting progresses and if it can be spotted before its widely known.

The old timeline analysis was really stupid slow, so we are trying again with less queries.
"""

import os
import csv
import sys
import math
import numpy
import random
import MySQLdb
import operator
import rpy2.robjects as robjects

db = MySQLdb.connect(host="localhost", port=3307, user="root", passwd="enter_your_password",db="effect_aers")
c = db.cursor()

#####################################
#  Load the data.
#####################################

query = """
select stitch_id, isr_report_id
from aers2stitch
join drugs using (drug_name);
"""
c.execute(query)

drug_report = dict()
report_drug = dict()
for stitch_id, report_id in c.fetchall():
    if not stitch_id in drug_report:
        drug_report[stitch_id] = set()
    
    drug_report[stitch_id].add(report_id)
    
    if not report_id in report_drug:
        report_drug[report_id] = set()
    
    report_drug[report_id].add(stitch_id)

query = """
select indication_descrip_term, isr_report_id
from indications;
"""
c.execute(query)

report_indication = dict()
indication_report = dict()
for indication, report_id in c.fetchall():
    if not report_id in report_indication:
        report_indication[report_id] = set()
    
    report_indication[report_id].add(indication)
    
    if not indication in indication_report:
        indication_report[indication] = set()
    
    indication_report[indication].add(report_id)

query = """
select isr_report_id, umls_id
from reactions
join aers2umls using (event_descrip_term)
"""
c.execute(query)

report_event = dict()
for report_id, umls_id in c.fetchall():
    if not report_id in report_event:
        report_event[report_id] = set()
    
    report_event[report_id].add(umls_id)

report_ids_with_events = set(report_event.keys())

#####################################
#  Run the analysis
#####################################

MAX_BACKGROUND_REPORTS = 100000

# target_drug = 'CID000005090' # vioxx
# target_drug = 'CID000001302' # naproxen
# target_drug = 'CID000003672' # ibuprofen
target_drug = 'CID000002662' # celebrex

# target_drug = 'CID000077999' # rosiglitazone / avandia
# target_drug = 'CID000004829' # pioglitazone /actos, in same class as avandia
# target_drug = 'CID000004091' # metformin

# target_drug = 'CID000004723' # pemoline

sys.exit(1)

print >> sys.stderr, "Running analysis for %s" % (target_drug)

# 1. Get the reports we have for the given drug
drug_report_ids = drug_report[target_drug]

# 2. Extract all of the coprescribed drugs.
coprescribed_drugs = reduce(operator.or_, [report_drug[report_id] for report_id in drug_report_ids]) - set([target_drug])

# 3. Test each one for co-prescription significance.
print >> sys.stderr, "Finding Correlated Drugs: ",

correlated_drugs = set()
correlated_drugs_reports = set()
for i,stitch_id in enumerate(coprescribed_drugs):
    if i % (len(coprescribed_drugs)/10) == 0:
        print >> sys.stderr, ".",
    
    coprescribed_drug_report_ids = drug_report[stitch_id] & all_reports_for_year
    contingency = list()
    contingency.append( len(drug_report_ids & coprescribed_drug_report_ids) )
    contingency.append( len(drug_report_ids - coprescribed_drug_report_ids) )
    contingency.append( len(coprescribed_drug_report_ids - drug_report_ids) )
    contingency.append( len(all_reports_for_year - (drug_report_ids | coprescribed_drug_report_ids)) )
    
    pvalue = robjects.r("fisher.test(matrix(c(%s),nrow=2))$p.value" % ",".join(map(str,contingency)))[0]
    if pvalue < 0.05:
        correlated_drugs.add(stitch_id)
        correlated_drugs_reports |= coprescribed_drug_report_ids

print >> sys.stderr, " |"

# 4. Extract all of the indications for the given drug.
given_indications = reduce(operator.or_, [report_indication[report_id] for report_id in drug_report_ids if report_id in report_indication])

# 5. Identify significantly correlated indications.
print >> sys.stderr, "Finding Correlated Indications: ",

correlated_indications = set()
correlated_indications_reports = set()
for i, indication in enumerate(given_indications):
    if i % (len(given_indications)/10) == 0:
        print >> sys.stderr, ".",
    
    indication_report_ids = indication_report[indication] & all_reports_for_year
    contingency = list()
    contingency.append( len(drug_report_ids & indication_report_ids) )
    contingency.append( len(drug_report_ids - indication_report_ids) )
    contingency.append( len(indication_report_ids - drug_report_ids) )
    contingency.append( len(all_reports_for_year - (drug_report_ids | indication_report_ids)) )
    
    pvalue = robjects.r("fisher.test(matrix(c(%s),nrow=2))$p.value" % ",".join(map(str,contingency)))[0]
    if pvalue < 0.05:
        correlated_indications.add(indication)
        correlated_indications_reports |= indication_report_ids

print >> sys.stderr, " |"

background_report_ids =  correlated_indications_reports | correlated_drugs_reports

# We only want to look at reports which have events, so we filter.
background_report_ids = background_report_ids & report_ids_with_events
if len(background_report_ids) > MAX_BACKGROUND_REPORTS:
    background_report_ids = random.sample(list(background_report_ids), MAX_BACKGROUND_REPORTS)

foreground_report_ids = drug_report_ids & report_ids_with_events
if len(foreground_report_ids) > MAX_BACKGROUND_REPORTS:
    foreground_report_ids = random.sample(list(foreground_report_ids), MAX_BACKGROUND_REPORTS)

umls_ids = reduce(operator.or_, [report_event[report_id] for report_id in foreground_report_ids])

print >> sys.stderr, "Analyzing Adverse Events: ",
for i,umls_id in enumerate(umls_ids):
    if i % (len(umls_ids)/10) == 0:
        print >> sys.stderr, ".",
    
    background_vector = [1 if umls_id in report_event[report_id] else 0 for report_id in background_report_ids]
    foreground_vector = [1 if umls_id in report_event[report_id] else 0 for report_id in foreground_report_ids]
    
    if len(background_vector) < 10 or len(foreground_vector) < 10 or sum(background_vector) == 0:
        continue
    
    background_sample_size = max(10, len(background_vector)/10)
    foreground_sample_size = max(10, len(foreground_vector)/10)
    
    background_means = [numpy.mean(random.sample(background_vector, background_sample_size)) for i in range(100)]
    foreground_means = [numpy.mean(random.sample(foreground_vector, foreground_sample_size)) for i in range(100)]
    
    if sum(background_means) == 0 or sum(foreground_means) == 0:
        continue
    
    pvalue = robjects.r("t.test(c(%s), c(%s))$p.value" % (",".join(map(str,background_means)), ",".join(map(str,foreground_means))))[0]
    
    bg_mu = numpy.mean(background_means)
    bg_sd = numpy.std(background_means)
    fg_mu = numpy.mean(foreground_means)
    fg_sd = numpy.std(foreground_means)
    
    q = """
    INSERT INTO project_aers.timeline 
    (stitch_id, umls_id, year, fg_mu, fg_sd, fg_N, bg_mu, bg_sd, bg_N, pvalue) VALUES 
    ('%s', '%s', %s, %.10e, %.10e, %d, %.10e, %.10e, %d, %.10e)
    """ % (target_drug, umls_id, this_year, fg_mu, fg_sd, len(foreground_vector), bg_mu, bg_sd, len(background_vector), pvalue)
    r = c.execute(q)

print >> sys.stderr, " |"

this_year += 1