"""
timeline_analysis.py

We would like to follow a drug (say Vioxx) and monitor the drugs signal for an Adverse Event (say myocardial infarction)
over time to see how the reporting progresses and if it can be spotted before its widely known.

The old timeline analysis was really stupid slow, so we are trying again with less queries.
"""

import csv
import sys
import math
import numpy
import random
import MySQLdb
import operator
import rpy2.robjects as robjects

db = MySQLdb.connect(host="localhost", port=3307, user="root", passwd="enter_your_password",db="effect_aers")
c = db.cursor()

#####################################
#  Load the data.
#####################################

query = """
select isr_report_id, substring_index(event_date, '-',1), event_date
from demographics
where not event_date = '0000-00-00';
"""
c.execute(query)

year_report = dict()
report_year = dict()
for report_id, year, date in c.fetchall():
    year = int(year)
    if not year in year_report:
        year_report[year] = set()
    
    year_report[year].add(report_id)
    
    if not report_id in report_year:
        report_year[report_id] = year

query = """
select stitch_id, isr_report_id
from aers2stitch
join drugs using (drug_name);
"""
c.execute(query)

drug_report = dict()
report_drug = dict()
for stitch_id, report_id in c.fetchall():
    if not stitch_id in drug_report:
        drug_report[stitch_id] = set()
    
    drug_report[stitch_id].add(report_id)
    
    if not report_id in report_drug:
        report_drug[report_id] = set()
    
    report_drug[report_id].add(stitch_id)

query = """
select indication_descrip_term, isr_report_id
from indications;
"""
c.execute(query)

report_indication = dict()
indication_report = dict()
for indication, report_id in c.fetchall():
    if not report_id in report_indication:
        report_indication[report_id] = set()
    
    report_indication[report_id].add(indication)
    
    if not indication in indication_report:
        indication_report[indication] = set()
    
    indication_report[indication].add(report_id)

query = """
select isr_report_id, umls_id
from reactions
join aers2umls using (event_descrip_term)
"""
c.execute(query)

report_event = dict()
for report_id, umls_id in c.fetchall():
    if not report_id in report_event:
        report_event[report_id] = set()
    
    report_event[report_id].add(umls_id)

report_ids_with_events = set(report_event.keys())

#####################################
#  Run the analysis
#####################################

MAX_BACKGROUND_REPORTS = 100000

START_YEAR = 1995
END_YEAR = 2005

# target_drug = 'CID000005090' # vioxx
# target_drug = 'CID000001302' # naproxen
# target_drug = 'CID000003672' # ibuprofen
target_drug = 'CID000002662' # celebrex

# target_drug = 'CID000077999' # rosiglitazone / avandia
# target_drug = 'CID000004829' # pioglitazone /actos, in same class as avandia
# target_drug = 'CID000004091' # metformin

# target_drug = 'CID000004723' # pemoline



this_year = START_YEAR

while this_year <= END_YEAR:
    
    print >> sys.stderr, "Running analysis for %s as of the year %d" % (target_drug, this_year)
    
    all_reports_for_year = reduce(operator.or_, [reports for year,reports in year_report.items() if year <= this_year])
    
    # 1. Get the reports we have for the given drug by the current year.
    drug_report_ids = drug_report[target_drug] & all_reports_for_year
    
    # 2. Extract all of the coprescribed drugs.
    coprescribed_drugs = reduce(operator.or_, [report_drug[report_id] for report_id in drug_report_ids]) - set([target_drug])
    
    # 3. Test each one for co-prescription significance.
    print >> sys.stderr, "Finding Correlated Drugs: ",
    
    correlated_drugs = set()
    correlated_drugs_reports = set()
    for i,stitch_id in enumerate(coprescribed_drugs):
        if i % (len(coprescribed_drugs)/10) == 0:
            print >> sys.stderr, ".",
        
        coprescribed_drug_report_ids = drug_report[stitch_id] & all_reports_for_year
        contingency = list()
        contingency.append( len(drug_report_ids & coprescribed_drug_report_ids) )
        contingency.append( len(drug_report_ids - coprescribed_drug_report_ids) )
        contingency.append( len(coprescribed_drug_report_ids - drug_report_ids) )
        contingency.append( len(all_reports_for_year - (drug_report_ids | coprescribed_drug_report_ids)) )
        
        pvalue = robjects.r("fisher.test(matrix(c(%s),nrow=2))$p.value" % ",".join(map(str,contingency)))[0]
        if pvalue < 0.05:
            correlated_drugs.add(stitch_id)
            correlated_drugs_reports |= coprescribed_drug_report_ids
    
    print >> sys.stderr, " |"
    
    # 4. Extract all of the indications for the given drug.
    given_indications = reduce(operator.or_, [report_indication[report_id] for report_id in drug_report_ids if report_id in report_indication])
    
    # 5. Identify significantly correlated indications.
    print >> sys.stderr, "Finding Correlated Indications: ",
    
    correlated_indications = set()
    correlated_indications_reports = set()
    for i, indication in enumerate(given_indications):
        if i % (len(given_indications)/10) == 0:
            print >> sys.stderr, ".",
        
        indication_report_ids = indication_report[indication] & all_reports_for_year
        contingency = list()
        contingency.append( len(drug_report_ids & indication_report_ids) )
        contingency.append( len(drug_report_ids - indication_report_ids) )
        contingency.append( len(indication_report_ids - drug_report_ids) )
        contingency.append( len(all_reports_for_year - (drug_report_ids | indication_report_ids)) )
        
        pvalue = robjects.r("fisher.test(matrix(c(%s),nrow=2))$p.value" % ",".join(map(str,contingency)))[0]
        if pvalue < 0.05:
            correlated_indications.add(indication)
            correlated_indications_reports |= indication_report_ids
    
    print >> sys.stderr, " |"
    
    background_report_ids =  correlated_indications_reports | correlated_drugs_reports
    
    # We only want to look at reports which have events, so we filter.
    background_report_ids = background_report_ids & report_ids_with_events
    if len(background_report_ids) > MAX_BACKGROUND_REPORTS:
        background_report_ids = random.sample(list(background_report_ids), MAX_BACKGROUND_REPORTS)
    
    foreground_report_ids = drug_report_ids & report_ids_with_events
    if len(foreground_report_ids) > MAX_BACKGROUND_REPORTS:
        foreground_report_ids = random.sample(list(foreground_report_ids), MAX_BACKGROUND_REPORTS)
    
    umls_ids = reduce(operator.or_, [report_event[report_id] for report_id in foreground_report_ids])
    
    print >> sys.stderr, "Analyzing Adverse Events: ",
    for i,umls_id in enumerate(umls_ids):
        if i % (len(umls_ids)/10) == 0:
            print >> sys.stderr, ".",
        
        background_vector = [1 if umls_id in report_event[report_id] else 0 for report_id in background_report_ids]
        foreground_vector = [1 if umls_id in report_event[report_id] else 0 for report_id in foreground_report_ids]
        
        if len(background_vector) < 10 or len(foreground_vector) < 10 or sum(background_vector) == 0:
            continue
        
        background_sample_size = max(10, len(background_vector)/10)
        foreground_sample_size = max(10, len(foreground_vector)/10)
        
        background_means = [numpy.mean(random.sample(background_vector, background_sample_size)) for i in range(100)]
        foreground_means = [numpy.mean(random.sample(foreground_vector, foreground_sample_size)) for i in range(100)]
        
        if sum(background_means) == 0 or sum(foreground_means) == 0:
            continue
        
        pvalue = robjects.r("t.test(c(%s), c(%s))$p.value" % (",".join(map(str,background_means)), ",".join(map(str,foreground_means))))[0]
        
        bg_mu = numpy.mean(background_means)
        bg_sd = numpy.std(background_means)
        fg_mu = numpy.mean(foreground_means)
        fg_sd = numpy.std(foreground_means)
        
        q = """
        INSERT INTO project_aers.timeline 
        (stitch_id, umls_id, year, fg_mu, fg_sd, fg_N, bg_mu, bg_sd, bg_N, pvalue) VALUES 
        ('%s', '%s', %s, %.10e, %.10e, %d, %.10e, %.10e, %d, %.10e)
        """ % (target_drug, umls_id, this_year, fg_mu, fg_sd, len(foreground_vector), bg_mu, bg_sd, len(background_vector), pvalue)
        r = c.execute(q)
    
    print >> sys.stderr, " |"
    
    this_year += 1