"""
timeline_analysis.py

We would like to follow a drug (say Vioxx) and monitor the drugs signal for an Adverse Event (say myocardial infarction)
over time to see how the reporting progresses and if it can be spotted before its widely known.
"""

import csv
import sys
import math
import numpy
import random
import MySQLdb
import operator
import rpy2.robjects as robjects

def create_temporary_table(data, column_name, data_type='varchar', length=None):
    
    if length is None:
        length = max([len(d) for d in data])
    
    if data_type == 'int':
        length = 11
    
    query = """
    create table nick.temporary_table
    (`%s` %s(%s));
    """ % (column_name, data_type, length)
    c.execute(query)
    for value in data:
        c.execute("insert into nick.temporary_table (`%s`) values ('%s');" % (column_name, value))
    query = """
    alter table nick.temporary_table add index (`%s`);
    """ % column_name
    c.execute(query)

def drop_temporary_table():
    query = """
    drop table nick.temporary_table;
    """
    c.execute(query)

db = MySQLdb.connect(host="localhost", port=3307, user="root", passwd="enter_your_password",db="effect_aers")
c = db.cursor()

# You can use this query to get the stitch ids for the drug of interest.
# select *
# from aers2stitch
# where drug_name = 'vioxx';

stitch_id = 'CID000005090'

# This query is useful for figuring out when the start/end date should 

start_year = 1995
end_year = 2009

year = start_year

outfh = open('%s_timeline.csv' % (stitch_id), 'w')

while year <= end_year:
    
    print >> sys.stderr, "Running analysis on %s for year %d" % (stitch_id, year)
    
    query = """
    select isr_report_id
    from drugs
    join aers2stitch using (drug_name)
    join demographics using (isr_report_id)
    where stitch_id = "%s"
    and not event_date = '0000-00-00'
    and substring_index(event_date, '-',1) <= %d;
    """ % (stitch_id, year)
    c.execute(query)
    
    report_ids = set([row[0] for row in c.fetchall()])
    
    # 1. We need to get the correlated drugs.
    
    # 1a. First we need to know all the reports at this time.
    query = """
    select isr_report_id
    from drugs
    join demographics using (isr_report_id)
    join aers2stitch using (drug_name)
    where not event_date = '0000-00-00'
    and substring_index(event_date, '-',1) <= %d;
    """ % year
    c.execute(query)
    
    all_report_ids = set([row[0] for row in c.fetchall()])
    
    # 1b. Now we pull out all the drugs which are coprescribed.
    create_temporary_table(report_ids, 'isr_report_id', 'decimal', length='10,0')
    
    query = """
    select stitch_id
    from drugs
    join nick.temporary_table using (isr_report_id)
    join demographics using (isr_report_id)
    join aers2stitch using (drug_name)
    where not event_date = '0000-00-00'
    and substring_index(event_date, '-',1) <= %d;
    """ % year
    c.execute(query)
    
    drop_temporary_table()
    
    coprescribed_drugs = set([row[0] for row in c.fetchall()]) - set([stitch_id])
    
    # 1c. And all of their respective report ids.
    create_temporary_table(coprescribed_drugs, 'stitch_id', 'varchar', '30')
    
    query = """
    select stitch_id, isr_report_id
    from drugs
    join demographics using (isr_report_id)
    join aers2stitch using (drug_name)
    join nick.temporary_table using (stitch_id)
    where not event_date = '0000-00-00'
    and substring_index(event_date, '-',1) <= %d;
    """ % year
    c.execute(query)
    
    drop_temporary_table()
    
    drug_reports = dict()
    for sid, rid in c.fetchall():
        if not sid in drug_reports:
            drug_reports[sid] = set()
        
        drug_reports[sid].add(rid)
    
    # 1d. Build a contingency table for each drug to determine the significance
    #     of it's correlation with the given drug.
    #
    #   1,1: number of report ids shared by both the given and other drug
    #   1,2: number of report ids for only the given drug
    #   2,1: number of report ids for only the other drug
    #   2,2: number of report ids not for either drug
    correlated_drugs = list()
    for sid,rids in drug_reports.items():
        contingency = list()
        contingency.append( len(rids & report_ids) )
        contingency.append( len(report_ids - rids) )
        contingency.append( len(rids - report_ids) )
        contingency.append( len(all_report_ids - (rids | report_ids)) )
        
        if contingency[0] > 0:
            pvalue = robjects.r("fisher.test(matrix(c(%s),nrow=2))$p.value" % ",".join(map(str,contingency)))[0]
            if pvalue < 0.05:
                correlated_drugs.append(sid)
    
    # 2. We now need to identify the correlated indications.
    
    # 2a. Identify the candiate indications.
    query = """
    select indication_descrip_term
    from drugs
    join aers2stitch using (drug_name)
    join demographics using (isr_report_id)
    join indications using (isr_report_id)
    where stitch_id = "%s"
    and not event_date = '0000-00-00'
    and substring_index(event_date, '-',1) <= %d;
    """ % (stitch_id, year)
    c.execute(query)
    
    indications = set([row[0] for row in c.fetchall()])
    
    # 2b. Pull out the reports for the indications.
    create_temporary_table(indications, 'indication_descrip_term', 'varchar', '100')
    
    query = """
    select indication_descrip_term, isr_report_id
    from drugs
    join aers2stitch using (drug_name)
    join demographics using (isr_report_id)
    join indications using (isr_report_id)
    join nick.temporary_table using (indication_descrip_term)
    where not event_date = '0000-00-00'
    and substring_index(event_date, '-',1) <= %d;
    """ % year
    c.execute(query)
    
    drop_temporary_table()
    
    indication_reports = dict()
    for iid, rid in c.fetchall():
        if not iid in indication_reports:
            indication_reports[iid] = set()
        
        indication_reports[iid].add(rid)
    
    # 2c. Identify the significantly correlated reports.
    correlated_indications = list()
    for iid,rids in indication_reports.items():
        contingency = list()
        contingency.append( len(rids & report_ids) )
        contingency.append( len(report_ids - rids) )
        contingency.append( len(rids - report_ids) )
        contingency.append( len(all_report_ids - (rids | report_ids)) )
        
        if contingency[0] > 0:
            pvalue = robjects.r("fisher.test(matrix(c(%s),nrow=2))$p.value" % ",".join(map(str,contingency)))[0]
            if pvalue < 0.05:
                correlated_indications.append(iid)
    
    # 3. Pull out the report events that will make up the background.
    
    background_report_ids = reduce(operator.or_, [drug_reports[sid] for sid in correlated_drugs])
    background_report_ids |= reduce(operator.or_, [indication_reports[iid] for iid in correlated_indications])
    
    create_temporary_table(background_report_ids, 'isr_report_id', 'decimal', '10,0')
    
    query = """
    select isr_report_id, umls_id
    from reactions
    join aers2umls using (event_descrip_term)
    join nick.temporary_table using (isr_report_id)
    """
    c.execute(query)
    
    drop_temporary_table()
    
    background_report_events = dict()
    for rid,uid in c.fetchall():
        if not rid in background_report_events:
            background_report_events[rid] = set()
        
        background_report_events[rid].add(uid)
    
    # 4. Pull out the report events that will make up the cases.
    create_temporary_table(report_ids, 'isr_report_id', 'decimal', '10,0')
    
    query = """
    select isr_report_id, umls_id
    from reactions
    join aers2umls using (event_descrip_term)
    join nick.temporary_table using (isr_report_id)
    """
    c.execute(query)
    
    drop_temporary_table()
    
    report_events = dict()
    for rid,uid in c.fetchall():
        if not rid in report_events:
            report_events[rid] = set()
        
        report_events[rid].add(uid)
    
    umls_ids = reduce(operator.or_, background_report_events.values()) & reduce(operator.or_, report_events.values())
    
    writer = csv.writer(outfh)
    
    for uid in umls_ids:
        
        background_vector = [1 if uid in uids else 0 for uids in background_report_events.values()]
        foreground_vector = [1 if uid in uids else 0 for uids in report_events.values()]
        
        if len(foreground_vector) < 10 or len(background_vector) < 10:
            continue
        
        background_sample_size = max(10, len(background_vector)/10)
        foreground_sample_size = max(10, len(foreground_vector)/10)
        
        background_means = [numpy.mean(random.sample(background_vector, background_sample_size)) for i in range(100)]
        foreground_means = [numpy.mean(random.sample(foreground_vector, foreground_sample_size)) for i in range(100)]
        
        pvalue = robjects.r("t.test(c(%s), c(%s))$p.value" % (",".join(map(str,background_means)), ",".join(map(str,foreground_means))))[0]
        
        bg_mu = numpy.mean(background_means)
        bg_sd = numpy.std(background_means)
        fg_mu = numpy.mean(foreground_means)
        fg_sd = numpy.std(foreground_means)
        
        writer.writerow([stitch_id, uid, year, fg_mu, fg_sd, bg_mu, bg_sd, pvalue])
    
    year += 1

outfh.close()