<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">"""
Discover adverse events significantly associated with drugs.
"""

import R
import csv
import sys
import numpy
import random
import MySQLdb
import tempfile
import operator
from namedmatrix import NamedMatrix

db = MySQLdb.connect(host="localhost", port=3307, user="root", passwd="enter_your_password",db="project_aers")
c = db.cursor()

# Define the drug we are interested in.
# cid = 'CID000005090' # Vioxx
cid = sys.argv[1]

print &gt;&gt; sys.stderr, "%s: Building the drug data matrix and file." % cid
# Populate the drug_data
query = """
select report_id, stitch_id, drug_name, umls_id, adverse_event
from drug_report_event
where stitch_id = '%s';
""" % cid
c.execute(query)

report_data = [row for row in c.fetchall()]
drug_report_ids = sorted(set([r[0] for r in report_data]))
umls_ids = sorted(set([r[3] for r in report_data]))

drug_data = dict()
for row in report_data:
    report_id = str(row[0])
    umls_id = str(row[3])
    if not drug_data.has_key(umls_id):
        drug_data[umls_id] = set()
    drug_data[umls_id].add(report_id)

drug_data_file = tempfile.mkstemp(suffix=".csv")[-1]
nmeans = 20
nfreqs = 1000
drug_file = open(drug_data_file,'w')
writer = csv.writer(drug_file)

for j,umls_id in enumerate(umls_ids):
    if j % 100 == 0:
        print &gt;&gt; sys.stderr, ".",
    
    sample_size = max(10, int(1.0/ ((len(drug_data[umls_id])/float(len(drug_report_ids))))) / 100)
    # a = time.time()
    means = []
    for i in range(nmeans):
        freqs = []
        for i in range(nfreqs):
            reports = set(random.sample(drug_report_ids, sample_size))
            freqs.append( len( reports &amp; drug_data[umls_id] ) / float(sample_size) )
        means.append(numpy.mean(freqs))
    
    # print time.time()-a
    writer.writerow([umls_id] + means)

drug_file.close()

# Clean up some variables.
del drug_data
del report_data

print &gt;&gt; sys.stderr, "%s: Building the background data matrix and file." % cid

# Get the background data.
query = """
select report_id, stitch_id, umls_id, adverse_event
from drug_report_event join
(
	select stitch_id2 as stitch_id
	from corr_drug_drug
	where stitch_id1 = '%s'
	and corrected &lt; 0.05
) a
using (stitch_id);
""" % cid
c.execute(query)

report_data = [row for row in c.fetchall()]
bg_report_ids = map(str, sorted(set([r[0] for r in report_data]) - set(drug_report_ids) ))

bg_data = dict()

for row in report_data:
    report_id = str(row[0])
    umls_id = str(row[2])
    
    if not bg_data.has_key(umls_id):
        bg_data[umls_id] = set()
    bg_data[umls_id].add(report_id)

# ssize1 = int(min(100,max(10,0.01*len(bg_report_ids))))
# ssize2 = int(min(100,max(10,0.01*len(drug_report_ids))))

bg_data_file = tempfile.mkstemp(suffix=".csv")[-1]

# sample_size = 10

nmeans = 20
nfreqs = 1000
bg_file = open(bg_data_file,'w')
writer = csv.writer(bg_file)

for j,umls_id in enumerate(umls_ids):
    if j % 100 == 0:
        print &gt;&gt; sys.stderr, ".",
    
    sample_size = max(10, int(1.0/ ((len(bg_data[umls_id])/float(len(bg_report_ids))))) / 100)
    # a = time.time()
    means = []
    for i in range(nmeans):
        freqs = []
        for i in range(nfreqs):
            bg_reports = set(random.sample(bg_report_ids, sample_size))
            freqs.append( len( bg_reports &amp; bg_data[umls_id] ) / float(sample_size) )
        means.append(numpy.mean(freqs))
    
    # print time.time()-a
    writer.writerow([umls_id] + means)

bg_file.close()

print &gt;&gt; sys.stderr, "%s: Running the statistical analysis." % cid

result_file = "/Volumes/UsersHD/Users/nick/Stanford/AltmanLab/aers/part2/%s.csv" % cid
r_string = """
bg_data &lt;- read.csv('%s')
rownames(bg_data) &lt;- bg_data[,1]
bg_data &lt;- bg_data[,-1]

drug_data &lt;- read.csv('%s')
rownames(drug_data) &lt;- drug_data[,1]
drug_data &lt;- drug_data[,-1]

results &lt;- matrix(rep(c(NA,NA,NA),ncol(drug_data)), ncol=3)
colnames(results) &lt;- c("pvalue","drug_mean","bg_mean")
rownames(results) &lt;- colnames(drug_data)

nmeans &lt;- 1000
ssize1 &lt;- round(max(10,0.01*nrow(bg_data)))
ssize2 &lt;- round(max(10,0.01*nrow(drug_data)))
write(paste("%s:","nmeans=",nmeans,"sizes:",ssize1,ssize2, "lengths:",nrow(bg_data),nrow(drug_data)), stderr())

for (i in 1:nrow(results)) {
	#write(paste("Working on ",i,"of ",nrow(results)), stderr())
	vals1 &lt;- bg_data[,i]
	vals2 &lt;- drug_data[,i]
	
	v1means &lt;- rep(0, nmeans)
	v1means &lt;- unlist(lapply(v1means,function(x){mean(sample(vals1,ssize1))}))
	
	v2means &lt;- rep(0, nmeans)
	v2means &lt;- unlist(lapply(v2means,function(x){mean(sample(vals2,ssize2))}))
	
	results[i,1] &lt;- t.test(v1means, v2means)$p.value
	results[i,2] &lt;- mean(v2means)
	results[i,3] &lt;- mean(v1means)
}

write.csv(results,file="%s",quote=F)
""" % (bg_data_file, drug_data_file, cid, result_file)

R.runRString(r_string)

print &gt;&gt; sys.stderr, "%s: Completed run, result saved to %s" % (cid, result_file)
</pre></body></html>