!------------------------------------------------------------------------- ! Discover drug-adverse event interactions (part 4) ! ! In part 4 we are simply deconvolving for pairs drugs using all reports. ! ! (c) 2010 Nicholas P. Tatonetti, Stanford University ! nick.tatonetti@stanford.edu !------------------------------------------------------------------------- !------------------------------------------------------------------------- ! This is a remake of the first p4 script. The difference here is that ! we are now treating pairs as one entity and have recomputed all of the ! covariances based on pairs rather than individual drugs. This is a more ! appropriate way to generalize the method. ! ! ! CLUSTER RUNNING ! ! for ((i=1;i<14558;i+=250)); do bsub -e output/pairs_$i.txt -o results/pairs_$i.txt ./discover_ae_p4_pairs $i 250; done ! ! PRECISION - select the basic precision for the computations. ! module precision integer,parameter::R=kind(1.0) ! IEEE single precision or similar integer,parameter::D=kind(1.0d0) ! IEEE double precision or similar integer,parameter::I64=selected_int_kind(18) ! 64-bit int on most machines end module precision module functions use precision contains integer function num_pair_reports(drug1, drug2, stitch_reportcount, num_stitch, stitch2report, max_stitch_reports, num_reports) integer,intent(in)::drug1, drug2, num_stitch, num_reports, max_stitch_reports integer,intent(in)::stitch_reportcount(num_stitch) integer,intent(in)::stitch2report(num_stitch, max_stitch_reports) integer::reports(num_reports) reports = 0 do i=1,stitch_reportcount(drug1) reports(stitch2report(drug1, i)) = 1 enddo do i=1,stitch_reportcount(drug2) reports(stitch2report(drug2, i)) = reports(stitch2report(drug2, i)) + 1 enddo do i=1,num_reports if (reports(i).eq.2) then reports(i) = 1 else reports(i) = 0 endif enddo num_pair_reports = sum(reports) end function num_pair_reports subroutine vi_max(vector, length, max) integer,intent(in)::length integer,intent(in)::vector(length) integer,intent(inout)::max max = 0 do i=1,length if (i.eq.1) then max = vector(i) else if (vector(i).gt.max) then max = vector(i) endif endif enddo end subroutine vi_max subroutine next_rand(rand, randvec, randptr) integer,intent(inout)::randptr,rand integer,intent(in)::randvec(1100000) rand = randvec(randptr) randptr = randptr + 1 if (randptr.gt.1100000) then randptr = 1 endif end subroutine next_rand subroutine vector_mean(mu, vector, n) integer,intent(in)::n real(D),intent(out)::mu integer,intent(in)::vector(n) integer::i mu = 0.0 do i=1,n mu = mu + real(vector(i)) enddo mu = mu / real(n) end subroutine vector_mean subroutine vector_mean_real(mu, vector, n) integer,intent(in)::n real(D),intent(out)::mu real(D),intent(in)::vector(n) integer::i mu = 0.0 do i=1,n mu = mu + vector(i) enddo mu = mu / real(n) end subroutine vector_mean_real subroutine vector_std_real(std, mu, vector, n) integer,intent(in)::n real(D),intent(in)::mu real(D),intent(out)::std real(D),intent(in)::vector(n) integer::i std = 0.0 do i=1,n std = std + (vector(i) - mu)*(vector(i) - mu) enddo std = sqrt( std / real(n-1) ) end subroutine vector_std_real subroutine vector_sample(sample, size, vector, n, randvec, randptr) integer,intent(in)::size,n integer,intent(in)::vector(n) integer,intent(inout)::sample(size), randvec(1100000), randptr integer::i,randint do i=1,size call next_rand(randint, randvec, randptr) sample(i) = vector(mod(randint,n)+1) enddo end subroutine vector_sample subroutine bootstrap_statistics(mu, variance, vector, vector_length, randvec, randptr) real(D),intent(inout)::mu, variance integer,intent(in)::vector_length integer,intent(inout)::randptr integer,intent(inout)::randvec(1100000),vector(vector_length) real(D)::estimates(1000) integer::rand, sample_size, i integer,dimension(:),allocatable::sample sample_size = vector_length / 100 if (sample_size.lt.10) then sample_size = 10 endif if (sample_size.gt.10000) then sample_size = 10000 endif allocate(sample(sample_size)) sample = 0 do i=1,1000 call vector_sample( sample, sample_size, vector, vector_length, randvec, randptr ) call vector_mean( estimates(i), sample, sample_size) enddo call vector_mean_real( mu, estimates, 1000) call vector_std_real( variance, mu, estimates, 1000) end subroutine bootstrap_statistics end module functions program discover use precision use functions implicit none integer::i,j,k,l,m,i2 integer::start_index, step_size, stop_index ! This is initialized from the stdin to enable easy multi-processing. integer::num_stitch, num_reports, num_events, num_inds, min_bg_reports, num_pairs integer::max_reportcount, max_eventcount, max_indreportcount, num_reports_shared integer::drug1, drug2 real(D)::temp_real integer,dimension(:),allocatable::stitch_reportcount, report_umlscount, ind_reportcount integer,dimension(:),allocatable::shared_reports integer,dimension(:),allocatable::temp_vector, temp_vector2 integer,dimension(:,:),allocatable::report2umls ! Binary matrix that stores which events are on which reports integer,dimension(:,:),allocatable::stitch2report ! Binary matrix that stores which drugs are on which reports. integer,dimension(:,:),allocatable::ind2report ! Binary matrix that stores which indications are on which reports. integer,dimension(:,:),allocatable::pair2stitch ! Np x 2 matrix of the stitch ids real(D),dimension(4)::results ! A vector which stores fg_mean, fg_sd, fg_n, bg_mean, bg_sd, bg_n for the current interation. real(D),dimension(:,:),allocatable::pair_stitch, pair_ind real(D),dimension(:),allocatable::pair_stitch_threshold, pair_ind_threshold character*10::argument integer::randvec(1100000),randptr,randint ! ================= INITIALIZATION ==================== ! Initialize the data matrices and vectors. if (iargc().ne.2) then write(0,*) "Wrong number of arguments provided." endif call getarg(1, argument) read(argument, *) start_index call getarg(2, argument) read(argument, *) step_size stop_index = start_index + step_size - 1 write(0,*) "========= Discover Pair-AE: ", start_index, " to ~", stop_index, "==========" open(unit = 13, file = "random_numbers.vec") randptr = 726 do i=1,1100000 read(13,*) randvec(i) enddo open(unit = 10, file= "aers_entity_counts_pairs.csv") read(10, *), num_stitch, num_reports, num_events, num_inds, num_pairs close(10) if (num_pairs.lt.stop_index) then stop_index = num_pairs write(0,*) "Adjusting stop_index to ", stop_index endif write(0,*) "Found ", num_stitch, " drugs, ", num_reports, " reports, " write(0,*) "and ", num_inds, " indications, ", num_events, " adverse events," write(0,*) "and ", num_pairs, " pairs of drugs." allocate(pair_stitch(num_pairs, num_stitch)) allocate(pair_stitch_threshold(num_pairs)) allocate(pair_ind_threshold(num_pairs)) allocate(pair_ind(num_pairs, num_inds)) ! Load up how many reports we have for each drug. open(unit = 10, file= "aers_stitch_reportcount.txt") allocate(stitch_reportcount(num_stitch)) max_reportcount = 0 do i=1,num_stitch read(10, *) stitch_reportcount(i) if (stitch_reportcount(i).gt.max_reportcount) then max_reportcount = stitch_reportcount(i) endif enddo close(10) ! Load up the stitch2report open(unit = 11, file= "aers_stitch_reportlist.csv") allocate( stitch2report( num_stitch, max_reportcount ) ) stitch2report = 0 do i=1,num_stitch allocate( temp_vector( stitch_reportcount(i) ) ) read(11, *) temp_vector do j=1,stitch_reportcount(i) stitch2report(i, j) = temp_vector(j) enddo deallocate( temp_vector ) enddo close(11) write(0,*) "Loaded stitch2report matrix (", num_stitch, ",", max_reportcount, ")" ! Load up how many events we have for each report. open(unit = 10, file= "aers_report_umlscount.txt") allocate(report_umlscount(num_reports)) max_eventcount = 0 do i=1,num_reports read(10, *) report_umlscount(i) if (report_umlscount(i).gt.max_eventcount) then max_eventcount = report_umlscount(i) endif enddo close(10) ! Load up the event data. open(unit = 11, file= "aers_report_umlslist.csv") allocate( report2umls( num_reports, max_eventcount ) ) report2umls = 0 do i=1,num_reports allocate( temp_vector( report_umlscount(i) ) ) read(11, *) temp_vector do j=1,report_umlscount(i) report2umls(i, j) = temp_vector(j) enddo deallocate( temp_vector ) enddo close(11) write(0,*) "Loaded report2umls matrix (", num_reports, ",", max_eventcount, ")" ! Load up the indicaiton report data. open(unit = 10, file= "aers_ind_reportcount.txt") allocate(ind_reportcount(num_inds)) max_indreportcount = 0 do i=1,num_inds read(10, *) ind_reportcount(i) if (ind_reportcount(i).gt.max_indreportcount) then max_indreportcount = ind_reportcount(i) endif enddo close(10) open(unit = 11, file= "aers_ind_reportlist.csv") allocate( ind2report( num_inds, max_indreportcount ) ) ind2report = 0 do i=1,num_inds allocate( temp_vector( ind_reportcount(i) ) ) read(11, *) temp_vector do j=1,ind_reportcount(i) ind2report(i, j) = temp_vector(j) enddo deallocate( temp_vector ) enddo close(11) write(0,*) "Loaded ind2report matrix (", num_inds, ",", max_indreportcount, ")" ! Load in the members of each pair according to their fortran index (from stitch2index.csv). open(unit = 11, file="aers_pairs2stitch.txt") allocate(pair2stitch(num_pairs, 2)) do i=1,num_pairs read(11, *) pair2stitch(i,:) enddo close(11) write(0,*) "Loaded pair2stitch matrix (", num_pairs, ", 2 )" ! Load up the pair-drug correlation significance matrix. open(unit = 11, file= "aers_pair_stitch_est.csv") ! Type _e (estimates rather than pvalues are used) pair_stitch = 0 do i=1,num_pairs read(11, *) pair_stitch(i,:) enddo close(11) open(unit = 11, file= "aers_pair_ind_est.csv") ! Type _e pair_ind = 0 do i=1,num_pairs read(11, *) pair_ind(i,:) enddo close(11) ! =============== IDENTIFY THE PAIR-DRUG BG THRESHOLD ================== write(0,*) "Identifying the pair-drug correlation thresholds..." pair_stitch_threshold = -1 allocate(temp_vector(num_reports)) do i=1,num_pairs ! store the indices for the two drugs of the pair drug1 = pair2stitch(i,1) drug2 = pair2stitch(i,2) min_bg_reports = 5*num_pair_reports(drug1, drug2, stitch_reportcount, num_stitch, stitch2report, max_reportcount, num_reports) ! write(0,*) "Min bg reports for ", drug1, " and ", drug2, " is ", min_bg_reports temp_real = 1.0 ! Type _e temp_vector = 0 do while ((sum(temp_vector).lt.min_bg_reports).and.(temp_real.lt.10000)) ! Type _e temp_real = temp_real + 1.0 do j=1,num_stitch if (pair_stitch(i,j).ge.(1.0/temp_real)) then ! Type _e (if the correlation estimate is > 1/X) do k=1,stitch_reportcount(j) temp_vector(stitch2report(j,k)) = 1 enddo endif enddo ! Set reports for foreground back to zero do k=1,stitch_reportcount(drug1) temp_vector(stitch2report(drug1,k)) = 0 enddo do k=1,stitch_reportcount(drug2) temp_vector(stitch2report(drug2,k)) = 0 enddo enddo pair_stitch_threshold(i) = 1.0/temp_real ! Type _e ! write(0,*) "Threshold found was ", pair_stitch_threshold(i), "size was ", sum(temp_vector) enddo deallocate(temp_vector) ! write(0,*) pair_stitch_threshold(1:5) ! =============== IDENTIFY THE PAIR-INDICATION BG THRESHOLD ================== write(0,*) "Identifying the pair-ind correlation thresholds..." ! The following is set up for Type _e pair_ind_threshold = -1 allocate(temp_vector(num_reports)) do i=1,num_pairs ! store the indices for the two drugs of the pair drug1 = pair2stitch(i,1) drug2 = pair2stitch(i,2) min_bg_reports = 5*num_pair_reports(drug1, drug2, stitch_reportcount, num_stitch, stitch2report, max_reportcount, num_reports) temp_real = 1.0 temp_vector = 0 do while ((sum(temp_vector).lt.min_bg_reports).and.(temp_real.lt.10000)) temp_real = temp_real + 1.0 do j=1,num_inds if (pair_ind(i,j).ge.(1.0/temp_real)) then do k=1,ind_reportcount(j) temp_vector(ind2report(j,k)) = 1 enddo endif enddo ! Set reports for foreground back to zero do k=1,stitch_reportcount(drug1) temp_vector(stitch2report(drug1,k)) = 0 enddo do k=1,stitch_reportcount(drug2) temp_vector(stitch2report(drug2,k)) = 0 enddo enddo pair_ind_threshold(i) = 1.0/temp_real enddo deallocate(temp_vector) ! write(0,*) pair_ind_threshold(1:5) ! =============== BOOTSTRAP THOSE STATISTICS ================== results = 0 do i=start_index,stop_index drug1 = pair2stitch(i,1) drug2 = pair2stitch(i,2) allocate(temp_vector(num_reports)) temp_vector = 0 do k=1,stitch_reportcount(drug1) temp_vector( stitch2report(drug1, k) ) = 1 enddo do k=1,stitch_reportcount(drug2) temp_vector( stitch2report(drug2, k) ) = temp_vector( stitch2report(drug2, k) ) + 1 enddo num_reports_shared = 0 do k=1,num_reports if (temp_vector(k).eq.2) then num_reports_shared = num_reports_shared + 1 endif enddo allocate( shared_reports(num_reports_shared) ) shared_reports = -1 l = 1 do k=1,num_reports if (temp_vector(k).eq.2) then shared_reports(l) = k l = l + 1 endif enddo deallocate(temp_vector) write(0,*) 'Working on the drug pair: (', drug1, ', ', drug2, '), fg reports: ', num_reports_shared do j=1,num_events results = 0 ! Given a drug, i, and an adverse event, j... ! 1. Construct a vector of reports for this drug pair. !! Remember 'num_reports_shared' is holding the number of reports this pair shares. allocate( temp_vector(num_reports_shared) ) temp_vector = 0 do k=1,num_reports_shared ! 2. Set vector element equal to 1 if the event is on this report ! "k" is now the index of the report id we want to check for this event do l=1,report_umlscount(shared_reports(k)) if (report2umls(shared_reports(k),l).eq.j) then temp_vector(k) = 1 endif enddo enddo if (sum(temp_vector).gt.0) then call bootstrap_statistics(results(1), results(2), temp_vector, num_reports_shared, randvec, randptr) deallocate(temp_vector) allocate( temp_vector(num_reports) ) temp_vector = 0 ! Turn "on" the reports for the correlated drugs. do k=1,num_stitch ! If the "other" (ie. k) drug passes the muster for this drug pair we turn the report on if ((pair_stitch(i,k).ge.pair_stitch_threshold(i))) then do l=1,stitch_reportcount(k) temp_vector( stitch2report(k,l) ) = 1 enddo endif enddo ! Turn "on" the reports for the correlated indications. do k=1,num_inds ! If the "other" (ie. k) drug passes the muster for the pair we include it's reports. if ((pair_ind(i,k).ge.pair_ind_threshold(i))) then do l=1,ind_reportcount(k) temp_vector( ind2report(k,l) ) = 1 enddo endif enddo ! Turn "off" the reports for the drugs do k=1,stitch_reportcount(drug1) temp_vector( stitch2report(drug1, k) ) = 0 enddo do k=1,stitch_reportcount(drug2) temp_vector( stitch2report(drug2, k) ) = 0 enddo allocate( temp_vector2( sum(temp_vector) ) ) temp_vector2 = 0 m = 0 do k=1,num_reports if (temp_vector(k).eq.1) then m = m + 1 do l=1,report_umlscount(k) if (report2umls(k,l).eq.j) then temp_vector2(m) = 1 endif enddo endif enddo call bootstrap_statistics( results(3), results(4), temp_vector2, sum(temp_vector), randvec, randptr) write(6,"(I10,I10,I10,ES20.10,ES20.10,ES20.10,ES20.10,I10,I10)") drug1,drug2,j,results,num_reports_shared,sum(temp_vector) temp_vector2 = 1 deallocate( temp_vector2 ) endif ! end check if this drug pair has any shared reports deallocate( temp_vector ) enddo deallocate( shared_reports ) enddo ! ========= DEBUGGING STUFF ========== ! do i=1,num_stitch ! k = 0 ! do j=1,max_reportcount ! if (stitch2report(i,j).ne.0) then ! k = k + 1 ! endif ! enddo ! if (k.ne.stitch_reportcount(i)) then ! write(0,*) "Error in reading stitch2report data for stitch at index ", i ! endif ! enddo ! ! do i=1,num_reports ! k = 0 ! do j=1,max_eventcount ! if (report2umls(i,j).ne.0) then ! k = k + 1 ! endif ! enddo ! if (k.ne.report_umlscount(i)) then ! write(0,*) "Error in reading stitch2report data for stitch at index ", i ! endif ! enddo ! ! write(0,*) stitch_reportcount(1:10) ! We're done. deallocate(stitch_reportcount) end program discover