!-------------------------------------------------------------------------
! Discover drug-adverse event interactions (part 4)
! 
! In part 4 we are simply deconvolving for pairs drugs using all reports.
!
! (c) 2010 Nicholas P. Tatonetti, Stanford University
! nick.tatonetti@stanford.edu
!-------------------------------------------------------------------------

!-------------------------------------------------------------------------
! This is a remake of the first p4 script. The difference here is that
! we are now treating pairs as one entity and have recomputed all of the
! covariances based on pairs rather than individual drugs. This is a more
! appropriate way to generalize the method. 
!
! 
! CLUSTER RUNNING
! 
! for ((i=1;i<14558;i+=250)); do bsub -e output/pairs_$i.txt -o results/pairs_$i.txt ./discover_ae_p4_pairs $i 250; done
! 


! PRECISION - select the basic precision for the computations.
!

module precision
    integer,parameter::R=kind(1.0) ! IEEE single precision or similar
    integer,parameter::D=kind(1.0d0) ! IEEE double precision or similar
    integer,parameter::I64=selected_int_kind(18) ! 64-bit int on most machines
end module precision

module functions 
      use precision
      contains
      
      integer function num_pair_reports(drug1, drug2, stitch_reportcount, num_stitch, stitch2report, max_stitch_reports, num_reports)
        
        integer,intent(in)::drug1, drug2, num_stitch, num_reports, max_stitch_reports
        integer,intent(in)::stitch_reportcount(num_stitch)
        integer,intent(in)::stitch2report(num_stitch, max_stitch_reports)
        integer::reports(num_reports)
        
        reports = 0
        
        do i=1,stitch_reportcount(drug1)
          reports(stitch2report(drug1, i)) = 1
        enddo
        
        do i=1,stitch_reportcount(drug2)
          reports(stitch2report(drug2, i)) = reports(stitch2report(drug2, i)) + 1
        enddo
        
        do i=1,num_reports
          if (reports(i).eq.2) then
            reports(i) = 1
          else
            reports(i) = 0
          endif
        enddo
        
        num_pair_reports = sum(reports)
        
      end function num_pair_reports
      
      subroutine vi_max(vector, length, max)
            
            integer,intent(in)::length
            integer,intent(in)::vector(length)
            integer,intent(inout)::max
            
            max = 0
            do i=1,length
                  if (i.eq.1) then
                        max = vector(i)
                  else
                        if (vector(i).gt.max) then
                              max = vector(i)
                        endif
                  endif
            enddo
            
      end subroutine vi_max
      
      subroutine next_rand(rand, randvec, randptr)
            
            integer,intent(inout)::randptr,rand
            integer,intent(in)::randvec(1100000)
            
            rand = randvec(randptr)
            randptr = randptr + 1
            if (randptr.gt.1100000) then
                  randptr = 1
            endif
            
      end subroutine next_rand
      
      subroutine vector_mean(mu, vector, n)
            
            integer,intent(in)::n
            real(D),intent(out)::mu
            integer,intent(in)::vector(n)
            
            integer::i
            
            mu = 0.0
            do i=1,n
                  mu = mu + real(vector(i))
            enddo
            
            mu = mu / real(n)
            
      end subroutine vector_mean
      
      subroutine vector_mean_real(mu, vector, n)
            
            integer,intent(in)::n
            real(D),intent(out)::mu
            real(D),intent(in)::vector(n)
            
            integer::i
            
            mu = 0.0
            do i=1,n
                  mu = mu + vector(i)
            enddo
            
            mu = mu / real(n)
            
      end subroutine vector_mean_real
      
      subroutine vector_std_real(std, mu, vector, n)
            
            integer,intent(in)::n
            real(D),intent(in)::mu
            real(D),intent(out)::std
            real(D),intent(in)::vector(n)
            
            integer::i
            
            std = 0.0
            do i=1,n
                  std = std + (vector(i) - mu)*(vector(i) - mu)
            enddo
            
            std = sqrt( std / real(n-1) )
            
      end subroutine vector_std_real
      
      subroutine vector_sample(sample, size, vector, n, randvec, randptr)
            
            integer,intent(in)::size,n
            integer,intent(in)::vector(n)
            integer,intent(inout)::sample(size), randvec(1100000), randptr
            
            integer::i,randint
            do i=1,size
                  call next_rand(randint, randvec, randptr)
                  sample(i) = vector(mod(randint,n)+1)
            enddo
            
      end subroutine vector_sample
      
      subroutine bootstrap_statistics(mu, variance, vector, vector_length, randvec, randptr)
            
            real(D),intent(inout)::mu, variance
            integer,intent(in)::vector_length
            integer,intent(inout)::randptr
            integer,intent(inout)::randvec(1100000),vector(vector_length)
            
            real(D)::estimates(1000)
            integer::rand, sample_size, i
            integer,dimension(:),allocatable::sample
            
            sample_size = vector_length / 100
            if (sample_size.lt.10) then
                  sample_size = 10
            endif
            if (sample_size.gt.10000) then
                  sample_size = 10000
            endif
            
            allocate(sample(sample_size))
            sample = 0
            
            do i=1,1000
                  call vector_sample( sample, sample_size, vector, vector_length, randvec, randptr )
                  call vector_mean( estimates(i), sample, sample_size)
            enddo
            
            call vector_mean_real( mu, estimates, 1000)
            call vector_std_real( variance, mu, estimates, 1000)
            
      end subroutine bootstrap_statistics
      
end module functions

program discover
      
      use precision
      use functions
      implicit none
      
      integer::i,j,k,l,m,i2
      integer::start_index, step_size, stop_index ! This is initialized from the stdin to enable easy multi-processing.
      integer::num_stitch, num_reports, num_events, num_inds, min_bg_reports, num_pairs
      integer::max_reportcount, max_eventcount, max_indreportcount, num_reports_shared
      integer::drug1, drug2
      real(D)::temp_real
      
      integer,dimension(:),allocatable::stitch_reportcount, report_umlscount, ind_reportcount
      integer,dimension(:),allocatable::shared_reports
      integer,dimension(:),allocatable::temp_vector, temp_vector2
      integer,dimension(:,:),allocatable::report2umls ! Binary matrix that stores which events are on which reports
      integer,dimension(:,:),allocatable::stitch2report ! Binary matrix that stores which drugs are on which reports.
      integer,dimension(:,:),allocatable::ind2report ! Binary matrix that stores which indications are on which reports.
      integer,dimension(:,:),allocatable::pair2stitch ! Np x 2 matrix of the stitch ids
      real(D),dimension(4)::results ! A vector which stores fg_mean, fg_sd, fg_n, bg_mean, bg_sd, bg_n for the current interation.
      real(D),dimension(:,:),allocatable::pair_stitch, pair_ind
      real(D),dimension(:),allocatable::pair_stitch_threshold, pair_ind_threshold
      character*10::argument
      
      integer::randvec(1100000),randptr,randint
      
!     ================= INITIALIZATION ====================
      ! Initialize the data matrices and vectors.
      
      if (iargc().ne.2) then 
          write(0,*) "Wrong number of arguments provided."
      endif
      
      call getarg(1, argument)
      read(argument, *) start_index
      
      call getarg(2, argument)
      read(argument, *) step_size
      
      stop_index = start_index + step_size - 1
      write(0,*) "========= Discover Pair-AE: ", start_index, " to ~", stop_index, "=========="
      
      open(unit = 13, file = "random_numbers.vec")
      randptr = 726
      do i=1,1100000
            read(13,*) randvec(i)
      enddo
      
      open(unit = 10, file= "aers_entity_counts_pairs.csv")
      read(10, *), num_stitch, num_reports, num_events, num_inds, num_pairs
      close(10)
      
      if (num_pairs.lt.stop_index) then
            stop_index = num_pairs
            write(0,*) "Adjusting stop_index to ", stop_index
      endif
      
      write(0,*) "Found ", num_stitch, " drugs, ", num_reports, " reports, "
      write(0,*) "and ", num_inds, " indications, ", num_events, " adverse events,"
      write(0,*) "and ", num_pairs, " pairs of drugs."
      
      allocate(pair_stitch(num_pairs, num_stitch))
      allocate(pair_stitch_threshold(num_pairs))
      allocate(pair_ind_threshold(num_pairs))
      allocate(pair_ind(num_pairs, num_inds))
      
      ! Load up how many reports we have for each drug.
      open(unit = 10, file= "aers_stitch_reportcount.txt")
      
      allocate(stitch_reportcount(num_stitch))
      max_reportcount = 0
      do i=1,num_stitch
            read(10, *) stitch_reportcount(i)
            if (stitch_reportcount(i).gt.max_reportcount) then
                  max_reportcount = stitch_reportcount(i)
            endif
      enddo
      close(10)
      
      ! Load up the stitch2report
      open(unit = 11, file= "aers_stitch_reportlist.csv")
      
      allocate( stitch2report( num_stitch, max_reportcount ) )
      stitch2report = 0
      
      do i=1,num_stitch
            allocate( temp_vector( stitch_reportcount(i) ) )
            read(11, *) temp_vector
            do j=1,stitch_reportcount(i)
                  stitch2report(i, j) = temp_vector(j)
            enddo
            deallocate( temp_vector )
      enddo
      close(11)
      
      write(0,*) "Loaded stitch2report matrix (", num_stitch, ",", max_reportcount, ")"
      
      ! Load up how many events we have for each report.
      open(unit = 10, file= "aers_report_umlscount.txt")
      
      allocate(report_umlscount(num_reports))
      max_eventcount = 0
      do i=1,num_reports
            read(10, *) report_umlscount(i)
            if (report_umlscount(i).gt.max_eventcount) then
                  max_eventcount = report_umlscount(i)
            endif
      enddo
      close(10)
      
      ! Load up the event data.
      open(unit = 11, file= "aers_report_umlslist.csv")
      
      allocate( report2umls( num_reports, max_eventcount ) )
      report2umls = 0
      
      do i=1,num_reports
            allocate( temp_vector( report_umlscount(i) ) )
            read(11, *) temp_vector
            do j=1,report_umlscount(i)
                  report2umls(i, j) = temp_vector(j)
            enddo
            deallocate( temp_vector )
      enddo
      close(11)
      
      write(0,*) "Loaded report2umls matrix (", num_reports, ",", max_eventcount, ")"
      
      ! Load up the indicaiton report data.
      open(unit = 10, file= "aers_ind_reportcount.txt")
      
      allocate(ind_reportcount(num_inds))
      max_indreportcount = 0
      do i=1,num_inds
            read(10, *) ind_reportcount(i)
            if (ind_reportcount(i).gt.max_indreportcount) then
                  max_indreportcount = ind_reportcount(i)
            endif
      enddo
      close(10)
      
      open(unit = 11, file= "aers_ind_reportlist.csv")
      allocate( ind2report( num_inds, max_indreportcount ) )
      ind2report = 0
      
      do i=1,num_inds
            allocate( temp_vector( ind_reportcount(i) ) )
            read(11, *) temp_vector
            do j=1,ind_reportcount(i)
                  ind2report(i, j) = temp_vector(j)
            enddo
            deallocate( temp_vector )
      enddo
      close(11)
      
      write(0,*) "Loaded ind2report matrix (", num_inds, ",", max_indreportcount, ")"
      
      ! Load in the members of each pair according to their fortran index (from stitch2index.csv).
      open(unit = 11, file="aers_pairs2stitch.txt")
      allocate(pair2stitch(num_pairs, 2))
      do i=1,num_pairs
        read(11, *) pair2stitch(i,:)
      enddo
      close(11)
      
      write(0,*) "Loaded pair2stitch matrix (", num_pairs, ", 2 )"
      
      ! Load up the pair-drug correlation significance matrix.
      open(unit = 11, file= "aers_pair_stitch_est.csv") ! Type _e (estimates rather than pvalues are used)
      pair_stitch = 0
      do i=1,num_pairs
            read(11, *) pair_stitch(i,:)
      enddo
      close(11)
      
      open(unit = 11, file= "aers_pair_ind_est.csv") ! Type _e
      pair_ind = 0
      do i=1,num_pairs
            read(11, *) pair_ind(i,:)
      enddo
      close(11)
      
!     =============== IDENTIFY THE PAIR-DRUG BG THRESHOLD ==================
      
      write(0,*) "Identifying the pair-drug correlation thresholds..."
      
      pair_stitch_threshold = -1
      allocate(temp_vector(num_reports))
      do i=1,num_pairs
            ! store the indices for the two drugs of the pair
            drug1 = pair2stitch(i,1)
            drug2 = pair2stitch(i,2)
            
            min_bg_reports = 5*num_pair_reports(drug1, drug2, stitch_reportcount, num_stitch, stitch2report, max_reportcount, num_reports)
!             write(0,*) "Min bg reports for ", drug1, " and ", drug2, " is ", min_bg_reports
            
            temp_real = 1.0 ! Type _e
            temp_vector = 0
            
            do while ((sum(temp_vector).lt.min_bg_reports).and.(temp_real.lt.10000)) ! Type _e
                  
                  temp_real = temp_real + 1.0
                  do j=1,num_stitch
                        if (pair_stitch(i,j).ge.(1.0/temp_real)) then ! Type _e (if the correlation estimate is > 1/X)
                              do k=1,stitch_reportcount(j)
                                    temp_vector(stitch2report(j,k)) = 1
                              enddo
                        endif
                  enddo
                  ! Set reports for foreground back to zero
                  do k=1,stitch_reportcount(drug1)
                        temp_vector(stitch2report(drug1,k)) = 0
                  enddo
                  do k=1,stitch_reportcount(drug2)
                        temp_vector(stitch2report(drug2,k)) = 0
                  enddo
            enddo
            pair_stitch_threshold(i) = 1.0/temp_real ! Type _e
!             write(0,*) "Threshold found was ", pair_stitch_threshold(i), "size was ", sum(temp_vector)
      enddo
      deallocate(temp_vector)
!       write(0,*) pair_stitch_threshold(1:5)
      
!     =============== IDENTIFY THE PAIR-INDICATION BG THRESHOLD ==================
      
      write(0,*) "Identifying the pair-ind correlation thresholds..."
      
      ! The following is set up for Type _e
      pair_ind_threshold = -1
      allocate(temp_vector(num_reports))
      do i=1,num_pairs
            ! store the indices for the two drugs of the pair
            drug1 = pair2stitch(i,1)
            drug2 = pair2stitch(i,2)
            
            min_bg_reports = 5*num_pair_reports(drug1, drug2, stitch_reportcount, num_stitch, stitch2report, max_reportcount, num_reports)
            
            temp_real = 1.0
            temp_vector = 0
            do while ((sum(temp_vector).lt.min_bg_reports).and.(temp_real.lt.10000))
                  temp_real = temp_real + 1.0
                  do j=1,num_inds
                        if (pair_ind(i,j).ge.(1.0/temp_real)) then
                              do k=1,ind_reportcount(j)
                                    temp_vector(ind2report(j,k)) = 1
                              enddo
                        endif
                  enddo
                  ! Set reports for foreground back to zero
                  do k=1,stitch_reportcount(drug1)
                        temp_vector(stitch2report(drug1,k)) = 0
                  enddo
                  do k=1,stitch_reportcount(drug2)
                        temp_vector(stitch2report(drug2,k)) = 0
                  enddo
            enddo
            pair_ind_threshold(i) = 1.0/temp_real
      enddo
      deallocate(temp_vector)
!       write(0,*) pair_ind_threshold(1:5)
      
      
!     =============== BOOTSTRAP THOSE STATISTICS ==================
      results = 0
      do i=start_index,stop_index
          drug1 = pair2stitch(i,1)
          drug2 = pair2stitch(i,2)
          
          allocate(temp_vector(num_reports))
          temp_vector = 0
          do k=1,stitch_reportcount(drug1)
              temp_vector( stitch2report(drug1, k) ) = 1
          enddo
          do k=1,stitch_reportcount(drug2)
              temp_vector( stitch2report(drug2, k) ) = temp_vector( stitch2report(drug2, k) ) + 1
          enddo
          
          num_reports_shared = 0
          do k=1,num_reports
              if (temp_vector(k).eq.2) then
                  num_reports_shared = num_reports_shared + 1
              endif
          enddo
          
          allocate( shared_reports(num_reports_shared) )
          
          shared_reports = -1
          l = 1
          do k=1,num_reports
              if (temp_vector(k).eq.2) then
                  shared_reports(l) = k
                  l = l + 1
              endif
          enddo
          
          deallocate(temp_vector)
          
          write(0,*) 'Working on the drug pair: (', drug1, ', ', drug2, '), fg reports: ', num_reports_shared
          
          do j=1,num_events
              
              results = 0
              ! Given a drug, i, and an adverse event, j...
              ! 1. Construct a vector of reports for this drug pair.
              !! Remember 'num_reports_shared' is holding the number of reports this pair shares.
              
              allocate( temp_vector(num_reports_shared) )
              temp_vector = 0
              
              do k=1,num_reports_shared
                  ! 2. Set vector element equal to 1 if the event is on this report
                  ! "k" is now the index of the report id we want to check for this event
                  do l=1,report_umlscount(shared_reports(k))
                      if (report2umls(shared_reports(k),l).eq.j) then
                          temp_vector(k) = 1
                      endif
                  enddo
              enddo
              
              if (sum(temp_vector).gt.0) then
                  
                  call bootstrap_statistics(results(1), results(2), temp_vector, num_reports_shared, randvec, randptr)
                  
                  deallocate(temp_vector)
                  
                  allocate( temp_vector(num_reports) )
                  temp_vector = 0
                  
                  ! Turn "on" the reports for the correlated drugs.
                  do k=1,num_stitch
                        ! If the "other" (ie. k) drug passes the muster for this drug pair we turn the report on
                        if ((pair_stitch(i,k).ge.pair_stitch_threshold(i))) then
                              do l=1,stitch_reportcount(k)
                                    temp_vector( stitch2report(k,l) ) = 1
                              enddo
                        endif
                  enddo
                  
                  ! Turn "on" the reports for the correlated indications.
                  do k=1,num_inds
                        ! If the "other" (ie. k) drug passes the muster for the pair we include it's reports.
                        if ((pair_ind(i,k).ge.pair_ind_threshold(i))) then
                              do l=1,ind_reportcount(k)
                                    temp_vector( ind2report(k,l) ) = 1
                              enddo
                        endif
                  enddo
                  
                  ! Turn "off" the reports for the drugs
                  do k=1,stitch_reportcount(drug1)
                      temp_vector( stitch2report(drug1, k) ) = 0
                  enddo
                  do k=1,stitch_reportcount(drug2)
                      temp_vector( stitch2report(drug2, k) ) = 0
                  enddo
                  
                  allocate( temp_vector2( sum(temp_vector) ) )
                  temp_vector2 = 0
                  m = 0
                  do k=1,num_reports
                      if (temp_vector(k).eq.1) then
                          m = m + 1
                          do l=1,report_umlscount(k)
                              if (report2umls(k,l).eq.j) then
                                  temp_vector2(m) = 1
                              endif
                          enddo
                      endif
                  enddo
                  
                  call bootstrap_statistics( results(3), results(4), temp_vector2, sum(temp_vector), randvec, randptr)
                  
                  write(6,"(I10,I10,I10,ES20.10,ES20.10,ES20.10,ES20.10,I10,I10)") drug1,drug2,j,results,num_reports_shared,sum(temp_vector)
                  
                  temp_vector2 = 1
                  deallocate( temp_vector2 )
                  
              endif ! end check if this drug pair has any shared reports
              deallocate( temp_vector )
          enddo
          deallocate( shared_reports )
      enddo
      
!     ========= DEBUGGING STUFF ==========
!       do i=1,num_stitch
!             k = 0
!             do j=1,max_reportcount
!                   if (stitch2report(i,j).ne.0) then
!                         k = k + 1
!                   endif
!             enddo
!             if (k.ne.stitch_reportcount(i)) then
!                   write(0,*) "Error in reading stitch2report data for stitch at index ", i
!             endif
!       enddo
!       
!       do i=1,num_reports
!             k = 0
!             do j=1,max_eventcount
!                   if (report2umls(i,j).ne.0) then
!                         k = k + 1
!                   endif
!             enddo
!             if (k.ne.report_umlscount(i)) then
!                   write(0,*) "Error in reading stitch2report data for stitch at index ", i
!             endif
!       enddo
!       
!      write(0,*) stitch_reportcount(1:10)
      
      ! We're done.
      deallocate(stitch_reportcount)
      
end program discover