Merge pull request #18 from bartongroup/development

v0.4.4
bartongroup · Apr 19, 2017 · 7b329c0 · 7b329c0
2 parents 9f5a9f7 + e3b3c6d
commit 7b329c0
Show file tree

Hide file tree

Showing 33 changed files with 1,317 additions and 907 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: rats
-Version: 0.4.2
-Date: 2017-04-05
+Version: 0.4.4
+Date: 2017-04-19
 Title: Relative Abundance of Transcripts
 Encoding: UTF-8
 Authors: c(person("Kimon Froussios", role=c("aut"), email="[email protected]"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,9 +5,14 @@ export(call_DTU)
 export(combine_sim_dtu)
 export(countrange_sim)
 export(denest_sleuth_boots)
+export(dtu_plurality_summary)
 export(dtu_summary)
-export(g.test)
+export(dtu_switch_summary)
+export(g.test.1)
+export(g.test.2)
 export(get_dtu_ids)
+export(get_plurality_ids)
+export(get_switch_ids)
 export(group_samples)
 export(plot_gene)
 export(plot_overview)

diff --git a/R/func.R b/R/func.R
@@ -166,12 +166,12 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL,
       if (minboots < 100) {
         warnmsg["toofewboots"] <- "Your quantifications have few bootstrap iterations, which reduces reproducibility of the calls."
       }
-      if (qbootnum < 100) {
+      if (qbootnum < 100 && qbootnum != 0) {
         warnmsg["toolowbootnum"] <- "The requested qbootnum is low, which reduces reproducibility of the calls."
       }
       bootcombos <- minboots^numsamples  # Conservative estimate.
       if (qbootnum >= bootcombos/100)
-        warnmsg["toomanyboots"] <- "The requested number of quantification bootstraps is very high, relative to the supplied data. Over 1% chance of duplicate iterations."
+        warnmsg["toomanyboots"] <- "The requested number of quantification bootstraps is very high, relatively to the supplied data. Over 1% chance of duplicate iterations."
       if (qbootnum >= maxmatrix/dim(annot)[1])
         return(list("error"=TRUE,"message"="The requested number of quantification bootstraps would exceed the maximum capacity of an R matrix."))
     } # else it is probably count data and qboot will be auto-set to FALSE
@@ -302,12 +302,10 @@ alloc_out <- function(annot, full){
     Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)),
                         "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA,
                         "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_,
-                        "pvalAB"=NA_real_, "pvalBA"=NA_real_, "pvalAB_corr"=NA_real_, "pvalBA_corr"=NA_real_,
-                        "quant_p_meanAB"=NA_real_, "quant_p_meanBA"=NA_real_, "quant_p_stdevAB"=NA_real_, "quant_p_stdevBA"=NA_real_,
-                        "quant_p_minAB"=NA_real_, "quant_p_minBA"=NA_real_, "quant_p_maxAB"=NA_real_, "quant_p_maxBA"=NA_real_,
+                        "pval"=NA_real_, "pval_corr"=NA_real_,
+                        "quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_, 
                         "quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_,
-                        "rep_p_meanAB"=NA_real_, "rep_p_meanBA"=NA_real_, "rep_p_stdevAB"=NA_real_, "rep_p_stdevBA"=NA_real_,
-                        "rep_p_minAB"=NA_real_, "rep_p_minBA"=NA_real_, "rep_p_maxAB"=NA_real_, "rep_p_maxBA"=NA_real_,
+                        "rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
                         "rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_)
     Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id,
                               "elig_xp"=NA, "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "gene_DTU"=NA,
@@ -324,8 +322,7 @@ alloc_out <- function(annot, full){
     Parameters <- list("num_replic_A"=NA_integer_, "num_replic_B"=NA_integer_)
     Genes <- data.table("parent_id"=levels(as.factor(annot$parent_id)), "DTU"=NA,
                         "elig_transc"=NA_integer_, "elig"=NA, "elig_fx"=NA,
-                        "pvalAB"=NA_real_, "pvalBA"=NA_real_,
-                        "pvalAB_corr"=NA_real_, "pvalBA_corr"=NA_real_, "sig"=NA)
+                        "pval"=NA_real_, "pval_corr"=NA_real_, "sig"=NA)
     Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id, "DTU"=NA,
                               "sumA"=NA_real_, "sumB"=NA_real_,  # sum across replicates of means across bootstraps
                               "totalA"=NA_real_, "totalB"=NA_real_,  # sum of all transcripts for that gene
@@ -359,9 +356,9 @@ alloc_out <- function(annot, full){
 #' @param threads Number of threads (POSIX systems only).
 #' @return list
 #'
+#' @import utils
 #' @import parallel
 #' @import data.table
-#' @import stats
 #'
 calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes, full, count_thresh, p_thresh, dprop_thresh, correction, threads= 1) {
   if (packageVersion("data.table") >= "1.9.8")  # Ensure data.table complies.
@@ -410,65 +407,94 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes
 
     #---------- TESTS
 
-    # Proportion test.
+    # Transcript-level test.
     if (test_transc) {
       Transcripts[(elig), pval := unlist( mclapply( as.data.frame(t(Transcripts[(elig), .(sumA, sumB, totalA, totalB)])),
-                                                    function(row) { prop.test(x = row[c(1, 2)],
-                                                                               n = row[c(3, 4)],
-                                                                               correct = TRUE)[["p.value"]]
+                                                    function(row) {
+                                                      return( g.test.2(obsx= c(row[1], row[3]-row[1]), obsy= c(row[2], row[4]-row[2])) )
                                                     }, mc.cores= threads, mc.allow.recursive= FALSE, mc.preschedule= TRUE)
                                           ) ]
       Transcripts[(elig), pval_corr := p.adjust(pval, method=correction)]
       Transcripts[(elig), sig := pval_corr < p_thresh]
       Transcripts[(elig), DTU := sig & elig_fx]
     }
 
-    # G test.
+    # Gene-level test.
     if (test_genes) {
-      Genes[(elig), c("pvalAB", "pvalBA") := t( as.data.frame( mclapply(Genes[(elig), parent_id],
+      Genes[(elig), pval := t( as.data.frame( mclapply(Genes[(elig), parent_id],
                                         function(parent) {
                                             # Extract all relevant data to avoid repeated look ups in the large table.
-                                            subdt <- Transcripts[parent, .(sumA, sumB, propA, propB)]  # All isoforms, including not detected ones.
-                                            pAB <- g.test(x = subdt[, sumA], p = subdt[, propB])
-                                            pBA <- g.test(x = subdt[, sumB], p = subdt[, propA])
-                                            return(c(pAB, pBA))
+                                            subdt <- Transcripts[parent, .(sumA, sumB)]  # All isoforms, including not detected ones.
+                                            return( g.test.2(obsx= subdt$sumA, obsy= subdt$sumB) )
                                         }, mc.cores= threads, mc.preschedule= TRUE, mc.allow.recursive= FALSE)
                 )) ]
-      Genes[(elig), pvalAB_corr := p.adjust(pvalAB, method=correction)]
-      Genes[(elig), pvalBA_corr := p.adjust(pvalBA, method=correction)]
-      Genes[(elig), sig := pvalAB_corr < p_thresh & pvalBA_corr < p_thresh]
+      Genes[(elig), pval_corr := p.adjust(pval, method=correction)]
+      Genes[(elig), sig := pval_corr < p_thresh]
       Genes[(elig), DTU := sig & elig_fx]
     }
   })
   return(resobj)
 }
 
 
+
 #================================================================================
 #' Log-likelihood test of goodness of fit.
 #'
-#' @param x	a numeric vector of positive numbers, with at least one non-zero value.
-#' @param p	a vector of probabilities of the same length of x.
-#'
-#' Sourced and adapted from from:
-#' V3.3 Pete Hurd Sept 29 2001. [email protected]
-#' http://www.psych.ualberta.ca/~phurd/cruft/g.test.r
+#' @param obsx	a numeric vector of finite positive counts, with at least one non-zero value.
+#' @param px	a vector of probabilities of the same length as xobs. ( sum(px) <= 1 )
 #'
+#' The order of values in the two vectors should be the same.
+#' If any value of xp is zero and the corresponding xobs is not, g.test.1 will always reject the hypothesis.
+#' No corrections are applied. 
+#' No input checks are applied, as RATs needs to run this millions of times.
+#' 
 #' @import stats
 #' @export
-#'
-g.test <- function(x, p = rep(1/length(x), length(x)))
-{
-  n <- sum(x)
-  E <- n * p
-  names(E) <- names(x)
-  g <- 0
-  for (i in 1:length(x)){
-    if (x[i] != 0) g <- g + x[i] * log(x[i]/E[i])
-  }
-  q <- 1
-  STATISTIC <- G <- 2*g/q
-  PARAMETER <- length(x) - 1
-  PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
+#
+g.test.1 <- function(obsx, px) {
+  n = length(obsx)
+  sx <- sum(obsx)
+  ex <- sx * px  # expected values
+  G <- 2 * sum( sapply (seq.int(1, n, 1), 
+                        function (i) { if (obsx[i] != 0) { return(obsx[i] * log(obsx[i]/ex[i])) } else { return(0) } }) )
+  return( pchisq(G, df= n - 1, lower.tail= FALSE) )
 }
 
+#================================================================================
+#' Log-likelihood test of independence.
+#'
+#' For two sets of observations.
+#'
+#' @param obsx	a numeric vector of positive counts, with at least one non-zero value.
+#' @param obsy	a numeric vector of positive counts of same length as obsx, with at least one non-zero value.
+#'
+#' The order of values in the two vectors should be the same.
+#' No corrections are applied. 
+#' No input checks are applied, as RATs needs to run this millions of times. 
+#'
+#' @import stats
+#' @export
+#
+g.test.2 <- function(obsx, obsy) {
+  n = length(obsx)
+  # Row and column sums.
+  sx <- sum(obsx)
+  sy <- sum(obsy)
+  sv <- obsx + obsy
+  st <- sx + sy
+  # Marginal probabilities.
+  mpx <- sx / st
+  mpy <- sy / st
+  mpv <- sapply(sv, function(v) {v/st})
+  # Expected values.
+  ex <- mpx * mpv * st
+  ey <- mpy * mpv * st
+  # Statistic.
+  G <- 2 * sum( sum( sapply (seq.int(1, n, 1), 
+                             function (i) { if (obsx[i] != 0) { return(obsx[i] * log(obsx[i]/ex[i])) } else { return(0) } }) ),
+                sum( sapply (seq.int(1, n, 1), 
+                             function (i) { if (obsy[i] != 0) { return(obsy[i] * log(obsy[i]/ey[i])) } else { return(0) } }) )
+              )
+  return( pchisq(G, df= n - 1, lower.tail= FALSE) )
+}