Merge pull request #57 from bartongroup/development

Merge dev into master as 0.6.4
bartongroup · May 18, 2018 · b0eeabf · b0eeabf
2 parents dc95c94 + 67e0231
commit b0eeabf
Show file tree

Hide file tree

Showing 21 changed files with 449 additions and 384 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^CONTRIBUTING*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,11 @@
+# How to contribute
+
+If you are handy with R and inclined to help out, you can propose enhancements and bug fixes via pull request to the development branch.
+
+If you decide to do so, please make my life easier by following these points:
+- [ ] Keep it simple. I will not accept any large-scale changes without prior communication, discussion and aggreement.
+- [ ] Document your code and generally follow good legible coding guidelines. If I can't figure out what you've done, I cannot authorise the merge.
+- [ ] Avoid adding new dependencies, unless absolutely necessary.
+- [ ] It must pass the existing tests and R CMD checks.
+- [ ] Any new functionality must provide suitable additional tests.
+- [ ] Submit your pull request to the development branch, not the master.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,30 +1,30 @@
 Type: Package
 Package: rats
-Version: 0.6.3
-Date: 2018-03-13
+Version: 0.6.4
+Date: 2018-05-18
 Title: Relative Abundance of Transcripts
 Encoding: UTF-8
 Authors: c(person("Kimon Froussios", role=c("aut"), email="[email protected]"),
              person("Kira Mourão", role=c("aut"), email="[email protected]"),
              person("Nick J. Schurch", role=c("cre"), email="[email protected]"))
 Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
 Maintainer: Kimon Froussios <[email protected]>
-Description: Given a set of transript abundances or a Sleuth output object, and a transcript-to-gene index of
+Description: Given a set of transript abundances and a transcript-to-gene index of
     identifiers, this package identifies genes where differential transcript usage occurs.
 License: MIT + file LICENSE
 VignetteBuilder: knitr
 Depends:
-    matrixStats,
     data.table  (>= 1.9.8)
 Imports:
     utils,
     stats,
     parallel,
+    matrixStats
+Suggests:
     ggplot2  (>= 2.2.0),
     rhdf5,
     wasabi,
-    shiny
-Suggests:
+    shiny,
     testthat,
     knitr,
     rmarkdown

diff --git a/NAMESPACE b/NAMESPACE
@@ -22,11 +22,7 @@ export(sim_boot_data)
 export(sim_count_data)
 export(tidy_annot)
 import(data.table)
-import(ggplot2)
 import(matrixStats)
 import(parallel)
-import(rhdf5)
-import(shiny)
 import(stats)
 import(utils)
-import(wasabi)
diff --git a/R/func.R b/R/func.R
@@ -243,19 +243,21 @@ alloc_out <- function(annot, full, n=1){
                         "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA,
                         "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_,
                         "pval"=NA_real_, "pval_corr"=NA_real_,
-                        "quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
+                        "quant_p_median"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
                         "quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_,
-                        "rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
+                        "rep_p_median"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
                         "rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_)
     Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id,
                               "elig_xp"=NA, "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "gene_DTU"=NA,
                               "meanA"=NA_real_, "meanB"=NA_real_, "stdevA"=NA_real_, "stdevB"=NA_real_,
                               "sumA"=NA_real_, "sumB"=NA_real_, "log2FC"=NA_real_, "totalA"=NA_real_, "totalB"=NA_real_,
                               "propA"=NA_real_, "propB"=NA_real_, "Dprop"=NA_real_,
                               "pval"=NA_real_, "pval_corr"=NA_real_,
-                              "quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_,"quant_p_max"=NA_real_,
+                              "quant_p_median"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
+                              "quant_Dprop_mean"=NA_real_, "quant_Dprop_stdev"=NA_real_, "quant_Dprop_min"=NA_real_, "quant_Dprop_max"=NA_real_,
                               "quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_,
-                              "rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_,"rep_p_max"=NA_real_,
+                              "rep_p_median"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
+                              "rep_Dprop_mean"=NA_real_, "rep_Dprop_stdev"=NA_real_, "rep_Dprop_min"=NA_real_, "rep_Dprop_max"=NA_real_,
                               "rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_)
     CountData <- list()
   } else {
@@ -353,8 +355,8 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes
 
     # Transcript-level test.
     if (test_transc) {
-      if(any(Transcripts$elig)){  # Extreme case. If nothing is eligible, table subsetting by `elig` is nonsense and crashes. 
-                                  # We can just skip testing altegether, as all output fields are initialized with NA already.
+      if(any(Transcripts$elig)){  # If nothing is eligible, table subsetting by `elig` is nonsense and crashes. 
+                                  # In that case we can just skip testing altegether, as all output fields are initialized with NA already.
         Transcripts[(elig), pval := unlist( mclapply( as.data.frame(t(Transcripts[(elig), .(sumA, sumB, totalA, totalB)])),
                                                       function(row) {
                                                         return( g.test.2(obsx= c(row[1], row[3]-row[1]), obsy= c(row[2], row[4]-row[2])) )

diff --git a/R/input.R b/R/input.R
@@ -32,27 +32,27 @@ annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "paren
 #' then applies TPM normalisation using the info available from the abundance.h5 files.
 #'
 #' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
+#' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
 #'
 #' \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file.
 #'
-#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate. The directory name should be a unique identifier for the sample.
-#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate. The directory name should be a unique identifier for the sample.
+#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample.
+#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
 #' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
+#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
+#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)
+#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstraps} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
 #' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
 #' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
-#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)
 #' @param threads (integer) For parallel processing. (Default 1)
-#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
 #' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
 #'
-#' @import wasabi
-#' @import rhdf5
 #' @import data.table
 #' @import parallel
 #'
 #' @export
 
-fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, threads= 1L, scaleto= 1000000)
+fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
 {
   threads <- as.integer(threads)  # Can't be decimal.
   setDTthreads(1)  # Not a typo. Threads used for larger mclapply blocks instead of single table operations.
@@ -62,7 +62,7 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
 
   # Wasabi?
   if (!half_cooked) {
-    prepare_fish_for_sleuth(c(A_paths, B_paths))
+    wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths))
   }
 
   # Sort out scaling.
@@ -78,46 +78,53 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
     sfB <- scaleto[(1+lA):(lA+lB)]
   }
 
-  # Load and convert manually.
-  boots_A <- mclapply(1:lA, function(x) {
-    fil <- A_paths[x]
-    sf <- sfA[x]
-    ids <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/aux/ids") )
-    counts <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
-    effl <- h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")
-    tpm <- as.data.table( lapply(counts, function (y) {
-      cpb <- y / effl
-      tcpb <- sf / sum(cpb)
-      return(cpb * tcpb)
-    }) )
-    dt <- as.data.table( cbind(ids, tpm) )
-    with(dt, setkey(dt, V1) )
-    names(dt)[1] <- TARGET_COL
-    # Order transcripts to match annotation.
-    dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
-    return (dt)
-  }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)
-
-  boots_B <- mclapply(1:lB, function(x) {
-    fil <- B_paths[x]
-    sf <- sfB[x]
-    ids <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/aux/ids") )
-    counts <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
-    effl <- h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")
-    tpm <- as.data.table( lapply(counts, function (y) {
-      cpb <- y / effl
-      tcpb <- sf / sum(cpb)
-      return(cpb * tcpb)
-    }) )
-    dt <- as.data.table( cbind(ids, tpm) )
-    with(dt, setkey(dt, V1) )
-    names(dt)[1] <- TARGET_COL
-    # Order transcripts to match annotation.
-    dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
-    return (dt)
-  }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)
-
-  return(list("boot_data_A"= boots_A, "boot_data_B"= boots_B))
+  # Load and convert.
+  res <- lapply(c('A', 'B'), function(cond) {
+    boots_A <- mclapply(1:lA, function(x) {
+      # Get the correct files and scaling factors.
+      if (cond=='A') {
+        fil <- A_paths[x]
+        sf <- sfA[x]
+      } else {
+        fil <- B_paths[x]
+        sf <- sfB[x]
+      }
+      # If data from Kallisto plaintext...
+      if (beartext) {
+        # list the bootstrap files
+        bfils <- list.files(path=file.path(fil, "bootstraps"), full.names=TRUE, no..=TRUE)
+        bfils <- bfils[ grepl('bs_abundance', bfils) ]
+        # parse info.
+        meta <- fread(bfils[[1]], header=TRUE)[, c('target_id', 'eff_length'), with=FALSE]  
+            # the IDs should all come out in the same order in every iteration file of a given sample, 
+            # and the transcript lengths should not change either.
+        counts <- as.data.table( lapply(bfils, function(bf){ fread(bf, header=TRUE)[['est_counts']] }) )  # plaintext already has TPMs computed, but I stick with the raw counts
+                                                                                                          # for consistency with the .h5 mode and to allow normalisation to other target sizes 
+      # If data from Salmon/Wasabi or Kallisto abundance.h5...
+      } else {
+        meta <- as.data.table(list( target_id=as.character(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/ids")), 
+                                    eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
+        counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
+      }
+      # Normalise raw counts.
+      tpm <- as.data.table( lapply(counts, function (y) {
+        cpb <- y / meta$eff_length
+        tcpb <- sf / sum(cpb)
+        return(cpb * tcpb)
+      }) )
+      # Structure.
+      dt <- as.data.table( cbind(meta$target_id, tpm) )
+      with(dt, setkey(dt, V1) )
+      names(dt)[1] <- TARGET_COL
+      # Order transcripts to match annotation.
+      dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
+
+      return (dt)
+    }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)
+  })
+
+  names(res) <- c("boot_data_A", "boot_data_B")
+  return(res)
 }