Merge pull request #68 from bartongroup/development

Development
bartongroup · Dec 17, 2021 · 14127ca · 14127ca
2 parents 58cc11f + 9a4579b
commit 14127ca
Show file tree

Hide file tree

Showing 62 changed files with 3,384 additions and 6,406 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,3 +1,5 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^CONTRIBUTING*
+^doc$
+^Meta$
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
 inst/doc/* linguist-documentation
+doc/* linguist-documentation
 vignettes/* linguist-documentation
 doc/* linguist-documentation
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@
 *.Rproj
 
 .DS_Store
+doc
+Meta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,33 +1,34 @@
 Type: Package
 Package: rats
-Version: 0.6.4
-Date: 2018-05-18
+Version: 0.6.4.17
+Date: 2019-07-18
 Title: Relative Abundance of Transcripts
 Encoding: UTF-8
-Authors: c(person("Kimon Froussios", role=c("aut"), email="[email protected]"),
-             person("Kira Mourão", role=c("aut"), email="[email protected]"),
-             person("Nick J. Schurch", role=c("cre"), email="[email protected]"))
 Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
-Maintainer: Kimon Froussios <k.froussios@dundee.ac.uk>
+Maintainer: Kimon Froussios <kimon.froussios@gmail.com>
 Description: Given a set of transript abundances and a transcript-to-gene index of
     identifiers, this package identifies genes where differential transcript usage occurs.
+Authors@R: c(person("Kimon Froussios", role=c("aut", "cre"), email="[email protected]"),
+             person("Kira Mourão", role=c("aut")),
+             person("Nick J. Schurch", role=c("aut")) )
 License: MIT + file LICENSE
 VignetteBuilder: knitr
-Depends:
-    data.table  (>= 1.9.8)
 Imports:
+    data.table  (>= 1.9.8),
     utils,
     stats,
     parallel,
-    matrixStats
-Suggests:
+    matrixStats,
     ggplot2  (>= 2.2.0),
+    rtracklayer,
     rhdf5,
-    wasabi,
+    wasabi, 
+    GenomicRanges
+Suggests:
+    ggbio,
     shiny,
-    testthat,
     knitr,
-    rmarkdown
+    testthat
 URL: https://github.com/bartongroup/Rats, http://www.compbio.dundee.ac.uk
 BugReports: https://github.com/bartongroup/RATS/issues
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.1
diff --git a/Meta/vignette.rds b/Meta/vignette.rds
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(annot2ids)
+export(annot2models)
 export(call_DTU)
 export(denest_sleuth_boots)
 export(dtu_plurality_summary)
@@ -12,7 +13,9 @@ export(g.test.2)
 export(get_dtu_ids)
 export(get_plurality_ids)
 export(get_switch_ids)
+export(granges2ids)
 export(group_samples)
+export(gtf2ids)
 export(maxabs)
 export(plot_diagnostics)
 export(plot_gene)
@@ -22,7 +25,16 @@ export(sim_boot_data)
 export(sim_count_data)
 export(tidy_annot)
 import(data.table)
+import(ggplot2)
 import(matrixStats)
 import(parallel)
-import(stats)
+import(rhdf5)
 import(utils)
+import(wasabi)
+importFrom(GenomicRanges,makeGRangesListFromDataFrame)
+importFrom(GenomicRanges,mcols)
+importFrom(parallel,detectCores)
+importFrom(parallel,mclapply)
+importFrom(rtracklayer,import)
+importFrom(stats,p.adjust.methods)
+importFrom(stats,pchisq)
diff --git a/R/data_simulators.R b/R/data_simulators.R
diff --git a/R/func.R b/R/func.R
diff --git a/R/input.R b/R/input.R
@@ -1,35 +1,136 @@
+########## ########## ########## ########## ########## ########## ########## ########## ##########
+# Helper functions for setting up the input, if necessary.
+########## ########## ########## ########## ########## ########## ########## ########## ##########
+
+
+#================================================================================
+#' Extract matching transcript and gene IDs from a GRanges object.
+#'
+#' The GRanges must have at least the following metadata columns: `gene_id` and `transcript_id` (such as GRanges imported from GTF).
+#' GFF3-style metadata columns are currently not supported.
+#'
+#' @param annot A GRanges object with gene_id and transcript_id metadata columns.
+#' @param transc_header The title for the transcripts column in the output. (target_id)
+#' @param gene_header The title for the genes column in the output. (parent_id)
+#' @return A data.table with two columns, matching transcript IDs to gene IDs.
+#'
+#' @importFrom GenomicRanges mcols
+#' @import data.table
+#' @export
+#'
+granges2ids <- function(annot, transc_header= "target_id", gene_header= "parent_id")
+{
+  # If GTF style...
+  if ('transcript_id' %in% names(mcols(annot))) {
+    t2g <- unique(data.table(transcript_id=annot$transcript_id, gene_id=annot$gene_id))
+  # If GFF3 style...
+  } else {
+    stop('It seems you supplied a GFF3 file. This is currenlty not implemented. Please convert to GTF using available tools and try again.')
+  }
+  # Don't want any unpaired IDs (genes without transcripts are irrelevant, transcripts without gene are unusable).
+  t2g <- t2g[!is.na(t2g$gene_id) & !is.na(t2g$transcript_id), ]    # R Check really does not like data.table syntax... :(
+
+  names(t2g) <- c(transc_header, gene_header)
+  return(t2g)
+}
+
+
 #================================================================================
 #' Extract matching transcript and gene IDs from a GTF file.
 #'
-#' This function performs no file format validation. Please use the appropriate type.
-#' It is NOT compatible GFF3, as there is no rigid specification for ID attributes.
+#'Previously named annot2ids(). The old name will be discontinued.
+#'
+#' GFF3 not supported.
+#'
+#' @param annotfile A GTF file.
+#' @param transc_header The title for the transcripts column in the output. (target_id)
+#' @param gene_header The title for the genes column in the output. (parent_id)
+#' @return A data.table with two columns, matching transcript IDs to gene IDs.
+#'
+#' @importFrom rtracklayer import
+#' @export
+#'
+gtf2ids <- function(annotfile, transc_header= "target_id", gene_header= "parent_id")
+{
+  annot <- rtracklayer::import(annotfile)
+  granges2ids(annot, transc_header, gene_header)
+}
+
+
+#================================================================================
+#' (deprecated) Extract matching transcript and gene IDs from a GTF file.
+#'
+#' This function has been renamed to gtf2ids(). The old name will be discontinued.
+#' 
+#' GFF3 not supported.
 #'
 #' @param annotfile A GTF file.
-#' @param transc_header The tile for the transcripts column in the output. (target_id)
-#' @param gene_header The tile for the genes column in the output. (parent_id)
+#' @param transc_header The title for the transcripts column in the output. (target_id)
+#' @param gene_header The title for the genes column in the output. (parent_id)
 #' @return A data.table with two columns, matching transcript IDs to gene IDs.
 #'
 #' @export
 #'
 annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "parent_id")
 {
-  annot <- as.data.table(read.delim(annotfile, comment.char= "#", header= FALSE))
-  annot <- annot[(grepl("transcript_id", annot[[9]]) & grepl("gene_id", annot[[9]])), ]
-  t2g <- data.table("transc_id" = gsub(".*transcript_id \"?(.+?)\"?;.*", "\\1", annot[[9]]),
-                    "gene_id" = gsub(".*gene_id \"?(.+?)\"?;.*", "\\1", annot[[9]]) )
-  # Clean up.
-  t2g <- unique(t2g)
-  names(t2g) <- c(transc_header, gene_header)
+  warning("This function has been renamed to 'gtf2ids()' for clarity. The old name 'annot2ids()' will be discontinued.")
+  gtf2ids(annotfile, transc_header, gene_header)
+}
 
-  return(t2g)
+
+#================================================================================
+#' Prepare GTF for gene model plotting.
+#' 
+#' Create GRanges objects for each transcript and bind them into a GRangeList for each gene.
+#' Then one can easily \code{ggbio::autoplot()} a gene's models by its gene ID.
+#' 
+#' @param annotfile A GTF file.
+#' @param threads Number of processing threads (Default 1).
+#' @return A list (by gene) of GRangeLists (by transcript).
+#'
+#' @importFrom rtracklayer import
+#' @import parallel
+#' @importFrom GenomicRanges makeGRangesListFromDataFrame
+#' @export
+#'
+annot2models <- function(annotfile, threads= 1L)
+{
+  message('This will take a few minutes...')
+  gr <- rtracklayer::import(annotfile)
+
+  # Split entries by gene and then by transcript.
+  lg <- list()
+
+  # If GTF style...
+  if ('transcript_id' %in% names(mcols(gr))) {
+    genes <- unique(gr$gene_id)
+    na <- is.na(genes)
+    if (any(na)){
+      warning("The annotation contains entries without gene_id assignment. These will be ommitted.")
+      genes <- genes[!na]
+      gr <- gr[!is.na(gr$gene_id)]
+    }
+    lg <- mclapply(genes, function(g){
+      features <- gr[gr$gene_id == g]
+      isoforms <- makeGRangesListFromDataFrame(as.data.frame(features), split.field='transcript_id', keep.extra.columns=TRUE)
+
+      return(isoforms)
+    }, mc.cores = threads)
+    names(lg) <- genes
+  # If GFF3 style...
+  } else {
+    stop("It looks like you supplied a GFF3 fileinstead of GTF. This is currently disabled as the result does not plot properly. Please convert to GTF first, using available tools, and try again.")
+  }
+
+  return(lg)
 }
 
 
 #================================================================================
 #' Import abundances directly from salmon and kallisto output.
 #'
-#' Uses \code{wasabi} to convert Salmon read counts format to Kallisto RHDF5 format,
-#' then applies TPM normalisation using the info available from the abundance.h5 files.
+#' Convert Salmon read counts format to Kallisto RHDF5 format,
+#' then apply TPM normalisation using the info available from the abundance.h5 files.
 #'
 #' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
 #' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
@@ -40,55 +141,53 @@ annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "paren
 #' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
 #' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
 #' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
-#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)
-#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstraps} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
+#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. FALSE can also be used to force wasabi to repeat the conversion even if it exists. (Default FALSE)
+#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstrap} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
 #' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
 #' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
 #' @param threads (integer) For parallel processing. (Default 1)
 #' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
 #'
-#' @import data.table
 #' @import parallel
+#' @import data.table
+#' @import rhdf5
+#' @import wasabi
 #'
 #' @export
 
 fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
 {
   threads <- as.integer(threads)  # Can't be decimal.
-  setDTthreads(1)  # Not a typo. Threads used for larger mclapply blocks instead of single table operations.
+  setDTthreads(threads, restore_after_fork = TRUE)
 
   # Don't assume the annotation is ordered properly.
   annot <- tidy_annot(annot, TARGET_COL, PARENT_COL)
 
   # Wasabi?
-  if (!half_cooked) {
-    wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths))
-  }
+  wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths), force= !half_cooked)
 
-  # Sort out scaling.
-  lA <- length(A_paths)
-  lB <- length(B_paths)
-  sfA <- NULL
-  sfB <- NULL
-  if (length(scaleto) == 1) {
-    sfA <- rep(scaleto, lA)
-    sfB <- rep(scaleto, lB)
-  } else {
-    sfA <- scaleto[1:lA]
-    sfB <- scaleto[(1+lA):(lA+lB)]
+  # Sort out scaling factor per sample.
+  lgth <- c("A"=length(A_paths), "B"=length(B_paths))
+  sfac <- list("A"=NA_real_, "B"=NA_real_)
+  if (length(scaleto) == 1) {     # uniform scaling
+    sfac$A <- rep(scaleto, lgth$A)
+    sfac$B <- rep(scaleto, lgth$B)
+  } else {                        # individual scaling
+    sfac$A <- scaleto[1:lgth$A]
+    sfac$B <- scaleto[(1 + lgth$A):(lgth$A + lgth$B)]
   }
 
   # Load and convert.
   res <- lapply(c('A', 'B'), function(cond) {
-    boots_A <- mclapply(1:lA, function(x) {
-      # Get the correct files and scaling factors.
+    boots <- mclapply(1:lgth[cond], function(x) {
+      # Get the respective file and scaling factor.
+      sf <- sfac[[cond]][x]
       if (cond=='A') {
         fil <- A_paths[x]
-        sf <- sfA[x]
       } else {
         fil <- B_paths[x]
-        sf <- sfB[x]
       }
+
       # If data from Kallisto plaintext...
       if (beartext) {
         # list the bootstrap files
@@ -106,12 +205,14 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
                                     eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
         counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
       }
+
       # Normalise raw counts.
       tpm <- as.data.table( lapply(counts, function (y) {
         cpb <- y / meta$eff_length
         tcpb <- sf / sum(cpb)
         return(cpb * tcpb)
       }) )
+
       # Structure.
       dt <- as.data.table( cbind(meta$target_id, tpm) )
       with(dt, setkey(dt, V1) )
@@ -120,7 +221,9 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
       dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
 
       return (dt)
-    }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)
+    }, mc.cores = min(threads,lgth[cond]), mc.preschedule = TRUE, mc.allow.recursive = FALSE)
+
+    return (boots)
   })
 
   names(res) <- c("boot_data_A", "boot_data_B")
@@ -157,8 +260,11 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
 #'
 denest_sleuth_boots <- function(slo, annot, samples, COUNTS_COL="tpm", BS_TARGET_COL="target_id",
                                 TARGET_COL="target_id", PARENT_COL="parent_id", threads= 1) {
+
+  warning("DEPRECATED: denest_sleuth_boots() no longer serves a purpose in RATs and will be eventually removed.")
+
   # Ensure data.table complies.
-  setDTthreads(threads)
+  setDTthreads(threads, restore_after_fork = TRUE)
 
   # Compared to the size of other structures involved in calling DTU, this should make negligible difference.
   tx <- tidy_annot(annot, TARGET_COL, PARENT_COL)$target_id
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,5 @@ @@
     *.Rproj
     .DS_Store
+    doc
+    Meta