diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..6732fef 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^CONTRIBUTING* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..3125641 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,11 @@ +# How to contribute + +If you are handy with R and inclined to help out, you can propose enhancements and bug fixes via pull request to the development branch. + +If you decide to do so, please make my life easier by following these points: +- [ ] Keep it simple. I will not accept any large-scale changes without prior communication, discussion and aggreement. +- [ ] Document your code and generally follow good legible coding guidelines. If I can't figure out what you've done, I cannot authorise the merge. +- [ ] Avoid adding new dependencies, unless absolutely necessary. +- [ ] It must pass the existing tests and R CMD checks. +- [ ] Any new functionality must provide suitable additional tests. +- [ ] Submit your pull request to the development branch, not the master. diff --git a/DESCRIPTION b/DESCRIPTION index 779f6d0..61891dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: rats -Version: 0.6.3 -Date: 2018-03-13 +Version: 0.6.4 +Date: 2018-05-18 Title: Relative Abundance of Transcripts Encoding: UTF-8 Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"), @@ -9,22 +9,22 @@ Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac person("Nick J. Schurch", role=c("cre"), email="n.schurch@dundee.ac.uk")) Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre] Maintainer: Kimon Froussios -Description: Given a set of transript abundances or a Sleuth output object, and a transcript-to-gene index of +Description: Given a set of transript abundances and a transcript-to-gene index of identifiers, this package identifies genes where differential transcript usage occurs. License: MIT + file LICENSE VignetteBuilder: knitr Depends: - matrixStats, data.table (>= 1.9.8) Imports: utils, stats, parallel, + matrixStats +Suggests: ggplot2 (>= 2.2.0), rhdf5, wasabi, - shiny -Suggests: + shiny, testthat, knitr, rmarkdown diff --git a/NAMESPACE b/NAMESPACE index a460217..57195db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,11 +22,7 @@ export(sim_boot_data) export(sim_count_data) export(tidy_annot) import(data.table) -import(ggplot2) import(matrixStats) import(parallel) -import(rhdf5) -import(shiny) import(stats) import(utils) -import(wasabi) diff --git a/R/func.R b/R/func.R index 830fe67..5863fae 100644 --- a/R/func.R +++ b/R/func.R @@ -243,9 +243,9 @@ alloc_out <- function(annot, full, n=1){ "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA, "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_, "pval"=NA_real_, "pval_corr"=NA_real_, - "quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_, + "quant_p_median"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_, "quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_, - "rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_, + "rep_p_median"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_, "rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_) Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id, "elig_xp"=NA, "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "gene_DTU"=NA, @@ -253,9 +253,11 @@ alloc_out <- function(annot, full, n=1){ "sumA"=NA_real_, "sumB"=NA_real_, "log2FC"=NA_real_, "totalA"=NA_real_, "totalB"=NA_real_, "propA"=NA_real_, "propB"=NA_real_, "Dprop"=NA_real_, "pval"=NA_real_, "pval_corr"=NA_real_, - "quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_,"quant_p_max"=NA_real_, + "quant_p_median"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_, + "quant_Dprop_mean"=NA_real_, "quant_Dprop_stdev"=NA_real_, "quant_Dprop_min"=NA_real_, "quant_Dprop_max"=NA_real_, "quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_, - "rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_,"rep_p_max"=NA_real_, + "rep_p_median"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_, + "rep_Dprop_mean"=NA_real_, "rep_Dprop_stdev"=NA_real_, "rep_Dprop_min"=NA_real_, "rep_Dprop_max"=NA_real_, "rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_) CountData <- list() } else { @@ -353,8 +355,8 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes # Transcript-level test. if (test_transc) { - if(any(Transcripts$elig)){ # Extreme case. If nothing is eligible, table subsetting by `elig` is nonsense and crashes. - # We can just skip testing altegether, as all output fields are initialized with NA already. + if(any(Transcripts$elig)){ # If nothing is eligible, table subsetting by `elig` is nonsense and crashes. + # In that case we can just skip testing altegether, as all output fields are initialized with NA already. Transcripts[(elig), pval := unlist( mclapply( as.data.frame(t(Transcripts[(elig), .(sumA, sumB, totalA, totalB)])), function(row) { return( g.test.2(obsx= c(row[1], row[3]-row[1]), obsy= c(row[2], row[4]-row[2])) ) diff --git a/R/input.R b/R/input.R index 3d3364d..342bf18 100644 --- a/R/input.R +++ b/R/input.R @@ -32,27 +32,27 @@ annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "paren #' then applies TPM normalisation using the info available from the abundance.h5 files. #' #' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time. +#' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications. #' #' \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file. #' -#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate. The directory name should be a unique identifier for the sample. -#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate. The directory name should be a unique identifier for the sample. +#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample. +#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample. #' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs. +#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU. +#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE) +#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstraps} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems. #' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"}) #' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"}) -#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE) #' @param threads (integer) For parallel processing. (Default 1) -#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU. #' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format. #' -#' @import wasabi -#' @import rhdf5 #' @import data.table #' @import parallel #' #' @export -fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, threads= 1L, scaleto= 1000000) +fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000) { threads <- as.integer(threads) # Can't be decimal. setDTthreads(1) # Not a typo. Threads used for larger mclapply blocks instead of single table operations. @@ -62,7 +62,7 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT # Wasabi? if (!half_cooked) { - prepare_fish_for_sleuth(c(A_paths, B_paths)) + wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths)) } # Sort out scaling. @@ -78,46 +78,53 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT sfB <- scaleto[(1+lA):(lA+lB)] } - # Load and convert manually. - boots_A <- mclapply(1:lA, function(x) { - fil <- A_paths[x] - sf <- sfA[x] - ids <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/aux/ids") ) - counts <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/bootstrap") ) - effl <- h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths") - tpm <- as.data.table( lapply(counts, function (y) { - cpb <- y / effl - tcpb <- sf / sum(cpb) - return(cpb * tcpb) - }) ) - dt <- as.data.table( cbind(ids, tpm) ) - with(dt, setkey(dt, V1) ) - names(dt)[1] <- TARGET_COL - # Order transcripts to match annotation. - dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE) - return (dt) - }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE) - - boots_B <- mclapply(1:lB, function(x) { - fil <- B_paths[x] - sf <- sfB[x] - ids <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/aux/ids") ) - counts <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/bootstrap") ) - effl <- h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths") - tpm <- as.data.table( lapply(counts, function (y) { - cpb <- y / effl - tcpb <- sf / sum(cpb) - return(cpb * tcpb) - }) ) - dt <- as.data.table( cbind(ids, tpm) ) - with(dt, setkey(dt, V1) ) - names(dt)[1] <- TARGET_COL - # Order transcripts to match annotation. - dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE) - return (dt) - }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE) - - return(list("boot_data_A"= boots_A, "boot_data_B"= boots_B)) + # Load and convert. + res <- lapply(c('A', 'B'), function(cond) { + boots_A <- mclapply(1:lA, function(x) { + # Get the correct files and scaling factors. + if (cond=='A') { + fil <- A_paths[x] + sf <- sfA[x] + } else { + fil <- B_paths[x] + sf <- sfB[x] + } + # If data from Kallisto plaintext... + if (beartext) { + # list the bootstrap files + bfils <- list.files(path=file.path(fil, "bootstraps"), full.names=TRUE, no..=TRUE) + bfils <- bfils[ grepl('bs_abundance', bfils) ] + # parse info. + meta <- fread(bfils[[1]], header=TRUE)[, c('target_id', 'eff_length'), with=FALSE] + # the IDs should all come out in the same order in every iteration file of a given sample, + # and the transcript lengths should not change either. + counts <- as.data.table( lapply(bfils, function(bf){ fread(bf, header=TRUE)[['est_counts']] }) ) # plaintext already has TPMs computed, but I stick with the raw counts + # for consistency with the .h5 mode and to allow normalisation to other target sizes + # If data from Salmon/Wasabi or Kallisto abundance.h5... + } else { + meta <- as.data.table(list( target_id=as.character(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/ids")), + eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) )) + counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") ) + } + # Normalise raw counts. + tpm <- as.data.table( lapply(counts, function (y) { + cpb <- y / meta$eff_length + tcpb <- sf / sum(cpb) + return(cpb * tcpb) + }) ) + # Structure. + dt <- as.data.table( cbind(meta$target_id, tpm) ) + with(dt, setkey(dt, V1) ) + names(dt)[1] <- TARGET_COL + # Order transcripts to match annotation. + dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE) + + return (dt) + }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE) + }) + + names(res) <- c("boot_data_A", "boot_data_B") + return(res) } diff --git a/R/rats.R b/R/rats.R index bce8e9e..cb36c95 100644 --- a/R/rats.R +++ b/R/rats.R @@ -264,7 +264,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i if (verbose) myprogress <- utils::txtProgressBar(min = 0, max = numpairs, initial = 0, char = "=", width = NA, style = 3, file = stderr()) - repres <- lapply(1:numpairs, function(p) { # Single-threaded. Forking happens within calculate_DTU(). + repres <- lapply(1:numpairs, function(p) { # Single-threaded. Forking happens within calculate_DTU(). This allows call_DTU() to track and display progress. # Update progress. if (verbose) @@ -281,6 +281,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i with(pout, { return(list("pp" = Transcripts[, pval_corr], "pdtu" = Transcripts[, DTU], + "py" = Transcripts[, Dprop], "gp" = Genes[, pval_corr], "gdtu" = Genes[, DTU] )) }) }) @@ -299,28 +300,38 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i with(resobj, { if (test_transc) { eltr <- Transcripts[, elig] + pd <- as.matrix(as.data.table(mclapply(repres, function(p) { p[["pdtu"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) Transcripts[(elig), rep_dtu_freq := rowCounts(pd[eltr, ], value = TRUE, na.rm=TRUE) / numpairs] + pp <- as.matrix(as.data.table(mclapply(repres, function(p) { p[["pp"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) - Transcripts[(elig), rep_p_mean := rowMeans(pp[eltr, ], na.rm = TRUE)] - Transcripts[(elig), rep_p_stdev := rowSds(pp[eltr, ], na.rm = TRUE)] + Transcripts[(elig), rep_p_median := rowMedians(pp[eltr, ], na.rm = TRUE)] Transcripts[(elig), rep_p_min := rowMins(pp[eltr, ], na.rm = TRUE)] Transcripts[(elig), rep_p_max := rowMaxs(pp[eltr, ], na.rm = TRUE)] Transcripts[(elig), rep_na_freq := rowCounts(pp[eltr, ], value = NA, na.rm=FALSE) / numpairs] + + py <- as.matrix(as.data.table(mclapply(repres, function(p) { p[["py"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) + Transcripts[(elig), rep_Dprop_mean := rowMeans(py[eltr, ], na.rm = TRUE)] + Transcripts[(elig), rep_Dprop_stdev := rowSds(py[eltr, ], na.rm = TRUE)] + Transcripts[(elig), rep_Dprop_min := rowMins(py[eltr, ], na.rm = TRUE)] + Transcripts[(elig), rep_Dprop_max := rowMaxs(py[eltr, ], na.rm = TRUE)] + Transcripts[(elig & DTU), rep_reprod := (rep_dtu_freq >= rrep_thresh)] Transcripts[(elig & !DTU), rep_reprod := (rep_dtu_freq <= 1-rrep_thresh)] } if (test_genes) { elge <- Genes[, elig] - gres <- as.matrix(as.data.table(mclapply(repres, function(p) { p[["gp"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) + gdres <- as.matrix(as.data.table(mclapply(repres, function(p) { p[["gdtu"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) Genes[(elig), rep_dtu_freq := rowCounts(gdres[elge, ], value = TRUE, na.rm = TRUE) / numpairs] - Genes[(elig), rep_p_mean := rowMeans(gres[elge, ], na.rm = TRUE)] - Genes[(elig), rep_p_stdev := rowSds(gres[elge, ], na.rm = TRUE)] + + gres <- as.matrix(as.data.table(mclapply(repres, function(p) { p[["gp"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) + Genes[(elig), rep_p_median := rowMedians (gres[elge, ], na.rm = TRUE)] Genes[(elig), rep_p_min := rowMins(gres[elge, ], na.rm = TRUE)] Genes[(elig), rep_p_max := rowMaxs(gres[elge, ], na.rm = TRUE)] - Genes[(elig), rep_na_freq := rowCounts(gres[elge, ], value = NA, na.rm = FALSE) / numpairs] # It doesn't matter if AB or BA, affected identically by gene eligibility. + Genes[(elig), rep_na_freq := rowCounts(gres[elge, ], value = NA, na.rm = FALSE) / numpairs] + Genes[(elig & DTU), rep_reprod := (rep_dtu_freq >= rrep_thresh)] Genes[(elig & !DTU), rep_reprod := (rep_dtu_freq <= 1-rrep_thresh)] } @@ -346,7 +357,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i #----- Iterations - bootres <- lapply(1:qbootnum, function(b) { # Single-threaded. Forking happens within calculate_DTU(). + bootres <- lapply(1:qbootnum, function(b) { # Single-threaded. Forking happens within calculate_DTU(). This allows call_DTU() to track and display progress. # Update progress. if (verbose) setTxtProgressBar(myprogress, b) @@ -364,6 +375,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i with(bout, { return(list("pp" = Transcripts[, pval_corr], "pdtu" = Transcripts[, DTU], + "py" = Transcripts[, Dprop], "gp" = Genes[, pval_corr], "gdtu" = Genes[, DTU] )) }) }) @@ -383,25 +395,33 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i # !!! POSSIBLE source of ERRORS if bootstraps * transcripts exceed R's maximum matrix size. (due to number of either) !!! pd <- as.matrix(as.data.table(mclapply(bootres, function(b) { b[["pdtu"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) Transcripts[(elig), quant_dtu_freq := rowCounts(pd[Transcripts[, elig], ], value = TRUE, na.rm=TRUE) / qbootnum] + pp <- as.matrix(as.data.table(mclapply(bootres, function(b) { b[["pp"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) - Transcripts[(elig), quant_p_mean := rowMeans(pp[Transcripts[, elig], ], na.rm = TRUE)] - Transcripts[(elig), quant_p_stdev := rowSds(pp[Transcripts[, elig], ], na.rm = TRUE)] + Transcripts[(elig), quant_p_median := rowMedians(pp[Transcripts[, elig], ], na.rm = TRUE)] Transcripts[(elig), quant_p_min := rowMins(pp[Transcripts[, elig], ], na.rm = TRUE)] Transcripts[(elig), quant_p_max := rowMaxs(pp[Transcripts[, elig], ], na.rm = TRUE)] Transcripts[(elig), quant_na_freq := rowCounts(pp[Transcripts[, elig], ], value = NA, na.rm=FALSE) / qbootnum] + + py <- as.matrix(as.data.table(mclapply(bootres, function(b) { b[["py"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) + Transcripts[(elig), quant_Dprop_mean := rowMeans(py[Transcripts[, elig], ], na.rm = TRUE)] + Transcripts[(elig), quant_Dprop_stdev := rowSds(py[Transcripts[, elig], ], na.rm = TRUE)] + Transcripts[(elig), quant_Dprop_min := rowMins(py[Transcripts[, elig], ], na.rm = TRUE)] + Transcripts[(elig), quant_Dprop_max := rowMaxs(py[Transcripts[, elig], ], na.rm = TRUE)] + Transcripts[(elig & DTU), quant_reprod := (quant_dtu_freq >= qrep_thresh)] Transcripts[(elig & !DTU), quant_reprod := (quant_dtu_freq <= 1-qrep_thresh)] } if (test_genes) { # !!! POSSIBLE source of ERRORS if bootstraps * genes exceed R's maximum matrix size. (due to number of bootstraps) !!! - gres <- as.matrix(as.data.table(mclapply(bootres, function(b) { b[["gp"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) gdres <- as.matrix(as.data.table(mclapply(bootres, function(b) { b[["gdtu"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) Genes[(elig), quant_dtu_freq := rowCounts(gdres[Genes[, elig], ], value = TRUE, na.rm = TRUE) / qbootnum] - Genes[(elig), quant_p_mean := rowMeans(gres[Genes[, elig], ], na.rm = TRUE)] - Genes[(elig), quant_p_stdev := rowSds(gres[Genes[, elig], ], na.rm = TRUE)] + + gres <- as.matrix(as.data.table(mclapply(bootres, function(b) { b[["gp"]] }, mc.cores= threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE))) + Genes[(elig), quant_p_median := rowMedians(gres[Genes[, elig], ], na.rm = TRUE)] Genes[(elig), quant_p_min := rowMins(gres[Genes[, elig], ], na.rm = TRUE)] Genes[(elig), quant_p_max := rowMaxs(gres[Genes[, elig], ], na.rm = TRUE)] - Genes[(elig), quant_na_freq := rowCounts(gres[Genes[, elig], ], value = NA, na.rm = FALSE) / qbootnum] # It doesn't matter if AB or BA, affected identically by gene eligibility. + Genes[(elig), quant_na_freq := rowCounts(gres[Genes[, elig], ], value = NA, na.rm = FALSE) / qbootnum] + Genes[(elig & DTU), quant_reprod := (quant_dtu_freq >= qrep_thresh)] Genes[(elig & !DTU), quant_reprod := (quant_dtu_freq <= 1-qrep_thresh)] } @@ -487,13 +507,13 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i # Drop the bootstrap columns, if unused. if (!qboot || !test_transc) - Transcripts[, c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") := NULL] + Transcripts[, c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_Dprop_mean", "quant_Dprop_stdev", "quant_Dprop_min", "quant_Dprop_max", "quant_na_freq", "quant_reprod") := NULL] if(!qboot || !test_genes) - Genes[, c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") := NULL] + Genes[, c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") := NULL] if (!rboot || !test_transc) - Transcripts[, c("rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") := NULL] + Transcripts[, c("rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_Dprop_mean", "rep_Dprop_stdev", "rep_Dprop_min", "rep_Dprop_max", "rep_na_freq", "rep_reprod") := NULL] if(!rboot || !test_genes) - Genes[, c("rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") := NULL] + Genes[, c("rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") := NULL] }) diff --git a/R/results.R b/R/results.R index 6e5bcc9..ca1e4dd 100644 --- a/R/results.R +++ b/R/results.R @@ -228,7 +228,6 @@ dtu_plurality_summary <- function(dtuo) { #' @return A ggplot2 object. Simply display/print it or you can also customize it. #' #' @import data.table -#' @import ggplot2 #' @export plot_gene <- function(dtuo, pid, style="bycondition", fillby=NA_character_, colourby=NA_character_, shapeby=NA_character_, isofcolvec=c("tomato", "lightblue", "forestgreen", "purple", "hotpink", "gold3"), @@ -294,10 +293,10 @@ plot_gene <- function(dtuo, pid, style="bycondition", fillby=NA_character_, colo shapeby <- "DTU" if (all("condition" != c(colourby, fillby))) stop("Either fillby or colourby must be set to 'condition' for this plot to be displayed correctly!") - result <- ggplot(vis_data, aes(x= isoform, y= vals, colour= vis_data[[colourby]], fill= vis_data[[fillby]])) + - facet_grid(type ~ ., scales= "free", switch="y") + - geom_jitter(aes(shape=vis_data[[shapeby]]), position=position_jitterdodge()) + - geom_boxplot(position=position_dodge(), alpha=0.3, outlier.shape= NA) + result <- ggplot2::ggplot(vis_data, ggplot2::aes(x= isoform, y= vals, colour= vis_data[[colourby]], fill= vis_data[[fillby]])) + + ggplot2::facet_grid(type ~ ., scales= "free", switch="y") + + ggplot2::geom_jitter(ggplot2::aes(shape=vis_data[[shapeby]]), position=ggplot2::position_jitterdodge()) + + ggplot2::geom_boxplot(position=ggplot2::position_dodge(), alpha=0.3, outlier.shape= NA) ### BY CONDITION. } else if (style=="bycondition") { if (is.na(fillby)) @@ -305,15 +304,15 @@ plot_gene <- function(dtuo, pid, style="bycondition", fillby=NA_character_, colo if (is.na(shapeby)) shapeby <- "DTU" colourby <- "replicate" - result <- ggplot(vis_data, aes(x= isoform, y= vals)) + - facet_grid(type ~ condition, scales= "free", switch="y") + - geom_path(aes(colour= replicate, group= replicate), position=position_dodge(width=0.5), alpha=0.7) + - geom_point(aes(colour= replicate, group= replicate, shape=vis_data[[shapeby]]), position=position_dodge(width=0.5)) + - geom_boxplot(aes(fill= vis_data[[fillby]]), alpha=0.25, outlier.shape= NA, colour="grey60") + result <- ggplot2::ggplot(vis_data, ggplot2::aes(x= isoform, y= vals)) + + ggplot2::facet_grid(type ~ condition, scales= "free", switch="y") + + ggplot2::geom_path(ggplot2::aes(colour= replicate, group= replicate), position=ggplot2::position_dodge(width=0.5), alpha=0.7) + + ggplot2::geom_point(ggplot2::aes(colour= replicate, group= replicate, shape=vis_data[[shapeby]]), position=ggplot2::position_dodge(width=0.5)) + + ggplot2::geom_boxplot(ggplot2::aes(fill= vis_data[[fillby]]), alpha=0.25, outlier.shape= NA, colour="grey60") if (fillby=="condition") - result <- result + guides(fill="none") + result <- result + ggplot2::guides(fill="none") if (shapeby=="none") - result <- result + guides(shape="none") + result <- result + ggplot2::guides(shape="none") ### BY CONDITION LINESONLY. } else if (style=="lines") { if (is.na(fillby)) @@ -321,38 +320,38 @@ plot_gene <- function(dtuo, pid, style="bycondition", fillby=NA_character_, colo if(is.na(shapeby)) shapeby <- "DTU" colourby <- "replicate" - result <- ggplot(vis_data, aes(x= isoform, y= vals, colour= replicate)) + - facet_grid(type ~ condition, scales= "free", switch="y") + - geom_path(aes(group= replicate, colour= vis_data[[colourby]]), position=position_dodge(width=0.2)) + - geom_point(aes(group= replicate, colour= vis_data[[colourby]], shape=vis_data[[shapeby]]), position=position_dodge(width=0.2)) + result <- ggplot2::ggplot(vis_data, ggplot2::aes(x= isoform, y= vals, colour= replicate)) + + ggplot2::facet_grid(type ~ condition, scales= "free", switch="y") + + ggplot2::geom_path(ggplot2::aes(group= replicate, colour= vis_data[[colourby]]), position=ggplot2::position_dodge(width=0.2)) + + ggplot2::geom_point(ggplot2::aes(group= replicate, colour= vis_data[[colourby]], shape=vis_data[[shapeby]]), position=ggplot2::position_dodge(width=0.2)) ### ERROR } else { stop("Unknown plot style.") } result <- result + - scale_fill_manual(values=colplt[[fillby]], name=fillby) + - scale_colour_manual(values=colplt[[colourby]], name=colourby) + - scale_shape_manual(values=shaplt[[shapeby]], name=shapeby) + - scale_y_continuous(limits= c(0, NA), sec.axis=dup_axis()) + + ggplot2::scale_fill_manual(values=colplt[[fillby]], name=fillby) + + ggplot2::scale_colour_manual(values=colplt[[colourby]], name=colourby) + + ggplot2::scale_shape_manual(values=shaplt[[shapeby]], name=shapeby) + + ggplot2::scale_y_continuous(limits= c(0, NA), sec.axis=ggplot2::dup_axis()) + # geom_hline(yintercept=0, size=rel(1.1)) + - labs(title= paste("gene:", pid), y= NULL, x= NULL) + - theme(axis.text.x= element_text(angle= 90), + ggplot2::labs(title= paste("gene:", pid), y= NULL, x= NULL) + + ggplot2::theme(axis.text.x= ggplot2::element_text(angle= 90), # axis.line.x= element_line(), - strip.background= element_rect(fill= "grey95"), - strip.text.y= element_text(size= rel(1.2)), - strip.text.x= element_text(size= rel(1.1)), - panel.grid.major.x= element_line(colour = "grey95"), - panel.grid.major.y= element_blank(), - panel.grid.minor= element_blank(), - panel.background= element_rect(fill = "white"), - panel.border = element_rect(colour = "black", fill=NA), - legend.key = element_rect(fill = 'white') ) + strip.background= ggplot2::element_rect(fill= "grey95"), + strip.text.y= ggplot2::element_text(size= ggplot2::rel(1.2)), + strip.text.x= ggplot2::element_text(size= ggplot2::rel(1.1)), + panel.grid.major.x= ggplot2::element_line(colour = "grey95"), + panel.grid.major.y= ggplot2::element_blank(), + panel.grid.minor= ggplot2::element_blank(), + panel.background= ggplot2::element_rect(fill = "white"), + panel.border = ggplot2::element_rect(colour = "black", fill=NA), + legend.key = ggplot2::element_rect(fill = 'white') ) if ( any(fillby == c("none", "isoform")) ) - result <- result + guides(fill="none") + result <- result + ggplot2::guides(fill="none") if ( any(colourby == c("none", "isoform")) ) - result <- result + guides(colour="none") + result <- result + ggplot2::guides(colour="none") if ( any(shapeby == c("none", "isoform")) ) - result <- result + guides(shape="none") + result <- result + ggplot2::guides(shape="none") return(result) }) } @@ -380,7 +379,6 @@ plot_gene <- function(dtuo, pid, style="bycondition", fillby=NA_character_, colo #' object was created without the transcript-level tests, this function will not work. #' #' @import data.table -#' @import ggplot2 #' @export plot_overview <- function(dtuo, type="volcano") { @@ -390,152 +388,140 @@ plot_overview <- function(dtuo, type="volcano") { if (any(type == c("transc_volcano", "tvolcano", "volcano"))) { mydata = Transcripts[, .(target_id, Dprop, -log10(pval_corr), DTU)] names(mydata)[3] <- "neglogP" - result <- ggplot() + - geom_point(aes(x=mydata[DTU==FALSE, Dprop], y=mydata[DTU==FALSE,neglogP], colour=mydata[DTU==FALSE, DTU]), alpha = 0.3, shape=20) + - geom_point(aes(x=mydata[DTU==TRUE, Dprop], y=mydata[DTU==TRUE,neglogP], colour=mydata[DTU==TRUE, DTU]), alpha = 0.3, shape=20) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_hline(yintercept= -Parameters$p_thresh, colour="grey50") + - ggtitle("Effect size VS significance (transcript level)") + - labs(x= "Isoform propotion difference", - y = "-log10 (Pval)") + - scale_x_continuous(breaks = seq(-1, 1, 0.2)) + - scale_y_continuous(expand=c(0,0)) + result <- ggplot2::ggplot() + + ggplot2::geom_point(ggplot2::aes(x=mydata[DTU==FALSE, Dprop], y=mydata[DTU==FALSE,neglogP], colour=mydata[DTU==FALSE, DTU]), alpha = 0.3, shape=20) + + ggplot2::geom_point(ggplot2::aes(x=mydata[DTU==TRUE, Dprop], y=mydata[DTU==TRUE,neglogP], colour=mydata[DTU==TRUE, DTU]), alpha = 0.3, shape=20) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_hline(yintercept= -Parameters$p_thresh, colour="grey50") + + ggplot2::ggtitle("Effect size VS significance (transcript level)") + + ggplot2::labs(x= "Isoform propotion difference", y = "-log10 (Pval)") + + ggplot2::scale_x_continuous(breaks = seq(-1, 1, 0.2)) + + ggplot2::scale_y_continuous(expand=c(0,0)) ### GENE VOLCANO } else if (any(type == c("gene_volcano", "gvolcano"))) { mydata = Genes[, .(parent_id, maxDprop, -log10(pval_corr), DTU)] names(mydata)[3] <- "neglogP" - result <- ggplot() + - geom_point(aes(x=mydata[DTU==FALSE, maxDprop], y=mydata[DTU==FALSE,neglogP], colour=mydata[DTU==FALSE, DTU]), alpha = 0.3, shape=20) + - geom_point(aes(x=mydata[DTU==TRUE, maxDprop], y=mydata[DTU==TRUE,neglogP], colour=mydata[DTU==TRUE, DTU]), alpha = 0.3, shape=20) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_hline(yintercept= -Parameters$p_thresh, colour="grey50") + - ggtitle("Effect size VS significance (gene level)") + - labs(x= "Largest difference in isoform propotion", - y = "-log10 (Pval)") + - scale_x_continuous(breaks = seq(-1, 1, 0.2)) + - scale_y_continuous(expand=c(0,0)) + result <- ggplot2::ggplot() + + ggplot2::geom_point(ggplot2::aes(x=mydata[DTU==FALSE, maxDprop], y=mydata[DTU==FALSE,neglogP], colour=mydata[DTU==FALSE, DTU]), alpha = 0.3, shape=20) + + ggplot2::geom_point(ggplot2::aes(x=mydata[DTU==TRUE, maxDprop], y=mydata[DTU==TRUE,neglogP], colour=mydata[DTU==TRUE, DTU]), alpha = 0.3, shape=20) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_hline(yintercept= -Parameters$p_thresh, colour="grey50") + + ggplot2::ggtitle("Effect size VS significance (gene level)") + + ggplot2::labs(x= "Largest difference in isoform propotion", y = "-log10 (Pval)") + + ggplot2::scale_x_continuous(breaks = seq(-1, 1, 0.2)) + + ggplot2::scale_y_continuous(expand=c(0,0)) ### GENE VOLCANO 2 } else if (any(type == c("gtvolcano"))) { mydata = Genes[, .(parent_id, maxDprop, -log10(pval_corr), transc_DTU)] names(mydata)[3] <- "neglogP" - result <- ggplot() + - geom_point(aes(x=mydata[transc_DTU==FALSE, maxDprop], y=mydata[transc_DTU==FALSE,neglogP], colour=mydata[transc_DTU==FALSE, transc_DTU]), alpha = 0.3, shape=20) + - geom_point(aes(x=mydata[transc_DTU==TRUE, maxDprop], y=mydata[transc_DTU==TRUE,neglogP], colour=mydata[transc_DTU==TRUE, transc_DTU]), alpha = 0.3, shape=20) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_hline(yintercept= -Parameters$p_thresh, colour="grey50") + - ggtitle("Effect size VS significance (transcript level)") + - labs(x= "Largest difference in isoform propotion", - y = "-log10 (Pval)") + - scale_x_continuous(breaks = seq(-1, 1, 0.2)) + - scale_y_continuous(expand=c(0,0)) + result <- ggplot2::ggplot() + + ggplot2::geom_point(ggplot2::aes(x=mydata[transc_DTU==FALSE, maxDprop], y=mydata[transc_DTU==FALSE,neglogP], colour=mydata[transc_DTU==FALSE, transc_DTU]), alpha = 0.3, shape=20) + + ggplot2::geom_point(ggplot2::aes(x=mydata[transc_DTU==TRUE, maxDprop], y=mydata[transc_DTU==TRUE,neglogP], colour=mydata[transc_DTU==TRUE, transc_DTU]), alpha = 0.3, shape=20) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_hline(yintercept= -Parameters$p_thresh, colour="grey50") + + ggplot2::ggtitle("Effect size VS significance (transcript level)") + + ggplot2::labs(x= "Largest difference in isoform propotion", y = "-log10 (Pval)") + + ggplot2::scale_x_continuous(breaks = seq(-1, 1, 0.2)) + + ggplot2::scale_y_continuous(expand=c(0,0)) ### TRADITIONAL VOLCANO } else if (any(type == "fcvolcano")) { mydata = Transcripts[, .(target_id, log2FC, -log10(pval_corr), DTU)] names(mydata)[3] <- "neglogP" - result <- ggplot(data = na.omit(mydata), aes(log2FC, neglogP, colour = DTU)) + - geom_point(shape=16, alpha = 0.3) + - ggtitle("Isoform abundance fold-change VS significance") + - labs(x = "log2 (FC)", - y = "-log10 (Pval)") + result <- ggplot2::ggplot(data = na.omit(mydata), ggplot2::aes(log2FC, neglogP, colour = DTU)) + + ggplot2::geom_point(shape=16, alpha = 0.3) + + ggplot2::ggtitle("Isoform abundance fold-change VS significance") + + ggplot2::labs(x = "log2 (FC)", y = "-log10 (Pval)") ### EFFECT SIZE } else if (type == "dprop") { - result <- ggplot(data= Transcripts[(elig), .(Dprop, DTU)], aes(x=Dprop, fill=DTU)) + - geom_histogram(binwidth = 0.02, position="identity", alpha = 0.5) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - ggtitle("Effect size (transcript level)") + - labs(# x = paste("Prop in ", Parameters$cond_B, " (-) Prop in ", Parameters$cond_A, sep=""), - x = "Isoform proportion difference", - y = "Number of Transcripts") + - scale_x_continuous(breaks = seq(-1, 1, 0.2)) + - theme( axis.line.x = element_line() ) - maxy <- max(ggplot_build(result)$data[[1]]$y, na.rm=TRUE) + result <- ggplot2::ggplot(data= Transcripts[(elig), .(Dprop, DTU)], ggplot2::aes(x=Dprop, fill=DTU)) + + ggplot2::geom_histogram(binwidth = 0.02, position="identity", alpha = 0.5) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::ggtitle("Effect size (transcript level)") + + ggplot2::labs(x = "Isoform proportion difference", y = "Number of Transcripts") + + ggplot2::scale_x_continuous(breaks = seq(-1, 1, 0.2)) + + ggplot2::theme( axis.line.x = ggplot2::element_line() ) + maxy <- max(ggplot2::ggplot_build(result)$data[[1]]$y, na.rm=TRUE) maxy <- maxy + maxy * 0.05 result <- result + - scale_y_continuous(limits=c(0, maxy), expand=c(0, 0), trans="sqrt", + ggplot2::scale_y_continuous(limits=c(0, maxy), expand=c(0, 0), trans="sqrt", breaks=c(100, 500, 1000, pretty(c(0, maxy), n=4))) ### MAXDPROP } else if (type == "maxdprop") { mydata = Genes[, .(parent_id, maxDprop, DTU)] - result <- ggplot(data = na.omit(mydata), aes(x=maxDprop, fill=DTU)) + - geom_histogram(binwidth = 0.02, position="identity", alpha = 0.4) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - ggtitle("Effect size (gene level)") + - labs(# x = paste("max( Prop in ", Parameters$cond_B, " (-) Prop in ", Parameters$cond_A, " )", sep=""), - x = "Largest isoform proportion difference per gene", - y = "Number of genes") + - scale_x_continuous(limits=c(-1, 1), breaks = seq(-1, 1, 0.2)) + - theme(axis.line.x=element_line()) - maxy <- max(ggplot_build(result)$data[[1]]$y, na.rm=TRUE) + result <- ggplot2::ggplot(data = na.omit(mydata), ggplot2::aes(x=maxDprop, fill=DTU)) + + ggplot2::geom_histogram(binwidth = 0.02, position="identity", alpha = 0.4) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::ggtitle("Effect size (gene level)") + + ggplot2::labs(x = "Largest isoform proportion difference per gene", y = "Number of genes") + + ggplot2::scale_x_continuous(limits=c(-1, 1), breaks = seq(-1, 1, 0.2)) + + ggplot2::theme(axis.line.x= ggplot2::element_line()) + maxy <- max(ggplot2::ggplot_build(result)$data[[1]]$y, na.rm=TRUE) maxy <- maxy + maxy * 0.05 result <- result + - scale_y_continuous(limits=c(0, maxy), expand=c(0, 0), trans="sqrt", + ggplot2::scale_y_continuous(limits=c(0, maxy), expand=c(0, 0), trans="sqrt", breaks=pretty(c(0, maxy), n=5)) ### FC vs DPROP } else if (type == "fcVSdprop") { - result <- ggplot(data = na.omit(Transcripts[, .(log2FC, Dprop, DTU)]), aes(x=Dprop, y=log2FC, colour=DTU)) + - geom_point(shape=20, alpha = 0.3) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - ggtitle("Transcript abundance fold-change VS isoform proportion change") + - labs(# x = paste("Prop in ", Parameters$cond_B, " (-) Prop in ", Parameters$cond_A, sep=""), - x = "Proportion difference", - y = "log2 (FC)") + - theme( panel.grid.minor.y= element_line(colour= "grey95", size=rel(1.5)) ) + result <- ggplot2::ggplot(data = na.omit(Transcripts[, .(log2FC, Dprop, DTU)]), ggplot2::aes(x=Dprop, y=log2FC, colour=DTU)) + + ggplot2::geom_point(shape=20, alpha = 0.3) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::ggtitle("Transcript abundance fold-change VS isoform proportion change") + + ggplot2::labs(x = "Proportion difference", y = "log2 (FC)") + + ggplot2::theme( panel.grid.minor.y= ggplot2::element_line(colour= "grey95", size=ggplot2::rel(1.5)) ) ### REPRODUCIBILITY vs DPROP } else if (type == "reprodVSdprop") { - result <- ggplot(data= na.omit(Transcripts[, .(Dprop, quant_dtu_freq, DTU)]), aes(x=Dprop, y=quant_dtu_freq, colour=DTU)) + - geom_point(shape=20, alpha=0.3) + - geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=rel(0.5)) + - geom_hline(yintercept= Parameters$quant_reprod_thresh, colour="grey50", size=rel(0.5)) + - ggtitle("Reproducibility VS effect size") + - labs(# x = paste("abs( Prop in ", Parameters$cond_B, " (-) Prop in ", Parameters$cond_A, " )", sep=""), - x = "Isoform proportion change", - y = "Pass frequency") + - scale_x_continuous(limits=c(-1, 1), breaks = seq(-1, 1, 0.2)) + - scale_y_continuous(limits= c(0, 1.01), expand= c(0, 0)) + - theme( axis.line.x = element_line() ) + result <- ggplot2::ggplot(data= na.omit(Transcripts[, .(Dprop, quant_dtu_freq, DTU)]), ggplot2::aes(x=Dprop, y=quant_dtu_freq, colour=DTU)) + + ggplot2::geom_point(shape=20, alpha=0.3) + + ggplot2::geom_vline(xintercept= Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_vline(xintercept= -Parameters$dprop_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::geom_hline(yintercept= Parameters$quant_reprod_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::ggtitle("Reproducibility VS effect size") + + ggplot2::labs(x = "Isoform proportion change", y = "Pass frequency") + + ggplot2::scale_x_continuous(limits=c(-1, 1), breaks = seq(-1, 1, 0.2)) + + ggplot2::scale_y_continuous(limits= c(0, 1.01), expand= c(0, 0)) + + ggplot2::theme( axis.line.x = ggplot2::element_line() ) ### REPRODUCIBILITY } else if (type == "reprod") { - result <- ggplot(data= na.omit(Genes[, .(quant_dtu_freq, DTU)]), aes(x=quant_dtu_freq, fill=DTU)) + - geom_histogram(binwidth = 0.02, position="identity", alpha = 0.5) + - geom_vline(xintercept= Parameters$quant_reprod_thresh, colour="grey50", size=rel(0.5)) + - scale_x_continuous(breaks = seq(0, 1, 0.2), expand=c(0, 0)) + - ggtitle("DTU Reproducibility") + - labs(x = "Pass frequency", y = "Number of Genes") + - theme( axis.line = element_line() ) - maxy <- max(ggplot_build(result)$data[[1]]$y, na.rm=TRUE) + result <- ggplot2::ggplot(data= na.omit(Genes[, .(quant_dtu_freq, DTU)]), ggplot2::aes(x=quant_dtu_freq, fill=DTU)) + + ggplot2::geom_histogram(binwidth = 0.02, position="identity", alpha = 0.5) + + ggplot2::geom_vline(xintercept= Parameters$quant_reprod_thresh, colour="grey50", size=ggplot2::rel(0.5)) + + ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.2), expand=c(0, 0)) + + ggplot2::ggtitle("DTU Reproducibility") + + ggplot2::labs(x = "Pass frequency", y = "Number of Genes") + + ggplot2::theme( axis.line = ggplot2::element_line() ) + maxy <- max(ggplot2::ggplot_build(result)$data[[1]]$y, na.rm=TRUE) maxy <- maxy + maxy * 0.05 result <- result + - scale_y_continuous(limits=c(0, maxy), expand=c(0, 0), trans="sqrt", + ggplot2::scale_y_continuous(limits=c(0, maxy), expand=c(0, 0), trans="sqrt", breaks=c(100, 500, 1000, pretty(c(0, maxy), n=4))) + - coord_flip() + ggplot2::coord_flip() } else { stop("Unrecognized plot type!") } # Apply theme. result <- result + - scale_fill_manual(values=c("lightblue", "red")) + - scale_colour_manual(values=c("lightblue", "red")) + - guides(colour=guide_legend("DTU")) + - theme(panel.background= element_rect(fill= "white"), - panel.grid.major= element_line(colour= "grey95"), + ggplot2::scale_fill_manual(values=c("lightblue", "red")) + + ggplot2::scale_colour_manual(values=c("lightblue", "red")) + + ggplot2::guides(colour=ggplot2::guide_legend("DTU")) + + ggplot2::theme(panel.background= ggplot2::element_rect(fill= "white"), + panel.grid.major= ggplot2::element_line(colour= "grey95"), # panel.border = element_rect(colour = "black", fill=NA), - legend.key = element_rect(fill = 'white') ) + legend.key = ggplot2::element_rect(fill = 'white') ) return(result) }) @@ -550,7 +536,6 @@ plot_overview <- function(dtuo, type="volcano") { #' } #' @return A ggplot2 object. Simply display it or you can also customize it. #' @import data.table -#' @import ggplot2 #' #' @export plot_diagnostics <- function(dtuo, type="cormat") { @@ -564,17 +549,17 @@ plot_diagnostics <- function(dtuo, type="cormat") { names(mydata)[seq.int(dtuo$Parameters$num_replic_A+1, dtuo$Parameters$num_replic_A + dtuo$Parameters$num_replic_B)] <- paste0(dtuo$Parameters$cond_B, '_', names(mydata)[seq.int(dtuo$Parameters$num_replic_A+1, dtuo$Parameters$num_replic_A + dtuo$Parameters$num_replic_B)]) # Do all the pairwise correlations between the columns. Must leave out all rows that contain NA, otherwise all correlations become NA. corels <- melt(cor(mydata[complete.cases(mydata)])) - result <- ggplot() + - geom_tile(aes(x=corels[[1]], y=corels[[2]], fill=corels[[3]])) + - scale_fill_gradient(low="purple", high="white") + - xlab('Sample') + ylab('Sample') + ggtitle('Pairwise Pearson\'s correlations') + labs(fill='corr') + - theme(axis.text.x=element_text(angle=90)) + result <- ggplot2::ggplot() + + ggplot2::geom_tile(ggplot2::aes(x=corels[[1]], y=corels[[2]], fill=corels[[3]])) + + ggplot2::scale_fill_gradient(low="purple", high="white") + + ggplot2::xlab('Sample') + ggplot2::ylab('Sample') + ggplot2::ggtitle('Pairwise Pearson\'s correlations') + ggplot2::labs(fill='corr') + + ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90)) } # Apply theme. result <- result + - theme(panel.background= element_rect(fill= "white"), - legend.key = element_rect(fill = 'white') ) + ggplot2::theme(panel.background= ggplot2::element_rect(fill= "white"), + legend.key = ggplot2::element_rect(fill = 'white') ) return(result) }) @@ -587,44 +572,42 @@ plot_diagnostics <- function(dtuo, type="cormat") { #' @param dtuo A DTU object. #' #' @import data.table -#' @import ggplot2 -#' @import shiny #' @export #' plot_shiny_volcano <- function(dtuo) { # Set up interface. - volcano_ui <- fluidPage( + volcano_ui <- shiny::fluidPage( # Prepare space for plot display. - fluidRow( - column(width= 12, - plotOutput("plot1", height= 600, - hover= hoverOpts(id= "plot_hover"), - click= clickOpts(id= "plot_click")) ) + shiny::fluidRow( + shiny::column(width= 12, + shiny::plotOutput("plot1", height= 600, + hover= shiny::hoverOpts(id= "plot_hover"), + click= shiny::clickOpts(id= "plot_click")) ) ), # Hover info. - fluidRow( - column(width= 12, - verbatimTextOutput("hover_info") ) + shiny::fluidRow( + shiny::column(width= 12, + shiny::verbatimTextOutput("hover_info") ) ), # Click info. - fluidRow( - column(width= 12, - verbatimTextOutput("gene_info") ) + shiny::fluidRow( + shiny::column(width= 12, + shiny::verbatimTextOutput("gene_info") ) ), - fluidRow( - column(width= 12, - verbatimTextOutput("transc_info") ) + shiny::fluidRow( + shiny::column(width= 12, + shiny::verbatimTextOutput("transc_info") ) ), - fluidRow( - column(width= 12, - plotOutput("plot2", height= 600)) ) + shiny::fluidRow( + shiny::column(width= 12, + shiny::plotOutput("plot2", height= 600)) ) ) with(dtuo, { # Set up mouse responses. volcano_server <- function(input, output) { # For storing which rows have been excluded - vals <- reactiveValues( + vals <- shiny::reactiveValues( keeprows = rep(TRUE, nrow(mtcars)) ) @@ -634,15 +617,15 @@ plot_shiny_volcano <- function(dtuo) { names(myp)[3] <- "neglogP" # Plot - output$plot1 <- renderPlot({ + output$plot1 <- shiny::renderPlot({ plot_overview(dtuo, "gvolcano") }) # Assign mouse hover action to hover info output space. - output$hover_info <- renderPrint({ + output$hover_info <- shiny::renderPrint({ cat("Mouse-hover info: \n") myhover <- input$plot_hover - points <- nearPoints(myp, myhover, xvar="maxDprop", yvar="neglogP", threshold= 5) + points <- shiny::nearPoints(myp, myhover, xvar="maxDprop", yvar="neglogP", threshold= 5) if(dim(points)[1] != 0) { pid <- noquote(points[, parent_id]) dtuo$Genes[pid, .(parent_id, known_transc, elig_transc, maxDprop, pval_corr)] @@ -650,19 +633,19 @@ plot_shiny_volcano <- function(dtuo) { }) # Assign mouse click action to click info output space. - output$gene_info <- renderPrint({ + output$gene_info <- shiny::renderPrint({ cat("Gene info (left click): \n") myclick <- input$plot_click - points <- nearPoints(myp, myclick, xvar="maxDprop", yvar="neglogP", threshold= 5) + points <- shiny::nearPoints(myp, myclick, xvar="maxDprop", yvar="neglogP", threshold= 5) if(dim(points)[1] != 0) { pid <- noquote(points[, parent_id]) dtuo$Genes[pid, ] } }) - output$transc_info <- renderPrint({ + output$transc_info <- shiny::renderPrint({ cat("Transcript info (left click): \n") myclick <- input$plot_click - points <- nearPoints(myp, myclick, xvar="maxDprop", yvar="neglogP", threshold= 5) + points <- shiny::nearPoints(myp, myclick, xvar="maxDprop", yvar="neglogP", threshold= 5) if(dim(points)[1] != 0) { pid <- noquote(points[, parent_id]) dtuo$Transcripts[pid, ] @@ -670,9 +653,9 @@ plot_shiny_volcano <- function(dtuo) { }) # Assign mouse click action to gene plot output space. - output$plot2 <- renderPlot({ + output$plot2 <- shiny::renderPlot({ myclick <- input$plot_click - points <- nearPoints(myp, myclick, xvar="maxDprop", yvar="neglogP", threshold= 5, addDist=TRUE) + points <- shiny::nearPoints(myp, myclick, xvar="maxDprop", yvar="neglogP", threshold= 5, addDist=TRUE) md <- which.min(points[, dist_]) pid <- points[md, parent_id] if(!is.na(pid[1])) @@ -681,7 +664,7 @@ plot_shiny_volcano <- function(dtuo) { } # Display - shinyApp(ui= volcano_ui, server= volcano_server) + shiny::shinyApp(ui= volcano_ui, server= volcano_server) }) } diff --git a/README.md b/README.md index 891ae27..a9c10af 100644 --- a/README.md +++ b/README.md @@ -46,24 +46,24 @@ We recommend studying the vignettes before using RATs. ### Dependencies The package depends on a few third-party packages, which you may need to install first, if they are not present already. -At present these are all required in order to install RATs, but most of them will be made optional in the next release: +Most of these relate to specific functionality that you may not wish to use, thus are optional: * Packages needed for computation ``` -install.packages(c("data.table", "matrixStats"), dependencies=TRUE) +install.packages(c("data.table", "matrixStats")) ``` -* Package needed for plotting results +* Package needed for plotting results (optional, recommended) ``` -install.packages("ggplot2", dependencies=TRUE) +install.packages("ggplot2") ``` -* Packages needed for importing abundances from Salmon/Kallisto output +* Packages needed for importing abundances from Salmon/Kallisto output (optional, recommended) ``` -install.packages("devtools", dependencies=TRUE) +install.packages("devtools") source("http://bioconductor.org/biocLite.R") @@ -74,10 +74,10 @@ biocLite("COMBINE-lab/wasabi") biocLite("rhdf5") ``` -* Package needed for interactive visualisation feature +* Package needed for interactive visualisation feature (optional) ``` -install.packages("shiny", dependencies=TRUE) +install.packages("shiny") ``` If you have trouble installing these dependencies, your system could be missing source compilers for C and/or Fortran, and possibly other libraries, which you can see by scrolling back through the installation output to look for the errors. Please refer to the R manual or the respective package documentation for help. diff --git a/inst/doc/input.R b/inst/doc/input.R index 8296540..7f791b1 100644 --- a/inst/doc/input.R +++ b/inst/doc/input.R @@ -136,6 +136,8 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # TARGET_COL="transcript", PARENT_COL="gene") ## ---- eval=FALSE--------------------------------------------------------- +# # The following are equivalent. +# # # 1: # # Scale directly to library sizes at the import step. # mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, @@ -167,7 +169,7 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # # Scale TPMs to actual library sizes. # mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, # boot_data_B= mydata$boot_data_B, -# scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36)) +# scaling=c(25.123456, 26.65431, 23.131313, 50.0, 45.123132, 48.456654, 52.363636)) ## ---- eval=FALSE--------------------------------------------------------- # mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, diff --git a/inst/doc/input.Rmd b/inst/doc/input.Rmd index be59177..d90e223 100644 --- a/inst/doc/input.Rmd +++ b/inst/doc/input.Rmd @@ -218,8 +218,9 @@ mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot, scaleto=100000000) ``` -The return value of `fish4rodents` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are -formatted for RATs' generic bootstrapped input. The abundaces are normalised for isoform length and library-size. +The return value of `fish4rodents()` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are +formatted for RATs' generic bootstrapped input. This helper function works *only* with bootstrapped quantifications. +The abundaces are normalised for isoform length and library-size. Scaling to 1M reads provides TPMs, which can be used by other tools as well. Other scaling options are discussed in a dedicated section later in this vignette. @@ -230,9 +231,11 @@ last segments of each path should be a directory with a unique identifying name 2. `annot` - The annotation table that you will use for calling DTU (and that was used for the quantification). This is needed to enforce consistent order of the transcripts in the tables, enabling efficient processing. -#### Optional arguments +#### Optional arguments (`fish4rodents`): * `scaleto` allows you to control the normalisation factor. (Default 1000000 gives TPM values). +* `half_cooked` indicates whether a kallisto-style abundance.h5 file already exists. (Default `FALSE`). `wasabi` automatically detects the presence of an abundance.h5 file, so this option is actually redundant and pointless. +* `beartext` directs `fish4rodents()` to read bootstrap data from plain-text files in a `bootstraps` subdirectory in each sample instead of parsing the abundance.h5 file of the sample. `kallisto` has the option to return plaintext results or to extract results from an existing abundance.h5 file to plaintext using its `h5dump` subcommand. (Default FALSE) And finally run DTU in the way already shown: @@ -462,6 +465,8 @@ Both `fish4rodents()` and `call_DTU()` support scaling by a single value or a ve directly to the desired library size(s), as in the examples below: ```{r, eval=FALSE} +# The following are equivalent. + # 1: # Scale directly to library sizes at the import step. mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, @@ -493,7 +498,7 @@ mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, # Scale TPMs to actual library sizes. mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, boot_data_B= mydata$boot_data_B, - scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36)) + scaling=c(25.123456, 26.65431, 23.131313, 50.0, 45.123132, 48.456654, 52.363636)) ``` You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. diff --git a/inst/doc/input.html b/inst/doc/input.html index 65d0336..1c4be7a 100644 --- a/inst/doc/input.html +++ b/inst/doc/input.html @@ -369,7 +369,7 @@

DTU with Salmon/Kallisto output

# Scale them to 1M reads for TPM values. mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot, scaleto=100000000) -

The return value of fish4rodents is a list with two items: $boot_data_A and $boot_data_B. These two items are formatted for RATs’ generic bootstrapped input. The abundaces are normalised for isoform length and library-size. Scaling to 1M reads provides TPMs, which can be used by other tools as well. Other scaling options are discussed in a dedicated section later in this vignette.

+

The return value of fish4rodents() is a list with two items: $boot_data_A and $boot_data_B. These two items are formatted for RATs’ generic bootstrapped input. This helper function works only with bootstrapped quantifications. The abundaces are normalised for isoform length and library-size. Scaling to 1M reads provides TPMs, which can be used by other tools as well. Other scaling options are discussed in a dedicated section later in this vignette.

Mandatory arguments (fish4rodents):

    @@ -377,10 +377,12 @@

    Mandatory arguments (fish4rodents):

  1. annot - The annotation table that you will use for calling DTU (and that was used for the quantification). This is needed to enforce consistent order of the transcripts in the tables, enabling efficient processing.
-
-

Optional arguments

+
+

Optional arguments (fish4rodents):

  • scaleto allows you to control the normalisation factor. (Default 1000000 gives TPM values).
  • +
  • half_cooked indicates whether a kallisto-style abundance.h5 file already exists. (Default FALSE). wasabi automatically detects the presence of an abundance.h5 file, so this option is actually redundant and pointless.
  • +
  • beartext directs fish4rodents() to read bootstrap data from plain-text files in a bootstraps subdirectory in each sample instead of parsing the abundance.h5 file of the sample. kallisto has the option to return plaintext results or to extract results from an existing abundance.h5 file to plaintext using its h5dump subcommand. (Default FALSE)

And finally run DTU in the way already shown:

# 3. Run RATs with the bootstrapped table data format. 
@@ -531,7 +533,9 @@ 

Abundance scaling

To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples) or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have very dissimilar library sizes.

For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by fish4rodents(), or the actual testing step by call_DTU(). In the example examined previously, fish4rodents() was instructed to create TPM abundances, by normalising to 1000000 reads. Such values are useful with certain other tools that a user may also intend to use. Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain the normalisation by isoform length. However, it is not necessary to scale in two separate steps.

Both fish4rodents() and call_DTU() support scaling by a single value or a vector of values. If you don’t need the TPMs, you can scale directly to the desired library size(s), as in the examples below:

-
# 1:
+
# The following are equivalent.
+
+# 1:
 # Scale directly to library sizes at the import step.
 mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, 
                        annot= myannot, 
@@ -562,7 +566,7 @@ 

Abundance scaling

# Scale TPMs to actual library sizes. mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, boot_data_B= mydata$boot_data_B, - scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36))
+ scaling=c(25.123456, 26.65431, 23.131313, 50.0, 45.123132, 48.456654, 52.363636))

You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. It is important to note, that if you simply run both methods with their respective defaults, you’ll effectively run RATs on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data.

diff --git a/inst/doc/output.Rmd b/inst/doc/output.Rmd index 16f6b28..ebd9632 100644 --- a/inst/doc/output.Rmd +++ b/inst/doc/output.Rmd @@ -205,14 +205,12 @@ For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DT * `elig_transc` - (int) Number of eligible transcripts, aggregated from `Transcripts$elig`. * `pval` - (num) G test of independence p-value for the isoform ratios. * `pval_corr` - (num) Multiple testing corrected p-value. `pval_corr= p.adjust(pval, Parameters$correction)`. -* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_p_median` - (num) Median of corrected p-values across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed (corrected) p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed (corrected) p-value across quantification bootstraps. * `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). * `quant_dtu_freq` - (num) Fraction of replicate iterations that support a positive `DTU` classification. -* `rep_p_mean` - (num) Mean p-value across replication bootstraps. -* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. +* `rep_p_median` - (num) Median p-value across replication bootstraps. * `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. * `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. * `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). @@ -239,7 +237,7 @@ print( names(mydtu$Transcripts) ) * `elig` - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). `= (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))`. * `sig` - (bool) Statistically significant. `= (pval_corr < Parameters$p_thresh)`. * `elig_fx` - (bool) Eligible effect size. Proxy for biological significance. `= (Dprop > Parameters$dprop_thresh)`. -7. `quant_reprod` - (bool) Quantification reproducible. +*. `quant_reprod` - (bool) Quantification reproducible. For positive DTU, `= (quant_dtu_freq >= Parameters$quant_reprod_thresh)`, for non-DTU `= (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh)`. * `rep_reprod` - (bool) Replication reproducible. For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DTU `= (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh)`. @@ -254,16 +252,22 @@ For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DT * `Dprop` - (num) The difference in the proportion of the transcript between the two conditions (effect size). `= (probB - propA)`. * `pval` - (num) The proportion equality test P-value for the transcript. * `pval_corr` - (num) Multiple testing corrected p-value. `= p.adjust(pval, Parameters$correction)`. -* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_p_median` - (num) Median of corrected p-value across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed (corrected) p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed (corrected) p-value across quantification bootstraps. +* `quant_Dprop_mean` - (num) Mean effect size across quantification bootstraps. +* `quant_Dprop_stdev` - (num) Standard deviation of effect size across quantification bootstraps. +* `quant_Dprop_min` - (num) Minimum observed effect size across quantification bootstraps. +* `quant_Dprop_max` - (num) Maximum observed effect size across quantification bootstraps. * `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). * `quant_dtu_freq` - (num) Fraction of quantification iterations that support a positive `DTU` classification. -* `rep_p_mean` - (num) Mean p-value across replication bootstraps. -* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. -* `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. -* `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. +* `rep_p_median` - (num) Median of corrected p-value across replication bootstraps. +* `rep_p_min` - (num) Minimum observed (corrected) p-value across replication bootstraps. +* `rep_p_max` - (num) Maximum observed (corrected) p-value across replication bootstraps. +* `rep_Dprop_mean` - (num) Mean effect size across replication bootstraps. +* `rep_Dprop_stdev` - (num) Standard deviation of effect size across replication bootstraps. +* `rep_Dprop_min` - (num) Minimum observed effect size across replication bootstraps. +* `rep_Dprop_max` - (num) Maximum observed effect size across replication bootstraps. * `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). * `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. diff --git a/inst/doc/output.html b/inst/doc/output.html index 3ff92f1..f547770 100644 --- a/inst/doc/output.html +++ b/inst/doc/output.html @@ -422,8 +422,8 @@

Genes

##  [1] "parent_id"      "elig"           "sig"            "elig_fx"       
 ##  [5] "quant_reprod"   "DTU"            "transc_DTU"     "known_transc"  
 ##  [9] "detect_transc"  "elig_transc"    "maxDprop"       "pval"          
-## [13] "pval_corr"      "quant_p_mean"   "quant_p_stdev"  "quant_p_min"   
-## [17] "quant_p_max"    "quant_na_freq"  "quant_dtu_freq"
+## [13] "pval_corr" "quant_p_median" "quant_p_min" "quant_p_max" +## [17] "quant_na_freq" "quant_dtu_freq"

The first few columns show the result of each decision step. The remaining columns list the values based on which the decisions were made. Pseudo-code formulas are shown to help understand how the different fields interact when making decisions.

  • parent_id - (str) Gene identifier.
  • @@ -439,14 +439,12 @@

    Genes

  • elig_transc - (int) Number of eligible transcripts, aggregated from Transcripts$elig.
  • pval - (num) G test of independence p-value for the isoform ratios.
  • pval_corr - (num) Multiple testing corrected p-value. pval_corr= p.adjust(pval, Parameters$correction).
  • -
  • quant_p_mean - (num) Mean p-value across quantification bootstraps.
  • -
  • quant_p_stdev - (num) Standard deviation of p-values across quantification bootstraps.
  • -
  • quant_p_min - (num) Minimum observed p-value across quantification bootstraps.
  • -
  • quant_p_max - (num) Maximum observed p-value across quantification bootstraps.
  • +
  • quant_p_median - (num) Median of corrected p-values across quantification bootstraps.
  • +
  • quant_p_min - (num) Minimum observed (corrected) p-value across quantification bootstraps.
  • +
  • quant_p_max - (num) Maximum observed (corrected) p-value across quantification bootstraps.
  • quant_na_freq - (num) Fraction of quantification iterations in which DTU could not be determined (not meeting pre-filtering criteria).
  • quant_dtu_freq - (num) Fraction of replicate iterations that support a positive DTU classification.
  • -
  • rep_p_mean - (num) Mean p-value across replication bootstraps.
  • -
  • rep_p_stdev - (num) Standard deviation of p-values across replication bootstraps.
  • +
  • rep_p_median - (num) Median p-value across replication bootstraps.
  • rep_p_min - (num) Minimum observed p-value across replication bootstraps.
  • rep_p_max - (num) Maximum observed p-value across replication bootstraps.
  • rep_na_freq - (num) Fraction of replication iterations in which DTU could not be determined (not meeting pre-filtering criteria).
  • @@ -460,26 +458,24 @@

    Transcripts

    The maximum likelihood-based G-test of independence is used to compare each given isoform’s proportions in the two conditions.

    # Transcripts table's fields.
     print( names(mydtu$Transcripts) )
    -
    ##  [1] "target_id"      "parent_id"      "elig_xp"        "elig"          
    -##  [5] "sig"            "elig_fx"        "quant_reprod"   "DTU"           
    -##  [9] "gene_DTU"       "meanA"          "meanB"          "stdevA"        
    -## [13] "stdevB"         "sumA"           "sumB"           "log2FC"        
    -## [17] "totalA"         "totalB"         "propA"          "propB"         
    -## [21] "Dprop"          "pval"           "pval_corr"      "quant_p_mean"  
    -## [25] "quant_p_stdev"  "quant_p_min"    "quant_p_max"    "quant_na_freq" 
    -## [29] "quant_dtu_freq"
    +
    ##  [1] "target_id"         "parent_id"         "elig_xp"          
    +##  [4] "elig"              "sig"               "elig_fx"          
    +##  [7] "quant_reprod"      "DTU"               "gene_DTU"         
    +## [10] "meanA"             "meanB"             "stdevA"           
    +## [13] "stdevB"            "sumA"              "sumB"             
    +## [16] "log2FC"            "totalA"            "totalB"           
    +## [19] "propA"             "propB"             "Dprop"            
    +## [22] "pval"              "pval_corr"         "quant_p_median"   
    +## [25] "quant_p_min"       "quant_p_max"       "quant_Dprop_mean" 
    +## [28] "quant_Dprop_stdev" "quant_Dprop_min"   "quant_Dprop_max"  
    +## [31] "quant_na_freq"     "quant_dtu_freq"
    • target_id - (str) Transcript identifier.
    • parent_id - (str) Gene identifier.
    • elig_xp - (bool) Eligible expression level. = (meanA >= Parameters$abund_thresh | meanB >= Parameters$abund_thresh).
    • elig - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). = (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB)).
    • sig - (bool) Statistically significant. = (pval_corr < Parameters$p_thresh).
    • -
    • elig_fx - (bool) Eligible effect size. Proxy for biological significance. = (Dprop > Parameters$dprop_thresh).
    • -
    -
      -
    1. quant_reprod - (bool) Quantification reproducible. For positive DTU, = (quant_dtu_freq >= Parameters$quant_reprod_thresh), for non-DTU = (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh).
    2. -
    -
      +
    • elig_fx - (bool) Eligible effect size. Proxy for biological significance. = (Dprop > Parameters$dprop_thresh). *. quant_reprod - (bool) Quantification reproducible. For positive DTU, = (quant_dtu_freq >= Parameters$quant_reprod_thresh), for non-DTU = (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh).
    • rep_reprod - (bool) Replication reproducible. For positive DTU, = (rep_dtu_freq >= Parameters$rep_reprod_thresh), for non-DTU = (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh).
    • DTU - (bool) The Transcript is Differentially Used. = (sig & elig_fx & quant_reprod & rep_reprod). *. gene_DTU - (bool) Expanded from Genes$DTU. Indicates that the gene as a whole shows significant change in isoform ratios. = (Genes[parent_id, DTU]).
    • meanA and meanB - (num) The mean abundance across replicates, for each condition.
    • @@ -491,16 +487,22 @@

      Transcripts

    • Dprop - (num) The difference in the proportion of the transcript between the two conditions (effect size). = (probB - propA).
    • pval - (num) The proportion equality test P-value for the transcript.
    • pval_corr - (num) Multiple testing corrected p-value. = p.adjust(pval, Parameters$correction).
    • -
    • quant_p_mean - (num) Mean p-value across quantification bootstraps.
    • -
    • quant_p_stdev - (num) Standard deviation of p-values across quantification bootstraps.
    • -
    • quant_p_min - (num) Minimum observed p-value across quantification bootstraps.
    • -
    • quant_p_max - (num) Maximum observed p-value across quantification bootstraps.
    • +
    • quant_p_median - (num) Median of corrected p-value across quantification bootstraps.
    • +
    • quant_p_min - (num) Minimum observed (corrected) p-value across quantification bootstraps.
    • +
    • quant_p_max - (num) Maximum observed (corrected) p-value across quantification bootstraps.
    • +
    • quant_Dprop_mean - (num) Mean effect size across quantification bootstraps.
    • +
    • quant_Dprop_stdev - (num) Standard deviation of effect size across quantification bootstraps.
    • +
    • quant_Dprop_min - (num) Minimum observed effect size across quantification bootstraps.
    • +
    • quant_Dprop_max - (num) Maximum observed effect size across quantification bootstraps.
    • quant_na_freq - (num) Fraction of quantification iterations in which DTU could not be determined (not meeting pre-filtering criteria).
    • quant_dtu_freq - (num) Fraction of quantification iterations that support a positive DTU classification.
    • -
    • rep_p_mean - (num) Mean p-value across replication bootstraps.
    • -
    • rep_p_stdev - (num) Standard deviation of p-values across replication bootstraps.
    • -
    • rep_p_min - (num) Minimum observed p-value across replication bootstraps.
    • -
    • rep_p_max - (num) Maximum observed p-value across replication bootstraps.
    • +
    • rep_p_median - (num) Median of corrected p-value across replication bootstraps.
    • +
    • rep_p_min - (num) Minimum observed (corrected) p-value across replication bootstraps.
    • +
    • rep_p_max - (num) Maximum observed (corrected) p-value across replication bootstraps.
    • +
    • rep_Dprop_mean - (num) Mean effect size across replication bootstraps.
    • +
    • rep_Dprop_stdev - (num) Standard deviation of effect size across replication bootstraps.
    • +
    • rep_Dprop_min - (num) Minimum observed effect size across replication bootstraps.
    • +
    • rep_Dprop_max - (num) Maximum observed effect size across replication bootstraps.
    • rep_na_freq - (num) Fraction of replication iterations in which DTU could not be determined (not meeting pre-filtering criteria).
    • rep_dtu_freq - (num) Fraction of replication iterations that support a positive DTU classification.
    diff --git a/inst/doc/plots.html b/inst/doc/plots.html index bfa8c81..2b6091f 100644 --- a/inst/doc/plots.html +++ b/inst/doc/plots.html @@ -259,7 +259,7 @@

    Isoform abundances for a given gene

    When there are many replicates or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier:

    # Grouping by isoform:
     plot_gene(mydtu, "MIX6", style="byisoform")
    -

    +

Overview plots

@@ -316,7 +316,7 @@

Change information layers

The fillby, colourby and shapeby parameters can be respectively used to control which information layers are encoded as fill, line/point colour, and point shape. Possible values are c("isoform", "condition", "DTU", "none", "replicate"). Be aware that some combinations of plot style and information layers are not possible. If safe to do so these will be silently ignored, otherwise an error message will appear.

# For a less busy look, any of the information layers can be disabled.
 plot_gene(mydtu, "MIX6", style="byisoform", colourby="none", shapeby="none")
-

+

The following command will approximate the older version of the gene plot (where DTU determined the fill colour):

plot_gene(mydtu, "MIX6", fillby="DTU", shapeby="none")

diff --git a/man/fish4rodents.Rd b/man/fish4rodents.Rd index 24298cc..c285b35 100644 --- a/man/fish4rodents.Rd +++ b/man/fish4rodents.Rd @@ -5,13 +5,13 @@ \title{Import abundances directly from salmon and kallisto output.} \usage{ fish4rodents(A_paths, B_paths, annot, TARGET_COL = "target_id", - PARENT_COL = "parent_id", half_cooked = FALSE, threads = 1L, - scaleto = 1e+06) + PARENT_COL = "parent_id", half_cooked = FALSE, beartext = FALSE, + threads = 1L, scaleto = 1e+06) } \arguments{ -\item{A_paths}{(character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate. The directory name should be a unique identifier for the sample.} +\item{A_paths}{(character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample.} -\item{B_paths}{(character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate. The directory name should be a unique identifier for the sample.} +\item{B_paths}{(character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.} \item{annot}{(data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.} @@ -21,6 +21,8 @@ fish4rodents(A_paths, B_paths, annot, TARGET_COL = "target_id", \item{half_cooked}{(logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)} +\item{beartext}{(logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstraps} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.} + \item{threads}{(integer) For parallel processing. (Default 1)} \item{scaleto}{(double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.} @@ -34,6 +36,7 @@ then applies TPM normalisation using the info available from the abundance.h5 fi } \details{ Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time. +IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications. \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file. } diff --git a/tests/testthat/test_1_internal-structures.R b/tests/testthat/test_1_internal-structures.R index 4d2e067..eb2d252 100644 --- a/tests/testthat/test_1_internal-structures.R +++ b/tests/testthat/test_1_internal-structures.R @@ -28,22 +28,22 @@ test_that("The reporting structures are created correctly", { expect_true(is.data.frame(full$Genes)) expect_true(is.data.frame(short$Genes)) - expect_equal(dim(full$Genes)[2], 26) + expect_equal(dim(full$Genes)[2], 24) expect_equal(dim(short$Genes)[2], 8) expect_named(full$Genes, c("parent_id", "elig", "sig", "elig_fx", "quant_reprod", "rep_reprod", "DTU", "transc_DTU", "known_transc", "detect_transc", "elig_transc", "maxDprop", "pval", "pval_corr", - "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_dtu_freq", - "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_dtu_freq") ) + "quant_p_median", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_dtu_freq", + "rep_p_median", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_dtu_freq") ) expect_named(short$Genes, c("parent_id", "DTU", "elig_transc", "elig", "elig_fx", "pval", "pval_corr", "sig") ) expect_true(is.data.frame(full$Transcripts)) expect_true(is.data.frame(short$Transcripts)) - expect_equal(dim(full$Transcripts)[2], 36) + expect_equal(dim(full$Transcripts)[2], 42) expect_equal(dim(short$Transcripts)[2], 17) expect_named(full$Transcripts, c("target_id", "parent_id", "elig_xp", "elig", "sig", "elig_fx", "quant_reprod", "rep_reprod", "DTU", "gene_DTU", "meanA", "meanB", "stdevA", "stdevB", "sumA", "sumB", "log2FC", "totalA", "totalB", "propA", "propB", "Dprop", "pval", "pval_corr", - "quant_p_mean", "quant_p_stdev", "quant_p_min","quant_p_max", "quant_na_freq", "quant_dtu_freq", - "rep_p_mean", "rep_p_stdev", "rep_p_min","rep_p_max", "rep_na_freq", "rep_dtu_freq") ) + "quant_p_median", "quant_p_min", "quant_p_max", "quant_Dprop_mean", "quant_Dprop_stdev", "quant_Dprop_min", "quant_Dprop_max", "quant_na_freq", "quant_dtu_freq", + "rep_p_median", "rep_p_min", "rep_p_max", "rep_Dprop_mean", "rep_Dprop_stdev", "rep_Dprop_min", "rep_Dprop_max", "rep_na_freq", "rep_dtu_freq") ) expect_named(short$Transcripts, c("target_id", "parent_id", "DTU", "sumA", "sumB", "log2FC", "totalA", "totalB", "elig_xp", "elig", "propA", "propB", "Dprop", "elig_fx", "pval", "pval_corr", "sig") ) diff --git a/tests/testthat/test_5_output.R b/tests/testthat/test_5_output.R index bdc7810..cd53467 100644 --- a/tests/testthat/test_5_output.R +++ b/tests/testthat/test_5_output.R @@ -20,26 +20,24 @@ test_that("The output structure is correct", { "rep_boot", "rep_reprod_thresh", "rep_bootnum", "seed", "reckless")) expect_true(is.data.frame(mydtu$Genes)) - expect_equal(dim(mydtu$Genes)[2], 26) + expect_equal(dim(mydtu$Genes)[2], 24) expect_named(mydtu$Genes, c("parent_id", "elig", "sig", "elig_fx", "quant_reprod", "rep_reprod", "DTU", "transc_DTU", "known_transc", "detect_transc", "elig_transc", "maxDprop", "pval", "pval_corr", - "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_dtu_freq", - "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_dtu_freq") ) + "quant_p_median", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_dtu_freq", + "rep_p_median", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_dtu_freq") ) expect_true(is.numeric(mydtu$Genes[["known_transc"]])) expect_true(is.numeric(mydtu$Genes[["detect_transc"]])) expect_true(is.numeric(mydtu$Genes[["maxDprop"]])) expect_true(is.numeric(mydtu$Genes[["pval"]])) expect_true(is.numeric(mydtu$Genes[["pval_corr"]])) expect_true(is.numeric(mydtu$Genes[["quant_dtu_freq"]])) - expect_true(is.numeric(mydtu$Genes[["quant_p_mean"]])) - expect_true(is.numeric(mydtu$Genes[["quant_p_stdev"]])) + expect_true(is.numeric(mydtu$Genes[["quant_p_median"]])) expect_true(is.numeric(mydtu$Genes[["quant_p_min"]])) expect_true(is.numeric(mydtu$Genes[["quant_p_max"]])) expect_true(is.numeric(mydtu$Genes[["quant_na_freq"]])) expect_true(is.logical(mydtu$Genes[["quant_reprod"]])) expect_true(is.numeric(mydtu$Genes[["rep_dtu_freq"]])) - expect_true(is.numeric(mydtu$Genes[["rep_p_mean"]])) - expect_true(is.numeric(mydtu$Genes[["rep_p_stdev"]])) + expect_true(is.numeric(mydtu$Genes[["rep_p_median"]])) expect_true(is.numeric(mydtu$Genes[["rep_p_min"]])) expect_true(is.numeric(mydtu$Genes[["rep_p_max"]])) expect_true(is.numeric(mydtu$Genes[["rep_na_freq"]])) @@ -51,11 +49,11 @@ test_that("The output structure is correct", { expect_true(is.logical(mydtu$Genes[["transc_DTU"]])) expect_true(is.data.frame(mydtu$Transcripts)) - expect_equal(dim(mydtu$Transcripts)[2], 36) + expect_equal(dim(mydtu$Transcripts)[2], 42) expect_named(mydtu$Transcripts, c("target_id", "parent_id", "elig_xp", "elig", "sig", "elig_fx", "quant_reprod", "rep_reprod", "DTU", "gene_DTU", "meanA", "meanB", "stdevA", "stdevB", "sumA", "sumB", "log2FC", "totalA", "totalB", "propA", "propB", "Dprop", "pval", "pval_corr", - "quant_p_mean", "quant_p_stdev", "quant_p_min","quant_p_max", "quant_na_freq", "quant_dtu_freq", - "rep_p_mean", "rep_p_stdev", "rep_p_min","rep_p_max", "rep_na_freq", "rep_dtu_freq") ) + "quant_p_median", "quant_p_min","quant_p_max", "quant_Dprop_mean", "quant_Dprop_stdev", "quant_Dprop_min","quant_Dprop_max", "quant_na_freq", "quant_dtu_freq", + "rep_p_median", "rep_p_min","rep_p_max", "rep_Dprop_mean", "rep_Dprop_stdev", "rep_Dprop_min","rep_Dprop_max", "rep_na_freq", "rep_dtu_freq") ) expect_true(is.logical(mydtu$Transcripts[["elig_xp"]])) expect_true(is.logical(mydtu$Transcripts[["elig"]])) expect_true(is.logical(mydtu$Transcripts[["elig_fx"]])) @@ -75,44 +73,49 @@ test_that("The output structure is correct", { expect_true(is.numeric(mydtu$Transcripts[["pval"]])) expect_true(is.numeric(mydtu$Transcripts[["pval_corr"]])) expect_true(is.numeric(mydtu$Transcripts[["quant_dtu_freq"]])) - expect_true(is.numeric(mydtu$Transcripts[["quant_p_mean"]])) - expect_true(is.numeric(mydtu$Transcripts[["quant_p_stdev"]])) + expect_true(is.numeric(mydtu$Transcripts[["quant_p_median"]])) expect_true(is.numeric(mydtu$Transcripts[["quant_p_min"]])) expect_true(is.numeric(mydtu$Transcripts[["quant_p_max"]])) + expect_true(is.numeric(mydtu$Transcripts[["quant_Dprop_mean"]])) + expect_true(is.numeric(mydtu$Transcripts[["quant_Dprop_stdev"]])) + expect_true(is.numeric(mydtu$Transcripts[["quant_Dprop_min"]])) + expect_true(is.numeric(mydtu$Transcripts[["quant_Dprop_max"]])) expect_true(is.numeric(mydtu$Transcripts[["quant_na_freq"]])) expect_true(is.logical(mydtu$Transcripts[["quant_reprod"]])) expect_true(is.numeric(mydtu$Transcripts[["rep_dtu_freq"]])) - expect_true(is.numeric(mydtu$Transcripts[["rep_p_mean"]])) - expect_true(is.numeric(mydtu$Transcripts[["rep_p_stdev"]])) + expect_true(is.numeric(mydtu$Transcripts[["rep_p_median"]])) expect_true(is.numeric(mydtu$Transcripts[["rep_p_min"]])) expect_true(is.numeric(mydtu$Transcripts[["rep_p_max"]])) + expect_true(is.numeric(mydtu$Transcripts[["rep_Dprop_mean"]])) + expect_true(is.numeric(mydtu$Transcripts[["rep_Dprop_stdev"]])) + expect_true(is.numeric(mydtu$Transcripts[["rep_Dprop_min"]])) + expect_true(is.numeric(mydtu$Transcripts[["rep_Dprop_max"]])) expect_true(is.numeric(mydtu$Transcripts[["rep_na_freq"]])) expect_true(is.logical(mydtu$Transcripts[["rep_reprod"]])) - expect_true(is.list(mydtu$Abundances)) expect_true(is.data.frame(mydtu$Abundances[[1]])) expect_true(is.data.frame(mydtu$Abundances[[2]])) mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", verbose = FALSE, qboot = FALSE) - expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") %in% names(mydtu$Genes))) - expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") %in% names(mydtu$Transcripts))) + expect_false(any(c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") %in% names(mydtu$Genes))) + expect_false(any(c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_Dprop_mean", "quant_Dprop_stdev", "quant_Dprop_min", "quant_Dprop_max", "quant_na_freq", "quant_reprod") %in% names(mydtu$Transcripts))) mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", verbose = FALSE, rboot = FALSE) - expect_false(any(c("rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) - expect_false(any(c("rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) + expect_false(any(c("rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) + expect_false(any(c("rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_Dprop_mean", "rep_Dprop_stdev", "rep_Dprop_min", "rep_Dprop_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", testmode="transc", qbootnum=2, rboot=TRUE, qboot=TRUE, verbose = FALSE) - expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", - "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) - expect_true(all(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", - "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) + expect_false(any(c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", + "rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) + expect_true(all(c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_Dprop_mean", "quant_Dprop_stdev", "quant_Dprop_min", "quant_Dprop_max", "quant_na_freq", "quant_reprod", + "rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_Dprop_mean", "rep_Dprop_stdev", "rep_Dprop_min", "rep_Dprop_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", testmode="genes", qbootnum=2, rboot=TRUE, qboot=TRUE, verbose = FALSE) - expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", - "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) - expect_true(all(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", - "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) + expect_false(any(c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_Dprop_mean", "quant_Dprop_stdev", "quant_Dprop_min", "quant_Dprop_max", "quant_na_freq", "quant_reprod", + "rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_Dprop_mean", "rep_Dprop_stdev", "rep_Dprop_min", "rep_Dprop_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) + expect_true(all(c("quant_dtu_freq", "quant_p_median", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", + "rep_dtu_freq", "rep_p_median", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) }) @@ -171,15 +174,13 @@ test_that("The output content is complete", { expect_false(all(is.na(mydtu[[x]]$Genes[["transc_DTU"]]))) if (x>1) { expect_false(all(is.na(mydtu[[x]]$Genes[["quant_dtu_freq"]]))) - expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_mean"]]))) - expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_stdev"]]))) + expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_median"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_min"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_max"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["quant_na_freq"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["quant_reprod"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["rep_dtu_freq"]]))) - expect_false(all(is.na(mydtu[[x]]$Genes[["rep_p_mean"]]))) - expect_false(all(is.na(mydtu[[x]]$Genes[["rep_p_stdev"]]))) + expect_false(all(is.na(mydtu[[x]]$Genes[["rep_p_median"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["rep_p_min"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["rep_p_max"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["rep_na_freq"]]))) @@ -207,17 +208,23 @@ test_that("The output content is complete", { expect_false(all(is.na(mydtu[[x]]$Transcripts[["sig"]]))) if (x>1) { expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_dtu_freq"]]))) - expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_mean"]]))) - expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_stdev"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_median"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_min"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_max"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_Dprop_mean"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_Dprop_stdev"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_Dprop_min"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_Dprop_max"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_na_freq"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_reprod"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_dtu_freq"]]))) - expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_p_mean"]]))) - expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_p_stdev"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_p_median"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_p_min"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_p_max"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_Dprop_mean"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_Dprop_stdev"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_Dprop_min"]]))) + expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_Dprop_max"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_na_freq"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_reprod"]]))) } diff --git a/tests/testthat/test_6_reports.R b/tests/testthat/test_6_reports.R index 0e1fce5..b344193 100644 --- a/tests/testthat/test_6_reports.R +++ b/tests/testthat/test_6_reports.R @@ -188,3 +188,12 @@ test_that("The overview plotting commands work", { expect_silent(plot_overview(dtuo=mydtu, type="reprod")) expect_silent(plot_overview(dtuo=mydtu, type="reprodVSdprop")) }) + +#============================================================================== +test_that("The diagnostic plots commands work", { + sim <- sim_boot_data() + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE, reckless=TRUE) + + expect_silent(plot_diagnostics(mydtu)) + expect_silent(plot_diagnostics(dtuo=mydtu, type="cormat")) +}) diff --git a/vignettes/input.Rmd b/vignettes/input.Rmd index be59177..d90e223 100644 --- a/vignettes/input.Rmd +++ b/vignettes/input.Rmd @@ -218,8 +218,9 @@ mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot, scaleto=100000000) ``` -The return value of `fish4rodents` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are -formatted for RATs' generic bootstrapped input. The abundaces are normalised for isoform length and library-size. +The return value of `fish4rodents()` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are +formatted for RATs' generic bootstrapped input. This helper function works *only* with bootstrapped quantifications. +The abundaces are normalised for isoform length and library-size. Scaling to 1M reads provides TPMs, which can be used by other tools as well. Other scaling options are discussed in a dedicated section later in this vignette. @@ -230,9 +231,11 @@ last segments of each path should be a directory with a unique identifying name 2. `annot` - The annotation table that you will use for calling DTU (and that was used for the quantification). This is needed to enforce consistent order of the transcripts in the tables, enabling efficient processing. -#### Optional arguments +#### Optional arguments (`fish4rodents`): * `scaleto` allows you to control the normalisation factor. (Default 1000000 gives TPM values). +* `half_cooked` indicates whether a kallisto-style abundance.h5 file already exists. (Default `FALSE`). `wasabi` automatically detects the presence of an abundance.h5 file, so this option is actually redundant and pointless. +* `beartext` directs `fish4rodents()` to read bootstrap data from plain-text files in a `bootstraps` subdirectory in each sample instead of parsing the abundance.h5 file of the sample. `kallisto` has the option to return plaintext results or to extract results from an existing abundance.h5 file to plaintext using its `h5dump` subcommand. (Default FALSE) And finally run DTU in the way already shown: @@ -462,6 +465,8 @@ Both `fish4rodents()` and `call_DTU()` support scaling by a single value or a ve directly to the desired library size(s), as in the examples below: ```{r, eval=FALSE} +# The following are equivalent. + # 1: # Scale directly to library sizes at the import step. mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, @@ -493,7 +498,7 @@ mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, # Scale TPMs to actual library sizes. mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, boot_data_B= mydata$boot_data_B, - scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36)) + scaling=c(25.123456, 26.65431, 23.131313, 50.0, 45.123132, 48.456654, 52.363636)) ``` You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. diff --git a/vignettes/output.Rmd b/vignettes/output.Rmd index 16f6b28..ebd9632 100644 --- a/vignettes/output.Rmd +++ b/vignettes/output.Rmd @@ -205,14 +205,12 @@ For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DT * `elig_transc` - (int) Number of eligible transcripts, aggregated from `Transcripts$elig`. * `pval` - (num) G test of independence p-value for the isoform ratios. * `pval_corr` - (num) Multiple testing corrected p-value. `pval_corr= p.adjust(pval, Parameters$correction)`. -* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_p_median` - (num) Median of corrected p-values across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed (corrected) p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed (corrected) p-value across quantification bootstraps. * `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). * `quant_dtu_freq` - (num) Fraction of replicate iterations that support a positive `DTU` classification. -* `rep_p_mean` - (num) Mean p-value across replication bootstraps. -* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. +* `rep_p_median` - (num) Median p-value across replication bootstraps. * `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. * `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. * `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). @@ -239,7 +237,7 @@ print( names(mydtu$Transcripts) ) * `elig` - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). `= (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))`. * `sig` - (bool) Statistically significant. `= (pval_corr < Parameters$p_thresh)`. * `elig_fx` - (bool) Eligible effect size. Proxy for biological significance. `= (Dprop > Parameters$dprop_thresh)`. -7. `quant_reprod` - (bool) Quantification reproducible. +*. `quant_reprod` - (bool) Quantification reproducible. For positive DTU, `= (quant_dtu_freq >= Parameters$quant_reprod_thresh)`, for non-DTU `= (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh)`. * `rep_reprod` - (bool) Replication reproducible. For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DTU `= (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh)`. @@ -254,16 +252,22 @@ For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DT * `Dprop` - (num) The difference in the proportion of the transcript between the two conditions (effect size). `= (probB - propA)`. * `pval` - (num) The proportion equality test P-value for the transcript. * `pval_corr` - (num) Multiple testing corrected p-value. `= p.adjust(pval, Parameters$correction)`. -* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_p_median` - (num) Median of corrected p-value across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed (corrected) p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed (corrected) p-value across quantification bootstraps. +* `quant_Dprop_mean` - (num) Mean effect size across quantification bootstraps. +* `quant_Dprop_stdev` - (num) Standard deviation of effect size across quantification bootstraps. +* `quant_Dprop_min` - (num) Minimum observed effect size across quantification bootstraps. +* `quant_Dprop_max` - (num) Maximum observed effect size across quantification bootstraps. * `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). * `quant_dtu_freq` - (num) Fraction of quantification iterations that support a positive `DTU` classification. -* `rep_p_mean` - (num) Mean p-value across replication bootstraps. -* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. -* `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. -* `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. +* `rep_p_median` - (num) Median of corrected p-value across replication bootstraps. +* `rep_p_min` - (num) Minimum observed (corrected) p-value across replication bootstraps. +* `rep_p_max` - (num) Maximum observed (corrected) p-value across replication bootstraps. +* `rep_Dprop_mean` - (num) Mean effect size across replication bootstraps. +* `rep_Dprop_stdev` - (num) Standard deviation of effect size across replication bootstraps. +* `rep_Dprop_min` - (num) Minimum observed effect size across replication bootstraps. +* `rep_Dprop_max` - (num) Maximum observed effect size across replication bootstraps. * `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). * `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification.