Skip to content

Commit

Permalink
Merge pull request #57 from bartongroup/development
Browse files Browse the repository at this point in the history
Merge dev into master as 0.6.4
  • Loading branch information
fruce-ki authored May 18, 2018
2 parents dc95c94 + 67e0231 commit b0eeabf
Show file tree
Hide file tree
Showing 21 changed files with 449 additions and 384 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
^.*\.Rproj$
^\.Rproj\.user$
^CONTRIBUTING*
11 changes: 11 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# How to contribute

If you are handy with R and inclined to help out, you can propose enhancements and bug fixes via pull request to the development branch.

If you decide to do so, please make my life easier by following these points:
- [ ] Keep it simple. I will not accept any large-scale changes without prior communication, discussion and aggreement.
- [ ] Document your code and generally follow good legible coding guidelines. If I can't figure out what you've done, I cannot authorise the merge.
- [ ] Avoid adding new dependencies, unless absolutely necessary.
- [ ] It must pass the existing tests and R CMD checks.
- [ ] Any new functionality must provide suitable additional tests.
- [ ] Submit your pull request to the development branch, not the master.
12 changes: 6 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
Type: Package
Package: rats
Version: 0.6.3
Date: 2018-03-13
Version: 0.6.4
Date: 2018-05-18
Title: Relative Abundance of Transcripts
Encoding: UTF-8
Authors: c(person("Kimon Froussios", role=c("aut"), email="[email protected]"),
person("Kira Mourão", role=c("aut"), email="[email protected]"),
person("Nick J. Schurch", role=c("cre"), email="[email protected]"))
Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
Maintainer: Kimon Froussios <[email protected]>
Description: Given a set of transript abundances or a Sleuth output object, and a transcript-to-gene index of
Description: Given a set of transript abundances and a transcript-to-gene index of
identifiers, this package identifies genes where differential transcript usage occurs.
License: MIT + file LICENSE
VignetteBuilder: knitr
Depends:
matrixStats,
data.table (>= 1.9.8)
Imports:
utils,
stats,
parallel,
matrixStats
Suggests:
ggplot2 (>= 2.2.0),
rhdf5,
wasabi,
shiny
Suggests:
shiny,
testthat,
knitr,
rmarkdown
Expand Down
4 changes: 0 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@ export(sim_boot_data)
export(sim_count_data)
export(tidy_annot)
import(data.table)
import(ggplot2)
import(matrixStats)
import(parallel)
import(rhdf5)
import(shiny)
import(stats)
import(utils)
import(wasabi)
14 changes: 8 additions & 6 deletions R/func.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,19 +243,21 @@ alloc_out <- function(annot, full, n=1){
"elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA,
"known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_,
"pval"=NA_real_, "pval_corr"=NA_real_,
"quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
"quant_p_median"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
"quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_,
"rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
"rep_p_median"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
"rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_)
Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id,
"elig_xp"=NA, "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "gene_DTU"=NA,
"meanA"=NA_real_, "meanB"=NA_real_, "stdevA"=NA_real_, "stdevB"=NA_real_,
"sumA"=NA_real_, "sumB"=NA_real_, "log2FC"=NA_real_, "totalA"=NA_real_, "totalB"=NA_real_,
"propA"=NA_real_, "propB"=NA_real_, "Dprop"=NA_real_,
"pval"=NA_real_, "pval_corr"=NA_real_,
"quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_,"quant_p_max"=NA_real_,
"quant_p_median"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
"quant_Dprop_mean"=NA_real_, "quant_Dprop_stdev"=NA_real_, "quant_Dprop_min"=NA_real_, "quant_Dprop_max"=NA_real_,
"quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_,
"rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_,"rep_p_max"=NA_real_,
"rep_p_median"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
"rep_Dprop_mean"=NA_real_, "rep_Dprop_stdev"=NA_real_, "rep_Dprop_min"=NA_real_, "rep_Dprop_max"=NA_real_,
"rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_)
CountData <- list()
} else {
Expand Down Expand Up @@ -353,8 +355,8 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes

# Transcript-level test.
if (test_transc) {
if(any(Transcripts$elig)){ # Extreme case. If nothing is eligible, table subsetting by `elig` is nonsense and crashes.
# We can just skip testing altegether, as all output fields are initialized with NA already.
if(any(Transcripts$elig)){ # If nothing is eligible, table subsetting by `elig` is nonsense and crashes.
# In that case we can just skip testing altegether, as all output fields are initialized with NA already.
Transcripts[(elig), pval := unlist( mclapply( as.data.frame(t(Transcripts[(elig), .(sumA, sumB, totalA, totalB)])),
function(row) {
return( g.test.2(obsx= c(row[1], row[3]-row[1]), obsy= c(row[2], row[4]-row[2])) )
Expand Down
103 changes: 55 additions & 48 deletions R/input.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,27 @@ annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "paren
#' then applies TPM normalisation using the info available from the abundance.h5 files.
#'
#' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
#' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
#'
#' \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file.
#'
#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate. The directory name should be a unique identifier for the sample.
#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate. The directory name should be a unique identifier for the sample.
#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample.
#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
#' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)
#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstraps} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
#' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
#' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)
#' @param threads (integer) For parallel processing. (Default 1)
#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
#'
#' @import wasabi
#' @import rhdf5
#' @import data.table
#' @import parallel
#'
#' @export

fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, threads= 1L, scaleto= 1000000)
fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
{
threads <- as.integer(threads) # Can't be decimal.
setDTthreads(1) # Not a typo. Threads used for larger mclapply blocks instead of single table operations.
Expand All @@ -62,7 +62,7 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT

# Wasabi?
if (!half_cooked) {
prepare_fish_for_sleuth(c(A_paths, B_paths))
wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths))
}

# Sort out scaling.
Expand All @@ -78,46 +78,53 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
sfB <- scaleto[(1+lA):(lA+lB)]
}

# Load and convert manually.
boots_A <- mclapply(1:lA, function(x) {
fil <- A_paths[x]
sf <- sfA[x]
ids <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/aux/ids") )
counts <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
effl <- h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / effl
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )
dt <- as.data.table( cbind(ids, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
return (dt)
}, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)

boots_B <- mclapply(1:lB, function(x) {
fil <- B_paths[x]
sf <- sfB[x]
ids <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/aux/ids") )
counts <- as.data.table( h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
effl <- h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / effl
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )
dt <- as.data.table( cbind(ids, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
return (dt)
}, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)

return(list("boot_data_A"= boots_A, "boot_data_B"= boots_B))
# Load and convert.
res <- lapply(c('A', 'B'), function(cond) {
boots_A <- mclapply(1:lA, function(x) {
# Get the correct files and scaling factors.
if (cond=='A') {
fil <- A_paths[x]
sf <- sfA[x]
} else {
fil <- B_paths[x]
sf <- sfB[x]
}
# If data from Kallisto plaintext...
if (beartext) {
# list the bootstrap files
bfils <- list.files(path=file.path(fil, "bootstraps"), full.names=TRUE, no..=TRUE)
bfils <- bfils[ grepl('bs_abundance', bfils) ]
# parse info.
meta <- fread(bfils[[1]], header=TRUE)[, c('target_id', 'eff_length'), with=FALSE]
# the IDs should all come out in the same order in every iteration file of a given sample,
# and the transcript lengths should not change either.
counts <- as.data.table( lapply(bfils, function(bf){ fread(bf, header=TRUE)[['est_counts']] }) ) # plaintext already has TPMs computed, but I stick with the raw counts
# for consistency with the .h5 mode and to allow normalisation to other target sizes
# If data from Salmon/Wasabi or Kallisto abundance.h5...
} else {
meta <- as.data.table(list( target_id=as.character(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/ids")),
eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
}
# Normalise raw counts.
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / meta$eff_length
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )
# Structure.
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)

return (dt)
}, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)
})

names(res) <- c("boot_data_A", "boot_data_B")
return(res)
}


Expand Down
Loading

0 comments on commit b0eeabf

Please sign in to comment.