Skip to content

Commit

Permalink
Merge pull request #68 from bartongroup/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
fruce-ki authored Dec 17, 2021
2 parents 58cc11f + 9a4579b commit 14127ca
Show file tree
Hide file tree
Showing 62 changed files with 3,384 additions and 6,406 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
^.*\.Rproj$
^\.Rproj\.user$
^CONTRIBUTING*
^doc$
^Meta$
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
inst/doc/* linguist-documentation
doc/* linguist-documentation
vignettes/* linguist-documentation
doc/* linguist-documentation
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
*.Rproj

.DS_Store
doc
Meta
29 changes: 15 additions & 14 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
Type: Package
Package: rats
Version: 0.6.4
Date: 2018-05-18
Version: 0.6.4.17
Date: 2019-07-18
Title: Relative Abundance of Transcripts
Encoding: UTF-8
Authors: c(person("Kimon Froussios", role=c("aut"), email="[email protected]"),
person("Kira Mourão", role=c("aut"), email="[email protected]"),
person("Nick J. Schurch", role=c("cre"), email="[email protected]"))
Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
Maintainer: Kimon Froussios <k.froussios@dundee.ac.uk>
Maintainer: Kimon Froussios <kimon.froussios@gmail.com>
Description: Given a set of transript abundances and a transcript-to-gene index of
identifiers, this package identifies genes where differential transcript usage occurs.
Authors@R: c(person("Kimon Froussios", role=c("aut", "cre"), email="[email protected]"),
person("Kira Mourão", role=c("aut")),
person("Nick J. Schurch", role=c("aut")) )
License: MIT + file LICENSE
VignetteBuilder: knitr
Depends:
data.table (>= 1.9.8)
Imports:
data.table (>= 1.9.8),
utils,
stats,
parallel,
matrixStats
Suggests:
matrixStats,
ggplot2 (>= 2.2.0),
rtracklayer,
rhdf5,
wasabi,
wasabi,
GenomicRanges
Suggests:
ggbio,
shiny,
testthat,
knitr,
rmarkdown
testthat
URL: https://github.com/bartongroup/Rats, http://www.compbio.dundee.ac.uk
BugReports: https://github.com/bartongroup/RATS/issues
RoxygenNote: 6.0.1
RoxygenNote: 6.1.1
Binary file added Meta/vignette.rds
Binary file not shown.
14 changes: 13 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(annot2ids)
export(annot2models)
export(call_DTU)
export(denest_sleuth_boots)
export(dtu_plurality_summary)
Expand All @@ -12,7 +13,9 @@ export(g.test.2)
export(get_dtu_ids)
export(get_plurality_ids)
export(get_switch_ids)
export(granges2ids)
export(group_samples)
export(gtf2ids)
export(maxabs)
export(plot_diagnostics)
export(plot_gene)
Expand All @@ -22,7 +25,16 @@ export(sim_boot_data)
export(sim_count_data)
export(tidy_annot)
import(data.table)
import(ggplot2)
import(matrixStats)
import(parallel)
import(stats)
import(rhdf5)
import(utils)
import(wasabi)
importFrom(GenomicRanges,makeGRangesListFromDataFrame)
importFrom(GenomicRanges,mcols)
importFrom(parallel,detectCores)
importFrom(parallel,mclapply)
importFrom(rtracklayer,import)
importFrom(stats,p.adjust.methods)
importFrom(stats,pchisq)
177 changes: 115 additions & 62 deletions R/data_simulators.R

Large diffs are not rendered by default.

447 changes: 361 additions & 86 deletions R/func.R

Large diffs are not rendered by default.

182 changes: 144 additions & 38 deletions R/input.R
Original file line number Diff line number Diff line change
@@ -1,35 +1,136 @@
########## ########## ########## ########## ########## ########## ########## ########## ##########
# Helper functions for setting up the input, if necessary.
########## ########## ########## ########## ########## ########## ########## ########## ##########


#================================================================================
#' Extract matching transcript and gene IDs from a GRanges object.
#'
#' The GRanges must have at least the following metadata columns: `gene_id` and `transcript_id` (such as GRanges imported from GTF).
#' GFF3-style metadata columns are currently not supported.
#'
#' @param annot A GRanges object with gene_id and transcript_id metadata columns.
#' @param transc_header The title for the transcripts column in the output. (target_id)
#' @param gene_header The title for the genes column in the output. (parent_id)
#' @return A data.table with two columns, matching transcript IDs to gene IDs.
#'
#' @importFrom GenomicRanges mcols
#' @import data.table
#' @export
#'
granges2ids <- function(annot, transc_header= "target_id", gene_header= "parent_id")
{
# If GTF style...
if ('transcript_id' %in% names(mcols(annot))) {
t2g <- unique(data.table(transcript_id=annot$transcript_id, gene_id=annot$gene_id))
# If GFF3 style...
} else {
stop('It seems you supplied a GFF3 file. This is currenlty not implemented. Please convert to GTF using available tools and try again.')
}
# Don't want any unpaired IDs (genes without transcripts are irrelevant, transcripts without gene are unusable).
t2g <- t2g[!is.na(t2g$gene_id) & !is.na(t2g$transcript_id), ] # R Check really does not like data.table syntax... :(

names(t2g) <- c(transc_header, gene_header)
return(t2g)
}


#================================================================================
#' Extract matching transcript and gene IDs from a GTF file.
#'
#' This function performs no file format validation. Please use the appropriate type.
#' It is NOT compatible GFF3, as there is no rigid specification for ID attributes.
#'Previously named annot2ids(). The old name will be discontinued.
#'
#' GFF3 not supported.
#'
#' @param annotfile A GTF file.
#' @param transc_header The title for the transcripts column in the output. (target_id)
#' @param gene_header The title for the genes column in the output. (parent_id)
#' @return A data.table with two columns, matching transcript IDs to gene IDs.
#'
#' @importFrom rtracklayer import
#' @export
#'
gtf2ids <- function(annotfile, transc_header= "target_id", gene_header= "parent_id")
{
annot <- rtracklayer::import(annotfile)
granges2ids(annot, transc_header, gene_header)
}


#================================================================================
#' (deprecated) Extract matching transcript and gene IDs from a GTF file.
#'
#' This function has been renamed to gtf2ids(). The old name will be discontinued.
#'
#' GFF3 not supported.
#'
#' @param annotfile A GTF file.
#' @param transc_header The tile for the transcripts column in the output. (target_id)
#' @param gene_header The tile for the genes column in the output. (parent_id)
#' @param transc_header The title for the transcripts column in the output. (target_id)
#' @param gene_header The title for the genes column in the output. (parent_id)
#' @return A data.table with two columns, matching transcript IDs to gene IDs.
#'
#' @export
#'
annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "parent_id")
{
annot <- as.data.table(read.delim(annotfile, comment.char= "#", header= FALSE))
annot <- annot[(grepl("transcript_id", annot[[9]]) & grepl("gene_id", annot[[9]])), ]
t2g <- data.table("transc_id" = gsub(".*transcript_id \"?(.+?)\"?;.*", "\\1", annot[[9]]),
"gene_id" = gsub(".*gene_id \"?(.+?)\"?;.*", "\\1", annot[[9]]) )
# Clean up.
t2g <- unique(t2g)
names(t2g) <- c(transc_header, gene_header)
warning("This function has been renamed to 'gtf2ids()' for clarity. The old name 'annot2ids()' will be discontinued.")
gtf2ids(annotfile, transc_header, gene_header)
}

return(t2g)

#================================================================================
#' Prepare GTF for gene model plotting.
#'
#' Create GRanges objects for each transcript and bind them into a GRangeList for each gene.
#' Then one can easily \code{ggbio::autoplot()} a gene's models by its gene ID.
#'
#' @param annotfile A GTF file.
#' @param threads Number of processing threads (Default 1).
#' @return A list (by gene) of GRangeLists (by transcript).
#'
#' @importFrom rtracklayer import
#' @import parallel
#' @importFrom GenomicRanges makeGRangesListFromDataFrame
#' @export
#'
annot2models <- function(annotfile, threads= 1L)
{
message('This will take a few minutes...')
gr <- rtracklayer::import(annotfile)

# Split entries by gene and then by transcript.
lg <- list()

# If GTF style...
if ('transcript_id' %in% names(mcols(gr))) {
genes <- unique(gr$gene_id)
na <- is.na(genes)
if (any(na)){
warning("The annotation contains entries without gene_id assignment. These will be ommitted.")
genes <- genes[!na]
gr <- gr[!is.na(gr$gene_id)]
}
lg <- mclapply(genes, function(g){
features <- gr[gr$gene_id == g]
isoforms <- makeGRangesListFromDataFrame(as.data.frame(features), split.field='transcript_id', keep.extra.columns=TRUE)

return(isoforms)
}, mc.cores = threads)
names(lg) <- genes
# If GFF3 style...
} else {
stop("It looks like you supplied a GFF3 fileinstead of GTF. This is currently disabled as the result does not plot properly. Please convert to GTF first, using available tools, and try again.")
}

return(lg)
}


#================================================================================
#' Import abundances directly from salmon and kallisto output.
#'
#' Uses \code{wasabi} to convert Salmon read counts format to Kallisto RHDF5 format,
#' then applies TPM normalisation using the info available from the abundance.h5 files.
#' Convert Salmon read counts format to Kallisto RHDF5 format,
#' then apply TPM normalisation using the info available from the abundance.h5 files.
#'
#' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
#' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
Expand All @@ -40,55 +141,53 @@ annot2ids <- function(annotfile, transc_header= "target_id", gene_header= "paren
#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
#' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. Wasabi automatically skips conversion if abundance.h5 is present, so this parameter is redundant, unless wasabi is not installed. (Default FALSE)
#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstraps} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. FALSE can also be used to force wasabi to repeat the conversion even if it exists. (Default FALSE)
#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstrap} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
#' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
#' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
#' @param threads (integer) For parallel processing. (Default 1)
#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
#'
#' @import data.table
#' @import parallel
#' @import data.table
#' @import rhdf5
#' @import wasabi
#'
#' @export

fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
{
threads <- as.integer(threads) # Can't be decimal.
setDTthreads(1) # Not a typo. Threads used for larger mclapply blocks instead of single table operations.
setDTthreads(threads, restore_after_fork = TRUE)

# Don't assume the annotation is ordered properly.
annot <- tidy_annot(annot, TARGET_COL, PARENT_COL)

# Wasabi?
if (!half_cooked) {
wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths))
}
wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths), force= !half_cooked)

# Sort out scaling.
lA <- length(A_paths)
lB <- length(B_paths)
sfA <- NULL
sfB <- NULL
if (length(scaleto) == 1) {
sfA <- rep(scaleto, lA)
sfB <- rep(scaleto, lB)
} else {
sfA <- scaleto[1:lA]
sfB <- scaleto[(1+lA):(lA+lB)]
# Sort out scaling factor per sample.
lgth <- c("A"=length(A_paths), "B"=length(B_paths))
sfac <- list("A"=NA_real_, "B"=NA_real_)
if (length(scaleto) == 1) { # uniform scaling
sfac$A <- rep(scaleto, lgth$A)
sfac$B <- rep(scaleto, lgth$B)
} else { # individual scaling
sfac$A <- scaleto[1:lgth$A]
sfac$B <- scaleto[(1 + lgth$A):(lgth$A + lgth$B)]
}

# Load and convert.
res <- lapply(c('A', 'B'), function(cond) {
boots_A <- mclapply(1:lA, function(x) {
# Get the correct files and scaling factors.
boots <- mclapply(1:lgth[cond], function(x) {
# Get the respective file and scaling factor.
sf <- sfac[[cond]][x]
if (cond=='A') {
fil <- A_paths[x]
sf <- sfA[x]
} else {
fil <- B_paths[x]
sf <- sfB[x]
}

# If data from Kallisto plaintext...
if (beartext) {
# list the bootstrap files
Expand All @@ -106,12 +205,14 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
}

# Normalise raw counts.
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / meta$eff_length
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )

# Structure.
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
Expand All @@ -120,7 +221,9 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)

return (dt)
}, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)
}, mc.cores = min(threads,lgth[cond]), mc.preschedule = TRUE, mc.allow.recursive = FALSE)

return (boots)
})

names(res) <- c("boot_data_A", "boot_data_B")
Expand Down Expand Up @@ -157,8 +260,11 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
#'
denest_sleuth_boots <- function(slo, annot, samples, COUNTS_COL="tpm", BS_TARGET_COL="target_id",
TARGET_COL="target_id", PARENT_COL="parent_id", threads= 1) {

warning("DEPRECATED: denest_sleuth_boots() no longer serves a purpose in RATs and will be eventually removed.")

# Ensure data.table complies.
setDTthreads(threads)
setDTthreads(threads, restore_after_fork = TRUE)

# Compared to the size of other structures involved in calling DTU, this should make negligible difference.
tx <- tidy_annot(annot, TARGET_COL, PARENT_COL)$target_id
Expand Down
Loading

0 comments on commit 14127ca

Please sign in to comment.