Skip to content

Commit

Permalink
Merge pull request #18 from bartongroup/development
Browse files Browse the repository at this point in the history
v0.4.4
  • Loading branch information
fruce-ki authored Apr 19, 2017
2 parents 9f5a9f7 + e3b3c6d commit 7b329c0
Show file tree
Hide file tree
Showing 33 changed files with 1,317 additions and 907 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: rats
Version: 0.4.2
Date: 2017-04-05
Version: 0.4.4
Date: 2017-04-19
Title: Relative Abundance of Transcripts
Encoding: UTF-8
Authors: c(person("Kimon Froussios", role=c("aut"), email="[email protected]"),
Expand Down
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@ export(call_DTU)
export(combine_sim_dtu)
export(countrange_sim)
export(denest_sleuth_boots)
export(dtu_plurality_summary)
export(dtu_summary)
export(g.test)
export(dtu_switch_summary)
export(g.test.1)
export(g.test.2)
export(get_dtu_ids)
export(get_plurality_ids)
export(get_switch_ids)
export(group_samples)
export(plot_gene)
export(plot_overview)
Expand Down
112 changes: 69 additions & 43 deletions R/func.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,12 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL,
if (minboots < 100) {
warnmsg["toofewboots"] <- "Your quantifications have few bootstrap iterations, which reduces reproducibility of the calls."
}
if (qbootnum < 100) {
if (qbootnum < 100 && qbootnum != 0) {
warnmsg["toolowbootnum"] <- "The requested qbootnum is low, which reduces reproducibility of the calls."
}
bootcombos <- minboots^numsamples # Conservative estimate.
if (qbootnum >= bootcombos/100)
warnmsg["toomanyboots"] <- "The requested number of quantification bootstraps is very high, relative to the supplied data. Over 1% chance of duplicate iterations."
warnmsg["toomanyboots"] <- "The requested number of quantification bootstraps is very high, relatively to the supplied data. Over 1% chance of duplicate iterations."
if (qbootnum >= maxmatrix/dim(annot)[1])
return(list("error"=TRUE,"message"="The requested number of quantification bootstraps would exceed the maximum capacity of an R matrix."))
} # else it is probably count data and qboot will be auto-set to FALSE
Expand Down Expand Up @@ -302,12 +302,10 @@ alloc_out <- function(annot, full){
Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)),
"elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA,
"known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_,
"pvalAB"=NA_real_, "pvalBA"=NA_real_, "pvalAB_corr"=NA_real_, "pvalBA_corr"=NA_real_,
"quant_p_meanAB"=NA_real_, "quant_p_meanBA"=NA_real_, "quant_p_stdevAB"=NA_real_, "quant_p_stdevBA"=NA_real_,
"quant_p_minAB"=NA_real_, "quant_p_minBA"=NA_real_, "quant_p_maxAB"=NA_real_, "quant_p_maxBA"=NA_real_,
"pval"=NA_real_, "pval_corr"=NA_real_,
"quant_p_mean"=NA_real_, "quant_p_stdev"=NA_real_, "quant_p_min"=NA_real_, "quant_p_max"=NA_real_,
"quant_na_freq"=NA_real_, "quant_dtu_freq"=NA_real_,
"rep_p_meanAB"=NA_real_, "rep_p_meanBA"=NA_real_, "rep_p_stdevAB"=NA_real_, "rep_p_stdevBA"=NA_real_,
"rep_p_minAB"=NA_real_, "rep_p_minBA"=NA_real_, "rep_p_maxAB"=NA_real_, "rep_p_maxBA"=NA_real_,
"rep_p_mean"=NA_real_, "rep_p_stdev"=NA_real_, "rep_p_min"=NA_real_, "rep_p_max"=NA_real_,
"rep_na_freq"=NA_real_, "rep_dtu_freq"=NA_real_)
Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id,
"elig_xp"=NA, "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "gene_DTU"=NA,
Expand All @@ -324,8 +322,7 @@ alloc_out <- function(annot, full){
Parameters <- list("num_replic_A"=NA_integer_, "num_replic_B"=NA_integer_)
Genes <- data.table("parent_id"=levels(as.factor(annot$parent_id)), "DTU"=NA,
"elig_transc"=NA_integer_, "elig"=NA, "elig_fx"=NA,
"pvalAB"=NA_real_, "pvalBA"=NA_real_,
"pvalAB_corr"=NA_real_, "pvalBA_corr"=NA_real_, "sig"=NA)
"pval"=NA_real_, "pval_corr"=NA_real_, "sig"=NA)
Transcripts <- data.table("target_id"=annot$target_id, "parent_id"=annot$parent_id, "DTU"=NA,
"sumA"=NA_real_, "sumB"=NA_real_, # sum across replicates of means across bootstraps
"totalA"=NA_real_, "totalB"=NA_real_, # sum of all transcripts for that gene
Expand Down Expand Up @@ -359,9 +356,9 @@ alloc_out <- function(annot, full){
#' @param threads Number of threads (POSIX systems only).
#' @return list
#'
#' @import utils
#' @import parallel
#' @import data.table
#' @import stats
#'
calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes, full, count_thresh, p_thresh, dprop_thresh, correction, threads= 1) {
if (packageVersion("data.table") >= "1.9.8") # Ensure data.table complies.
Expand Down Expand Up @@ -410,65 +407,94 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes

#---------- TESTS

# Proportion test.
# Transcript-level test.
if (test_transc) {
Transcripts[(elig), pval := unlist( mclapply( as.data.frame(t(Transcripts[(elig), .(sumA, sumB, totalA, totalB)])),
function(row) { prop.test(x = row[c(1, 2)],
n = row[c(3, 4)],
correct = TRUE)[["p.value"]]
function(row) {
return( g.test.2(obsx= c(row[1], row[3]-row[1]), obsy= c(row[2], row[4]-row[2])) )
}, mc.cores= threads, mc.allow.recursive= FALSE, mc.preschedule= TRUE)
) ]
Transcripts[(elig), pval_corr := p.adjust(pval, method=correction)]
Transcripts[(elig), sig := pval_corr < p_thresh]
Transcripts[(elig), DTU := sig & elig_fx]
}

# G test.
# Gene-level test.
if (test_genes) {
Genes[(elig), c("pvalAB", "pvalBA") := t( as.data.frame( mclapply(Genes[(elig), parent_id],
Genes[(elig), pval := t( as.data.frame( mclapply(Genes[(elig), parent_id],
function(parent) {
# Extract all relevant data to avoid repeated look ups in the large table.
subdt <- Transcripts[parent, .(sumA, sumB, propA, propB)] # All isoforms, including not detected ones.
pAB <- g.test(x = subdt[, sumA], p = subdt[, propB])
pBA <- g.test(x = subdt[, sumB], p = subdt[, propA])
return(c(pAB, pBA))
subdt <- Transcripts[parent, .(sumA, sumB)] # All isoforms, including not detected ones.
return( g.test.2(obsx= subdt$sumA, obsy= subdt$sumB) )
}, mc.cores= threads, mc.preschedule= TRUE, mc.allow.recursive= FALSE)
)) ]
Genes[(elig), pvalAB_corr := p.adjust(pvalAB, method=correction)]
Genes[(elig), pvalBA_corr := p.adjust(pvalBA, method=correction)]
Genes[(elig), sig := pvalAB_corr < p_thresh & pvalBA_corr < p_thresh]
Genes[(elig), pval_corr := p.adjust(pval, method=correction)]
Genes[(elig), sig := pval_corr < p_thresh]
Genes[(elig), DTU := sig & elig_fx]
}
})
return(resobj)
}



#================================================================================
#' Log-likelihood test of goodness of fit.
#'
#' @param x a numeric vector of positive numbers, with at least one non-zero value.
#' @param p a vector of probabilities of the same length of x.
#'
#' Sourced and adapted from from:
#' V3.3 Pete Hurd Sept 29 2001. [email protected]
#' http://www.psych.ualberta.ca/~phurd/cruft/g.test.r
#' @param obsx a numeric vector of finite positive counts, with at least one non-zero value.
#' @param px a vector of probabilities of the same length as xobs. ( sum(px) <= 1 )
#'
#' The order of values in the two vectors should be the same.
#' If any value of xp is zero and the corresponding xobs is not, g.test.1 will always reject the hypothesis.
#' No corrections are applied.
#' No input checks are applied, as RATs needs to run this millions of times.
#'
#' @import stats
#' @export
#'
g.test <- function(x, p = rep(1/length(x), length(x)))
{
n <- sum(x)
E <- n * p
names(E) <- names(x)
g <- 0
for (i in 1:length(x)){
if (x[i] != 0) g <- g + x[i] * log(x[i]/E[i])
}
q <- 1
STATISTIC <- G <- 2*g/q
PARAMETER <- length(x) - 1
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
#
g.test.1 <- function(obsx, px) {
n = length(obsx)
sx <- sum(obsx)
ex <- sx * px # expected values
G <- 2 * sum( sapply (seq.int(1, n, 1),
function (i) { if (obsx[i] != 0) { return(obsx[i] * log(obsx[i]/ex[i])) } else { return(0) } }) )
return( pchisq(G, df= n - 1, lower.tail= FALSE) )
}

#================================================================================
#' Log-likelihood test of independence.
#'
#' For two sets of observations.
#'
#' @param obsx a numeric vector of positive counts, with at least one non-zero value.
#' @param obsy a numeric vector of positive counts of same length as obsx, with at least one non-zero value.
#'
#' The order of values in the two vectors should be the same.
#' No corrections are applied.
#' No input checks are applied, as RATs needs to run this millions of times.
#'
#' @import stats
#' @export
#
g.test.2 <- function(obsx, obsy) {
n = length(obsx)
# Row and column sums.
sx <- sum(obsx)
sy <- sum(obsy)
sv <- obsx + obsy
st <- sx + sy
# Marginal probabilities.
mpx <- sx / st
mpy <- sy / st
mpv <- sapply(sv, function(v) {v/st})
# Expected values.
ex <- mpx * mpv * st
ey <- mpy * mpv * st
# Statistic.
G <- 2 * sum( sum( sapply (seq.int(1, n, 1),
function (i) { if (obsx[i] != 0) { return(obsx[i] * log(obsx[i]/ex[i])) } else { return(0) } }) ),
sum( sapply (seq.int(1, n, 1),
function (i) { if (obsy[i] != 0) { return(obsy[i] * log(obsy[i]/ey[i])) } else { return(0) } }) )
)
return( pchisq(G, df= n - 1, lower.tail= FALSE) )
}
Loading

0 comments on commit 7b329c0

Please sign in to comment.