From 8ee32508f403814d303cd00fb5c54726c90f8cc8 Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Sun, 17 Nov 2024 16:16:55 -0800 Subject: [PATCH] Switch to scrapper for marker detection and clustering. (#283) This affects aggregateReference, plotMarkerHeatmap and trainSingleR with de.method="wilcox" or de.method="t". It should improve performance, reduce dependencies, and avoid sensitivity to the random seed during aggregation. --- DESCRIPTION | 11 +- NAMESPACE | 9 -- R/aggregateReference.R | 173 ++++++++++---------------------- R/plotMarkerHeatmap.R | 32 ++++-- R/trainSingleR.R | 100 +++++++++++++----- inst/NEWS.Rd | 8 ++ man/aggregateReference.Rd | 15 +-- man/plotMarkerHeatmap.Rd | 18 ++-- man/trainSingleR.Rd | 27 +++-- tests/testthat/test-aggregate.R | 70 +------------ tests/testthat/test-train.R | 70 +++++++++---- 11 files changed, 252 insertions(+), 281 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fba6e12..62c7d65 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: SingleR Title: Reference-Based Single-Cell RNA-Seq Annotation -Version: 2.9.0 -Date: 2024-10-28 +Version: 2.9.1 +Date: 2024-11-17 Authors@R: c(person("Dvir", "Aran", email="dvir.aran@ucsf.edu", role=c("aut", "cph")), person("Aaron", "Lun", email="infinite.monkeys.with.keyboards@gmail.com", role=c("ctb", "cre")), person("Daniel", "Bunis", role="ctb"), @@ -19,13 +19,11 @@ Imports: DelayedArray, DelayedMatrixStats, BiocParallel, - BiocSingular, BiocNeighbors, stats, utils, Rcpp, - beachmat (>= 2.21.1), - parallel + beachmat (>= 2.21.1) LinkingTo: Rcpp, beachmat, @@ -39,8 +37,7 @@ Suggests: BiocGenerics, SingleCellExperiment, scuttle, - scater, - scran, + scrapper, scRNAseq, ggplot2, pheatmap, diff --git a/NAMESPACE b/NAMESPACE index c004009..280d79d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,12 +33,9 @@ importFrom(BiocNeighbors,KmknnParam) importFrom(BiocNeighbors,defineBuilder) importFrom(BiocParallel,SerialParam) importFrom(BiocParallel,bpisup) -importFrom(BiocParallel,bpmapply) importFrom(BiocParallel,bpnworkers) importFrom(BiocParallel,bpstart) importFrom(BiocParallel,bpstop) -importFrom(BiocSingular,bsparam) -importFrom(BiocSingular,runPCA) importFrom(DelayedArray,DelayedArray) importFrom(DelayedArray,colsum) importFrom(DelayedArray,getAutoBPPARAM) @@ -49,9 +46,7 @@ importFrom(DelayedArray,setAutoBPPARAM) importFrom(DelayedArray,sweep) importFrom(DelayedMatrixStats,rowAnyNAs) importFrom(DelayedMatrixStats,rowMedians) -importFrom(DelayedMatrixStats,rowVars) importFrom(Matrix,rowMeans) -importFrom(Matrix,t) importFrom(Rcpp,sourceCpp) importFrom(S4Vectors,"metadata<-") importFrom(S4Vectors,DataFrame) @@ -63,16 +58,12 @@ importFrom(S4Vectors,selfmatch) importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(SummarizedExperiment,assay) importFrom(beachmat,initializeCpp) -importFrom(beachmat,realizeFileBackedMatrix) importFrom(methods,as) importFrom(methods,is) -importFrom(parallel,nextRNGStream) -importFrom(stats,kmeans) importFrom(stats,mad) importFrom(stats,median) importFrom(stats,rnorm) importFrom(stats,rpois) -importFrom(stats,runif) importFrom(utils,head) importFrom(utils,relist) useDynLib(SingleR) diff --git a/R/aggregateReference.R b/R/aggregateReference.R index 5ef63e7..729b2fc 100644 --- a/R/aggregateReference.R +++ b/R/aggregateReference.R @@ -14,8 +14,9 @@ #' @param ntop Integer scalar specifying the number of highly variable genes to use for the PCA step. #' @param subset.row Integer, character or logical vector indicating the rows of \code{ref} to use for k-means clustering. #' @param check.missing Logical scalar indicating whether rows should be checked for missing values (and if found, removed). -#' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed. -#' @param BSPARAM A \linkS4class{BiocSingularParam} object indicating which SVD algorithm should be used in \code{\link{runPCA}}. +#' @param num.threads Integer scalar specifying the number to threads to use. +#' @param BPPARAM Deprecated, use \code{num.threads} instead. +#' @param BSPARAM Deprecated and ignored. #' #' @details #' With single-cell reference datasets, it is often useful to aggregate individual cells into pseudo-bulk samples to serve as a reference. @@ -35,13 +36,13 @@ #' If \code{ncenters} is greater than the number of samples for a label and/or \code{power=1}, no aggregation is performed. #' Setting \code{power=0} will aggregate all cells of a label into a single pseudo-bulk profile. #' -#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link{runPCA}}. -#' The use of PCs compacts the data for more efficient operation of \code{\link{kmeans}}; +#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link[scrapper]{runPca}}. +#' The use of PCs compacts the data for more efficient operation of \code{\link[scrapper]{clusterKmeans}}; #' it also removes some of the high-dimensional noise to highlight major factors of within-label heterogenity. #' Note that the PCs are only used for clustering and the full expression profiles are still used for the final averaging. #' Users can disable the PCA step by setting \code{rank=Inf}. #' -#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA. +#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA, as identified with \code{\link[scrapper]{modelGeneVariances}}. #' More subsetting of the matrix prior to the PCA can be achieved by setting \code{subset.row} to an appropriate indexing vector. #' This option may be useful for clustering based on known genes of interest but retaining all genes in the aggregated results. #' (If both options are set, subsetting by \code{subset.row} is done first, and then the top \code{ntop} genes are selected.) @@ -75,36 +76,61 @@ #' @export #' @importFrom SummarizedExperiment SummarizedExperiment #' @importFrom S4Vectors DataFrame -#' @importFrom BiocParallel SerialParam bpnworkers bpmapply -#' @importFrom BiocSingular bsparam -aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000, assay.type="logcounts", - rank=20, subset.row=NULL, check.missing=TRUE, BPPARAM=SerialParam(), BSPARAM=bsparam()) +#' @importFrom Matrix rowMeans +#' @importFrom DelayedArray sweep colsum DelayedArray +#' @importFrom BiocParallel SerialParam bpnworkers +aggregateReference <- function( + ref, + labels, + ncenters=NULL, + power=0.5, + ntop=1000, + assay.type="logcounts", + rank=20, + subset.row=NULL, + check.missing=TRUE, + num.threads=bpnworkers(BPPARAM), + BPPARAM=SerialParam(), + BSPARAM=NULL) { by.label <- split(seq_along(labels), labels) if (is(ref, "SummarizedExperiment")) { ref <- assay(ref, i=assay.type) } + ref <- DelayedArray(ref) - FRAGMENTER <- function(x, i, j) { - x <- x[,j,drop=FALSE] # this is usually smaller, so do this first. - if (!is.null(i)) { - x <- x[i,,drop=FALSE] + output.vals <- vector("list", length(by.label)) + names(output.vals) <- names(by.label) + for (lab in names(by.label)) { + chosen <- by.label[[lab]] + current <- ref[,chosen,drop=FALSE] + + cur.ncenters <- ncenters + if (is.null(cur.ncenters)) { + cur.ncenters <- floor(ncol(current)^power) } - x - } - all.seeds <- .define_seeds(length(by.label)) + if (cur.ncenters <= 1) { + output <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL)) + } else { + # Doing a mini-analysis here: PCA on HVGs followed by k-means. + stats <- scrapper::modelGeneVariances(current, num.threads=num.threads) + keep <- scrapper::chooseHighlyVariableGenes(stats$statistics$residuals, top=ntop) + sub <- current[keep,,drop=FALSE] + + if (rank <= min(dim(sub))-1L) { + pcs <- scrapper::runPca(sub, number=rank, num.threads=num.threads)$components + } else { + pcs <- as.matrix(sub) + } + + clustered <- scrapper::clusterKmeans(pcs, k=cur.ncenters, num.threads=num.threads) + agg <- colsum(current, clustered$cluster) + tab <- table(clustered$cluster)[colnames(agg)] + output <- sweep(agg, 2, tab, "/") + } - if (is.null(BPPARAM) || is(BPPARAM, "SerialParam") || is(BPPARAM, "MulticoreParam")) { - output.vals <- bpmapply(chosen=by.label, .seed=all.seeds, - FUN=function(chosen, x, ...) .aggregate_internal(FRAGMENTER(x, i=subset.row, j=chosen), ...), - MoreArgs=list(x=ref, ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM), - BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE) - } else { - by.mat <- lapply(by.label, FRAGMENTER, x=ref, i=subset.row) - output.vals <- bpmapply(by.mat, .seed=all.seeds, FUN=.aggregate_internal, - MoreArgs=list(ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM), - BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE) + output.vals[[lab]] <- output } if (length(output.vals)==0L) { @@ -119,98 +145,3 @@ aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000, colnames(output) <- sprintf("%s.%s", output.labels, sequence(num)) output } - -#' @importFrom stats kmeans -#' @importFrom utils head -#' @importFrom Matrix t rowMeans -#' @importFrom BiocSingular runPCA -#' @importFrom DelayedArray sweep colsum DelayedArray getAutoBPPARAM setAutoBPPARAM -#' @importFrom DelayedMatrixStats rowVars -#' @importFrom beachmat realizeFileBackedMatrix -.aggregate_internal <- function(current, ncenters, power, rank, ntop, check.missing, BSPARAM, .seed) { - oldseed <- .get_seed() - on.exit(.set_seed(oldseed)) - - old <- RNGkind("L'Ecuyer-CMRG") - on.exit(RNGkind(old[1]), add=TRUE, after=FALSE) - assign(".Random.seed", .seed, envir = .GlobalEnv) - - old.bp <- getAutoBPPARAM() - setAutoBPPARAM(SerialParam()) - on.exit(setAutoBPPARAM(old.bp), add=TRUE, after=FALSE) - - cur.ncenters <- ncenters - if (is.null(cur.ncenters)) { - cur.ncenters <- floor(ncol(current)^power) - } - - if (cur.ncenters <= 1) { - val <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL)) - } else if (cur.ncenters >= ncol(current)) { - val <- current - } else { - to.use <- current - - if (ntop <= nrow(current)) { - o <- order(rowVars(to.use), decreasing=TRUE) - to.use <- to.use[head(o, ntop),,drop=FALSE] - } - - to.use <- t(to.use) - to.use <- realizeFileBackedMatrix(to.use) - - # Identifying the top PCs to avoid realizing the entire matrix. - if (rank <= min(dim(to.use))-1L) { - pcs <- runPCA(to.use, rank=rank, get.rotation=FALSE, BSPARAM=BSPARAM)$x - } else { - pcs <- as.matrix(to.use) - } - - clustered <- kmeans(pcs, centers=cur.ncenters) - val <- colsum(DelayedArray(current), clustered$cluster) - tab <- table(clustered$cluster)[colnames(val)] - val <- sweep(val, 2, tab, "/") - } - - val -} - -#' @importFrom stats runif -#' @importFrom parallel nextRNGStream -.define_seeds <- function(n) { - if (!n) { - return(list()) - } - - runif(1) # bumping the RNG so that repeated calls to this function generate different results. - - oldseed <- .get_seed() - on.exit(.set_seed(oldseed)) - - old <- RNGkind("L'Ecuyer-CMRG") - on.exit(RNGkind(old[1]), add=TRUE, after=FALSE) - - seeds <- vector("list", n) - seeds[[1L]] <- .Random.seed - for (i in seq_len(n - 1L)) { - seeds[[i + 1L]] <- nextRNGStream(seeds[[i]]) - } - - seeds -} - -.get_seed <- function( ) { - if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) { - get(".Random.seed", envir = .GlobalEnv, inherits = FALSE) - } else { - NULL - } -} - -.set_seed <- function(oldseed) { - if (!is.null(oldseed)) { - assign(".Random.seed", oldseed, envir = .GlobalEnv) - } else { - rm(.Random.seed, envir = .GlobalEnv) - } -} diff --git a/R/plotMarkerHeatmap.R b/R/plotMarkerHeatmap.R index a9a6eaa..9b1dfe9 100644 --- a/R/plotMarkerHeatmap.R +++ b/R/plotMarkerHeatmap.R @@ -13,12 +13,14 @@ #' Defaults to all available labels. #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}. #' @param use.pruned Logical scalar indicating whether the pruned labels should be used instead. -#' @param order.by String specifying the column of the output of \code{\link[scran]{scoreMarkers}} with which to sort for interesting markers. +#' @param order.by.effect String specifying the effect size from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers. +#' @param order.by.summary String specifying the summary statistic from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers. #' @param display.row.names Character vector of length equal to the number of rows of \code{test}, #' containing the names of the features to show on the heatmap (e.g., to replace IDs with symbols). #' If \code{NULL}, the existing row names of \code{test} are used. #' @param top Integer scalar indicating the top most interesting markers to show in the heatmap. -#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for marker scoring. +#' @param num.threads Integer scalar specifying the number to threads to use. +#' @param BPPARAM Deprecated, use \code{num.threads} instead. #' #' @return #' The output of \code{\link[pheatmap]{pheatmap}} is returned showing the heatmap on the current graphics device. @@ -27,8 +29,8 @@ #' This function creates a heatmap where each row is a marker gene for \code{label} and each column is a cell in the test dataset. #' The aim is to check the effectiveness of the reference-derived markers for distinguishing between labels in the test dataset. #' Useful markers from the reference should show strong upregulation in \code{label} compared to all \code{other.labels}. -#' We identify such markers by scoring all reference-derived markers with \code{\link[scran]{scoreMarkers}} on the \code{test} expression. -#' The \code{top} markers based on the specified \code{order.by} field are shown in the heatmap. +#' We identify such markers by scoring all reference-derived markers with \code{\link[scrapper]{scoreMarkers}} on the \code{test} expression. +#' The \code{top} markers based on the specified \code{order.by.*} fields are shown in the heatmap. #' If only one label is present, markers are ranked by average abundance intead. #' #' @author Aaron Lun @@ -38,11 +40,11 @@ #' #' plotMarkerHeatmap(pred, test, pred$labels[1]) #' plotMarkerHeatmap(pred, test, pred$labels[1], use.pruned=TRUE) -#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by="rank.AUC") +#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by.effect="auc") #' #' @export #' @importFrom S4Vectors metadata -#' @importFrom BiocParallel SerialParam +#' @importFrom BiocParallel SerialParam bpnworkers #' @importFrom Matrix rowMeans #' @importFrom utils head plotMarkerHeatmap <- function( @@ -53,8 +55,10 @@ plotMarkerHeatmap <- function( assay.type="logcounts", display.row.names=NULL, use.pruned=FALSE, - order.by="rank.logFC.cohen", + order.by.effect="cohens.d", + order.by.summary="min.rank", top=20, + num.threads=bpnworkers(BPPARAM), BPPARAM = SerialParam()) { test <- .to_clean_matrix(test, assay.type, check.missing=FALSE) @@ -87,9 +91,17 @@ plotMarkerHeatmap <- function( # Prioritize the markers with interesting variation in the test data for # visualization. If we only have one label, we use the most abundant markers. if (length(unique(predictions)) > 1L) { - interesting <- scran::scoreMarkers(test, predictions, BPPARAM=BPPARAM) - stats <- interesting[[label]] - o <- order(stats[[order.by]], decreasing=!startsWith(order.by, "rank.")) + interesting <- scrapper::scoreMarkers( + test, + predictions, + num.threads=num.threads, + compute.auc=(order.by.effect=="auc"), + compute.cohens.d=(order.by.effect=="cohens.d"), + compute.delta.mean=(order.by.effect=="delta.mean"), + compute.delta.detected=(order.by.effect=="delta.detected") + ) + stats <- interesting[[order.by.effect]][[label]][[order.by.summary]] + o <- order(stats, decreasing=(order.by.summary!="min.rank")) to.show <- rownames(test)[o] } else { abundance <- rowMeans(test) diff --git a/R/trainSingleR.R b/R/trainSingleR.R index 8f0b9dd..787f36a 100644 --- a/R/trainSingleR.R +++ b/R/trainSingleR.R @@ -35,7 +35,7 @@ #' If \code{de.method="classic"}, defaults to \code{500 * (2/3) ^ log2(N)} where \code{N} is the number of unique labels. #' Otherwise, defaults to 10. #' Ignored if \code{genes} is a list of markers/DE genes. -#' @param de.args Named list of additional arguments to pass to \code{\link[scran]{pairwiseTTests}} or \code{\link[scran]{pairwiseWilcox}} when \code{de.method="wilcox"} or \code{"t"}. +#' @param de.args Named list of additional arguments to pass to \code{\link[scrapper]{scoreMarkers}} when \code{de.method="wilcox"} or \code{"t"}. #' Ignored if \code{genes} is a list of markers/DE genes. #' @param aggr.ref Logical scalar indicating whether references should be aggregated to pseudo-bulk samples for speed, see \code{\link{aggregateReference}}. #' @param aggr.args Further arguments to pass to \code{\link{aggregateReference}} when \code{aggr.ref=TRUE}. @@ -47,8 +47,7 @@ #' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying how the neighbor search index should be constructed. #' @param approximate Deprecated, use \code{BNPARAM} instead. #' @param num.threads Integer scalar specifying the number of threads to use for index building. -#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed. -#' Relevant for marker detection if \code{genes = NULL}, aggregation if \code{aggr.ref = TRUE}, and \code{NA} checking if \code{check.missing = TRUE}. +#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed when \code{check.missing = TRUE}. #' @param restrict A character vector of gene names to use for marker selection. #' By default, all genes in \code{ref} are used. #' @param test.genes Character vector of the names of the genes in the test dataset, i.e., the row names of \code{test} in \code{\link{classifySingleR}}. @@ -163,11 +162,23 @@ #' trained #' length(trained$markers$unique) #' -#' # Alternatively, computing and supplying a set of label-specific markers. -#' by.t <- scran::pairwiseTTests(assay(ref, 2), ref$label, direction="up") -#' markers <- scran::getTopMarkers(by.t[[1]], by.t[[2]], n=10) -#' trained <- trainSingleR(ref, ref$label, genes=markers) -#' length(trained$markers$unique) +#' # Alternatively, supplying a custom set of markers from pairwise comparisons. +#' all.labels <- unique(ref$label) +#' custom.markers <- list() +#' for (x in all.labels) { +#' current.markers <- lapply(all.labels, function(x) sample(rownames(ref), 20)) +#' names(current.markers) <- all.labels +#' current.markers[[x]] <- character(0) +#' custom.markers[[x]] <- current.markers +#' } +#' custom.trained <- trainSingleR(ref, ref$label, genes=custom.markers) +#' +#' # Alternatively, supplying a custom set of markers for each label. +#' custom.markers <- list() +#' for (x in all.labels) { +#' custom.markers[[x]] <- sample(rownames(ref), 20) +#' } +#' custom.trained <- trainSingleR(ref, ref$label, genes=custom.markers) #' #' @export #' @importFrom S4Vectors List isSingleString metadata metadata<- @@ -247,7 +258,7 @@ trainSingleR <- function( de.args=de.args, restrict=restrict, test.genes=test.genes, - BPPARAM=BPPARAM + num.threads=num.threads ) if (aggr.ref) { @@ -283,7 +294,7 @@ trainSingleR <- function( } } -.identify_genes <- function(ref, labels, genes, de.method, de.n, test.genes, restrict, de.args, BPPARAM) { +.identify_genes <- function(ref, labels, genes, de.method, de.n, test.genes, restrict, de.args, num.threads) { if (length(labels)!=ncol(ref)) { stop("number of labels must be equal to number of cells") } @@ -317,7 +328,7 @@ trainSingleR <- function( if (genes != "de") { .Deprecated(old="genes = \"", genes, "\"") } - genes <- .get_genes_by_de(ref, labels, de.n=de.n, de.method=de.method, de.args=de.args, BPPARAM=BPPARAM) + genes <- .get_genes_by_de(ref, labels, de.n=de.n, de.method=de.method, de.args=de.args, num.threads=num.threads) } genes @@ -365,24 +376,65 @@ trainSingleR <- function( .is_solo <- function(trained) !is.null(trained$ref) #' @importFrom utils head -.get_genes_by_de <- function(ref, labels, de.method="classic", de.n=NULL, de.args=list(), BPPARAM=SerialParam()) { +.get_genes_by_de <- function(ref, labels, de.method="classic", de.n=NULL, de.args=list(), num.threads=1) { if (de.method=="classic") { - getClassicMarkers(ref=ref, labels=labels, de.n=de.n, check.missing=FALSE, BPPARAM=BPPARAM) + return(getClassicMarkers(ref=ref, labels=labels, de.n=de.n, check.missing=FALSE, num.threads=num.threads)) + } + + if (de.method=="t") { + compute.auc <- FALSE + compute.cohens.d <- TRUE + effect.size <- "cohens.d" + upregulation.boundary <- 0 } else { - if (de.method=="t") { - FUN <- scran::pairwiseTTests - } else { - FUN <- scran::pairwiseWilcox - } + compute.auc <- TRUE + compute.cohens.d <- FALSE + effect.size <- "auc" + upregulation.boundary <- 0.5 + } - pairwise <- do.call(FUN, c(list(x=ref, groups=labels, direction="up", log.p=TRUE, BPPARAM=BPPARAM), de.args)) - if (is.null(de.n)) { - de.n <- 10 - } + pairwise <- do.call( + scrapper::scoreMarkers, + c( + list( + ref, + groups=labels, + num.threads=num.threads, + all.pairwise=TRUE, + compute.delta.detected=FALSE, + compute.delta.mean=FALSE, + compute.auc=compute.auc, + compute.cohens.d=compute.cohens.d + ), + de.args + ) + )[[effect.size]] + + if (is.null(de.n)) { + de.n <- 10 + } + + all.labels <- dimnames(pairwise)[[1]] + all.genes <- dimnames(pairwise)[[3]] + output <- vector("list", length(all.labels)) + names(output) <- all.labels - collected <- scran::getTopMarkers(pairwise$statistics, pairwise$pairs, n=de.n, pval.field="log.p.value", fdr.field="log.FDR", fdr.threshold=log(0.05)) - lapply(collected, as.list) + for (g1 in all.labels) { + current <- vector("list", length(all.labels)) + names(current) <- all.labels + for (g2 in all.labels) { + if (g1 == g2) { + current[[g2]] <- character(0) + next + } + stats <- pairwise[g2, g1,] # remember, second dimension is the first group in the comparison. + keep <- which(stats > upregulation.boundary) + o <- order(stats[keep], decreasing=TRUE) + current[[g2]] <- all.genes[keep[head(o, de.n)]] + } + output[[g1]] <- current } + output } .convert_per_label_set <- function(genes) { diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index 4c7c6bc..64f7daf 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -2,6 +2,14 @@ \title{SingleR News} \encoding{UTF-8} +\section{Version 2.10.0}{\itemize{ +\item Switch to \pkg{scrapper} for DE detection when \code{de.method="t"} or \code{de.method="wilcox"} in \code{trainSingleR()}. +This should give similar results to but is faster than the previous \pkg{scran} functions. + +\item Switch to \pkg{scrapper} for variance calculation, PCA and clustering in \code{aggregateReference()}. +This should be faster than the previous \pkg{BiocSingular} and \code{stats::kmeans} functions, and avoids the need to consider the seed. +}} + \section{Version 2.8.0}{\itemize{ \item Added the \code{test.genes=} argument to \code{trainSingleR()}, to restrict marker detection to only those genes in the test dataset. This is also checked against \code{rownames(test)} in \code{classifySingleR()} to ensure that the test's feature space is consistent with the space used during training. diff --git a/man/aggregateReference.Rd b/man/aggregateReference.Rd index 5dde288..5dbcc76 100644 --- a/man/aggregateReference.Rd +++ b/man/aggregateReference.Rd @@ -14,8 +14,9 @@ aggregateReference( rank = 20, subset.row = NULL, check.missing = TRUE, + num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam(), - BSPARAM = bsparam() + BSPARAM = NULL ) } \arguments{ @@ -39,9 +40,11 @@ if \code{ref} is a \linkS4class{SummarizedExperiment} object.} \item{check.missing}{Logical scalar indicating whether rows should be checked for missing values (and if found, removed).} -\item{BPPARAM}{A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed.} +\item{num.threads}{Integer scalar specifying the number to threads to use.} -\item{BSPARAM}{A \linkS4class{BiocSingularParam} object indicating which SVD algorithm should be used in \code{\link{runPCA}}.} +\item{BPPARAM}{Deprecated, use \code{num.threads} instead.} + +\item{BSPARAM}{Deprecated and ignored.} } \value{ A \linkS4class{SummarizedExperiment} object with a \code{"logcounts"} assay containing a matrix of aggregated expression values, @@ -69,13 +72,13 @@ This ensures that labels with more cells have more resolved representatives. If \code{ncenters} is greater than the number of samples for a label and/or \code{power=1}, no aggregation is performed. Setting \code{power=0} will aggregate all cells of a label into a single pseudo-bulk profile. -In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link{runPCA}}. -The use of PCs compacts the data for more efficient operation of \code{\link{kmeans}}; +In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link[scrapper]{runPca}}. +The use of PCs compacts the data for more efficient operation of \code{\link[scrapper]{clusterKmeans}}; it also removes some of the high-dimensional noise to highlight major factors of within-label heterogenity. Note that the PCs are only used for clustering and the full expression profiles are still used for the final averaging. Users can disable the PCA step by setting \code{rank=Inf}. -By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA. +By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA, as identified with \code{\link[scrapper]{modelGeneVariances}}. More subsetting of the matrix prior to the PCA can be achieved by setting \code{subset.row} to an appropriate indexing vector. This option may be useful for clustering based on known genes of interest but retaining all genes in the aggregated results. (If both options are set, subsetting by \code{subset.row} is done first, and then the top \code{ntop} genes are selected.) diff --git a/man/plotMarkerHeatmap.Rd b/man/plotMarkerHeatmap.Rd index 4603b22..050bb66 100644 --- a/man/plotMarkerHeatmap.Rd +++ b/man/plotMarkerHeatmap.Rd @@ -12,8 +12,10 @@ plotMarkerHeatmap( assay.type = "logcounts", display.row.names = NULL, use.pruned = FALSE, - order.by = "rank.logFC.cohen", + order.by.effect = "cohens.d", + order.by.summary = "min.rank", top = 20, + num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) } @@ -39,11 +41,15 @@ If \code{NULL}, the existing row names of \code{test} are used.} \item{use.pruned}{Logical scalar indicating whether the pruned labels should be used instead.} -\item{order.by}{String specifying the column of the output of \code{\link[scran]{scoreMarkers}} with which to sort for interesting markers.} +\item{order.by.effect}{String specifying the effect size from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.} + +\item{order.by.summary}{String specifying the summary statistic from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.} \item{top}{Integer scalar indicating the top most interesting markers to show in the heatmap.} -\item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for marker scoring.} +\item{num.threads}{Integer scalar specifying the number to threads to use.} + +\item{BPPARAM}{Deprecated, use \code{num.threads} instead.} } \value{ The output of \code{\link[pheatmap]{pheatmap}} is returned showing the heatmap on the current graphics device. @@ -55,8 +61,8 @@ Create a heatmap of the log-normalized expression for the most interesting marke This function creates a heatmap where each row is a marker gene for \code{label} and each column is a cell in the test dataset. The aim is to check the effectiveness of the reference-derived markers for distinguishing between labels in the test dataset. Useful markers from the reference should show strong upregulation in \code{label} compared to all \code{other.labels}. -We identify such markers by scoring all reference-derived markers with \code{\link[scran]{scoreMarkers}} on the \code{test} expression. -The \code{top} markers based on the specified \code{order.by} field are shown in the heatmap. +We identify such markers by scoring all reference-derived markers with \code{\link[scrapper]{scoreMarkers}} on the \code{test} expression. +The \code{top} markers based on the specified \code{order.by.*} fields are shown in the heatmap. If only one label is present, markers are ranked by average abundance intead. } \examples{ @@ -65,7 +71,7 @@ example(SingleR, echo=FALSE) plotMarkerHeatmap(pred, test, pred$labels[1]) plotMarkerHeatmap(pred, test, pred$labels[1], use.pruned=TRUE) -plotMarkerHeatmap(pred, test, pred$labels[1], order.by="rank.AUC") +plotMarkerHeatmap(pred, test, pred$labels[1], order.by.effect="auc") } \author{ diff --git a/man/trainSingleR.Rd b/man/trainSingleR.Rd index ca17f58..1ae9cea 100644 --- a/man/trainSingleR.Rd +++ b/man/trainSingleR.Rd @@ -68,7 +68,7 @@ If \code{de.method="classic"}, defaults to \code{500 * (2/3) ^ log2(N)} where \c Otherwise, defaults to 10. Ignored if \code{genes} is a list of markers/DE genes.} -\item{de.args}{Named list of additional arguments to pass to \code{\link[scran]{pairwiseTTests}} or \code{\link[scran]{pairwiseWilcox}} when \code{de.method="wilcox"} or \code{"t"}. +\item{de.args}{Named list of additional arguments to pass to \code{\link[scrapper]{scoreMarkers}} when \code{de.method="wilcox"} or \code{"t"}. Ignored if \code{genes} is a list of markers/DE genes.} \item{aggr.ref}{Logical scalar indicating whether references should be aggregated to pseudo-bulk samples for speed, see \code{\link{aggregateReference}}.} @@ -92,8 +92,7 @@ If true and any missing values are found, the rows containing these values are s \item{BNPARAM}{A \linkS4class{BiocNeighborParam} object specifying how the neighbor search index should be constructed.} -\item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed. -Relevant for marker detection if \code{genes = NULL}, aggregation if \code{aggr.ref = TRUE}, and \code{NA} checking if \code{check.missing = TRUE}.} +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed when \code{check.missing = TRUE}.} } \value{ For a single reference, a \linkS4class{List} is returned containing: @@ -204,11 +203,23 @@ trained <- trainSingleR(ref, ref$label) trained length(trained$markers$unique) -# Alternatively, computing and supplying a set of label-specific markers. -by.t <- scran::pairwiseTTests(assay(ref, 2), ref$label, direction="up") -markers <- scran::getTopMarkers(by.t[[1]], by.t[[2]], n=10) -trained <- trainSingleR(ref, ref$label, genes=markers) -length(trained$markers$unique) +# Alternatively, supplying a custom set of markers from pairwise comparisons. +all.labels <- unique(ref$label) +custom.markers <- list() +for (x in all.labels) { + current.markers <- lapply(all.labels, function(x) sample(rownames(ref), 20)) + names(current.markers) <- all.labels + current.markers[[x]] <- character(0) + custom.markers[[x]] <- current.markers +} +custom.trained <- trainSingleR(ref, ref$label, genes=custom.markers) + +# Alternatively, supplying a custom set of markers for each label. +custom.markers <- list() +for (x in all.labels) { + custom.markers[[x]] <- sample(rownames(ref), 20) +} +custom.trained <- trainSingleR(ref, ref$label, genes=custom.markers) } \seealso{ diff --git a/tests/testthat/test-aggregate.R b/tests/testthat/test-aggregate.R index b5267b3..d66c66e 100644 --- a/tests/testthat/test-aggregate.R +++ b/tests/testthat/test-aggregate.R @@ -45,8 +45,7 @@ test_that("aggregateReference works as expected for no aggregation", { test_that("aggregateReference skips PCA when the requested rank is too high", { labels <- sample(LETTERS, ncol(sce), replace=TRUE) ref <- aggregateReference(sce, labels) - - expect_error(out <- aggregateReference(sce, labels, rank=1000, BSPARAM="WHEEE"), NA) # should not even require the BSPARAM. + expect_error(out <- aggregateReference(sce, labels, rank=1000), NA) # should not even require the BSPARAM. expect_identical(ncol(ref), ncol(out)) }) @@ -56,70 +55,3 @@ test_that("aggregateReference works as expected for empty inputs", { expect_identical(nrow(aggr), nrow(sce)) expect_identical(ncol(aggr), 0L) }) - -test_that("aggregateReference selects HVGs correctly", { - mat <- assay(sce) - keep <- sample(nrow(mat), 50) - mat[keep,] <- mat[keep,] * 100 - - labels <- sample(LETTERS, ncol(sce), replace=TRUE) - set.seed(10) - aggr <- aggregateReference(mat, labels, ntop=50) - - set.seed(10) - ref <- aggregateReference(mat[keep,], labels, ntop=Inf) - - expect_identical(aggr[keep,], ref) -}) - -test_that("aggregateReference seed setter behaves correctly", { - # Seed is different for each set of labels. - set.seed(10) - aggr <- aggregateReference(BiocGenerics::cbind(sce, sce), rep(1:2, each=ncol(sce))) - - N <- ncol(aggr)/2 - expect_false(identical(unname(assay(aggr)[,1:N]), unname(assay(aggr)[,N+1:N]))) - - # Seed is different for different runs. - labels <- sample(LETTERS, ncol(sce), replace=TRUE) - - set.seed(10) - X1 <- aggregateReference(sce, labels) # double usage is intentional. - X2 <- aggregateReference(sce, labels) - expect_false(identical(X1, X2)) - - # You get the same results with the same seed, and different results with different seeds. - labels <- sample(LETTERS, ncol(sce), replace=TRUE) - - set.seed(20) - ref <- aggregateReference(sce, labels) - - set.seed(20) - out <- aggregateReference(sce, labels) - expect_identical(ref, out) - - set.seed(30) - out <- aggregateReference(sce, labels) - expect_false(identical(ref, out)) - - # You get same results regardless of parallelization. - # Note that SnowParam requires us to disable the failsafes, - # otherwise setAutoBPPARAM() doesn't work properly. - setAutoBPPARAM(SerialParam()) - - set.seed(20) - out <- aggregateReference(sce, labels, BPPARAM=BiocParallel::SnowParam(2)) - expect_identical(ref, out) - - # The seed is unset properly for downstream applications. - set.seed(10) - aggregateReference(sce, labels) - test1 <- runif(10) - - set.seed(10) - aggregateReference(sce, labels, BPPARAM=BiocParallel::SnowParam(2)) - test2 <- runif(10) - expect_identical(test1, test2) - - setAutoBPPARAM(FAIL) -}) diff --git a/tests/testthat/test-train.R b/tests/testthat/test-train.R index 24a7260..a9bef02 100644 --- a/tests/testthat/test-train.R +++ b/tests/testthat/test-train.R @@ -65,29 +65,59 @@ test_that("trainSingleR fails correctly for a list of lists of genes", { }) test_that("trainSingleR works correctly for other DE testing methods", { - # For Wilcox. - by.t <- scran::pairwiseWilcox(logcounts(training), training$label, direction="up") - markers <- scran::getTopMarkers(by.t[[1]], by.t[[2]], n=10) + effects <- scrapper::scoreMarkers(logcounts(training), training$label, all.pairwise=TRUE) - ref <- trainSingleR(training, training$label, genes='de', de.method="wilcox") - trained <- trainSingleR(training, training$label, genes=markers) - expect_identical(ref$markers, trained$markers) + VERIFY <- function(ref.markers, effect.sizes, hard.limit, extra) { + expect_identical(sort(names(ref.markers)), sort(unique(training$label))) - # For t-tests. - by.t <- scran::pairwiseTTests(logcounts(training), training$label, direction="up") - markers <- scran::getTopMarkers(by.t[[1]], by.t[[2]], n=10) + for (n in names(ref.markers)) { + current.markers <- ref.markers[[n]] + expect_identical(sort(names(current.markers)), sort(unique(training$label))) + expect_identical(current.markers[[n]], character(0)) - ref <- trainSingleR(training, training$label, genes='de', de.method="t") - trained <- trainSingleR(training, training$label, genes=markers) - expect_identical(ref$markers, trained$markers) + for (n2 in setdiff(names(current.markers), n)) { + my.effects <- effect.sizes[n2, n,] + is.chosen <- rownames(training) %in% current.markers[[n2]] + expect_gte(min(my.effects[is.chosen]), max(my.effects[!is.chosen])) + expect_gt(min(my.effects[is.chosen]), hard.limit) + + if (!is.null(extra)) { + extra(n, n2, current.markers[[n2]]) + } + } + } + } - # Responds to the requested number of genes. - by.t <- scran::pairwiseTTests(logcounts(training), training$label, direction="up", lfc=1) - markers <- scran::getTopMarkers(by.t[[1]], by.t[[2]], n=20) + # For Wilcox. + ref <- trainSingleR(training, training$label, genes='de', de.method="wilcox") + VERIFY(ref$markers$full, effects$auc, 0.5, extra=NULL) - ref <- trainSingleR(training, training$label, genes='de', de.method="t", de.n=20, de.args=list(lfc=1)) - trained <- trainSingleR(training, training$label, genes=markers) - expect_identical(ref$markers, trained$markers) + # For t-tests. + ref <- trainSingleR(training, training$label, genes='de', de.method="t") + VERIFY(ref$markers$full, effects$cohens.d, 0, extra=function(n, n2, markers) { + expect_lte(length(markers), 10) + left <- Matrix::rowMeans(logcounts(training)[markers, training$label == n]) + right <- Matrix::rowMeans(logcounts(training)[markers, training$label == n2]) + expect_true(all(left > right)) + }) + + # Behaves with a large number of genes. + ref <- trainSingleR(training, training$label, genes='de', de.method="t", de.n=10000) + VERIFY(ref$markers$full, effects$cohens.d, 0, extra=function(n, n2, markers) { + expect_gt(length(markers), 10) + left <- Matrix::rowMeans(logcounts(training)[markers, training$label == n]) + right <- Matrix::rowMeans(logcounts(training)[markers, training$label == n2]) + expect_true(all(left > right)) + }) + + # Responds to threshold specification. + thresh.effects <- scrapper::scoreMarkers(logcounts(training), training$label, threshold=1, all.pairwise=TRUE) + ref <- trainSingleR(training, training$label, genes='de', de.method="t", de.args=list(threshold=1)) + VERIFY(ref$markers$full, thresh.effects$cohens.d, 0, extra=function(n, n2, markers) { + left <- Matrix::rowMeans(logcounts(training)[markers, training$label == n]) + right <- Matrix::rowMeans(logcounts(training)[markers, training$label == n2]) + expect_true(all(left > right + 1)) + }) }) test_that("trainSingleR is robust to non-character labels", { @@ -161,14 +191,12 @@ test_that("trainSingleR behaves with multiple references, plus recomputation", { }) test_that("trainSingleR behaves with aggregation turned on", { - set.seed(10000) suppressWarnings(out <- trainSingleR(training, training$label, aggr.ref=TRUE)) expect_true(ncol(out$ref) <= ncol(training)) - set.seed(10000) suppressWarnings(out2 <- trainSingleR(ref=list(training, training), label=list(training$label, training$label), aggr.ref=TRUE)) expect_identical(out2[[1]]$ref, out$ref) - expect_false(identical(out2[[2]]$ref, out$ref)) # different k-means initialization. + expect_identical(out2[[2]]$ref, out$ref) }) test_that("trainSingleR behaves with silly inputs", {