Skip to content

Commit

Permalink
Switch to scrapper for marker detection and clustering. (#283)
Browse files Browse the repository at this point in the history
This affects aggregateReference, plotMarkerHeatmap and trainSingleR with
de.method="wilcox" or de.method="t". It should improve performance, reduce
dependencies, and avoid sensitivity to the random seed during aggregation.
  • Loading branch information
LTLA authored Nov 18, 2024
1 parent dcdc885 commit 8ee3250
Show file tree
Hide file tree
Showing 11 changed files with 252 additions and 281 deletions.
11 changes: 4 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SingleR
Title: Reference-Based Single-Cell RNA-Seq Annotation
Version: 2.9.0
Date: 2024-10-28
Version: 2.9.1
Date: 2024-11-17
Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
person("Daniel", "Bunis", role="ctb"),
Expand All @@ -19,13 +19,11 @@ Imports:
DelayedArray,
DelayedMatrixStats,
BiocParallel,
BiocSingular,
BiocNeighbors,
stats,
utils,
Rcpp,
beachmat (>= 2.21.1),
parallel
beachmat (>= 2.21.1)
LinkingTo:
Rcpp,
beachmat,
Expand All @@ -39,8 +37,7 @@ Suggests:
BiocGenerics,
SingleCellExperiment,
scuttle,
scater,
scran,
scrapper,
scRNAseq,
ggplot2,
pheatmap,
Expand Down
9 changes: 0 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,9 @@ importFrom(BiocNeighbors,KmknnParam)
importFrom(BiocNeighbors,defineBuilder)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpisup)
importFrom(BiocParallel,bpmapply)
importFrom(BiocParallel,bpnworkers)
importFrom(BiocParallel,bpstart)
importFrom(BiocParallel,bpstop)
importFrom(BiocSingular,bsparam)
importFrom(BiocSingular,runPCA)
importFrom(DelayedArray,DelayedArray)
importFrom(DelayedArray,colsum)
importFrom(DelayedArray,getAutoBPPARAM)
Expand All @@ -49,9 +46,7 @@ importFrom(DelayedArray,setAutoBPPARAM)
importFrom(DelayedArray,sweep)
importFrom(DelayedMatrixStats,rowAnyNAs)
importFrom(DelayedMatrixStats,rowMedians)
importFrom(DelayedMatrixStats,rowVars)
importFrom(Matrix,rowMeans)
importFrom(Matrix,t)
importFrom(Rcpp,sourceCpp)
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,DataFrame)
Expand All @@ -63,16 +58,12 @@ importFrom(S4Vectors,selfmatch)
importFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(SummarizedExperiment,assay)
importFrom(beachmat,initializeCpp)
importFrom(beachmat,realizeFileBackedMatrix)
importFrom(methods,as)
importFrom(methods,is)
importFrom(parallel,nextRNGStream)
importFrom(stats,kmeans)
importFrom(stats,mad)
importFrom(stats,median)
importFrom(stats,rnorm)
importFrom(stats,rpois)
importFrom(stats,runif)
importFrom(utils,head)
importFrom(utils,relist)
useDynLib(SingleR)
173 changes: 52 additions & 121 deletions R/aggregateReference.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
#' @param ntop Integer scalar specifying the number of highly variable genes to use for the PCA step.
#' @param subset.row Integer, character or logical vector indicating the rows of \code{ref} to use for k-means clustering.
#' @param check.missing Logical scalar indicating whether rows should be checked for missing values (and if found, removed).
#' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed.
#' @param BSPARAM A \linkS4class{BiocSingularParam} object indicating which SVD algorithm should be used in \code{\link{runPCA}}.
#' @param num.threads Integer scalar specifying the number to threads to use.
#' @param BPPARAM Deprecated, use \code{num.threads} instead.
#' @param BSPARAM Deprecated and ignored.
#'
#' @details
#' With single-cell reference datasets, it is often useful to aggregate individual cells into pseudo-bulk samples to serve as a reference.
Expand All @@ -35,13 +36,13 @@
#' If \code{ncenters} is greater than the number of samples for a label and/or \code{power=1}, no aggregation is performed.
#' Setting \code{power=0} will aggregate all cells of a label into a single pseudo-bulk profile.
#'
#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link{runPCA}}.
#' The use of PCs compacts the data for more efficient operation of \code{\link{kmeans}};
#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link[scrapper]{runPca}}.
#' The use of PCs compacts the data for more efficient operation of \code{\link[scrapper]{clusterKmeans}};
#' it also removes some of the high-dimensional noise to highlight major factors of within-label heterogenity.
#' Note that the PCs are only used for clustering and the full expression profiles are still used for the final averaging.
#' Users can disable the PCA step by setting \code{rank=Inf}.
#'
#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA.
#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA, as identified with \code{\link[scrapper]{modelGeneVariances}}.
#' More subsetting of the matrix prior to the PCA can be achieved by setting \code{subset.row} to an appropriate indexing vector.
#' This option may be useful for clustering based on known genes of interest but retaining all genes in the aggregated results.
#' (If both options are set, subsetting by \code{subset.row} is done first, and then the top \code{ntop} genes are selected.)
Expand Down Expand Up @@ -75,36 +76,61 @@
#' @export
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom S4Vectors DataFrame
#' @importFrom BiocParallel SerialParam bpnworkers bpmapply
#' @importFrom BiocSingular bsparam
aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000, assay.type="logcounts",
rank=20, subset.row=NULL, check.missing=TRUE, BPPARAM=SerialParam(), BSPARAM=bsparam())
#' @importFrom Matrix rowMeans
#' @importFrom DelayedArray sweep colsum DelayedArray
#' @importFrom BiocParallel SerialParam bpnworkers
aggregateReference <- function(
ref,
labels,
ncenters=NULL,
power=0.5,
ntop=1000,
assay.type="logcounts",
rank=20,
subset.row=NULL,
check.missing=TRUE,
num.threads=bpnworkers(BPPARAM),
BPPARAM=SerialParam(),
BSPARAM=NULL)
{
by.label <- split(seq_along(labels), labels)
if (is(ref, "SummarizedExperiment")) {
ref <- assay(ref, i=assay.type)
}
ref <- DelayedArray(ref)

FRAGMENTER <- function(x, i, j) {
x <- x[,j,drop=FALSE] # this is usually smaller, so do this first.
if (!is.null(i)) {
x <- x[i,,drop=FALSE]
output.vals <- vector("list", length(by.label))
names(output.vals) <- names(by.label)
for (lab in names(by.label)) {
chosen <- by.label[[lab]]
current <- ref[,chosen,drop=FALSE]

cur.ncenters <- ncenters
if (is.null(cur.ncenters)) {
cur.ncenters <- floor(ncol(current)^power)
}
x
}

all.seeds <- .define_seeds(length(by.label))
if (cur.ncenters <= 1) {
output <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL))
} else {
# Doing a mini-analysis here: PCA on HVGs followed by k-means.
stats <- scrapper::modelGeneVariances(current, num.threads=num.threads)
keep <- scrapper::chooseHighlyVariableGenes(stats$statistics$residuals, top=ntop)
sub <- current[keep,,drop=FALSE]

if (rank <= min(dim(sub))-1L) {
pcs <- scrapper::runPca(sub, number=rank, num.threads=num.threads)$components
} else {
pcs <- as.matrix(sub)
}

clustered <- scrapper::clusterKmeans(pcs, k=cur.ncenters, num.threads=num.threads)
agg <- colsum(current, clustered$cluster)
tab <- table(clustered$cluster)[colnames(agg)]
output <- sweep(agg, 2, tab, "/")
}

if (is.null(BPPARAM) || is(BPPARAM, "SerialParam") || is(BPPARAM, "MulticoreParam")) {
output.vals <- bpmapply(chosen=by.label, .seed=all.seeds,
FUN=function(chosen, x, ...) .aggregate_internal(FRAGMENTER(x, i=subset.row, j=chosen), ...),
MoreArgs=list(x=ref, ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM),
BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
} else {
by.mat <- lapply(by.label, FRAGMENTER, x=ref, i=subset.row)
output.vals <- bpmapply(by.mat, .seed=all.seeds, FUN=.aggregate_internal,
MoreArgs=list(ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM),
BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
output.vals[[lab]] <- output
}

if (length(output.vals)==0L) {
Expand All @@ -119,98 +145,3 @@ aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000,
colnames(output) <- sprintf("%s.%s", output.labels, sequence(num))
output
}

#' @importFrom stats kmeans
#' @importFrom utils head
#' @importFrom Matrix t rowMeans
#' @importFrom BiocSingular runPCA
#' @importFrom DelayedArray sweep colsum DelayedArray getAutoBPPARAM setAutoBPPARAM
#' @importFrom DelayedMatrixStats rowVars
#' @importFrom beachmat realizeFileBackedMatrix
.aggregate_internal <- function(current, ncenters, power, rank, ntop, check.missing, BSPARAM, .seed) {
oldseed <- .get_seed()
on.exit(.set_seed(oldseed))

old <- RNGkind("L'Ecuyer-CMRG")
on.exit(RNGkind(old[1]), add=TRUE, after=FALSE)
assign(".Random.seed", .seed, envir = .GlobalEnv)

old.bp <- getAutoBPPARAM()
setAutoBPPARAM(SerialParam())
on.exit(setAutoBPPARAM(old.bp), add=TRUE, after=FALSE)

cur.ncenters <- ncenters
if (is.null(cur.ncenters)) {
cur.ncenters <- floor(ncol(current)^power)
}

if (cur.ncenters <= 1) {
val <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL))
} else if (cur.ncenters >= ncol(current)) {
val <- current
} else {
to.use <- current

if (ntop <= nrow(current)) {
o <- order(rowVars(to.use), decreasing=TRUE)
to.use <- to.use[head(o, ntop),,drop=FALSE]
}

to.use <- t(to.use)
to.use <- realizeFileBackedMatrix(to.use)

# Identifying the top PCs to avoid realizing the entire matrix.
if (rank <= min(dim(to.use))-1L) {
pcs <- runPCA(to.use, rank=rank, get.rotation=FALSE, BSPARAM=BSPARAM)$x
} else {
pcs <- as.matrix(to.use)
}

clustered <- kmeans(pcs, centers=cur.ncenters)
val <- colsum(DelayedArray(current), clustered$cluster)
tab <- table(clustered$cluster)[colnames(val)]
val <- sweep(val, 2, tab, "/")
}

val
}

#' @importFrom stats runif
#' @importFrom parallel nextRNGStream
.define_seeds <- function(n) {
if (!n) {
return(list())
}

runif(1) # bumping the RNG so that repeated calls to this function generate different results.

oldseed <- .get_seed()
on.exit(.set_seed(oldseed))

old <- RNGkind("L'Ecuyer-CMRG")
on.exit(RNGkind(old[1]), add=TRUE, after=FALSE)

seeds <- vector("list", n)
seeds[[1L]] <- .Random.seed
for (i in seq_len(n - 1L)) {
seeds[[i + 1L]] <- nextRNGStream(seeds[[i]])
}

seeds
}

.get_seed <- function( ) {
if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
} else {
NULL
}
}

.set_seed <- function(oldseed) {
if (!is.null(oldseed)) {
assign(".Random.seed", oldseed, envir = .GlobalEnv)
} else {
rm(.Random.seed, envir = .GlobalEnv)
}
}
32 changes: 22 additions & 10 deletions R/plotMarkerHeatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
#' Defaults to all available labels.
#' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
#' @param use.pruned Logical scalar indicating whether the pruned labels should be used instead.
#' @param order.by String specifying the column of the output of \code{\link[scran]{scoreMarkers}} with which to sort for interesting markers.
#' @param order.by.effect String specifying the effect size from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.
#' @param order.by.summary String specifying the summary statistic from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.
#' @param display.row.names Character vector of length equal to the number of rows of \code{test},
#' containing the names of the features to show on the heatmap (e.g., to replace IDs with symbols).
#' If \code{NULL}, the existing row names of \code{test} are used.
#' @param top Integer scalar indicating the top most interesting markers to show in the heatmap.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for marker scoring.
#' @param num.threads Integer scalar specifying the number to threads to use.
#' @param BPPARAM Deprecated, use \code{num.threads} instead.
#'
#' @return
#' The output of \code{\link[pheatmap]{pheatmap}} is returned showing the heatmap on the current graphics device.
Expand All @@ -27,8 +29,8 @@
#' This function creates a heatmap where each row is a marker gene for \code{label} and each column is a cell in the test dataset.
#' The aim is to check the effectiveness of the reference-derived markers for distinguishing between labels in the test dataset.
#' Useful markers from the reference should show strong upregulation in \code{label} compared to all \code{other.labels}.
#' We identify such markers by scoring all reference-derived markers with \code{\link[scran]{scoreMarkers}} on the \code{test} expression.
#' The \code{top} markers based on the specified \code{order.by} field are shown in the heatmap.
#' We identify such markers by scoring all reference-derived markers with \code{\link[scrapper]{scoreMarkers}} on the \code{test} expression.
#' The \code{top} markers based on the specified \code{order.by.*} fields are shown in the heatmap.
#' If only one label is present, markers are ranked by average abundance intead.
#'
#' @author Aaron Lun
Expand All @@ -38,11 +40,11 @@
#'
#' plotMarkerHeatmap(pred, test, pred$labels[1])
#' plotMarkerHeatmap(pred, test, pred$labels[1], use.pruned=TRUE)
#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by="rank.AUC")
#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by.effect="auc")
#'
#' @export
#' @importFrom S4Vectors metadata
#' @importFrom BiocParallel SerialParam
#' @importFrom BiocParallel SerialParam bpnworkers
#' @importFrom Matrix rowMeans
#' @importFrom utils head
plotMarkerHeatmap <- function(
Expand All @@ -53,8 +55,10 @@ plotMarkerHeatmap <- function(
assay.type="logcounts",
display.row.names=NULL,
use.pruned=FALSE,
order.by="rank.logFC.cohen",
order.by.effect="cohens.d",
order.by.summary="min.rank",
top=20,
num.threads=bpnworkers(BPPARAM),
BPPARAM = SerialParam())
{
test <- .to_clean_matrix(test, assay.type, check.missing=FALSE)
Expand Down Expand Up @@ -87,9 +91,17 @@ plotMarkerHeatmap <- function(
# Prioritize the markers with interesting variation in the test data for
# visualization. If we only have one label, we use the most abundant markers.
if (length(unique(predictions)) > 1L) {
interesting <- scran::scoreMarkers(test, predictions, BPPARAM=BPPARAM)
stats <- interesting[[label]]
o <- order(stats[[order.by]], decreasing=!startsWith(order.by, "rank."))
interesting <- scrapper::scoreMarkers(
test,
predictions,
num.threads=num.threads,
compute.auc=(order.by.effect=="auc"),
compute.cohens.d=(order.by.effect=="cohens.d"),
compute.delta.mean=(order.by.effect=="delta.mean"),
compute.delta.detected=(order.by.effect=="delta.detected")
)
stats <- interesting[[order.by.effect]][[label]][[order.by.summary]]
o <- order(stats, decreasing=(order.by.summary!="min.rank"))
to.show <- rownames(test)[o]
} else {
abundance <- rowMeans(test)
Expand Down
Loading

0 comments on commit 8ee3250

Please sign in to comment.