Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to scrapper for marker detection and clustering. #283

Merged
merged 7 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SingleR
Title: Reference-Based Single-Cell RNA-Seq Annotation
Version: 2.9.0
Date: 2024-10-28
Version: 2.9.1
Date: 2024-11-17
Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
person("Daniel", "Bunis", role="ctb"),
Expand All @@ -19,13 +19,11 @@ Imports:
DelayedArray,
DelayedMatrixStats,
BiocParallel,
BiocSingular,
BiocNeighbors,
stats,
utils,
Rcpp,
beachmat (>= 2.21.1),
parallel
beachmat (>= 2.21.1)
LinkingTo:
Rcpp,
beachmat,
Expand All @@ -39,8 +37,7 @@ Suggests:
BiocGenerics,
SingleCellExperiment,
scuttle,
scater,
scran,
scrapper,
scRNAseq,
ggplot2,
pheatmap,
Expand Down
9 changes: 0 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,9 @@ importFrom(BiocNeighbors,KmknnParam)
importFrom(BiocNeighbors,defineBuilder)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpisup)
importFrom(BiocParallel,bpmapply)
importFrom(BiocParallel,bpnworkers)
importFrom(BiocParallel,bpstart)
importFrom(BiocParallel,bpstop)
importFrom(BiocSingular,bsparam)
importFrom(BiocSingular,runPCA)
importFrom(DelayedArray,DelayedArray)
importFrom(DelayedArray,colsum)
importFrom(DelayedArray,getAutoBPPARAM)
Expand All @@ -49,9 +46,7 @@ importFrom(DelayedArray,setAutoBPPARAM)
importFrom(DelayedArray,sweep)
importFrom(DelayedMatrixStats,rowAnyNAs)
importFrom(DelayedMatrixStats,rowMedians)
importFrom(DelayedMatrixStats,rowVars)
importFrom(Matrix,rowMeans)
importFrom(Matrix,t)
importFrom(Rcpp,sourceCpp)
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,DataFrame)
Expand All @@ -63,16 +58,12 @@ importFrom(S4Vectors,selfmatch)
importFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(SummarizedExperiment,assay)
importFrom(beachmat,initializeCpp)
importFrom(beachmat,realizeFileBackedMatrix)
importFrom(methods,as)
importFrom(methods,is)
importFrom(parallel,nextRNGStream)
importFrom(stats,kmeans)
importFrom(stats,mad)
importFrom(stats,median)
importFrom(stats,rnorm)
importFrom(stats,rpois)
importFrom(stats,runif)
importFrom(utils,head)
importFrom(utils,relist)
useDynLib(SingleR)
173 changes: 52 additions & 121 deletions R/aggregateReference.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
#' @param ntop Integer scalar specifying the number of highly variable genes to use for the PCA step.
#' @param subset.row Integer, character or logical vector indicating the rows of \code{ref} to use for k-means clustering.
#' @param check.missing Logical scalar indicating whether rows should be checked for missing values (and if found, removed).
#' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed.
#' @param BSPARAM A \linkS4class{BiocSingularParam} object indicating which SVD algorithm should be used in \code{\link{runPCA}}.
#' @param num.threads Integer scalar specifying the number to threads to use.
#' @param BPPARAM Deprecated, use \code{num.threads} instead.
#' @param BSPARAM Deprecated and ignored.
#'
#' @details
#' With single-cell reference datasets, it is often useful to aggregate individual cells into pseudo-bulk samples to serve as a reference.
Expand All @@ -35,13 +36,13 @@
#' If \code{ncenters} is greater than the number of samples for a label and/or \code{power=1}, no aggregation is performed.
#' Setting \code{power=0} will aggregate all cells of a label into a single pseudo-bulk profile.
#'
#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link{runPCA}}.
#' The use of PCs compacts the data for more efficient operation of \code{\link{kmeans}};
#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link[scrapper]{runPca}}.
#' The use of PCs compacts the data for more efficient operation of \code{\link[scrapper]{clusterKmeans}};
#' it also removes some of the high-dimensional noise to highlight major factors of within-label heterogenity.
#' Note that the PCs are only used for clustering and the full expression profiles are still used for the final averaging.
#' Users can disable the PCA step by setting \code{rank=Inf}.
#'
#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA.
#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA, as identified with \code{\link[scrapper]{modelGeneVariances}}.
#' More subsetting of the matrix prior to the PCA can be achieved by setting \code{subset.row} to an appropriate indexing vector.
#' This option may be useful for clustering based on known genes of interest but retaining all genes in the aggregated results.
#' (If both options are set, subsetting by \code{subset.row} is done first, and then the top \code{ntop} genes are selected.)
Expand Down Expand Up @@ -75,36 +76,61 @@
#' @export
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom S4Vectors DataFrame
#' @importFrom BiocParallel SerialParam bpnworkers bpmapply
#' @importFrom BiocSingular bsparam
aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000, assay.type="logcounts",
rank=20, subset.row=NULL, check.missing=TRUE, BPPARAM=SerialParam(), BSPARAM=bsparam())
#' @importFrom Matrix rowMeans
#' @importFrom DelayedArray sweep colsum DelayedArray
#' @importFrom BiocParallel SerialParam bpnworkers
aggregateReference <- function(
ref,
labels,
ncenters=NULL,
power=0.5,
ntop=1000,
assay.type="logcounts",
rank=20,
subset.row=NULL,
check.missing=TRUE,
num.threads=bpnworkers(BPPARAM),
BPPARAM=SerialParam(),
BSPARAM=NULL)
{
by.label <- split(seq_along(labels), labels)
if (is(ref, "SummarizedExperiment")) {
ref <- assay(ref, i=assay.type)
}
ref <- DelayedArray(ref)

FRAGMENTER <- function(x, i, j) {
x <- x[,j,drop=FALSE] # this is usually smaller, so do this first.
if (!is.null(i)) {
x <- x[i,,drop=FALSE]
output.vals <- vector("list", length(by.label))
names(output.vals) <- names(by.label)
for (lab in names(by.label)) {
chosen <- by.label[[lab]]
current <- ref[,chosen,drop=FALSE]

cur.ncenters <- ncenters
if (is.null(cur.ncenters)) {
cur.ncenters <- floor(ncol(current)^power)
}
x
}

all.seeds <- .define_seeds(length(by.label))
if (cur.ncenters <= 1) {
output <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL))
} else {
# Doing a mini-analysis here: PCA on HVGs followed by k-means.
stats <- scrapper::modelGeneVariances(current, num.threads=num.threads)
keep <- scrapper::chooseHighlyVariableGenes(stats$statistics$residuals, top=ntop)
sub <- current[keep,,drop=FALSE]

if (rank <= min(dim(sub))-1L) {
pcs <- scrapper::runPca(sub, number=rank, num.threads=num.threads)$components
} else {
pcs <- as.matrix(sub)
}

clustered <- scrapper::clusterKmeans(pcs, k=cur.ncenters, num.threads=num.threads)
agg <- colsum(current, clustered$cluster)
tab <- table(clustered$cluster)[colnames(agg)]
output <- sweep(agg, 2, tab, "/")
}

if (is.null(BPPARAM) || is(BPPARAM, "SerialParam") || is(BPPARAM, "MulticoreParam")) {
output.vals <- bpmapply(chosen=by.label, .seed=all.seeds,
FUN=function(chosen, x, ...) .aggregate_internal(FRAGMENTER(x, i=subset.row, j=chosen), ...),
MoreArgs=list(x=ref, ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM),
BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
} else {
by.mat <- lapply(by.label, FRAGMENTER, x=ref, i=subset.row)
output.vals <- bpmapply(by.mat, .seed=all.seeds, FUN=.aggregate_internal,
MoreArgs=list(ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM),
BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
output.vals[[lab]] <- output
}

if (length(output.vals)==0L) {
Expand All @@ -119,98 +145,3 @@ aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000,
colnames(output) <- sprintf("%s.%s", output.labels, sequence(num))
output
}

#' @importFrom stats kmeans
#' @importFrom utils head
#' @importFrom Matrix t rowMeans
#' @importFrom BiocSingular runPCA
#' @importFrom DelayedArray sweep colsum DelayedArray getAutoBPPARAM setAutoBPPARAM
#' @importFrom DelayedMatrixStats rowVars
#' @importFrom beachmat realizeFileBackedMatrix
.aggregate_internal <- function(current, ncenters, power, rank, ntop, check.missing, BSPARAM, .seed) {
oldseed <- .get_seed()
on.exit(.set_seed(oldseed))

old <- RNGkind("L'Ecuyer-CMRG")
on.exit(RNGkind(old[1]), add=TRUE, after=FALSE)
assign(".Random.seed", .seed, envir = .GlobalEnv)

old.bp <- getAutoBPPARAM()
setAutoBPPARAM(SerialParam())
on.exit(setAutoBPPARAM(old.bp), add=TRUE, after=FALSE)

cur.ncenters <- ncenters
if (is.null(cur.ncenters)) {
cur.ncenters <- floor(ncol(current)^power)
}

if (cur.ncenters <= 1) {
val <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL))
} else if (cur.ncenters >= ncol(current)) {
val <- current
} else {
to.use <- current

if (ntop <= nrow(current)) {
o <- order(rowVars(to.use), decreasing=TRUE)
to.use <- to.use[head(o, ntop),,drop=FALSE]
}

to.use <- t(to.use)
to.use <- realizeFileBackedMatrix(to.use)

# Identifying the top PCs to avoid realizing the entire matrix.
if (rank <= min(dim(to.use))-1L) {
pcs <- runPCA(to.use, rank=rank, get.rotation=FALSE, BSPARAM=BSPARAM)$x
} else {
pcs <- as.matrix(to.use)
}

clustered <- kmeans(pcs, centers=cur.ncenters)
val <- colsum(DelayedArray(current), clustered$cluster)
tab <- table(clustered$cluster)[colnames(val)]
val <- sweep(val, 2, tab, "/")
}

val
}

#' @importFrom stats runif
#' @importFrom parallel nextRNGStream
.define_seeds <- function(n) {
if (!n) {
return(list())
}

runif(1) # bumping the RNG so that repeated calls to this function generate different results.

oldseed <- .get_seed()
on.exit(.set_seed(oldseed))

old <- RNGkind("L'Ecuyer-CMRG")
on.exit(RNGkind(old[1]), add=TRUE, after=FALSE)

seeds <- vector("list", n)
seeds[[1L]] <- .Random.seed
for (i in seq_len(n - 1L)) {
seeds[[i + 1L]] <- nextRNGStream(seeds[[i]])
}

seeds
}

.get_seed <- function( ) {
if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
} else {
NULL
}
}

.set_seed <- function(oldseed) {
if (!is.null(oldseed)) {
assign(".Random.seed", oldseed, envir = .GlobalEnv)
} else {
rm(.Random.seed, envir = .GlobalEnv)
}
}
32 changes: 22 additions & 10 deletions R/plotMarkerHeatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@
#' Defaults to all available labels.
#' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
#' @param use.pruned Logical scalar indicating whether the pruned labels should be used instead.
#' @param order.by String specifying the column of the output of \code{\link[scran]{scoreMarkers}} with which to sort for interesting markers.
#' @param order.by.effect String specifying the effect size from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.
#' @param order.by.summary String specifying the summary statistic from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.
#' @param display.row.names Character vector of length equal to the number of rows of \code{test},
#' containing the names of the features to show on the heatmap (e.g., to replace IDs with symbols).
#' If \code{NULL}, the existing row names of \code{test} are used.
#' @param top Integer scalar indicating the top most interesting markers to show in the heatmap.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for marker scoring.
#' @param num.threads Integer scalar specifying the number to threads to use.
#' @param BPPARAM Deprecated, use \code{num.threads} instead.
#'
#' @return
#' The output of \code{\link[pheatmap]{pheatmap}} is returned showing the heatmap on the current graphics device.
Expand All @@ -27,8 +29,8 @@
#' This function creates a heatmap where each row is a marker gene for \code{label} and each column is a cell in the test dataset.
#' The aim is to check the effectiveness of the reference-derived markers for distinguishing between labels in the test dataset.
#' Useful markers from the reference should show strong upregulation in \code{label} compared to all \code{other.labels}.
#' We identify such markers by scoring all reference-derived markers with \code{\link[scran]{scoreMarkers}} on the \code{test} expression.
#' The \code{top} markers based on the specified \code{order.by} field are shown in the heatmap.
#' We identify such markers by scoring all reference-derived markers with \code{\link[scrapper]{scoreMarkers}} on the \code{test} expression.
#' The \code{top} markers based on the specified \code{order.by.*} fields are shown in the heatmap.
#' If only one label is present, markers are ranked by average abundance intead.
#'
#' @author Aaron Lun
Expand All @@ -38,11 +40,11 @@
#'
#' plotMarkerHeatmap(pred, test, pred$labels[1])
#' plotMarkerHeatmap(pred, test, pred$labels[1], use.pruned=TRUE)
#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by="rank.AUC")
#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by.effect="auc")
#'
#' @export
#' @importFrom S4Vectors metadata
#' @importFrom BiocParallel SerialParam
#' @importFrom BiocParallel SerialParam bpnworkers
#' @importFrom Matrix rowMeans
#' @importFrom utils head
plotMarkerHeatmap <- function(
Expand All @@ -53,8 +55,10 @@ plotMarkerHeatmap <- function(
assay.type="logcounts",
display.row.names=NULL,
use.pruned=FALSE,
order.by="rank.logFC.cohen",
order.by.effect="cohens.d",
order.by.summary="min.rank",
top=20,
num.threads=bpnworkers(BPPARAM),
BPPARAM = SerialParam())
{
test <- .to_clean_matrix(test, assay.type, check.missing=FALSE)
Expand Down Expand Up @@ -87,9 +91,17 @@ plotMarkerHeatmap <- function(
# Prioritize the markers with interesting variation in the test data for
# visualization. If we only have one label, we use the most abundant markers.
if (length(unique(predictions)) > 1L) {
interesting <- scran::scoreMarkers(test, predictions, BPPARAM=BPPARAM)
stats <- interesting[[label]]
o <- order(stats[[order.by]], decreasing=!startsWith(order.by, "rank."))
interesting <- scrapper::scoreMarkers(
test,
predictions,
num.threads=num.threads,
compute.auc=(order.by.effect=="auc"),
compute.cohens.d=(order.by.effect=="cohens.d"),
compute.delta.mean=(order.by.effect=="delta.mean"),
compute.delta.detected=(order.by.effect=="delta.detected")
)
stats <- interesting[[order.by.effect]][[label]][[order.by.summary]]
o <- order(stats, decreasing=(order.by.summary!="min.rank"))
to.show <- rownames(test)[o]
} else {
abundance <- rowMeans(test)
Expand Down
Loading
Loading