SingleR-inc · LTLA · Nov 18, 2024 · Nov 17, 2024 · Nov 17, 2024 · Nov 17, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SingleR
 Title: Reference-Based Single-Cell RNA-Seq Annotation
-Version: 2.9.0
-Date: 2024-10-28
+Version: 2.9.1
+Date: 2024-11-17
 Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
     person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
     person("Daniel", "Bunis", role="ctb"),
@@ -19,13 +19,11 @@ Imports:
     DelayedArray,
     DelayedMatrixStats,
     BiocParallel,
-    BiocSingular,
     BiocNeighbors,
     stats,
     utils,
     Rcpp,
-    beachmat (>= 2.21.1),
-    parallel
+    beachmat (>= 2.21.1)
 LinkingTo:
     Rcpp,
     beachmat,
@@ -39,8 +37,7 @@ Suggests:
     BiocGenerics,
     SingleCellExperiment,
     scuttle,
-    scater,
-    scran,
+    scrapper,
     scRNAseq,
     ggplot2,
     pheatmap,

diff --git a/NAMESPACE b/NAMESPACE
@@ -33,12 +33,9 @@ importFrom(BiocNeighbors,KmknnParam)
 importFrom(BiocNeighbors,defineBuilder)
 importFrom(BiocParallel,SerialParam)
 importFrom(BiocParallel,bpisup)
-importFrom(BiocParallel,bpmapply)
 importFrom(BiocParallel,bpnworkers)
 importFrom(BiocParallel,bpstart)
 importFrom(BiocParallel,bpstop)
-importFrom(BiocSingular,bsparam)
-importFrom(BiocSingular,runPCA)
 importFrom(DelayedArray,DelayedArray)
 importFrom(DelayedArray,colsum)
 importFrom(DelayedArray,getAutoBPPARAM)
@@ -49,9 +46,7 @@ importFrom(DelayedArray,setAutoBPPARAM)
 importFrom(DelayedArray,sweep)
 importFrom(DelayedMatrixStats,rowAnyNAs)
 importFrom(DelayedMatrixStats,rowMedians)
-importFrom(DelayedMatrixStats,rowVars)
 importFrom(Matrix,rowMeans)
-importFrom(Matrix,t)
 importFrom(Rcpp,sourceCpp)
 importFrom(S4Vectors,"metadata<-")
 importFrom(S4Vectors,DataFrame)
@@ -63,16 +58,12 @@ importFrom(S4Vectors,selfmatch)
 importFrom(SummarizedExperiment,SummarizedExperiment)
 importFrom(SummarizedExperiment,assay)
 importFrom(beachmat,initializeCpp)
-importFrom(beachmat,realizeFileBackedMatrix)
 importFrom(methods,as)
 importFrom(methods,is)
-importFrom(parallel,nextRNGStream)
-importFrom(stats,kmeans)
 importFrom(stats,mad)
 importFrom(stats,median)
 importFrom(stats,rnorm)
 importFrom(stats,rpois)
-importFrom(stats,runif)
 importFrom(utils,head)
 importFrom(utils,relist)
 useDynLib(SingleR)
diff --git a/R/aggregateReference.R b/R/aggregateReference.R
@@ -14,8 +14,9 @@
 #' @param ntop Integer scalar specifying the number of highly variable genes to use for the PCA step.
 #' @param subset.row Integer, character or logical vector indicating the rows of \code{ref} to use for k-means clustering. 
 #' @param check.missing Logical scalar indicating whether rows should be checked for missing values (and if found, removed).
-#' @param BPPARAM A \linkS4class{BiocParallelParam} object indicating how parallelization should be performed.
-#' @param BSPARAM A \linkS4class{BiocSingularParam} object indicating which SVD algorithm should be used in \code{\link{runPCA}}.
+#' @param num.threads Integer scalar specifying the number to threads to use.
+#' @param BPPARAM Deprecated, use \code{num.threads} instead.
+#' @param BSPARAM Deprecated and ignored.
 #' 
 #' @details
 #' With single-cell reference datasets, it is often useful to aggregate individual cells into pseudo-bulk samples to serve as a reference.
@@ -35,13 +36,13 @@
 #' If \code{ncenters} is greater than the number of samples for a label and/or \code{power=1}, no aggregation is performed.
 #' Setting \code{power=0} will aggregate all cells of a label into a single pseudo-bulk profile.
 #'
-#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link{runPCA}}.
-#' The use of PCs compacts the data for more efficient operation of \code{\link{kmeans}};
+#' In practice, k-means clustering is actually performed on the first \code{rank} principal components as computed using \code{\link[scrapper]{runPca}}.
+#' The use of PCs compacts the data for more efficient operation of \code{\link[scrapper]{clusterKmeans}};
 #' it also removes some of the high-dimensional noise to highlight major factors of within-label heterogenity.
 #' Note that the PCs are only used for clustering and the full expression profiles are still used for the final averaging.
 #' Users can disable the PCA step by setting \code{rank=Inf}.
 #'
-#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA.
+#' By default, we speed things up by only using the top \code{ntop} genes with the largest variances in the PCA, as identified with \code{\link[scrapper]{modelGeneVariances}}.
 #' More subsetting of the matrix prior to the PCA can be achieved by setting \code{subset.row} to an appropriate indexing vector.
 #' This option may be useful for clustering based on known genes of interest but retaining all genes in the aggregated results.
 #' (If both options are set, subsetting by \code{subset.row} is done first, and then the top \code{ntop} genes are selected.)
@@ -75,36 +76,61 @@
 #' @export
 #' @importFrom SummarizedExperiment SummarizedExperiment
 #' @importFrom S4Vectors DataFrame
-#' @importFrom BiocParallel SerialParam bpnworkers bpmapply
-#' @importFrom BiocSingular bsparam
-aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000, assay.type="logcounts", 
-    rank=20, subset.row=NULL, check.missing=TRUE, BPPARAM=SerialParam(), BSPARAM=bsparam())
+#' @importFrom Matrix rowMeans
+#' @importFrom DelayedArray sweep colsum DelayedArray
+#' @importFrom BiocParallel SerialParam bpnworkers
+aggregateReference <- function(
+    ref,
+    labels,
+    ncenters=NULL,
+    power=0.5,
+    ntop=1000,
+    assay.type="logcounts",
+    rank=20,
+    subset.row=NULL,
+    check.missing=TRUE,
+    num.threads=bpnworkers(BPPARAM),
+    BPPARAM=SerialParam(),
+    BSPARAM=NULL)
 {
     by.label <- split(seq_along(labels), labels)
     if (is(ref, "SummarizedExperiment")) {
         ref <- assay(ref, i=assay.type)
     }
+    ref <- DelayedArray(ref)
 
-    FRAGMENTER <- function(x, i, j) {
-        x <- x[,j,drop=FALSE] # this is usually smaller, so do this first.
-        if (!is.null(i)) {
-            x <- x[i,,drop=FALSE]
+    output.vals <- vector("list", length(by.label))
+    names(output.vals) <- names(by.label)
+    for (lab in names(by.label)) {
+        chosen <- by.label[[lab]]
+        current <- ref[,chosen,drop=FALSE]
+
+        cur.ncenters <- ncenters
+        if (is.null(cur.ncenters)) {
+            cur.ncenters <- floor(ncol(current)^power)
         }
-        x
-    }
 
-    all.seeds <- .define_seeds(length(by.label))
+        if (cur.ncenters <= 1) {
+            output <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL))
+        } else {
+            # Doing a mini-analysis here: PCA on HVGs followed by k-means.
+            stats <- scrapper::modelGeneVariances(current, num.threads=num.threads)
+            keep <- scrapper::chooseHighlyVariableGenes(stats$statistics$residuals, top=ntop)
+            sub <- current[keep,,drop=FALSE]
+
+            if (rank <= min(dim(sub))-1L) {
+                pcs <- scrapper::runPca(sub, number=rank, num.threads=num.threads)$components
+            } else {
+                pcs <- as.matrix(sub)
+            }
+
+            clustered <- scrapper::clusterKmeans(pcs, k=cur.ncenters, num.threads=num.threads)
+            agg <- colsum(current, clustered$cluster)
+            tab <- table(clustered$cluster)[colnames(agg)]
+            output <- sweep(agg, 2, tab, "/")
+        }
 
-    if (is.null(BPPARAM) || is(BPPARAM, "SerialParam") || is(BPPARAM, "MulticoreParam")) {
-        output.vals <- bpmapply(chosen=by.label, .seed=all.seeds,
-            FUN=function(chosen, x, ...) .aggregate_internal(FRAGMENTER(x, i=subset.row, j=chosen), ...), 
-            MoreArgs=list(x=ref, ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM),
-            BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
-    } else {
-        by.mat <- lapply(by.label, FRAGMENTER, x=ref, i=subset.row)
-        output.vals <- bpmapply(by.mat, .seed=all.seeds, FUN=.aggregate_internal,
-            MoreArgs=list(ncenters=ncenters, power=power, rank=rank, check.missing=check.missing, ntop=ntop, BSPARAM=BSPARAM),
-            BPPARAM=BPPARAM, SIMPLIFY=FALSE, USE.NAMES=FALSE)
+        output.vals[[lab]] <- output
     }
 
     if (length(output.vals)==0L) {
@@ -119,98 +145,3 @@ aggregateReference <- function(ref, labels, ncenters=NULL, power=0.5, ntop=1000,
     colnames(output) <- sprintf("%s.%s", output.labels, sequence(num))
     output
 }
-
-#' @importFrom stats kmeans
-#' @importFrom utils head
-#' @importFrom Matrix t rowMeans
-#' @importFrom BiocSingular runPCA
-#' @importFrom DelayedArray sweep colsum DelayedArray getAutoBPPARAM setAutoBPPARAM
-#' @importFrom DelayedMatrixStats rowVars
-#' @importFrom beachmat realizeFileBackedMatrix
-.aggregate_internal <- function(current, ncenters, power, rank, ntop, check.missing, BSPARAM, .seed) {
-    oldseed <- .get_seed()
-    on.exit(.set_seed(oldseed))
-
-    old <- RNGkind("L'Ecuyer-CMRG")
-    on.exit(RNGkind(old[1]), add=TRUE, after=FALSE)
-    assign(".Random.seed", .seed, envir = .GlobalEnv)
-
-    old.bp <- getAutoBPPARAM()
-    setAutoBPPARAM(SerialParam())
-    on.exit(setAutoBPPARAM(old.bp), add=TRUE, after=FALSE)
-
-    cur.ncenters <- ncenters
-    if (is.null(cur.ncenters)) {
-        cur.ncenters <- floor(ncol(current)^power)
-    }
-
-    if (cur.ncenters <= 1) {
-        val <- matrix(rowMeans(current), dimnames=list(rownames(current), NULL))
-    } else if (cur.ncenters >= ncol(current)) {
-        val <- current
-    } else {
-        to.use <- current 
-
-        if (ntop <= nrow(current)) {
-            o <- order(rowVars(to.use), decreasing=TRUE)
-            to.use <- to.use[head(o, ntop),,drop=FALSE]
-        }
-
-        to.use <- t(to.use)
-        to.use <- realizeFileBackedMatrix(to.use)
-
-        # Identifying the top PCs to avoid realizing the entire matrix.
-        if (rank <= min(dim(to.use))-1L) {
-            pcs <- runPCA(to.use, rank=rank, get.rotation=FALSE, BSPARAM=BSPARAM)$x
-        } else {
-            pcs <- as.matrix(to.use)
-        }
-
-        clustered <- kmeans(pcs, centers=cur.ncenters)
-        val <- colsum(DelayedArray(current), clustered$cluster)
-        tab <- table(clustered$cluster)[colnames(val)]
-        val <- sweep(val, 2, tab, "/")
-    }
-
-    val
-}
-
-#' @importFrom stats runif
-#' @importFrom parallel nextRNGStream
-.define_seeds <- function(n) {
-    if (!n) {
-        return(list())
-    }
-
-    runif(1) # bumping the RNG so that repeated calls to this function generate different results.
-
-    oldseed <- .get_seed()
-    on.exit(.set_seed(oldseed))
-
-    old <- RNGkind("L'Ecuyer-CMRG")
-    on.exit(RNGkind(old[1]), add=TRUE, after=FALSE)
-
-    seeds <- vector("list", n)
-    seeds[[1L]] <- .Random.seed
-    for (i in seq_len(n - 1L)) {
-        seeds[[i + 1L]] <- nextRNGStream(seeds[[i]])
-    }
-
-    seeds
-}
-
-.get_seed <- function( ) {
-    if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
-        get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
-    } else {
-        NULL
-    }
-}
-
-.set_seed <- function(oldseed) {
-    if (!is.null(oldseed)) {
-        assign(".Random.seed", oldseed, envir = .GlobalEnv)
-    } else {
-        rm(.Random.seed, envir = .GlobalEnv)
-    }
-}
diff --git a/R/plotMarkerHeatmap.R b/R/plotMarkerHeatmap.R
@@ -13,12 +13,14 @@
 #' Defaults to all available labels.
 #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
 #' @param use.pruned Logical scalar indicating whether the pruned labels should be used instead.
-#' @param order.by String specifying the column of the output of \code{\link[scran]{scoreMarkers}} with which to sort for interesting markers.
+#' @param order.by.effect String specifying the effect size from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.
+#' @param order.by.summary String specifying the summary statistic from \code{\link[scrapper]{scoreMarkers}} with which to sort for interesting markers.
 #' @param display.row.names Character vector of length equal to the number of rows of \code{test},
 #' containing the names of the features to show on the heatmap (e.g., to replace IDs with symbols).
 #' If \code{NULL}, the existing row names of \code{test} are used.
 #' @param top Integer scalar indicating the top most interesting markers to show in the heatmap.
-#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for marker scoring.
+#' @param num.threads Integer scalar specifying the number to threads to use.
+#' @param BPPARAM Deprecated, use \code{num.threads} instead.
 #'
 #' @return 
 #' The output of \code{\link[pheatmap]{pheatmap}} is returned showing the heatmap on the current graphics device.
@@ -27,8 +29,8 @@
 #' This function creates a heatmap where each row is a marker gene for \code{label} and each column is a cell in the test dataset.
 #' The aim is to check the effectiveness of the reference-derived markers for distinguishing between labels in the test dataset.
 #' Useful markers from the reference should show strong upregulation in \code{label} compared to all \code{other.labels}. 
-#' We identify such markers by scoring all reference-derived markers with \code{\link[scran]{scoreMarkers}} on the \code{test} expression.
-#' The \code{top} markers based on the specified \code{order.by} field are shown in the heatmap.
+#' We identify such markers by scoring all reference-derived markers with \code{\link[scrapper]{scoreMarkers}} on the \code{test} expression.
+#' The \code{top} markers based on the specified \code{order.by.*} fields are shown in the heatmap.
 #' If only one label is present, markers are ranked by average abundance intead. 
 #'
 #' @author Aaron Lun
@@ -38,11 +40,11 @@
 #'
 #' plotMarkerHeatmap(pred, test, pred$labels[1])
 #' plotMarkerHeatmap(pred, test, pred$labels[1], use.pruned=TRUE)
-#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by="rank.AUC")
+#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by.effect="auc")
 #' 
 #' @export
 #' @importFrom S4Vectors metadata
-#' @importFrom BiocParallel SerialParam
+#' @importFrom BiocParallel SerialParam bpnworkers
 #' @importFrom Matrix rowMeans
 #' @importFrom utils head
 plotMarkerHeatmap <- function(
@@ -53,8 +55,10 @@ plotMarkerHeatmap <- function(
     assay.type="logcounts",
     display.row.names=NULL,
     use.pruned=FALSE,
-    order.by="rank.logFC.cohen",
+    order.by.effect="cohens.d",
+    order.by.summary="min.rank",
     top=20,
+    num.threads=bpnworkers(BPPARAM),
     BPPARAM = SerialParam()) 
 {
     test <- .to_clean_matrix(test, assay.type, check.missing=FALSE)
@@ -87,9 +91,17 @@ plotMarkerHeatmap <- function(
     # Prioritize the markers with interesting variation in the test data for
     # visualization. If we only have one label, we use the most abundant markers.
     if (length(unique(predictions)) > 1L) {
-        interesting <- scran::scoreMarkers(test, predictions, BPPARAM=BPPARAM)
-        stats <- interesting[[label]]
-        o <- order(stats[[order.by]], decreasing=!startsWith(order.by, "rank."))
+        interesting <- scrapper::scoreMarkers(
+            test,
+            predictions,
+            num.threads=num.threads,
+            compute.auc=(order.by.effect=="auc"),
+            compute.cohens.d=(order.by.effect=="cohens.d"),
+            compute.delta.mean=(order.by.effect=="delta.mean"),
+            compute.delta.detected=(order.by.effect=="delta.detected")
+        )
+        stats <- interesting[[order.by.effect]][[label]][[order.by.summary]]
+        o <- order(stats, decreasing=(order.by.summary!="min.rank"))
         to.show <- rownames(test)[o]
     } else {
         abundance <- rowMeans(test)