Added utility to plot marker heatmaps for easier diagnostics.

SingleR-inc · Oct 26, 2024 · 1520a39 · 1520a39
1 parent ee2d7f9
commit 1520a39
Show file tree

Hide file tree

Showing 6 changed files with 208 additions and 10 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SingleR
 Title: Reference-Based Single-Cell RNA-Seq Annotation
-Version: 2.7.6
-Date: 2024-10-10
+Version: 2.7.7
+Date: 2024-10-26
 Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
     person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
     person("Daniel", "Bunis", role="ctb"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -18,6 +18,7 @@ export(getClassicMarkers)
 export(getDeltaFromMedian)
 export(matchReferences)
 export(plotDeltaDistribution)
+export(plotMarkerHeatmap)
 export(plotScoreDistribution)
 export(plotScoreHeatmap)
 export(pruneScores)

diff --git a/R/plotMarkerHeatmap.R b/R/plotMarkerHeatmap.R
@@ -0,0 +1,108 @@
+#' Plot a heatmap of the markers for a label
+#'
+#' Create a heatmap of the log-normalized expression for the most interesting markers of a particular label.
+#'
+#' @param results A \linkS4class{DataFrame} containing the output from \code{\link{SingleR}},
+#' \code{\link{classifySingleR}}, \code{\link{combineCommonResults}}, or \code{\link{combineRecomputedResults}}.
+#' @param test A numeric matrix of log-normalized expression values where rows are genes and columns are cells.
+#' Each row should be named with the gene name.
+#'
+#' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
+#' @param label String specifying the label of interest.
+#' @param other.labels Character vector specifying the other labels to be compared to \code{label} when finding interesting markers.
+#' Defaults to all available labels.
+#' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
+#' @param use.pruned Logical scalar indicating whether the pruned labels should be used instead.
+#' @param order.by String specifying the column of the output of \code{\link[scran]{scoreMarkers}} with which to sort for interesting markers.
+#' @param top Integer scalar indicating the top most interesting markers to show in the heatmap.
+#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for marker scoring.
+#'
+#' @return 
+#' The output of \code{\link[pheatmap]{pheatmap}} is returned showing the heatmap on the current graphics device.
+#'
+#' @details
+#' This function creates a heatmap where each row is a marker gene for \code{label} and each column is a cell in the test dataset.
+#' The aim is to check the effectiveness of the reference-derived markers for distinguishing between labels in the test dataset.
+#' Useful markers from the reference should show strong upregulation in \code{label} compared to all \code{other.labels}. 
+#' We identify such markers by scoring all reference-derived markers with \code{\link[scran]{scoreMarkers}} on the \code{test} expression.
+#' The \code{top} markers based on the specified \code{order.by} field are shown in the heatmap.
+#' If only one label is present, markers are ranked by average abundance intead. 
+#'
+#' @author Aaron Lun
+#' @examples
+#' # Running the SingleR() example.
+#' example(SingleR, echo=FALSE)
+#'
+#' plotMarkerHeatmap(pred, test, pred$labels[1])
+#' plotMarkerHeatmap(pred, test, pred$labels[1], use.pruned=TRUE)
+#' plotMarkerHeatmap(pred, test, pred$labels[1], order.by="rank.AUC")
+#' 
+#' @export
+#' @importFrom S4Vectors metadata
+#' @importFrom BiocParallel SerialParam
+#' @importFrom Matrix rowMeans
+#' @importFrom utils head
+plotMarkerHeatmap <- function(
+    results,
+    test,
+    label,
+    other.labels=NULL,
+    assay.type="logcounts",
+    use.pruned=FALSE,
+    order.by="rank.logFC.cohen",
+    top=20,
+    BPPARAM = SerialParam()) 
+{
+    test <- .to_clean_matrix(test, assay.type, check.missing=FALSE)
+    all.markers <- metadata(results)$de.genes[[label]]
+
+    if (use.pruned) {
+        labfield <- "pruned.labels"
+    } else {
+        labfield <- "labels"
+    }
+    predictions <- results[[labfield]]
+
+    ckeep <- seq_len(ncol(test))
+    if (!is.null(other.labels)) {
+        ckeep <- predictions %in% other.labels
+        predictions <- predictions[ckeep]
+        for (n in names(all.markers)) {
+            if (!(n %in% other.labels) && n != label) {
+                all.markers[[n]] <- NULL
+            }
+        }
+    } else if (anyNA(predictions)) {
+        ckeep <- !is.na(predictions)
+        predictions <- predictions[ckeep]
+    }
+
+    rkeep <- rownames(test) %in% unlist(all.markers)
+    test <- test[rkeep,ckeep,drop=FALSE]
+
+    # Prioritize the markers with interesting variation in the test data for
+    # visualization. If we only have one label, we use the most abundant markers.
+    if (length(unique(predictions)) > 1L) {
+        interesting <- scran::scoreMarkers(test, predictions, BPPARAM=BPPARAM)
+        stats <- interesting[[label]]
+        o <- order(stats[[order.by]], decreasing=!startsWith(order.by, "rank."))
+        to.show <- rownames(test)[o]
+    } else {
+        abundance <- rowMeans(test)
+        to.show <- names(abundance)[order(abundance, decreasing=TRUE)]
+    }
+
+    to.show <- head(to.show, top)
+    limits <- range(test, na.rm=TRUE)
+    colnames(test) <- seq_len(ncol(test))
+    col.order <- order(predictions)
+
+    pheatmap::pheatmap(
+        test[to.show,col.order,drop=FALSE],
+        breaks=seq(limits[1], limits[2], length.out=26),
+        color=viridis::viridis(25),
+        annotation_col=data.frame(labels=predictions, row.names=colnames(test)),
+        cluster_col=FALSE,
+        show_colnames=FALSE
+    )
+}
diff --git a/man/plotMarkerHeatmap.Rd b/man/plotMarkerHeatmap.Rd
diff --git a/tests/testthat/test-plotMarkerHeatmap.R b/tests/testthat/test-plotMarkerHeatmap.R
@@ -0,0 +1,27 @@
+# library(SingleR); library(testthat); source("setup.R"); source("test-plotMarkerHeatmap.R")
+
+colnames(test) <- sprintf("cell_%i", seq_len(ncol(test)))
+pred <- SingleR(test=test, ref=training, labels=training$label, genes="de")
+
+test_that("We can produce heatmaps of markers", {
+    expect_s3_class(plotMarkerHeatmap(pred, test, pred$labels[1]), "pheatmap")
+})
+
+test_that("marker heatmaps handle pruned NAs correctly", {
+    pred$pruned.labels[1] <- NA
+    lab <- na.omit(unique(pred$pruned.labels))[1]
+    out <- plotMarkerHeatmap(pred, test, lab, use.pruned=TRUE)
+    expect_s3_class(out, "pheatmap")
+})
+
+test_that("marker heatmaps work with a subset of other labels", {
+    expect_s3_class(plotMarkerHeatmap(pred, test, pred$labels[1], other.labels=unique(pred$labels)[1:2]), "pheatmap")
+})
+
+test_that("marker heatmap falls back to average abundances", {
+    lab <- pred$labels[1]
+    keep <- pred$labels == lab
+    pred <- pred[keep,]
+    test <- test[,keep,drop=FALSE]
+    expect_s3_class(plotMarkerHeatmap(pred, test, lab), "pheatmap")
+})
diff --git a/vignettes/SingleR.Rmd b/vignettes/SingleR.Rmd
@@ -152,18 +152,12 @@ summary(is.na(pred.grun$pruned.labels))
 ```
 
 Finally, a simple yet effective diagnostic is to examine the expression of the marker genes for each label in the test dataset.
-We extract the identity of the markers from the metadata of the `SingleR()` results and use them in the  `plotHeatmap()` function from `r Biocpkg("scater")`, as shown below for beta cell markers.
+We extract the identity of the markers from the metadata of the `SingleR()` results and use them in the  `plotMarkerHeatmap()` function, as shown below for beta cell markers.
 If a cell in the test dataset is confidently assigned to a particular label, we would expect it to have strong expression of that label's markers.
 At the very least, it should exhibit upregulation of those markers relative to cells assigned to other labels.
 
 ```{r}
-all.markers <- metadata(pred.grun)$de.genes
-sceG$labels <- pred.grun$labels
-
-# Beta cell-related markers
-library(scater)
-plotHeatmap(sceG, order_columns_by="labels",
-    features=unique(unlist(all.markers$beta))) 
+plotMarkerHeatmap(pred.grun, sceG, label="beta")
 ```
 
 # FAQs