Cleaned up the per-reference scores from combineRecomputedResults.

Now we return a DataFrame of nested Dataframes, where each inner DataFrame corresponds to a reference and contains the identity of the best label and the recomputed score in that reference. This is effectively a sparse version of the previous "expanded with NA" format, which is easier to read and saves memory. We also update the plotting functions to work with this new format, mostly by regenerated the previous expanded format as needed.
SingleR-inc · Dec 15, 2024 · c24b091 · c24b091
1 parent 320d4bb
commit c24b091
Show file tree

Hide file tree

Showing 9 changed files with 74 additions and 73 deletions.
diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R
@@ -19,9 +19,8 @@
 #' @return A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row).
 #' This mimics the output of \code{\link{classifySingleR}} and contains the following fields:
 #' \itemize{
-#' \item \code{scores}, a numeric matrix of correlations containing the \emph{recomputed} scores.
-#' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference;
-#' scores are not recomputed for the other labels.
+#' \item \code{scores}, a DataFrame of DataFrames containing the \emph{recomputed} scores for the best label in each reference.
+#' Each nested DataFrame corresponds to a reference and contains \code{labels} (the best label for that cell in this reference) and \code{scores} (the recomputed score).
 #' \item \code{labels}, a character vector containing the per-cell combined label across references.
 #' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived.
 #' \item \code{delta.next}, a numeric vector containing the difference between the best and next-best score.
@@ -186,21 +185,19 @@ combineRecomputedResults <- function(
     ) 
 
     # Organizing the outputs.
+    if (is.null(names(results))) {
+        names(results) <- sprintf("ref%i", seq_along(results))
+    }
+
     base.scores <- vector("list", length(results))
-    for (r in seq_along(base.scores)) {
-        mat <- results[[r]]$scores
-        mat[] <- NA_real_
-        idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L)
-        mat[idx] <- irun$scores[,r]
-        base.scores[[r]] <- mat
+    names(base.scores) <- names(results)
+    for (i in seq_along(base.scores)) {
+        base.scores[[i]] <- DataFrame(labels=results[[i]]$labels, trained[[i]]$labels$unique, scores=irun$scores[,i])
     }
 
-    all.scores <- do.call(cbind, base.scores)
+    all.scores <- DataFrame(lapply(base.scores, I))
     output <- DataFrame(scores = I(all.scores), row.names=rownames(results[[1]]))
-    metadata(output)$label.origin <- .create_label_origin(base.scores)
-
-    chosen <- irun$best + 1L
-    cbind(output, .combine_result_frames(chosen, irun$delta, results))
+    cbind(output, .combine_result_frames(irun$best + 1L, irun$delta, results))
 }
 
 #' @importFrom S4Vectors DataFrame
@@ -228,19 +225,7 @@ combineRecomputedResults <- function(
 
     output$reference <- chosen
     output$delta.next <- delta
-
-    if (is.null(names(results))) {
-        names(results) <- sprintf("ref%i", seq_along(results))
-    }
     output$orig.results <- do.call(DataFrame, lapply(results, I))
 
     output
 }
-
-#' @importFrom S4Vectors DataFrame
-.create_label_origin <- function(collected.scores) {
-    DataFrame(
-        label=unlist(lapply(collected.scores, colnames)),
-        reference=rep(seq_along(collected.scores), vapply(collected.scores, ncol, 0L))
-    )
-}
diff --git a/R/plotDeltaDistribution.R b/R/plotDeltaDistribution.R
@@ -76,7 +76,7 @@
 plotDeltaDistribution <- function(
     results,
     show = c("delta.med", "delta.next"),
-    labels.use = colnames(results$scores),
+    labels.use = NULL,
     references = NULL,
     chosen.only = TRUE,
     size = 2,
@@ -135,6 +135,14 @@ plotDeltaDistribution <- function(
             pruned <- is.na(current.results$pruned.labels)
         }
 
+        if (is.null(labels.use)) {
+            if (is(results$scores, "DataFrame")) {
+                labels.use <- unlist(lapply(results$scores, function(x) unique(x$labels)))
+            } else {
+                labels.use <- colnames(results$scores)
+            }
+        }
+
         # Actually creating the plot
         plots[[i]] <- .plot_delta_distribution(
             values=values, labels=labels, pruned=pruned, labels.use=labels.use,

diff --git a/R/plotScoreDistribution.R b/R/plotScoreDistribution.R
@@ -91,7 +91,7 @@
 plotScoreDistribution <- function(
     results,
     show = NULL,
-    labels.use = colnames(results$scores),
+    labels.use = NULL,
     references = NULL,
     scores.use = NULL,
     calls.use = 0,
@@ -137,11 +137,15 @@ plotScoreDistribution <- function(
         chosen <- references[i]
         if (chosen==0L) {
             current.results <- results
+            scores <- current.results$scores
+            if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults.
+                scores <- .expand_recomputed_scores(scores)
+            }
         } else {
             current.results <- results$orig.results[[chosen]]
+            scores <- current.results$scores
         }
 
-        scores <- current.results$scores
         scores.title <- .values_title(is.combined, chosen, ref.names, show)
 
         # Pulling out the labels to use in this iteration.
@@ -154,6 +158,10 @@ plotScoreDistribution <- function(
             prune.calls <- current.results$pruned.labels
         }
 
+        if (is.null(labels.use)) {
+            labels.use <- colnames(scores)
+        }
+
         # Actually creating the plot
         plots[[i]] <- .plot_score_distribution(
             scores=scores, labels=labels, prune.calls=prune.calls, labels.use=labels.use, 

diff --git a/R/plotScoreHeatmap.R b/R/plotScoreHeatmap.R
@@ -223,11 +223,15 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
         chosen.scores <- scores.use[i]
         if (chosen.scores==0L) {
             score.results <- results
+            scores <- score.results$scores
+            if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults.
+                scores <- .expand_recomputed_scores(scores)
+            }
         } else {
             score.results <- results$orig.results[[chosen.scores]]
+            scores <- score.results$scores
         }
 
-        scores <- score.results$scores
         rownames(scores) <- rownames(results)
         scores.title <- .values_title(is.combined, chosen.scores, ref.names, "Scores")
         scores.labels <- score.results$labels
@@ -245,6 +249,10 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
         names(labels) <- names(prune.calls) <- rownames(scores)
         labels.title <- .values_title(is.combined, chosen.calls, ref.names, "Labels")
 
+        if (is.null(labels.use)) {
+            labels.use <- colnames(scores)
+        }
+
         # Actually creating the heatmap.
         output <- .plot_score_heatmap(
             scores=scores,
@@ -465,10 +473,6 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
 .trim_byLabel_and_normalize_scores <- function(
     scores, labels.use, max.labels, normalize, scores.title, scores.labels) {
 
-    # Trim by labels (remove any with no scores)
-    all.na <- apply(scores, 2, FUN = function(x) all(is.na(x)))
-    scores <- scores[,!all.na, drop = FALSE]
-
     # Trim by labels (labels.use)
     if (!is.null(labels.use)) {
         labels.use <- labels.use[labels.use %in% colnames(scores)]
@@ -481,15 +485,15 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
 
     # Trim by labels (max.labels), using primarily the most frequent labels.
     times.best <- table(factor(scores.labels, levels = unique(colnames(scores))))[colnames(scores)]
-    if (!any(is.na(scores))) {
+    if (!anyNA(scores)) {
         # To break ties, we sort by the scaled maximum if there are no NAs.
         # This is done _before_ within-cell normalization of the scores,
         # after which it makes little sense to compare scores between cells.
         secondary <- rowMaxs(scale(t(scores)), na.rm = TRUE)
     } else {
         # If there are NAs - usually from combineRecomputedResults -
         # we sort by the frequency of non-NA occurrences.
-        secondary <- apply(scores, 2, FUN = function(x) sum(!is.na(x)))
+        secondary <- colSums(!is.na(scores))
     }
     to.keep <- order(times.best, secondary, decreasing=TRUE)
     to.keep <- head(to.keep, max.labels)
@@ -650,3 +654,16 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
         next.color.index.discrete = next.color.index.discrete,
         next.color.index.numeric = next.color.index.numeric)
 }
+
+.expand_recomputed_scores <- function(scores) {
+    expanded.scores <- vector("list", ncol(scores))
+    for (i in seq_along(expanded.scores)) {
+        curscores <- scores[[i]]
+        u <- unique(curscores$labels)
+        expanded <- matrix(NA_real_, nrow(curscores), length(u))
+        expanded[cbind(seq_len(nrow(curscores)), match(curscores$labels, u))] <- curscores$scores
+        colnames(expanded) <- u
+        expanded.scores[[i]] <- expanded
+    }
+    do.call(cbind, expanded.scores)
+}
diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd
diff --git a/man/plotDeltaDistribution.Rd b/man/plotDeltaDistribution.Rd
diff --git a/man/plotScoreDistribution.Rd b/man/plotScoreDistribution.Rd
diff --git a/tests/testthat/test-heatmap.R b/tests/testthat/test-heatmap.R
@@ -314,32 +314,14 @@ test_that("heatmap multi-ref - 'na.color'", {
         "#000000")
 })
 
-test_that("heatmap multi-ref - labels with no scores are removed", {
-    combined$scores <- cbind(combined$scores, "f" = NA)
-    expect_true("f" %in% colnames(combined$scores))
-    expect_false("f" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE,
-            scores.use = 0)$mat))
-})
-
 test_that("heatmap multi-ref - labels with least calls/calcs are removed by 'max.labels'", {
-    combined$scores <- cbind(combined$scores, "neverCalled" = 1) # actual score is immaterial
-    combined$scores <- cbind(combined$scores, "rarelyCalc" = NA)
-    combined$scores[1,"rarelyCalc"] <- 1 # Needs at least one score to not be removed anyway.
-    expect_true(all(c("neverCalled", "rarelyCalc") %in% colnames(combined$scores)))
-
-    # Both there with no trimming
-    expect_true(all(c("neverCalled", "rarelyCalc") %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
-        max.labels = 40)$mat)))
-
-    # The rarely picked for calculation "rarelyCalc" label should be removed first
-    expect_true("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
-        max.labels = 11)$mat))
-    expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
-        max.labels = 11)$mat))
+    combined$scores[[1]][1,"labels"] <- "rarelyCalc"
 
-    # The never picked as final label "neverCalled" label should be removed next
-    expect_false("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
-        max.labels = 10)$mat))
+    # Present with no trimming
+    expect_true("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
+        max.labels = 40)$mat))
+
+    # The rarely picked for calculation "rarelyCalc" label should be removed. 
     expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
         max.labels = 10)$mat))
 })

diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R
@@ -30,12 +30,17 @@ test_that("combineRecomputedResults works as expected (light check)", {
     combined <- combineRecomputedResults(
         results=list(pred1, pred2), 
         test=test,
-        trained=list(train1, train2))
+        trained=list(train1, train2),
+        fine.tune=FALSE
+    )
 
-    # Checking the sanity of the output.
-    obs <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[!is.na(x)])
-    ref <- rbind(pred1$labels, pred2$labels) 
-    expect_identical(obs, ref)
+    expect_identical(combined$scores$ref1$labels, pred1$labels)
+    expect_identical(combined$scores$ref2$labels, pred2$labels)
+
+    aggregated.scores <- do.call(cbind, lapply(combined$scores, function(x) x$scores))
+    aggregated.labels <- do.call(cbind, lapply(combined$scores, function(x) as.character(x$labels)))
+    expect_identical(max.col(aggregated.scores), combined$reference)
+    expect_identical(aggregated.labels[cbind(seq_len(nrow(aggregated.labels)), max.col(aggregated.scores))], combined$labels)
 
     expect_true(all(combined$labels == pred1$labels | combined$labels==pred2$labels))
     expect_true(all(combined$first.labels == pred1$first.labels | combined$first.labels==pred2$first.labels))
@@ -46,9 +51,6 @@ test_that("combineRecomputedResults works as expected (light check)", {
         is.na(combined$pruned.labels)==is.na(pred1$pruned.labels) | 
         is.na(combined$pruned.labels)==is.na(pred2$pruned.labels)
     ))
-
-    top <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[which.max(x)])
-    expect_identical(top, combined$labels)
 })
 
 test_that("combineRecomputedResults matrix fragmentation works as expected", {