Skip to content

Commit

Permalink
Cleaned up the per-reference scores from combineRecomputedResults.
Browse files Browse the repository at this point in the history
Now we return a DataFrame of nested Dataframes, where each inner DataFrame
corresponds to a reference and contains the identity of the best label and the
recomputed score in that reference. This is effectively a sparse version of the
previous "expanded with NA" format, which is easier to read and saves memory.

We also update the plotting functions to work with this new format, mostly by
regenerated the previous expanded format as needed.
  • Loading branch information
LTLA committed Dec 15, 2024
1 parent 320d4bb commit c24b091
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 73 deletions.
37 changes: 11 additions & 26 deletions R/combineRecomputedResults.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
#' @return A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row).
#' This mimics the output of \code{\link{classifySingleR}} and contains the following fields:
#' \itemize{
#' \item \code{scores}, a numeric matrix of correlations containing the \emph{recomputed} scores.
#' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference;
#' scores are not recomputed for the other labels.
#' \item \code{scores}, a DataFrame of DataFrames containing the \emph{recomputed} scores for the best label in each reference.
#' Each nested DataFrame corresponds to a reference and contains \code{labels} (the best label for that cell in this reference) and \code{scores} (the recomputed score).
#' \item \code{labels}, a character vector containing the per-cell combined label across references.
#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived.
#' \item \code{delta.next}, a numeric vector containing the difference between the best and next-best score.
Expand Down Expand Up @@ -186,21 +185,19 @@ combineRecomputedResults <- function(
)

# Organizing the outputs.
if (is.null(names(results))) {
names(results) <- sprintf("ref%i", seq_along(results))
}

base.scores <- vector("list", length(results))
for (r in seq_along(base.scores)) {
mat <- results[[r]]$scores
mat[] <- NA_real_
idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L)
mat[idx] <- irun$scores[,r]
base.scores[[r]] <- mat
names(base.scores) <- names(results)
for (i in seq_along(base.scores)) {
base.scores[[i]] <- DataFrame(labels=results[[i]]$labels, trained[[i]]$labels$unique, scores=irun$scores[,i])
}

all.scores <- do.call(cbind, base.scores)
all.scores <- DataFrame(lapply(base.scores, I))
output <- DataFrame(scores = I(all.scores), row.names=rownames(results[[1]]))
metadata(output)$label.origin <- .create_label_origin(base.scores)

chosen <- irun$best + 1L
cbind(output, .combine_result_frames(chosen, irun$delta, results))
cbind(output, .combine_result_frames(irun$best + 1L, irun$delta, results))
}

#' @importFrom S4Vectors DataFrame
Expand Down Expand Up @@ -228,19 +225,7 @@ combineRecomputedResults <- function(

output$reference <- chosen
output$delta.next <- delta

if (is.null(names(results))) {
names(results) <- sprintf("ref%i", seq_along(results))
}
output$orig.results <- do.call(DataFrame, lapply(results, I))

output
}

#' @importFrom S4Vectors DataFrame
.create_label_origin <- function(collected.scores) {
DataFrame(
label=unlist(lapply(collected.scores, colnames)),
reference=rep(seq_along(collected.scores), vapply(collected.scores, ncol, 0L))
)
}
10 changes: 9 additions & 1 deletion R/plotDeltaDistribution.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
plotDeltaDistribution <- function(
results,
show = c("delta.med", "delta.next"),
labels.use = colnames(results$scores),
labels.use = NULL,
references = NULL,
chosen.only = TRUE,
size = 2,
Expand Down Expand Up @@ -135,6 +135,14 @@ plotDeltaDistribution <- function(
pruned <- is.na(current.results$pruned.labels)
}

if (is.null(labels.use)) {
if (is(results$scores, "DataFrame")) {
labels.use <- unlist(lapply(results$scores, function(x) unique(x$labels)))
} else {
labels.use <- colnames(results$scores)
}
}

# Actually creating the plot
plots[[i]] <- .plot_delta_distribution(
values=values, labels=labels, pruned=pruned, labels.use=labels.use,
Expand Down
12 changes: 10 additions & 2 deletions R/plotScoreDistribution.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
plotScoreDistribution <- function(
results,
show = NULL,
labels.use = colnames(results$scores),
labels.use = NULL,
references = NULL,
scores.use = NULL,
calls.use = 0,
Expand Down Expand Up @@ -137,11 +137,15 @@ plotScoreDistribution <- function(
chosen <- references[i]
if (chosen==0L) {
current.results <- results
scores <- current.results$scores
if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults.
scores <- .expand_recomputed_scores(scores)
}
} else {
current.results <- results$orig.results[[chosen]]
scores <- current.results$scores
}

scores <- current.results$scores
scores.title <- .values_title(is.combined, chosen, ref.names, show)

# Pulling out the labels to use in this iteration.
Expand All @@ -154,6 +158,10 @@ plotScoreDistribution <- function(
prune.calls <- current.results$pruned.labels
}

if (is.null(labels.use)) {
labels.use <- colnames(scores)
}

# Actually creating the plot
plots[[i]] <- .plot_score_distribution(
scores=scores, labels=labels, prune.calls=prune.calls, labels.use=labels.use,
Expand Down
31 changes: 24 additions & 7 deletions R/plotScoreHeatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,15 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
chosen.scores <- scores.use[i]
if (chosen.scores==0L) {
score.results <- results
scores <- score.results$scores
if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults.
scores <- .expand_recomputed_scores(scores)
}
} else {
score.results <- results$orig.results[[chosen.scores]]
scores <- score.results$scores
}

scores <- score.results$scores
rownames(scores) <- rownames(results)
scores.title <- .values_title(is.combined, chosen.scores, ref.names, "Scores")
scores.labels <- score.results$labels
Expand All @@ -245,6 +249,10 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
names(labels) <- names(prune.calls) <- rownames(scores)
labels.title <- .values_title(is.combined, chosen.calls, ref.names, "Labels")

if (is.null(labels.use)) {
labels.use <- colnames(scores)
}

# Actually creating the heatmap.
output <- .plot_score_heatmap(
scores=scores,
Expand Down Expand Up @@ -465,10 +473,6 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
.trim_byLabel_and_normalize_scores <- function(
scores, labels.use, max.labels, normalize, scores.title, scores.labels) {

# Trim by labels (remove any with no scores)
all.na <- apply(scores, 2, FUN = function(x) all(is.na(x)))
scores <- scores[,!all.na, drop = FALSE]

# Trim by labels (labels.use)
if (!is.null(labels.use)) {
labels.use <- labels.use[labels.use %in% colnames(scores)]
Expand All @@ -481,15 +485,15 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,

# Trim by labels (max.labels), using primarily the most frequent labels.
times.best <- table(factor(scores.labels, levels = unique(colnames(scores))))[colnames(scores)]
if (!any(is.na(scores))) {
if (!anyNA(scores)) {
# To break ties, we sort by the scaled maximum if there are no NAs.
# This is done _before_ within-cell normalization of the scores,
# after which it makes little sense to compare scores between cells.
secondary <- rowMaxs(scale(t(scores)), na.rm = TRUE)
} else {
# If there are NAs - usually from combineRecomputedResults -
# we sort by the frequency of non-NA occurrences.
secondary <- apply(scores, 2, FUN = function(x) sum(!is.na(x)))
secondary <- colSums(!is.na(scores))
}
to.keep <- order(times.best, secondary, decreasing=TRUE)
to.keep <- head(to.keep, max.labels)
Expand Down Expand Up @@ -650,3 +654,16 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL,
next.color.index.discrete = next.color.index.discrete,
next.color.index.numeric = next.color.index.numeric)
}

.expand_recomputed_scores <- function(scores) {
expanded.scores <- vector("list", ncol(scores))
for (i in seq_along(expanded.scores)) {
curscores <- scores[[i]]
u <- unique(curscores$labels)
expanded <- matrix(NA_real_, nrow(curscores), length(u))
expanded[cbind(seq_len(nrow(curscores)), match(curscores$labels, u))] <- curscores$scores
colnames(expanded) <- u
expanded.scores[[i]] <- expanded
}
do.call(cbind, expanded.scores)
}
5 changes: 2 additions & 3 deletions man/combineRecomputedResults.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/plotDeltaDistribution.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/plotScoreDistribution.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 6 additions & 24 deletions tests/testthat/test-heatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -314,32 +314,14 @@ test_that("heatmap multi-ref - 'na.color'", {
"#000000")
})

test_that("heatmap multi-ref - labels with no scores are removed", {
combined$scores <- cbind(combined$scores, "f" = NA)
expect_true("f" %in% colnames(combined$scores))
expect_false("f" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE,
scores.use = 0)$mat))
})

test_that("heatmap multi-ref - labels with least calls/calcs are removed by 'max.labels'", {
combined$scores <- cbind(combined$scores, "neverCalled" = 1) # actual score is immaterial
combined$scores <- cbind(combined$scores, "rarelyCalc" = NA)
combined$scores[1,"rarelyCalc"] <- 1 # Needs at least one score to not be removed anyway.
expect_true(all(c("neverCalled", "rarelyCalc") %in% colnames(combined$scores)))

# Both there with no trimming
expect_true(all(c("neverCalled", "rarelyCalc") %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
max.labels = 40)$mat)))

# The rarely picked for calculation "rarelyCalc" label should be removed first
expect_true("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
max.labels = 11)$mat))
expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
max.labels = 11)$mat))
combined$scores[[1]][1,"labels"] <- "rarelyCalc"

# The never picked as final label "neverCalled" label should be removed next
expect_false("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
max.labels = 10)$mat))
# Present with no trimming
expect_true("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
max.labels = 40)$mat))

# The rarely picked for calculation "rarelyCalc" label should be removed.
expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0,
max.labels = 10)$mat))
})
Expand Down
18 changes: 10 additions & 8 deletions tests/testthat/test-recomputed.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,17 @@ test_that("combineRecomputedResults works as expected (light check)", {
combined <- combineRecomputedResults(
results=list(pred1, pred2),
test=test,
trained=list(train1, train2))
trained=list(train1, train2),
fine.tune=FALSE
)

# Checking the sanity of the output.
obs <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[!is.na(x)])
ref <- rbind(pred1$labels, pred2$labels)
expect_identical(obs, ref)
expect_identical(combined$scores$ref1$labels, pred1$labels)
expect_identical(combined$scores$ref2$labels, pred2$labels)

aggregated.scores <- do.call(cbind, lapply(combined$scores, function(x) x$scores))
aggregated.labels <- do.call(cbind, lapply(combined$scores, function(x) as.character(x$labels)))
expect_identical(max.col(aggregated.scores), combined$reference)
expect_identical(aggregated.labels[cbind(seq_len(nrow(aggregated.labels)), max.col(aggregated.scores))], combined$labels)

expect_true(all(combined$labels == pred1$labels | combined$labels==pred2$labels))
expect_true(all(combined$first.labels == pred1$first.labels | combined$first.labels==pred2$first.labels))
Expand All @@ -46,9 +51,6 @@ test_that("combineRecomputedResults works as expected (light check)", {
is.na(combined$pruned.labels)==is.na(pred1$pruned.labels) |
is.na(combined$pruned.labels)==is.na(pred2$pruned.labels)
))

top <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[which.max(x)])
expect_identical(top, combined$labels)
})

test_that("combineRecomputedResults matrix fragmentation works as expected", {
Expand Down

0 comments on commit c24b091

Please sign in to comment.