From b16b7e1008345833a154a5ca0e81d68c7df6b455 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi <6763470+andrewGhazi@users.noreply.github.com> Date: Sat, 7 Sep 2024 23:05:51 -0400 Subject: [PATCH] minor formatting changes to cell type annotation --- episodes/cell_type_annotation.Rmd | 54 ++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd index a3f731a..e21af69 100644 --- a/episodes/cell_type_annotation.Rmd +++ b/episodes/cell_type_annotation.Rmd @@ -25,8 +25,11 @@ editor_options: ```{r chunk-opts, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) library(BiocStyle) +options(digits = 3) ``` +Again we'll start by loading the libraries we'll be using: + ```{r setup} library(AUCell) library(MouseGastrulationData) @@ -34,11 +37,13 @@ library(SingleR) library(bluster) library(scater) library(scran) +library(pheatmap) +library(GSEABase) ``` ## Data retrieval -We'll be using the same set of WT chimeric mouse embryo data: +We'll be using the fifth processed sample from the WT chimeric mouse embryo data: ```{r data, message = FALSE} sce <- WTChimeraData(samples = 5, type = "processed") @@ -49,7 +54,9 @@ To speed up the computations, we take a random subset of 1,000 cells. ```{r} set.seed(123) + ind <- sample(ncol(sce), 1000) + sce <- sce[,ind] ``` @@ -59,6 +66,7 @@ The SCE object needs to contain log-normalized expression counts as well as PCA ```{r preproc, warning = FALSE} sce <- logNormCounts(sce) + sce <- runPCA(sce) ``` @@ -102,6 +110,7 @@ colLabels(sce) <- clusterCells(sce, use.dimred = "PCA", table(colLabels(sce)) ``` +You can see we ended up with `r length(table(colLabels(sce)))` clusters of varying sizes. We can now overlay the cluster labels as color on a UMAP plot: @@ -143,7 +152,9 @@ cluster when compared to another cluster. ```{r marker-detect} rownames(sce) <- rowData(sce)$SYMBOL + markers <- findMarkers(sce, test.type = "wilcox", direction = "up", lfc = 1) + markers ``` @@ -249,15 +260,17 @@ most of the examples here are derived. ```{r ref-data, message = FALSE} ref <- EmbryoAtlasData(samples = 29) + ref -table(ref$stage) ``` -To speed up the computations, we subsample the dataset to 1,000 cells. +In order to reduce the computational load, we subsample the dataset to 1,000 cells. ```{r} set.seed(123) + ind <- sample(ncol(ref), 1000) + ref <- ref[,ind] ``` @@ -268,15 +281,18 @@ tab <- sort(table(ref$celltype), decreasing = TRUE) tab ``` +We need the normalized log counts, so we add those on: + ```{r ref-preproc} ref <- logNormCounts(ref) ``` Some cleaning - remove cells of the reference dataset for which the cell -type annotation is missing. +type annotation is missing: ```{r na-celltype} nna <- !is.na(ref$celltype) + ref <- ref[,nna] ``` @@ -285,7 +301,9 @@ to remove noise prior to subsequent annotation tasks. ```{r low-abu-ct} abu.ct <- names(tab)[tab >= 10] + ind <- ref$celltype %in% abu.ct + ref <- ref[,ind] ``` @@ -293,16 +311,20 @@ Restrict to genes shared between query and reference dataset. ```{r shared-genes} rownames(ref) <- rowData(ref)$SYMBOL -isect <- intersect(rownames(sce), rownames(ref)) -sce <- sce[isect,] -ref <- ref[isect,] + +shared_genes <- intersect(rownames(sce), rownames(ref)) + +sce <- sce[shared_genes,] + +ref <- ref[shared_genes,] ``` Convert sparse assay matrices to regular dense matrices for input to -SingleR. +SingleR: ```{r sparse-to-dense-mat} sce.mat <- as.matrix(assay(sce, "logcounts")) + ref.mat <- as.matrix(assay(ref, "logcounts")) ``` @@ -337,8 +359,8 @@ type, indicating that the clustering represents finer subdivisions within these cell types. ```{r, fig.width = 10, fig.height = 10} -library(pheatmap) tab <- table(anno = res$pruned.labels, cluster = colLabels(sce)) + pheatmap(log2(tab + 10), color = colorRampPalette(c("white", "blue"))(101)) ``` @@ -350,6 +372,7 @@ experts. ```{r anno-vs-preanno, fig.width = 10, fig.height = 10} tab <- table(res$pruned.labels, sce$celltype.mapped) + pheatmap(log2(tab + 10), color = colorRampPalette(c("white", "blue"))(101)) ``` @@ -389,10 +412,11 @@ the markers are available. We demonstrate this approach using cell type markers derived from the mouse embryo atlas dataset. ```{r atlas-markers} -library(scran) wilcox.z <- pairwiseWilcox(ref, ref$celltype, lfc = 1, direction = "up") + markers.z <- getTopMarkers(wilcox.z$statistics, wilcox.z$pairs, pairwise = FALSE, n = 50) + lengths(markers.z) ``` @@ -414,19 +438,22 @@ set, but involving only the top ranking genes by expression in each cell. ```{r create-gsets, message = FALSE} -library(GSEABase) all.sets <- lapply(names(markers.z), function(x) GeneSet(markers.z[[x]], setName = x)) + all.sets <- GeneSetCollection(all.sets) + all.sets ``` ```{r aucell} -library(AUCell) rankings <- AUCell_buildRankings(as.matrix(counts(sce)), plotStats = FALSE, verbose = FALSE) + cell.aucs <- AUCell_calcAUC(all.sets, rankings) + results <- t(assay(cell.aucs)) + head(results) ``` @@ -443,7 +470,9 @@ they should at least represent closely related cell states). ```{r anno-vs-cluster} new.labels <- colnames(results)[max.col(results)] + tab <- table(new.labels, sce$celltype.mapped) + tab ``` @@ -460,6 +489,7 @@ may indicate that its gene set is not sufficiently informative. ```{r auc-dist, results="hide", fig.width = 10, fig.height = 10} par(mfrow = c(3,3)) + AUCell_exploreThresholds(cell.aucs[1:9], plotHist = TRUE, assign = TRUE) ```