From f667320225cce98775c757d489149b8cfb618afd Mon Sep 17 00:00:00 2001 From: csmagnano Date: Fri, 3 May 2024 09:40:06 -0400 Subject: [PATCH] Updated further reading and heading levels in all lessons --- episodes/cell_type_annotation.Rmd | 33 +++++++++++++----------- episodes/hca.Rmd | 42 +++++++++++++++---------------- episodes/intro-sce.Rmd | 24 +++++++++--------- episodes/large_data.Rmd | 38 +++++++++++++++------------- episodes/multi-sample.Rmd | 34 +++++++++++++------------ 5 files changed, 91 insertions(+), 80 deletions(-) diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd index 538e684..7be6dc3 100644 --- a/episodes/cell_type_annotation.Rmd +++ b/episodes/cell_type_annotation.Rmd @@ -23,7 +23,7 @@ exercises: 15 # Minutes of exercises in the lesson :::::::::::::::::::::::::::::::::::::::::::::::: -# Setup +## Setup ```{r setup, message = FALSE} library(BiocStyle) @@ -35,7 +35,7 @@ library(scater) library(scran) ``` -# Data retrieval +## Data retrieval ```{r data, message = FALSE} sce <- WTChimeraData(samples = 5, type = "processed") @@ -50,14 +50,14 @@ ind <- sample(ncol(sce), 1000) sce <- sce[,ind] ``` -# Preprocessing +## Preprocessing ```{r preproc, warning = FALSE} sce <- logNormCounts(sce) sce <- runPCA(sce) ``` -# Clustering +## Clustering Clustering is an unsupervised learning procedure that is used to empirically define groups of cells with similar expression profiles. @@ -104,7 +104,7 @@ sce <- runUMAP(sce, dimred = "PCA") plotReducedDim(sce, "UMAP", color_by = "label") ``` -# Marker gene detection +## Marker gene detection To interpret clustering results as obtained in the previous section, we identify the genes that drive separation between clusters. These marker genes allow us to @@ -156,7 +156,7 @@ top.markers <- head(rownames(markers[[1]])) plotExpression(sce, features = top.markers, x = "label", color_by = "label") ``` -# Cell type annotation +## Cell type annotation The most challenging task in scRNA-seq data analysis is arguably the interpretation of the results. @@ -182,7 +182,7 @@ reference datasets where each sample or cell has already been annotated with its putative biological state by domain experts. Here, we will demonstrate both approaches on the wild-type chimera dataset. -## Assigning cell labels from reference data +### Assigning cell labels from reference data A conceptually straightforward annotation approach is to compare the single-cell expression profiles with previously annotated reference datasets. @@ -303,7 +303,7 @@ tab <- table(res$pruned.labels, sce$celltype.mapped) pheatmap(log2(tab + 10), color = colorRampPalette(c("white", "blue"))(101)) ``` -## Assigning cell labels from gene sets +### Assigning cell labels from gene sets A related strategy is to explicitly identify sets of marker genes that are highly expressed in each individual cell. @@ -397,19 +397,15 @@ a fitted three-component mixture, and the grey curve represents a fitted normal distribution. Vertical lines represent threshold estimates corresponding to each estimate of the distribution. -# Session Info +## Session Info ```{r sessionInfo} sessionInfo() ``` -# Further Reading -* OSCA book, [Chapters 5-7](https://bioconductor.org/books/release/OSCA.basic/clustering.html) -* Assigning cell types with SingleR ([the book](https://bioconductor.org/books/release/SingleRBook/)). -* The [AUCell](https://bioconductor.org/packages/AUCell) package vignette. -# Exercises +## Exercises :::::::::::::::::::::::::::::::::: challenge @@ -484,6 +480,15 @@ TODO ::::::::::::::::::::::::::::::::::::::::::::: +:::::::::::::: checklist +## Further Reading + +* OSCA book, [Chapters 5-7](https://bioconductor.org/books/release/OSCA.basic/clustering.html) +* Assigning cell types with SingleR ([the book](https://bioconductor.org/books/release/SingleRBook/)). +* The [AUCell](https://bioconductor.org/packages/AUCell) package vignette. + +:::::::::::::: + ::::::::::::::::::::::::::::::::::::: keypoints - TODO diff --git a/episodes/hca.Rmd b/episodes/hca.Rmd index 65473e5..3d621be 100644 --- a/episodes/hca.Rmd +++ b/episodes/hca.Rmd @@ -18,7 +18,7 @@ exercises: 10 # Minutes of exercises in the lesson :::::::::::::::::::::::::::::::::::::::::::::::: -# HCA Project +## HCA Project The Human Cell Atlas (HCA) is a large project that aims to learn from and map every cell type in the human body. The project extracts spatial and molecular @@ -27,7 +27,7 @@ international collaborative that charts healthy cells in the human body at all ages. There are about 37.2 trillion cells in the human body. To read more about the project, head over to their website at https://www.humancellatlas.org. -# CELLxGENE +## CELLxGENE CELLxGENE is a database and a suite of tools that help scientists to find, download, explore, analyze, annotate, and publish single cell data. It includes @@ -35,7 +35,7 @@ several analytic and visualization tools to help you to discover single cell data patterns. To see the list of tools, browse to https://cellxgene.cziscience.com/. -# CELLxGENE | Census +## CELLxGENE | Census The Census provides efficient computational tooling to access, query, and analyze all single-cell RNA data from CZ CELLxGENE Discover. Using a new access @@ -44,7 +44,7 @@ through TileDB-SOMA, or get slices in AnnData or Seurat objects, thus accelerating your research by significantly minimizing data harmonization at https://chanzuckerberg.github.io/cellxgene-census/. -# The CuratedAtlasQueryR Project +## The CuratedAtlasQueryR Project To systematically characterize the immune system across tissues, demographics and multiple studies, single cell transcriptomics data was harmonized from the @@ -71,7 +71,7 @@ accessing atlas-level datasets programmatically and reproducibly. ![](figures/curatedAtlasQuery.png) -# Data Sources in R / Bioconductor +## Data Sources in R / Bioconductor There are a few options to access single cell data with R / Bioconductor. @@ -81,7 +81,7 @@ There are a few options to access single cell data with R / Bioconductor. | [cellxgenedp](https://bioconductor.org/packages/cellxgenedp) | [CellxGene](https://cellxgene.cziscience.com/) | Human and mouse SC data including HCA | | [CuratedAtlasQueryR](https://stemangiola.github.io/CuratedAtlasQueryR/) | [CellxGene](https://cellxgene.cziscience.com/) | fine-grained query capable CELLxGENE data including HCA | -# Installation +## Installation ```{r, eval=FALSE} if (!requireNamespace("BiocManager", quietly = TRUE)) @@ -90,14 +90,14 @@ if (!requireNamespace("BiocManager", quietly = TRUE)) BiocManager::install("CuratedAtlasQueryR") ``` -# Package load +## Package load ```{r, include = TRUE, results = "hide", message = FALSE, warning = FALSE} library(CuratedAtlasQueryR) library(dplyr) ``` -# HCA Metadata +## HCA Metadata The metadata allows the user to get a lay of the land of what is available via the package. In this example, we are using the sample database URL which @@ -115,7 +115,7 @@ metadata |> glimpse() ``` -# A note on the piping operator +## A note on the piping operator The vignette materials provided by `CuratedAtlasQueryR` show the use of the 'native' R pipe (implemented after R version `4.1.0`). For those not familiar @@ -136,7 +136,7 @@ iris |> aggregate(. ~ Species, data = _, mean) ``` -# Summarizing the metadata +## Summarizing the metadata For each distinct tissue and dataset combination, count the number of datasets by tissue type. @@ -147,13 +147,13 @@ metadata |> count(tissue) ``` -# Columns available in the metadata +## Columns available in the metadata ```{r, message = FALSE} head(names(metadata), 10) ``` -# Available assays +## Available assays ```{r} metadata |> @@ -161,7 +161,7 @@ metadata |> count(assay) ``` -# Available organisms +## Available organisms ```{r} metadata |> @@ -169,14 +169,14 @@ metadata |> count(organism) ``` -## Download single-cell RNA sequencing counts +### Download single-cell RNA sequencing counts The data can be provided as either "counts" or counts per million "cpm" as given by the `assays` argument in the `get_single_cell_experiment()` function. By default, the `SingleCellExperiment` provided will contain only the 'counts' data. -### Query raw counts +#### Query raw counts ```{r, message = FALSE} single_cell_counts <- @@ -192,7 +192,7 @@ single_cell_counts <- single_cell_counts ``` -### Query counts scaled per million +#### Query counts scaled per million This is helpful if just few genes are of interest, as they can be compared across samples. @@ -208,7 +208,7 @@ metadata |> get_single_cell_experiment(assays = "cpm") ``` -### Extract only a subset of genes +#### Extract only a subset of genes ```{r, message = FALSE} single_cell_counts <- @@ -224,7 +224,7 @@ single_cell_counts <- single_cell_counts ``` -### Extracting counts as a Seurat object +#### Extracting counts as a Seurat object If needed, the H5 `SingleCellExperiment` can be converted into a Seurat object. Note that it may take a long time and use a lot of memory depending on how many @@ -244,9 +244,9 @@ single_cell_counts <- single_cell_counts ``` -## Save your `SingleCellExperiment` +### Save your `SingleCellExperiment` -### Saving as HDF5 +#### Saving as HDF5 The recommended way of saving these `SingleCellExperiment` objects, if necessary, is to use `saveHDF5SummarizedExperiment` from the `HDF5Array` @@ -256,7 +256,7 @@ package. single_cell_counts |> saveHDF5SummarizedExperiment("single_cell_counts") ``` -# Exercises +## Exercises :::::::::::::::::::::::::::::::::: challenge diff --git a/episodes/intro-sce.Rmd b/episodes/intro-sce.Rmd index 5f8587a..bf756cd 100644 --- a/episodes/intro-sce.Rmd +++ b/episodes/intro-sce.Rmd @@ -20,7 +20,7 @@ exercises: 10 # Minutes of exercises in the lesson :::::::::::::::::::::::::::::::::::::::::::::::: -# Setup +## Setup ```{r setup, message = FALSE, warning=FALSE} library(SummarizedExperiment) @@ -29,9 +29,9 @@ library(MouseGastrulationData) library(BiocStyle) ``` -# Bioconductor +## Bioconductor -## Overview +### Overview Within the R ecosystem, the Bioconductor project provides tools for the analysis and comprehension of high-throughput genomics data. The scope of the project covers microarray data, various forms of sequencing (RNA-seq, ChIP-seq, bisulfite, genotyping, etc.), proteomics, flow cytometry and more. @@ -39,7 +39,7 @@ One of Bioconductor's main selling points is the use of common data structures t allowing code written by different people (from different organizations, in different countries) to work together seamlessly in complex analyses. By extending R to genomics, Bioconductor serves as a powerful addition to the computational biologist's toolkit. -## Installing Bioconductor Packages +### Installing Bioconductor Packages The default repository for R packages is the [Comprehensive R Archive Network](https://cran.r-project.org/mirrors.html) (CRAN), which is home to over 13,000 different R packages. We can easily install packages from CRAN - say, the popular `r CRANpkg("ggplot2")` package for data visualization - by opening up R and typing in: @@ -78,7 +78,7 @@ BiocManager::install("scater") Packages only need to be installed once, and then they are available for all subsequent uses of a particular R installation. There is no need to repeat the installation every time we start R. -## Finding relevant packages +### Finding relevant packages To find relevant Bioconductor packages, one useful resource is the [BiocViews](https://bioconductor.org/packages/release/BiocViews.html) page. This provides a hierarchically organized view of annotations associated with each Bioconductor package. @@ -87,7 +87,7 @@ This gives us a listing of all Bioconductor packages that might be useful for ou CRAN uses the similar concept of ["Task views"](https://cran.r-project.org/web/views/), though this is understandably more general than genomics. For example, the [Cluster task view page](https://cran.r-project.org/web/views/Cluster.html) lists an assortment of packages that are relevant to cluster analyses. -## Staying up to date +### Staying up to date Updating all R/Bioconductor packages is as simple as running `BiocManager::install()` without any arguments. This will check for more recent versions of each package (within a Bioconductor release) and prompt the user to update if any are available. @@ -96,7 +96,7 @@ This will check for more recent versions of each package (within a Bioconductor BiocManager::install() ``` -# The `SingleCellExperiment` class +## The `SingleCellExperiment` class One of the main strengths of the Bioconductor project lies in the use of a common data infrastructure that powers interoperability across packages. @@ -110,7 +110,7 @@ knitr::include_graphics("http://bioconductor.org/books/3.17/OSCA.intro/images/Si Let's start with an example dataset. -```{r, message = FALSE} +```{r, message = FALSE, warning=FALSE} sce <- WTChimeraData(samples=5) sce ``` @@ -121,7 +121,7 @@ The _getter_ methods are used to extract information from the slots and the _set Depending on the object, slots can contain different types of data (e.g., numeric matrices, lists, etc.). We will here review the main slots of the SingleCellExperiment class as well as their getter/setter methods. -## The `assays` +### The `assays` This is arguably the most fundamental part of the object that contains the count matrix, and potentially other matrices with transformed data. We can access the _list_ of matrices with the `assays` function and individual matrices with the `assay` function. If one of these matrices is called "counts", we can use the special `counts` getter (and the analogous `logcounts`). @@ -132,7 +132,7 @@ counts(sce)[1:3, 1:3] You will notice that in this case we have a sparse matrix of class "dgTMatrix" inside the object. More generally, any "matrix-like" object can be used, e.g., dense matrices or HDF5-backed matrices (see "Working with large data"). -## The `colData` and `rowData` +### The `colData` and `rowData` Conceptually, these are two data frames that annotate the columns and the rows of your assay, respectively. @@ -151,7 +151,7 @@ sce$my_sum <- colSums(counts(sce)) colData(sce) ``` -## The `reducedDims` +### The `reducedDims` Everything that we have described so far (except for the `counts` getter) is part of the `SummarizedExperiment` class that SingleCellExperiment extends. You can find a complete lesson on the `SummarizedExperiment` class [here](https://carpentries-incubator.github.io/bioc-intro/60-next-steps.html). @@ -196,7 +196,7 @@ Combining two objects: The `MouseGastrulationData` package contains several data :::::::::::::: checklist -# Further Reading +## Further Reading * OSCA book, [Introduction](https://bioconductor.org/books/release/OSCA.intro) diff --git a/episodes/large_data.Rmd b/episodes/large_data.Rmd index 8311c02..1e973b6 100644 --- a/episodes/large_data.Rmd +++ b/episodes/large_data.Rmd @@ -25,7 +25,7 @@ exercises: 2 # Minutes of exercises in the lesson library(BiocStyle) ``` -# Motivation +## Motivation Advances in scRNA-seq technologies have increased the number of cells that can be assayed in routine experiments. @@ -39,7 +39,7 @@ increasing size of scRNA-seq data sets. This section discusses how we can use various aspects of the Bioconductor ecosystem to tune our analysis pipelines for greater speed and efficiency. -# Out of memory representations +## Out of memory representations The count matrix is the central structure around which our analyses are based. In most of the previous chapters, this has been held fully in memory as a dense @@ -126,7 +126,7 @@ in-memory representations on HPC systems with plentiful memory, and then distributing file-backed counterparts to individual users for exploration and visualization on their personal machines. -# Parallelization +## Parallelization Parallelization of calculations across genes or cells is an obvious strategy for speeding up scRNA-seq analysis workflows. @@ -136,7 +136,7 @@ computing throughout the Bioconductor ecosystem, manifesting as a `BPPARAM` argument in compatible functions. We can also use `BiocParallel` with more expressive functions directly through the package's interface. -### Basic use +#### Basic use ```{r,include=TRUE,results="hide",message=FALSE,warning=FALSE} library(BiocParallel) @@ -206,9 +206,9 @@ parallelization backends involve (i) setting up one or more separate R sessions, session. Depending on the nature and size of the task, this overhead may outweigh any benefit from parallel computing. -# Fast approximations +## Fast approximations -## Nearest neighbor searching +### Nearest neighbor searching Identification of neighbouring cells in PC or expression space is a common procedure that is used in many functions, e.g., `buildSNNGraph()`, `doubletCells()`. @@ -273,7 +273,7 @@ approx <- findKNN(Y, k = 20, BNPARAM = AnnoyParam()) mean(exact$index != approx$index) ``` -## Singular value decomposition +### Singular value decomposition The singular value decomposition (SVD) underlies the PCA used throughout our analyses, e.g., in `denoisePCA()`, `fastMNN()`, `doubletCells()`. @@ -313,9 +313,9 @@ of power iterations (`q=`). We tend to prefer IRLBA as its default behavior is more accurate, though RSVD is much faster for file-backed matrices. -# Interoperability with popular single-cell analysis ecosytems +## Interoperability with popular single-cell analysis ecosytems -## Seurat +### Seurat [Seurat](https://satijalab.org/seurat) is an R package designed for QC, analysis, and exploration of single-cell RNA-seq data. Seurat can be used to identify and @@ -390,7 +390,7 @@ Idents(sobj) <- "celltype.mapped" sobj ``` -## Scanpy +### Scanpy [Scanpy](https://scanpy.readthedocs.io) is a scalable toolkit for analyzing single-cell gene expression data built jointly with @@ -440,18 +440,14 @@ The resulting H5AD file can then be read into Python using scanpy's [read_h5ad](https://scanpy.readthedocs.io/en/stable/generated/scanpy.read_h5ad.html) function and then directly used in compatible Python-based analysis frameworks. -# Session Info +## Session Info ```{r sessionInfo} sessionInfo() ``` -# Further Reading -* OSCA book, [Chapter 14](https://bioconductor.org/books/release/OSCA.advanced/dealing-with-big-data.html): Dealing with big data -* The `BiocParallel` `r Biocpkg("BiocParallel", vignette = "Introduction_To_BiocParallel.html", label = "intro vignette")`. - -# Exercises +## Exercises :::::::::::::::::::::::::::::::::: challenge @@ -524,6 +520,14 @@ Use Seurat's `DimPlot` function. ::::::::::::::::::::::: +:::::::::::::: checklist +## Further Reading + +* OSCA book, [Chapter 14](https://bioconductor.org/books/release/OSCA.advanced/dealing-with-big-data.html): Dealing with big data +* The `BiocParallel` `r Biocpkg("BiocParallel", vignette = "Introduction_To_BiocParallel.html", label = "intro vignette")`. + +:::::::::::::: + ::::::::::::::::::::::::::::::::::::: keypoints - Out-of-memory representations can be used to work with single-cell datasets that are too large to fit in memory @@ -533,4 +537,4 @@ Use Seurat's `DimPlot` function. :::::::::::::::::::::::::::::::::::::::::::::::: -# References +## References diff --git a/episodes/multi-sample.Rmd b/episodes/multi-sample.Rmd index db1c618..f185a5b 100644 --- a/episodes/multi-sample.Rmd +++ b/episodes/multi-sample.Rmd @@ -21,7 +21,7 @@ exercises: 15 # Minutes of exercises in the lesson :::::::::::::::::::::::::::::::::::::::::::::::: -# Setup and data exploration +## Setup and data exploration As said, we will use the the wild-type data from the Tal1 chimera experiment: @@ -89,7 +89,7 @@ There are evident sample effects. Depending on the analysis that you want to per For now, let's assume that we want to remove this effect. -# Correcting batch effects +## Correcting batch effects We correct the effect of samples by aid of the `correctExperiment` function in the `batchelor` package and using the `sample` `colData` column as batch. @@ -119,7 +119,7 @@ Once we removed the sample batch effect, we can proceed with the Differential Expression Analysis. -# Differential Expression +## Differential Expression In order to perform a Differential Expression Analysis, we need to identify groups of cells across samples/conditions (depending on the experimental @@ -131,7 +131,7 @@ In our case we will focus on this second aspect to group cells according to the already annotated cell types to proceed with the computation of the pseudo-bulk samples. -## Pseudo-bulk samples +### Pseudo-bulk samples To compute differences between groups of cells, a possible way is to compute pseudo-bulk samples, where we mediate the gene signal of all the cells @@ -155,7 +155,7 @@ summed ``` -## Differential Expression Analysis +### Differential Expression Analysis The main advantage of using pseudo-bulk samples is the possibility to use well-tested methods for differential analysis like `edgeR` and `DESeq2`, we will @@ -307,7 +307,7 @@ cur.results[order(cur.results$PValue),] ``` -# Differential Abundance +## Differential Abundance With DA we test for differences between clusters across conditions, to investigate which clusters change accordingly to the treatment (the tomato injection in our case). @@ -333,7 +333,7 @@ y.ab <- estimateDisp(y.ab, design, trend="none") fit.ab <- glmQLFit(y.ab, design, robust=TRUE, abundance.trend=FALSE) ``` -## Background on compositional effect +### Background on compositional effect As mentioned before, in DA we don't normalize our data with `calcNormFactors` function, because this approach considers that most of the input features do not vary between conditions. @@ -355,7 +355,7 @@ consider this aspect. We now look at different approaches for handling the compositional effect. -## Assuming most labels do not change +### Assuming most labels do not change We can use a similar approach used during the DEGs analysis, assuming that most labels are not changing, in particular if we think about the low number of DEGs @@ -379,7 +379,7 @@ summary(decideTests(res2)) topTags(res2, n=10) ``` -## Testing against a log-fold change threshold +### Testing against a log-fold change threshold This other approach assumes that the composition bias introduces a spurious log2-fold change of no more than a \tau quantity for a non-DA label. In other words, we interpret this as the maximum log-fold change in the total number of cells given by DA in other labels. @@ -394,18 +394,13 @@ topTags(res.lfc) Addionally, the choice of \tau can be guided by other external experimental data, like a previous or a pilot experiment. -# Session Info +## Session Info ```{r, tidy=TRUE} sessionInfo() ``` - -# Further Reading - -* OSCA book, Multi-sample analysis, [Chapters 1, 4, and 6](https://bioconductor.org/books/release/OSCA.multisample) - -# Exercises +## Exercises :::::::::::::::::::::::::::::::::: challenge @@ -444,6 +439,13 @@ TODO ::::::::::::::::::::::::::::::::::::::::::::: +:::::::::::::: checklist +## Further Reading + +* OSCA book, Multi-sample analysis, [Chapters 1, 4, and 6](https://bioconductor.org/books/release/OSCA.multisample) + +:::::::::::::: + ::::::::::::::::::::::::::::::::::::: keypoints - TODO