Merge pull request #77 from stemangiola/README-and-docs

update docs and README
stemangiola · Feb 18, 2023 · 4b08621 · 4b08621
2 parents 02c8a94 + 48a370c
commit 4b08621
Show file tree

Hide file tree

Showing 8 changed files with 321 additions and 132 deletions.
diff --git a/R/query.R b/R/query.R
@@ -370,6 +370,41 @@ get_seurat <- function(...) {
 #' @importFrom dplyr tbl
 #' @importFrom httr progress
 #' @importFrom cli cli_alert_info
+#' 
+#' @details 
+#' 
+#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
+#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
+#' 
+#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
+#' 
+#'  Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
+#' 
+#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
+#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
+#' 
+#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
+#' 
+#' `.sample`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
+#' 
+#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
+#' 
+#' `.cell`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler` 
+#' 
+#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
+#' 
+#' - `tissue_harmonised`: a coarser tissue name for better filtering
+#' - `age_days`: the number of days corresponding to the age
+#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
+#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.             
+#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
+#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
+#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
+#' - `sample_id_db`: Sample subdivision for internal use
+#' - `file_id_db`: File subdivision for internal use
+#' - `.sample`: Sample ID
+#' - `.sample_name`: How samples were defined
+#' 
 get_metadata <- function(
     remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
     cache_directory = get_default_cache_dir()

diff --git a/README.Rmd b/README.Rmd
@@ -23,12 +23,13 @@ knitr::opts_chunk$set(
 knitr::include_graphics(c("man/figures/logo.png"))
 ```
 
-```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
+```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px", "180px")}
 knitr::include_graphics(c(
   "man/figures/svcf_logo.jpeg", 
   "man/figures/czi_logo.png", 
   "man/figures/bioconductor_logo.jpg",
-    "man/figures/vca_logo.png"
+    "man/figures/vca_logo.png",
+  "man/figures/nectar_logo.png"
 ))
 ```
 
@@ -58,31 +59,14 @@ library(stringr)
 get_metadata()
 ```
 
-### Explore the tissue 
+### Explore the number of datasets per tissue
 
 ```{r}
 get_metadata() |>
-    dplyr::distinct(tissue, file_id) 
+  dplyr::distinct(tissue, dataset_id) |> 
+  dplyr::count(tissue)
 ```
 
-```{r}
-#> # Source:     SQL [?? x 2]
-#> # Database:   sqlite 3.40.0 [[email protected]:5432/metadata]
-#> # Ordered by: desc(n)
-#>    tissue                      n
-#>    <chr>                 <int64>
-#>  1 blood                      47
-#>  2 heart left ventricle       46
-#>  3 cortex of kidney           31
-#>  4 renal medulla              29
-#>  5 lung                       27
-#>  6 liver                      24
-#>  7 middle temporal gyrus      24
-#>  8 kidney                     19
-#>  9 intestine                  18
-#> 10 thymus                     17
-#> # … with more rows
-```
 
 
 ## Download single-cell RNA sequencing counts 
@@ -161,36 +145,110 @@ single_cell_counts
 
 We can gather all natural killer cells and plot the distribution of CD56 (NCAM1) across all tissues
 
+```{r, eval=FALSE, echo=FALSE}
+library(tidySingleCellExperiment)
+library(ggplot2)
+
+# Plots with styling
+
+# Plot by disease
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+  filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |> 
+  
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Add feature to table
+  join_features("HLA-A", shape = "wide") |> 
+    
+  # Rank x axis
+  as_tibble() |> 
+  with_groups(disease, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |> 
+  
+  # Plot
+  ggplot(aes( fct_reorder(disease, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") +
+    
+  # Style
+  guides(color="none") +
+  scale_y_log10() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) + 
+  xlab("Disease") + 
+  ggtitle("HLA-A in CD14 monocytes. Coloured by dataset") 
+
+# Plot by tissue
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+  filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |> 
+  
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Add feature to table
+  join_features("HLA-A", shape = "wide") |> 
+    
+  # Rank x axis
+  as_tibble() |> 
+  with_groups(tissue_harmonised, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |> 
+  
+  # Plot
+  ggplot(aes( fct_reorder(tissue_harmonised, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") +
+    
+  # Style
+  guides(color="none") +
+  scale_y_log10() +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) + 
+  xlab("Disease") + 
+  ggtitle("HLA-A in CD14 monocytes. Coloured by dataset") 
+
+```
+
 ```{r, eval=FALSE}
 library(tidySingleCellExperiment)
 library(ggplot2)
 
+get_metadata() |>
+  # Filter and subset
+  filter(cell_type_harmonised=="cd14 mono") |>
+
+  # Get counts per million for NCAM1 gene
+  get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |> 
+  
+  # Plot
+  join_features("HLA-A", shape = "wide") |> 
+  ggplot(aes( disease, `HLA.A`,color = file_id)) +
+  geom_jitter(shape=".") 
+```
+
+```{r, echo=FALSE, message=FALSE, warning=FALSE}
+knitr::include_graphics("man/figures/HLA_A_disease_plot.png")
+```
+
+```{r, eval=FALSE}
+
 get_metadata() |> 
     
   # Filter and subset
   filter(cell_type_harmonised=="nk") |> 
-  select(.cell, file_id_db, disease, file_id, tissue_harmonised) |> 
-  
+
   # Get counts per million for NCAM1 gene 
   get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |> 
 
-	# Get transcriptional abundance for plotting with `tidySingleCellExperiment`
-  join_features("NCAM1", shape = "wide") |> 
-	
 	# Plot
+  join_features("NCAM1", shape = "wide") |> 
   ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
-  geom_jitter(shape=".") +
-	
-	# Style
-  guides(color="none") +
-  scale_y_log10() +
-  theme_bw() +
-  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
+  geom_jitter(shape=".") 
 
 ```
 
 ```{r, echo=FALSE, message=FALSE, warning=FALSE}
-knitr::include_graphics("man/figures/NCAM1_figure.png")
+knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
 ```
 
 # Cell metadata