Skip to content

Commit

Permalink
Merge pull request #77 from stemangiola/README-and-docs
Browse files Browse the repository at this point in the history
update docs and README
  • Loading branch information
stemangiola authored Feb 18, 2023
2 parents 02c8a94 + 48a370c commit 4b08621
Show file tree
Hide file tree
Showing 8 changed files with 321 additions and 132 deletions.
35 changes: 35 additions & 0 deletions R/query.R
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,41 @@ get_seurat <- function(...) {
#' @importFrom dplyr tbl
#' @importFrom httr progress
#' @importFrom cli cli_alert_info
#'
#' @details
#'
#' The metadata was collected from the Bioconductor package `cellxgenedp`. it's vignette `using_cellxgenedp` provides an overview of the columns in the metadata.
#' The data for which the column `organism_name` included "Homo sapiens" was collected collected from `cellxgenedp`.
#'
#' The columns `dataset_id` and `file_id` link the datasets explorable through `CuratedAtlasQueryR` and `cellxgenedp`to the CELLxGENE portal.
#'
#' Our representation, harmonises the metadata at dataset, sample and cell levels, in a unique coherent database table.
#'
#' Dataset-specific columns (definitions available at cellxgene.cziscience.com)
#' `cell_count`, `collection_id`, `created_at.x`, `created_at.y`, `dataset_deployments`, `dataset_id`, `file_id`, `filename`, `filetype`, `is_primary_data.y`, `is_valid`, `linked_genesets`, `mean_genes_per_cell`, `name`, `published`, `published_at`, `revised_at`, `revision`, `s3_uri`, `schema_version`, `tombstone`, `updated_at.x`, `updated_at.y`, `user_submitted`, `x_normalization`
#'
#' Sample-specific columns (definitions available at cellxgene.cziscience.com)
#'
#' `.sample`, `.sample_name`, `age_days`, `assay`, `assay_ontology_term_id`, `development_stage`, `development_stage_ontology_term_id`, `ethnicity`, `ethnicity_ontology_term_id`, `experiment___`, `organism`, `organism_ontology_term_id`, `sample_placeholder`, `sex`, `sex_ontology_term_id`, `tissue`, `tissue_harmonised`, `tissue_ontology_term_id`, `disease`, `disease_ontology_term_id`, `is_primary_data.x`
#'
#' Cell-specific columns (definitions available at cellxgene.cziscience.com)
#'
#' `.cell`, `cell_type`, `cell_type_ontology_term_idm`, `cell_type_harmonised`, `confidence_class`, `cell_annotation_azimuth_l2`, `cell_annotation_blueprint_singler`
#'
#' Through harmonisation and curation we introduced custom column, not present in the original CELLxGENE metadata
#'
#' - `tissue_harmonised`: a coarser tissue name for better filtering
#' - `age_days`: the number of days corresponding to the age
#' - `cell_type_harmonised`: the consensus call identity (for immune cells) using the original and three novel annotations using Seurat Azimuth and SingleR
#' - `confidence_class`: an ordinal class of how confident `cell_type_harmonised` is. 1 is complete consensus, 2 is 3 out of four and so on.
#' - `cell_annotation_azimuth_l2`: Azimuth cell annotation
#' - `cell_annotation_blueprint_singler`: SingleR cell annotation using Blueprint reference
#' - `cell_annotation_blueprint_monaco`: SingleR cell annotation using Monaco reference
#' - `sample_id_db`: Sample subdivision for internal use
#' - `file_id_db`: File subdivision for internal use
#' - `.sample`: Sample ID
#' - `.sample_name`: How samples were defined
#'
get_metadata <- function(
remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet",
cache_directory = get_default_cache_dir()
Expand Down
128 changes: 93 additions & 35 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ knitr::opts_chunk$set(
knitr::include_graphics(c("man/figures/logo.png"))
```

```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px")}
```{r, echo=FALSE, out.height = c("58px"), out.width = c("155x", "129px", "202px", "219px", "180px")}
knitr::include_graphics(c(
"man/figures/svcf_logo.jpeg",
"man/figures/czi_logo.png",
"man/figures/bioconductor_logo.jpg",
"man/figures/vca_logo.png"
"man/figures/vca_logo.png",
"man/figures/nectar_logo.png"
))
```

Expand Down Expand Up @@ -58,31 +59,14 @@ library(stringr)
get_metadata()
```

### Explore the tissue
### Explore the number of datasets per tissue

```{r}
get_metadata() |>
dplyr::distinct(tissue, file_id)
dplyr::distinct(tissue, dataset_id) |>
dplyr::count(tissue)
```

```{r}
#> # Source: SQL [?? x 2]
#> # Database: sqlite 3.40.0 [[email protected]:5432/metadata]
#> # Ordered by: desc(n)
#> tissue n
#> <chr> <int64>
#> 1 blood 47
#> 2 heart left ventricle 46
#> 3 cortex of kidney 31
#> 4 renal medulla 29
#> 5 lung 27
#> 6 liver 24
#> 7 middle temporal gyrus 24
#> 8 kidney 19
#> 9 intestine 18
#> 10 thymus 17
#> # … with more rows
```


## Download single-cell RNA sequencing counts
Expand Down Expand Up @@ -161,36 +145,110 @@ single_cell_counts

We can gather all natural killer cells and plot the distribution of CD56 (NCAM1) across all tissues

```{r, eval=FALSE, echo=FALSE}
library(tidySingleCellExperiment)
library(ggplot2)
# Plots with styling
# Plot by disease
get_metadata() |>
# Filter and subset
filter(cell_type_harmonised=="cd14 mono") |>
filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |>
# Get counts per million for NCAM1 gene
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
# Add feature to table
join_features("HLA-A", shape = "wide") |>
# Rank x axis
as_tibble() |>
with_groups(disease, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |>
# Plot
ggplot(aes( fct_reorder(disease, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
geom_jitter(shape=".") +
# Style
guides(color="none") +
scale_y_log10() +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
xlab("Disease") +
ggtitle("HLA-A in CD14 monocytes. Coloured by dataset")
# Plot by tissue
get_metadata() |>
# Filter and subset
filter(cell_type_harmonised=="cd14 mono") |>
filter(file_id_db != "c5a05f23f9784a3be3bfa651198a48eb") |>
# Get counts per million for NCAM1 gene
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
# Add feature to table
join_features("HLA-A", shape = "wide") |>
# Rank x axis
as_tibble() |>
with_groups(tissue_harmonised, ~ .x |> mutate(median_count = median(`HLA.A`, rm.na=TRUE))) |>
# Plot
ggplot(aes( fct_reorder(tissue_harmonised, median_count,.desc = TRUE), `HLA.A`,color = file_id)) +
geom_jitter(shape=".") +
# Style
guides(color="none") +
scale_y_log10() +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1)) +
xlab("Disease") +
ggtitle("HLA-A in CD14 monocytes. Coloured by dataset")
```

```{r, eval=FALSE}
library(tidySingleCellExperiment)
library(ggplot2)
get_metadata() |>
# Filter and subset
filter(cell_type_harmonised=="cd14 mono") |>
# Get counts per million for NCAM1 gene
get_SingleCellExperiment(assays = "cpm", features = "HLA-A") |>
# Plot
join_features("HLA-A", shape = "wide") |>
ggplot(aes( disease, `HLA.A`,color = file_id)) +
geom_jitter(shape=".")
```

```{r, echo=FALSE, message=FALSE, warning=FALSE}
knitr::include_graphics("man/figures/HLA_A_disease_plot.png")
```

```{r, eval=FALSE}
get_metadata() |>
# Filter and subset
filter(cell_type_harmonised=="nk") |>
select(.cell, file_id_db, disease, file_id, tissue_harmonised) |>
# Get counts per million for NCAM1 gene
get_SingleCellExperiment(assays = "cpm", features = "NCAM1") |>
# Get transcriptional abundance for plotting with `tidySingleCellExperiment`
join_features("NCAM1", shape = "wide") |>
# Plot
join_features("NCAM1", shape = "wide") |>
ggplot(aes( tissue_harmonised, NCAM1,color = file_id)) +
geom_jitter(shape=".") +
# Style
guides(color="none") +
scale_y_log10() +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1))
geom_jitter(shape=".")
```

```{r, echo=FALSE, message=FALSE, warning=FALSE}
knitr::include_graphics("man/figures/NCAM1_figure.png")
knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
```

# Cell metadata
Expand Down
Loading

0 comments on commit 4b08621

Please sign in to comment.