diff --git a/DESCRIPTION b/DESCRIPTION index a2deb87..f3e425e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: CuratedAtlasQueryR Title: Queries the Human Cell Atlas -Version: 0.3.1 +Version: 0.4.0 Authors@R: c( person( "Stefano", @@ -126,3 +126,4 @@ LazyDataCompression: xz URL: https://github.com/stemangiola/CuratedAtlasQueryR BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues VignetteBuilder: knitr +Roxygen: list(markdown = TRUE) diff --git a/R/query.R b/R/query.R index c79d8cb..129c86f 100644 --- a/R/query.R +++ b/R/query.R @@ -11,15 +11,19 @@ assay_map <- c( ) REMOTE_URL <- "https://swift.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/harmonised-human-atlas" +COUNTS_VERSION <- "0.2" -#' Given a data frame of HCA metadata, returns a SingleCellExperiment object -#' corresponding to the samples in that data frame +#' Gets a SingleCellExperiment from curated metadata +#' +#' Given a data frame of Curated Atlas metadata obtained from [get_metadata()], +#' returns a [`SingleCellExperiment::SingleCellExperiment-class`] object corresponding to the samples in that +#' data frame #' #' @param data A data frame containing, at minimum, a `.sample` column, which #' corresponds to a single cell sample ID. This can be obtained from the #' [get_metadata()] function. -#' @param assays A character vector whose elements must be either "counts" and/or -#' "cpm", representing the corresponding assay(s) you want to request. +#' @param assays A character vector whose elements must be either "counts" +#' and/or "cpm", representing the corresponding assay(s) you want to request. #' @param repository A character vector of length one. If provided, it should be #' an HTTP URL pointing to the location where the single cell data is stored. #' @param cache_directory An optional character vector of length one. If @@ -51,7 +55,7 @@ REMOTE_URL <- "https://swift.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3e #' #' @export #' -#' +#' get_SingleCellExperiment <- function( data, assays = c("counts", "cpm"), @@ -79,9 +83,10 @@ get_SingleCellExperiment <- function( cli_alert_info("Realising metadata.") raw_data <- collect(data) inherits(raw_data, "tbl") |> assert_that() - has_name(raw_data, c(".cell", "file_id_db")) |> assert_that() + has_name(raw_data, c("_cell", "file_id_db")) |> assert_that() - cache_directory |> dir.create(showWarnings = FALSE) + versioned_cache_directory = file.path(cache_directory, COUNTS_VERSION) + versioned_cache_directory |> dir.create(showWarnings = FALSE, recursive = TRUE) subdirs <- assay_map[assays] @@ -100,7 +105,7 @@ get_SingleCellExperiment <- function( as.character() |> sync_assay_files( url = parsed_repo, - cache_dir = cache_directory, + cache_dir = versioned_cache_directory, files = _, subdirs = subdirs ) @@ -111,7 +116,7 @@ get_SingleCellExperiment <- function( imap(function(current_subdir, current_assay) { # Build up an SCE for each assay dir_prefix <- file.path( - cache_directory, + versioned_cache_directory, current_subdir ) @@ -172,14 +177,14 @@ group_to_sce <- function(i, df, dir_prefix, features) { sce <- loadHDF5SummarizedExperiment(sce_path) # The cells we select here are those that are both available in the SCE # object, and requested for this particular file - cells <- colnames(sce) |> intersect(df$.cell) + cells <- colnames(sce) |> intersect(df$`_cell`) # We need to make the cell names globally unique, which we can guarantee # by adding a suffix that is derived from file_id_db, which is the grouping # variable new_cellnames <- paste0(cells, "_", i) new_coldata <- df |> - mutate(original_cell_id = .data$.cell, .cell = new_cellnames) |> - column_to_rownames(".cell") |> + mutate(original_cell_id = .data$`_cell`, `_cell` = new_cellnames) |> + column_to_rownames("_cell") |> as("DataFrame") features |> @@ -333,10 +338,12 @@ get_seurat <- function(...) { get_SingleCellExperiment(...) |> as.Seurat(data = NULL) } +#' Gets the Curated Atlas metadata as a data frame. +#' #' Downloads a parquet database of the Human Cell Atlas metadata to a local #' cache, and then opens it as a data frame. It can then be filtered and #' passed into [get_SingleCellExperiment()] -#' to obtain a [`SingleCellExperiment`](SingleCellExperiment::SingleCellExperiment-class) +#' to obtain a [`SingleCellExperiment::SingleCellExperiment-class`] #' #' @param remote_url Optional character vector of length 1. An HTTP URL pointing #' to the location of the parquet database. @@ -364,10 +371,10 @@ get_seurat <- function(...) { #' @importFrom httr progress #' @importFrom cli cli_alert_info get_metadata <- function( - remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata-sqlite/metadata.parquet", + remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet", cache_directory = get_default_cache_dir() ) { - db_path <- file.path(cache_directory, "metadata.parquet") + db_path <- file.path(cache_directory, "metadata.0.2.2.parquet") sync_remote_file( remote_url, db_path, diff --git a/man/get_SingleCellExperiment.Rd b/man/get_SingleCellExperiment.Rd index 6bf9aac..8c5bc59 100644 --- a/man/get_SingleCellExperiment.Rd +++ b/man/get_SingleCellExperiment.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/query.R \name{get_SingleCellExperiment} \alias{get_SingleCellExperiment} -\title{Given a data frame of HCA metadata, returns a SingleCellExperiment object -corresponding to the samples in that data frame} +\title{Gets a SingleCellExperiment from curated metadata} \usage{ get_SingleCellExperiment( data, @@ -14,12 +13,12 @@ get_SingleCellExperiment( ) } \arguments{ -\item{data}{A data frame containing, at minimum, a `.sample` column, which +\item{data}{A data frame containing, at minimum, a \code{.sample} column, which corresponds to a single cell sample ID. This can be obtained from the -[get_metadata()] function.} +\code{\link[=get_metadata]{get_metadata()}} function.} -\item{assays}{A character vector whose elements must be either "counts" and/or -"cpm", representing the corresponding assay(s) you want to request.} +\item{assays}{A character vector whose elements must be either "counts" +and/or "cpm", representing the corresponding assay(s) you want to request.} \item{cache_directory}{An optional character vector of length one. If provided, it should indicate a local file path where any remotely accessed @@ -33,11 +32,12 @@ the counts for. By default counts for all features will be returned.} } \value{ A SingleCellExperiment object, with one assay for each value in the - assays argument +assays argument } \description{ -Given a data frame of HCA metadata, returns a SingleCellExperiment object -corresponding to the samples in that data frame +Given a data frame of Curated Atlas metadata obtained from \code{\link[=get_metadata]{get_metadata()}}, +returns a \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment::SingleCellExperiment}} object corresponding to the samples in that +data frame } \examples{ meta <- get_metadata() |> head(2) diff --git a/man/get_metadata.Rd b/man/get_metadata.Rd index f82560f..76dcf9d 100644 --- a/man/get_metadata.Rd +++ b/man/get_metadata.Rd @@ -2,14 +2,11 @@ % Please edit documentation in R/query.R \name{get_metadata} \alias{get_metadata} -\title{Downloads a parquet database of the Human Cell Atlas metadata to a local -cache, and then opens it as a data frame. It can then be filtered and -passed into [get_SingleCellExperiment()] -to obtain a [`SingleCellExperiment`](SingleCellExperiment::SingleCellExperiment-class)} +\title{Gets the Curated Atlas metadata as a data frame.} \usage{ get_metadata( remote_url = - "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata-sqlite/metadata.parquet", + "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.2.parquet", cache_directory = get_default_cache_dir() ) } @@ -23,15 +20,15 @@ metadata.parquet} } \value{ A lazy data.frame subclass containing the metadata. You can interact - with this object using most standard dplyr functions. For string matching, - it is recommended that you use `stringr::str_like` to filter character - columns, as `stringr::str_match` will not work. +with this object using most standard dplyr functions. For string matching, +it is recommended that you use \code{stringr::str_like} to filter character +columns, as \code{stringr::str_match} will not work. } \description{ -Downloads a parquet database of the Human Cell Atlas metadata to a local -cache, and then opens it as a data frame. It can then be filtered and -passed into [get_SingleCellExperiment()] -to obtain a [`SingleCellExperiment`](SingleCellExperiment::SingleCellExperiment-class) +Downloads a parquet database of the Human Cell Atlas metadata to a local +cache, and then opens it as a data frame. It can then be filtered and +passed into \code{\link[=get_SingleCellExperiment]{get_SingleCellExperiment()}} +to obtain a \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment::SingleCellExperiment}} } \examples{ library(dplyr) diff --git a/man/get_seurat.Rd b/man/get_seurat.Rd index 2e2a491..fb5189b 100644 --- a/man/get_seurat.Rd +++ b/man/get_seurat.Rd @@ -11,11 +11,11 @@ get_seurat(...) \item{...}{ Arguments passed on to \code{\link[=get_SingleCellExperiment]{get_SingleCellExperiment}} \describe{ - \item{\code{data}}{A data frame containing, at minimum, a `.sample` column, which + \item{\code{data}}{A data frame containing, at minimum, a \code{.sample} column, which corresponds to a single cell sample ID. This can be obtained from the -[get_metadata()] function.} - \item{\code{assays}}{A character vector whose elements must be either "counts" and/or -"cpm", representing the corresponding assay(s) you want to request.} +\code{\link[=get_metadata]{get_metadata()}} function.} + \item{\code{assays}}{A character vector whose elements must be either "counts" +and/or "cpm", representing the corresponding assay(s) you want to request.} \item{\code{repository}}{A character vector of length one. If provided, it should be an HTTP URL pointing to the location where the single cell data is stored.} \item{\code{cache_directory}}{An optional character vector of length one. If @@ -27,7 +27,7 @@ the counts for. By default counts for all features will be returned.} } \value{ A Seurat object containing the same data as a call to - get_SingleCellExperiment. +get_SingleCellExperiment. } \description{ Given a data frame of HCA metadata, returns a Seurat object corresponding to diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 92d491a..6ff3c02 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -2,7 +2,7 @@ library(CuratedAtlasQueryR) test_that("get_SingleCellExperiment() correctly handles duplicate cell IDs", { meta <- get_metadata() |> - dplyr::filter(.cell == "868417_1") |> + dplyr::filter(`_cell` == "868417_1") |> dplyr::collect() sce <- get_SingleCellExperiment(meta) # This query should return multiple cells, despite querying only 1 cell ID diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd index feefda1..7991996 100644 --- a/vignettes/Introduction.Rmd +++ b/vignettes/Introduction.Rmd @@ -31,7 +31,7 @@ knitr::include_graphics(c( "../man/figures/svcf_logo.jpeg", "../man/figures/czi_logo.png", "../man/figures/bioconductor_logo.jpg", - "../man/figures/vca_logo.png" + "../man/figures/vca_logo.png" )) ```