Skip to content

Commit

Permalink
add suggestions and optimise code from code review
Browse files Browse the repository at this point in the history
  • Loading branch information
myushen committed Mar 26, 2024
1 parent 506ee5a commit b6ee0b2
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 75 deletions.
14 changes: 10 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,27 @@ importFrom(DBI,dbDisconnect)
importFrom(HDF5Array,HDF5RealizationSink)
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
importFrom(HDF5Array,saveHDF5SummarizedExperiment)
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,metadata)
importFrom(Seurat,as.SingleCellExperiment)
importFrom(SeuratObject,as.Seurat)
importFrom(SeuratObject,as.sparse)
importFrom(SingleCellExperiment,"reducedDims<-")
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SingleCellExperiment,combineCols)
importFrom(SingleCellExperiment,reducedDims)
importFrom(SingleCellExperiment,rowData)
importFrom(SummarizedExperiment,"assayNames<-")
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
importFrom(SummarizedExperiment,assay)
importFrom(SummarizedExperiment,assays)
importFrom(SummarizedExperiment,colData)
importFrom(assertthat,assert_that)
importFrom(assertthat,has_name)
importFrom(checkmate,check_character)
importFrom(checkmate,check_directory_exists)
importFrom(checkmate,check_file_exists)
importFrom(checkmate,check_set_equal)
importFrom(checkmate,check_subset)
importFrom(checkmate,check_tibble)
importFrom(checkmate,check_true)
importFrom(cli,cli_abort)
importFrom(cli,cli_alert_info)
Expand All @@ -43,6 +47,7 @@ importFrom(dbplyr,remote_con)
importFrom(dbplyr,sql)
importFrom(dplyr,as_tibble)
importFrom(dplyr,collect)
importFrom(dplyr,distinct)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,inner_join)
Expand Down Expand Up @@ -75,6 +80,7 @@ importFrom(purrr,set_names)
importFrom(purrr,walk)
importFrom(rlang,.data)
importFrom(stats,setNames)
importFrom(stringr,str_detect)
importFrom(stringr,str_remove_all)
importFrom(stringr,str_replace_all)
importFrom(tibble,column_to_rownames)
Expand Down
4 changes: 1 addition & 3 deletions R/counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,7 @@ group_to_sce <- function(i, df, dir_prefix, features) {

if (length(cells) < nrow(df)){
single_line_str(
"Some cells were filtered out while loading {head(df$file_id_db, 1)}
because of extremely low counts. The
number of cells in the SingleCellExperiment will be less than the
"The number of cells in the SingleCellExperiment will be less than the
number of cells you have selected from the metadata.
Are cell IDs duplicated? Or, do cell IDs correspond to the counts file?
"
Expand Down
28 changes: 11 additions & 17 deletions R/counts_per_million.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,42 @@
#' Generating counts per million from SingleCellExperiment object
#' Generating counts per million from a SingleCellExperiment object
#'
#' @param input_sce_obj A SingleCellExperiment object read from RDS
#' @param output_dir A character vector of counts per million path
#' @param hd5_file_dir A character vector of HDF5 file path after converting from RDS
#' @return A directory stores counts per million
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom dplyr tbl
#' @importFrom glue glue
#' @importFrom SummarizedExperiment assay assays assays<-
#' @importFrom HDF5Array saveHDF5SummarizedExperiment
#' @importFrom purrr map
#' @noRd
get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {

# Create directories
output_dir |> dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)

# Save SCE to the cache directory counts folder
input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)

data <- input_sce_obj

# Avoid completely empty cells
col_sums <- colSums(as.matrix(data@assays@data$X))
which_to_select <- which(col_sums >0 & col_sums < Inf)
col_sums <- colSums(as.matrix(assay(data)))
selected_cols <- which(col_sums >0 & col_sums < Inf)

sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,which_to_select ,drop=FALSE ], assay.type = "X")))
rownames(sce) <- rownames(data[,which_to_select ])
colnames(sce) <- colnames(data[,which_to_select ])
sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X")))
rownames(sce) <- rownames(data[,selected_cols ])
colnames(sce) <- colnames(data[,selected_cols ])

# Avoid scaling zeros
which_to_select_not <- setdiff(1:ncol(data), which_to_select)
sce_zero <- SingleCellExperiment(list(counts_per_million = data@assays@data$X[,which_to_select_not ,drop=FALSE ]))
rownames(sce_zero) <- rownames(data[,which_to_select_not ])
colnames(sce_zero) <- colnames(data[,which_to_select_not ])
sce_zero <- SingleCellExperiment(list(counts_per_million = assay(data)[, !selected_cols ,drop=FALSE ]))
rownames(sce_zero) <- rownames(data[, !selected_cols ])
colnames(sce_zero) <- colnames(data[, !selected_cols ])

sce <- sce |> cbind(sce_zero)

sce <- sce[,colnames(data)]

rm(data)
gc()

# Check if there is a memory issue
SummarizedExperiment::assays(sce) <- SummarizedExperiment::assays(sce) |> map(DelayedArray::realize)
assays(sce) <- assays(sce) |> map(DelayedArray::realize)

sce |> saveHDF5SummarizedExperiment(output_dir)
}
Expand Down
86 changes: 41 additions & 45 deletions R/import_metadata_and_counts.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
#' Checks importing criteria for new atlas
#' Generates counts per million from provided raw counts
#' Import and process metadata and counts for a SingleCellExperiment object
#'
#' @param sce_obj A SingleCellExperiment object from RDS
#' @param cache_dir Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store
#' `metadata.parquet`
#' @export
#' @return A metadata.parquet strip from SCE object.
#' @return A metadata.parquet strip from the SingleCellExperiment object.
#' Directories store counts and counts per million in the provided cache directory.
#' @importFrom assertthat assert_that
#' @importFrom checkmate check_tibble check_directory_exists check_set_equal check_true check_character check_subset check_file_exists
#' @importFrom dplyr tbl select
#' @importFrom checkmate check_true check_character check_subset
#' @importFrom dplyr select distinct pull
#' @importFrom cli cli_alert_info
#' @importFrom glue glue
#' @importFrom rlang .data
#' @importFrom SingleCellExperiment reducedDims rowData reducedDims<-
#' @importFrom S4Vectors metadata metadata<-
#' @importFrom SummarizedExperiment assay
#' @importFrom stringr str_detect
#' @examples
#' data(sample_sce_obj)
#' import_metadata_counts(sample_sce_obj,
Expand All @@ -25,91 +27,85 @@ import_metadata_counts <- function(
original_dir <- file.path(cache_dir, "original")

# Identify metadata and counts matrix
metadata_tbl <- sce_obj@metadata$data
counts_matrix <- sce_obj@assays@data$X
metadata_tbl <- metadata(sce_obj)$data
counts_matrix <- assay(sce_obj)

# Identify whether genes in SingleCellxExperiment object are in ensembl nomenclature
genes <- rowData(sce_obj) |> rownames()
assert_that(sce_obj |> inherits( "SingleCellExperiment"),
msg = "sce_obj is not identified as SingleCellExperiment object.")
assert_that(!str_detect(genes, "^ENSG%") |> all(),
msg = "Gene names in SingleCellExperiment object cannot contain Ensembl IDs.")
assert_that(all(counts_matrix >= 0),
msg = "Counts for SingleCellExperiment cannot be negative.")

# Convert to tibble if metadata_tbl is not a tibble
# Convert to tibble if not provided
metadata_tbl <- metadata_tbl |> as_tibble()

# Create file_id_db from dataset_id
metadata_tbl <-
metadata_tbl |> mutate(file_id_db = .data$dataset_id |> openssl::md5() |> as.character())
sce_obj@metadata$data <- metadata_tbl
metadata(sce_obj)$data <- metadata_tbl

# Remove existing reducedDim slot to enable get_SCE API functionality
if (length(names(SingleCellExperiment::reducedDims(sce_obj))) >0 ) {
SingleCellExperiment::reducedDims(sce_obj) <- NULL
if (length(names(reducedDims(sce_obj))) >0 ) {
reducedDims(sce_obj) <- NULL
}

# create original and cpm folders in cache directory if not exist (in order to append new counts to existing ones)
# Create original and cpm folders in the cache directory if not exist
if (!dir.exists(original_dir)) {
dir.create(cache_dir, "original", recursive = TRUE)
cache_dir |> file.path("original") |> dir.create(recursive = TRUE)
}

if (!dir.exists(file.path(cache_dir, "cpm"))) {
dir.create(cache_dir, "cpm", recursive = TRUE)
cache_dir |> file.path("cpm") |> dir.create(recursive = TRUE)
}

# existing metadata genes
genes <- get_metadata() |> head(1) |>
get_single_cell_experiment(cache_directory = cache_dir) |> rownames()

# check count H5 directory name not included in the cache directory original
# Check whether count H5 directory has been generated
all(!metadata_tbl$file_id_db %in% dir(original_dir)) |>
check_true() |>
assert_that(msg = "Count H5 directory name should not duplicate with that in cache directory")

counts_data <- sce_obj@assays@data$X
assert_that(inherits(sce_obj, "SingleCellExperiment"),
msg = "SingleCellExperiment Object is not created from metadata.")
assert_that(!"^ENS" %in% genes,
msg = "Gene names cannot contain Ensembl IDs.")
assert_that(all(counts_data >= 0),
msg = "Counts for SingleCellExperiment cannot be negative.")
assert_that(msg = "The filename for count assay (file_id_db) already exists in the cache directory.")

# check the metadata contains cell_, file_id_db, sample_ with correct types
# Check the metadata contains cell_, file_id_db, sample_ with correct types
check_true("cell_" %in% colnames(metadata_tbl))
check_true("file_id_db" %in% names(metadata_tbl))
pull(metadata_tbl, .data$cell_) |> class() |> check_character()
select(metadata_tbl, .data$file_id_db) |> class() |> check_character()
#metadata_tbl |> select(cell_, file_id_db) |> sapply(class) |> check_character()

# check cell_ values in metadata_tbl is unique
(anyDuplicated(metadata_tbl$cell_) == 0 ) |> assert_that(msg = "cell_ in the metadata must be unique")
# Check cell_ values in metadata_tbl is unique
(anyDuplicated(metadata_tbl$cell_) == 0 ) |> assert_that(msg = "Cell names (cell_) in the metadata must be unique.")

# check cell_ values are not duplicated when join with parquet
cells <- select(get_metadata(), .data$cell_) |> as_tibble()
(!any(metadata_tbl$cell_ %in% cells$cell_)) |> assert_that(msg = "cell_ in the metadata should not duplicate with that exists in API")
# Check cell_ values are not duplicated when join with parquet
cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble()
(!any(metadata_tbl$cell_ %in% cells$cell_)) |>
assert_that(msg = "Cell names (cell_) should not clash with cells that already exist in the atlas.")

# check age_days is either -99 or greater than 365
# Check age_days is either -99 or greater than 365
if (any(colnames(metadata_tbl) == "age_days")) {
assert_that(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365),
msg = "age_days should be either -99 for unknown or greater than 365")
msg = "age_days should be either -99 for unknown or greater than 365.")
}

# check sex capitalisation then convert to lower case
# Check sex capitalisation then convert to lower case
if (any(colnames(metadata_tbl) == "sex")) {
metadata_tbl <- metadata_tbl |> mutate(sex = tolower(.data$sex))
dplyr::distinct(metadata_tbl, .data$sex) |> dplyr::pull() |> check_subset(c("female","male","unknown"))
distinct(metadata_tbl, .data$sex) |> pull(.data$sex) |> check_subset(c("female","male","unknown"))
}
counts_path <-
select(metadata_tbl, .data$file_id_db) |> mutate(
original_path = file.path(original_dir, basename(.data$file_id_db)),
cpm_path = file.path(cache_dir, "cpm", basename(.data$file_id_db))
)

# if checkpoints above pass, generate cpm
cli_alert_info("Generating cpm from {.path {metadata_tbl$file_id_db}}. ")

# Generate cpm from counts
cli_alert_info("Generating cpm from {.path {metadata_tbl$file_id_db}}. ")
get_counts_per_million(input_sce_obj = sce_obj, output_dir = counts_path$cpm_path, hd5_file_dir = counts_path$original_path)
saveHDF5SummarizedExperiment(sce_obj, counts_path$original_path, replace=TRUE)

cli_alert_info("cpm are generated in {.path {counts_path$cpm_path}}. ")

# check metadata sample file ID match the count file ID in cache directory
all(metadata_tbl |> pull(.data$file_id_db) %in% dir(original_dir)) |>
assert_that(msg = "The metadata sample file ID and the count file ID does not match")
assert_that(msg = "The filename for count assay, which matches the file_id_db column in the metadata, already exists in the cache directory.")

# convert metadata_tbl to parquet if above checkpoints pass
arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet"))
Expand Down
1 change: 0 additions & 1 deletion man/get_default_cache_dir.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 3 additions & 5 deletions man/import_metadata_counts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b6ee0b2

Please sign in to comment.