add suggestions and optimise code from code review

stemangiola · Mar 26, 2024 · b6ee0b2 · b6ee0b2
1 parent 506ee5a
commit b6ee0b2
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 75 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -16,23 +16,27 @@ importFrom(DBI,dbDisconnect)
 importFrom(HDF5Array,HDF5RealizationSink)
 importFrom(HDF5Array,loadHDF5SummarizedExperiment)
 importFrom(HDF5Array,saveHDF5SummarizedExperiment)
+importFrom(S4Vectors,"metadata<-")
 importFrom(S4Vectors,DataFrame)
+importFrom(S4Vectors,metadata)
 importFrom(Seurat,as.SingleCellExperiment)
 importFrom(SeuratObject,as.Seurat)
 importFrom(SeuratObject,as.sparse)
+importFrom(SingleCellExperiment,"reducedDims<-")
 importFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(SingleCellExperiment,combineCols)
+importFrom(SingleCellExperiment,reducedDims)
+importFrom(SingleCellExperiment,rowData)
 importFrom(SummarizedExperiment,"assayNames<-")
+importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")
+importFrom(SummarizedExperiment,assay)
+importFrom(SummarizedExperiment,assays)
 importFrom(SummarizedExperiment,colData)
 importFrom(assertthat,assert_that)
 importFrom(assertthat,has_name)
 importFrom(checkmate,check_character)
-importFrom(checkmate,check_directory_exists)
-importFrom(checkmate,check_file_exists)
-importFrom(checkmate,check_set_equal)
 importFrom(checkmate,check_subset)
-importFrom(checkmate,check_tibble)
 importFrom(checkmate,check_true)
 importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_info)
@@ -43,6 +47,7 @@ importFrom(dbplyr,remote_con)
 importFrom(dbplyr,sql)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,collect)
+importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,inner_join)
@@ -75,6 +80,7 @@ importFrom(purrr,set_names)
 importFrom(purrr,walk)
 importFrom(rlang,.data)
 importFrom(stats,setNames)
+importFrom(stringr,str_detect)
 importFrom(stringr,str_remove_all)
 importFrom(stringr,str_replace_all)
 importFrom(tibble,column_to_rownames)

diff --git a/R/counts.R b/R/counts.R
@@ -219,9 +219,7 @@ group_to_sce <- function(i, df, dir_prefix, features) {
 
     if (length(cells) < nrow(df)){
         single_line_str(
-            "Some cells were filtered out while loading {head(df$file_id_db, 1)}
-            because of extremely low counts. The
-            number of cells in the SingleCellExperiment will be less than the
+            "The number of cells in the SingleCellExperiment will be less than the
             number of cells you have selected from the metadata.
             Are cell IDs duplicated? Or, do cell IDs correspond to the counts file?
             "

diff --git a/R/counts_per_million.R b/R/counts_per_million.R
@@ -1,48 +1,42 @@
-#' Generating counts per million from SingleCellExperiment object
+#' Generating counts per million from a SingleCellExperiment object
 #'
 #' @param input_sce_obj A SingleCellExperiment object read from RDS
 #' @param output_dir A character vector of counts per million path
 #' @param hd5_file_dir A character vector of HDF5 file path after converting from RDS
 #' @return A directory stores counts per million 
 #' @importFrom SingleCellExperiment SingleCellExperiment
-#' @importFrom dplyr tbl
-#' @importFrom glue glue
+#' @importFrom SummarizedExperiment assay assays assays<-
 #' @importFrom HDF5Array saveHDF5SummarizedExperiment
 #' @importFrom purrr map
 #' @noRd
 get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
-
-  # Create directories
-  output_dir |>  dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)
 
   # Save SCE to the cache directory counts folder
   input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)
 
   data <- input_sce_obj
 
   # Avoid completely empty cells
-  col_sums <- colSums(as.matrix(data@assays@data$X))
-  which_to_select <- which(col_sums >0 & col_sums < Inf)
+  col_sums <- colSums(as.matrix(assay(data)))
+  selected_cols <- which(col_sums >0 & col_sums < Inf)
 
-  sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,which_to_select ,drop=FALSE ], assay.type = "X")))
-  rownames(sce) <- rownames(data[,which_to_select  ])
-  colnames(sce) <- colnames(data[,which_to_select  ])
+  sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X")))
+  rownames(sce) <- rownames(data[,selected_cols  ])
+  colnames(sce) <- colnames(data[,selected_cols  ])
 
   # Avoid scaling zeros
-  which_to_select_not <- setdiff(1:ncol(data), which_to_select)
-  sce_zero <- SingleCellExperiment(list(counts_per_million = data@assays@data$X[,which_to_select_not ,drop=FALSE ]))
-  rownames(sce_zero) <- rownames(data[,which_to_select_not  ])
-  colnames(sce_zero) <- colnames(data[,which_to_select_not ])
+  sce_zero <- SingleCellExperiment(list(counts_per_million = assay(data)[, !selected_cols ,drop=FALSE ]))
+  rownames(sce_zero) <- rownames(data[, !selected_cols  ])
+  colnames(sce_zero) <- colnames(data[, !selected_cols ])
 
   sce <- sce |> cbind(sce_zero)
 
   sce <- sce[,colnames(data)]
 
   rm(data)
-  gc()
 
   # Check if there is a memory issue 
-  SummarizedExperiment::assays(sce) <- SummarizedExperiment::assays(sce) |> map(DelayedArray::realize)
+  assays(sce) <- assays(sce) |> map(DelayedArray::realize)
 
   sce |> saveHDF5SummarizedExperiment(output_dir)
 } 

diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R
@@ -1,19 +1,21 @@
-#' Checks importing criteria for new atlas 
-#' Generates counts per million from provided raw counts
+#' Import and process metadata and counts for a SingleCellExperiment object
 #'
 #' @param sce_obj A SingleCellExperiment object from RDS
 #' @param cache_dir Optional character vector of length 1. A file path on
 #'   your local system to a directory (not a file) that will be used to store
 #'   `metadata.parquet`
 #' @export
-#' @return A metadata.parquet strip from SCE object. 
+#' @return A metadata.parquet strip from the SingleCellExperiment object. 
 #' Directories store counts and counts per million in the provided cache directory.
 #' @importFrom assertthat assert_that
-#' @importFrom checkmate check_tibble check_directory_exists check_set_equal check_true check_character check_subset check_file_exists
-#' @importFrom dplyr tbl select
+#' @importFrom checkmate check_true check_character check_subset
+#' @importFrom dplyr select distinct pull
 #' @importFrom cli cli_alert_info
-#' @importFrom glue glue
 #' @importFrom rlang .data
+#' @importFrom SingleCellExperiment reducedDims rowData reducedDims<-
+#' @importFrom S4Vectors metadata metadata<-
+#' @importFrom SummarizedExperiment assay
+#' @importFrom stringr str_detect
 #' @examples
 #' data(sample_sce_obj)
 #' import_metadata_counts(sample_sce_obj,
@@ -25,91 +27,85 @@ import_metadata_counts <- function(
   original_dir <- file.path(cache_dir, "original")
 
   # Identify metadata and counts matrix
-  metadata_tbl <- sce_obj@metadata$data
-  counts_matrix <- sce_obj@assays@data$X
+  metadata_tbl <- metadata(sce_obj)$data
+  counts_matrix <- assay(sce_obj)
+
+  # Identify whether genes in SingleCellxExperiment object are in ensembl nomenclature
+  genes <- rowData(sce_obj) |> rownames()
+  assert_that(sce_obj |> inherits( "SingleCellExperiment"),
+              msg = "sce_obj is not identified as SingleCellExperiment object.")
+  assert_that(!str_detect(genes, "^ENSG%") |> all(), 
+              msg = "Gene names in SingleCellExperiment object cannot contain Ensembl IDs.")
+  assert_that(all(counts_matrix >= 0),
+              msg = "Counts for SingleCellExperiment cannot be negative.")
 
-  # Convert to tibble if metadata_tbl is not a tibble
+  # Convert to tibble if not provided
   metadata_tbl <- metadata_tbl |> as_tibble()
 
   # Create file_id_db from dataset_id
   metadata_tbl <-
     metadata_tbl |> mutate(file_id_db = .data$dataset_id |> openssl::md5() |> as.character())
-  sce_obj@metadata$data <- metadata_tbl
+  metadata(sce_obj)$data <- metadata_tbl
 
   # Remove existing reducedDim slot to enable get_SCE API functionality 
-  if (length(names(SingleCellExperiment::reducedDims(sce_obj))) >0 ) {
-    SingleCellExperiment::reducedDims(sce_obj) <- NULL
+  if (length(names(reducedDims(sce_obj))) >0 ) {
+    reducedDims(sce_obj) <- NULL
   }
 
-  # create original and cpm folders in cache directory if not exist (in order to append new counts to existing ones)
+  # Create original and cpm folders in the cache directory if not exist
   if (!dir.exists(original_dir)) {
-    dir.create(cache_dir, "original", recursive = TRUE)
+    cache_dir |> file.path("original") |> dir.create(recursive = TRUE)
   }
 
   if (!dir.exists(file.path(cache_dir, "cpm"))) {
-    dir.create(cache_dir, "cpm", recursive = TRUE)
+    cache_dir |> file.path("cpm") |> dir.create(recursive = TRUE)
   }
 
-  # existing metadata genes
-  genes <- get_metadata() |> head(1) |> 
-    get_single_cell_experiment(cache_directory = cache_dir) |> rownames()
-
-  # check count H5 directory name not included in the cache directory original
+  # Check whether count H5 directory has been generated
   all(!metadata_tbl$file_id_db %in% dir(original_dir)) |>
     check_true() |>
-    assert_that(msg = "Count H5 directory name should not duplicate with that in cache directory")
-
-  counts_data <- sce_obj@assays@data$X
-  assert_that(inherits(sce_obj, "SingleCellExperiment"),
-              msg = "SingleCellExperiment Object is not created from metadata.")
-  assert_that(!"^ENS" %in% genes,
-              msg = "Gene names cannot contain Ensembl IDs.")
-  assert_that(all(counts_data >= 0),
-              msg = "Counts for SingleCellExperiment cannot be negative.")
+    assert_that(msg = "The filename for count assay (file_id_db) already exists in the cache directory.")
 
-  # check the metadata contains cell_, file_id_db, sample_ with correct types
+  # Check the metadata contains cell_, file_id_db, sample_ with correct types
   check_true("cell_" %in% colnames(metadata_tbl))
   check_true("file_id_db" %in% names(metadata_tbl)) 
   pull(metadata_tbl, .data$cell_) |> class() |> check_character()
   select(metadata_tbl, .data$file_id_db) |> class() |> check_character()
-  #metadata_tbl |> select(cell_, file_id_db) |> sapply(class) |> check_character()
 
-  # check cell_ values in metadata_tbl is unique
-  (anyDuplicated(metadata_tbl$cell_) == 0 ) |> assert_that(msg = "cell_ in the metadata must be unique")
+  # Check cell_ values in metadata_tbl is unique
+  (anyDuplicated(metadata_tbl$cell_) == 0 ) |> assert_that(msg = "Cell names (cell_) in the metadata must be unique.")
 
-  # check cell_ values are not duplicated when join with parquet
-  cells <- select(get_metadata(), .data$cell_) |> as_tibble()
-  (!any(metadata_tbl$cell_ %in% cells$cell_)) |> assert_that(msg = "cell_ in the metadata should not duplicate with that exists in API")
+  # Check cell_ values are not duplicated when join with parquet
+  cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble()
+  (!any(metadata_tbl$cell_ %in% cells$cell_)) |> 
+    assert_that(msg = "Cell names (cell_) should not clash with cells that already exist in the atlas.")
 
-  # check age_days is either -99 or greater than 365
+  # Check age_days is either -99 or greater than 365
   if (any(colnames(metadata_tbl) == "age_days")) {
     assert_that(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365),
-                msg = "age_days should be either -99 for unknown or greater than 365")
+                msg = "age_days should be either -99 for unknown or greater than 365.")
   }
 
-  # check sex capitalisation then convert to lower case 
+  # Check sex capitalisation then convert to lower case 
   if (any(colnames(metadata_tbl) == "sex")) {
     metadata_tbl <- metadata_tbl |> mutate(sex = tolower(.data$sex))
-    dplyr::distinct(metadata_tbl, .data$sex) |> dplyr::pull() |> check_subset(c("female","male","unknown"))
+    distinct(metadata_tbl, .data$sex) |> pull(.data$sex) |> check_subset(c("female","male","unknown"))
   }
   counts_path <-
     select(metadata_tbl, .data$file_id_db) |> mutate(
       original_path = file.path(original_dir, basename(.data$file_id_db)),
       cpm_path = file.path(cache_dir, "cpm", basename(.data$file_id_db))
     )
 
-  # if checkpoints above pass, generate cpm
-  cli_alert_info("Generating cpm from {.path {metadata_tbl$file_id_db}}. ")
-
   # Generate cpm from counts
+  cli_alert_info("Generating cpm from {.path {metadata_tbl$file_id_db}}. ")
   get_counts_per_million(input_sce_obj = sce_obj, output_dir = counts_path$cpm_path, hd5_file_dir = counts_path$original_path)
   saveHDF5SummarizedExperiment(sce_obj, counts_path$original_path, replace=TRUE)
-
   cli_alert_info("cpm are generated in {.path {counts_path$cpm_path}}. ")
 
   # check metadata sample file ID match the count file ID in cache directory
   all(metadata_tbl |> pull(.data$file_id_db) %in% dir(original_dir)) |> 
-    assert_that(msg = "The metadata sample file ID and the count file ID does not match")
+    assert_that(msg = "The filename for count assay, which matches the file_id_db column in the metadata, already exists in the cache directory.")
 
   # convert metadata_tbl to parquet if above checkpoints pass
   arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet"))

diff --git a/man/get_default_cache_dir.Rd b/man/get_default_cache_dir.Rd
diff --git a/man/import_metadata_counts.Rd b/man/import_metadata_counts.Rd