update pipeline, 0.2.2

stemangiola · stemangiola · Apr 23, 2023 · Mar 28, 2023 · Apr 23, 2023 · Apr 23, 2023
commit 8b7e5d163d85e18ad553ec1635b689037ce70559
diff --git a/dev/DB2_files.R b/dev/DB2_files.R
diff --git a/dev/DB_files.R b/dev/DB_files.R
@@ -10,13 +10,13 @@ library(scMerge)
 library(glue)
 library(DelayedArray)
 library(HDF5Array)
-library(HCAquery)
+# library(CuratedAtlasQueryR)
 library(openssl)
 
 
 # CREATE MAKEFILE
 tab = "\t"
-root_directory = "/vast/scratch/users/mangiola.s/human_cell_atlas"
+root_directory = "/vast/projects/cellxgene_curated"
 splitted_light_data_directory = "/vast/projects/RCP/human_cell_atlas/splitted_light_data" #glue("{root_directory}/splitted_light_data")
 DB_data_directory = glue("{root_directory}/splitted_DB_data")
 gene_names = glue("{root_directory}/gene_names.rds")

diff --git a/dev/annotate_files.R b/dev/annotate_files.R
@@ -16,7 +16,7 @@ library(celldex)
 library(SingleR)
 library(glmGamPoi)
 source("utility.R")
-library(HCAquery)
+library(CuratedAtlasQueryR)
 library(BiocParallel)
 library(scuttle)
 

diff --git a/dev/annotation_harmonise.R b/dev/annotation_harmonise.R
diff --git a/dev/build_makefile_annotate_files.R b/dev/build_makefile_annotate_files.R
@@ -127,7 +127,7 @@ metadata_df |>
 	))  |>
 	pull(commands) |>
 	unlist() |>
-	write_lines(glue("~/PostDoc/HCAquery/dev/annotate_files.makeflow"))
+	write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/annotate_files.makeflow"))
 
 
 
diff --git a/dev/get_gene_names.R b/dev/get_gene_names.R
@@ -3,23 +3,29 @@ library(SingleCellExperiment)
 library(tidyverse)
 library(purrr)
 library(glue)
-library(HCAquery)
+library(CuratedAtlasQueryR)
+library(HDF5Array)
+
+library(dbplyr)
+library(DBI)
+library(duckdb)
+
 
 # Read arguments
 args = commandArgs(trailingOnly=TRUE)
-root_directory = "/vast/projects/RCP/human_cell_atlas" # args[[1]]
-metadata_sql = glue("{root_directory}/metadata_annotated.sqlite")
+metadata_DB = "/vast/projects/cellxgene_curated/metadata_annotated_0.2.3.parquet"
+root_directory = "/vast/projects/cellxgene_curated" # args[[1]]
 raw_data_directory = glue("{root_directory}/splitted_data_0.2")
 
-
-
 samples =
-	# get_metadata(metadata_sql) |>
-	readRDS("/vast/projects/RCP/human_cell_atlas/metadata_annotated.rds") |>
-	distinct(file_id, .sample) |>
+  duckdb() |>
+  dbConnect(drv = _, read_only = TRUE) |>
+  tbl(metadata_DB) |>
+	distinct(file_id, sample_) |>
+  as_tibble() |> 
 	group_by(file_id) |>
 	slice(1) |>
-	pull(.sample)
+	pull(sample_)
 
 # Read gene names
 dir(raw_data_directory, full.names = TRUE) |>

diff --git a/dev/get_metadata.R b/dev/get_metadata.R
@@ -13,8 +13,7 @@ library(openssl)
 
 # # CREATE MAKEFILE
 # tab = "\t"
-# root_directory = "/vast/projects/RCP/human_cell_atlas"
-# # my_root_directory = "/vast/scratch/users/mangiola.s/human_cell_atlas"
+# root_directory = "/vast/projects/cellxgene_curated"
 # metadata_directory = glue("{root_directory}/metadata_0.2")
 # raw_data_directory = glue("{root_directory}/raw_data")
 # files_metadata = glue("{root_directory}/files_metadata.rds")
@@ -54,7 +53,7 @@ library(openssl)
 # 	glue("CATEGORY=merge_metadata\nMEMORY=80024\nCORES=1\nWALL_TIME=10000"),
 # 	glue("{metadata_path}:{paste(output_files_path, collapse = \" \")} {files_metadata}\n{tab}Rscript merge_metadata.R {paste(output_files_path, collapse = \" \")} {files_metadata} {metadata_path}")
 # )  |>
-# 	write_lines(glue("~/PostDoc/HCAquery/dev/get_metadata.makeflow"))
+# 	write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/get_metadata.makeflow"))
 
 source("utility.R")
 

diff --git a/dev/light_files.R b/dev/light_files.R
@@ -108,7 +108,7 @@ library(HDF5Array)
 # 												 	))  |>
 # 	pull(commands) |>
 # 	unlist() |>
-# 	write_lines(glue("~/PostDoc/HCAquery/dev/light_files.makeflow"))
+# 	write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/light_files.makeflow"))
 
 
 

diff --git a/dev/merge_metadata.R b/dev/merge_metadata.R
@@ -42,6 +42,28 @@ common_colnames =
 
 print(common_colnames)
 
+# # # Get the rest of uncommon metadata
+# uncommon_metadata =
+#   input_file_paths  |>
+#   enframe(value = "file") |>
+#   mutate(metadata_not_harmonised = imap(
+#     file,
+#     ~ .x %>%
+#       readRDS() |>
+#       select(-one_of(common_colnames), cell_ = .cell, file_id) |> select(cell_, file_id, everything()) |>
+#       mutate(file_id = file_id |> as.character())
+#   )) |>
+#   mutate(file_id = map_chr(metadata_not_harmonised, ~ .x |> distinct(file_id) |> pull(file_id))) |>
+#   select(-name, -file) |>
+#   select(file_id, metadata_not_harmonised) |>
+#   mutate(saved = map2(
+#    metadata_not_harmonised,  file_id,
+#     ~ .x %>%
+#      {print(.y); (.)} |>
+#      saveRDS(glue("/vast/projects/cellxgene_curated/metadata_non_harmonised_0.2/{.y}.rds"), compress = "xz")
+#   ))
+
+
 # Get all metadata
 
 metadata =

diff --git a/dev/metadata_cell_type.csv b/dev/metadata_cell_type.csv
@@ -302,10 +302,10 @@ contractile cell,
 cord blood hematopoietic stem cell,
 cortical cell of adrenal gland,
 cultured cell,
-DN3 thymocyte,
-DN4 thymocyte,
-double negative thymocyte,
-"double-positive, alpha-beta thymocyte",
+DN3 thymocyte,immune
+DN4 thymocyte,immune
+double negative thymocyte,immune
+"double-positive, alpha-beta thymocyte",immune
 duodenum glandular cell,
 early promyelocyte,
 embryonic stem cell,
@@ -316,16 +316,16 @@ enterocyte,
 enterocyte of epithelium of large intestine,
 enterocyte of epithelium of small intestine,
 enteroendocrine cell,
-enucleate erythrocyte,
+enucleate erythrocyte,immune
 enucleated reticulocyte,
 epicardial adipocyte,
 epidermal cell,
-epidermal Langerhans cell,
-erythroblast,
-erythrocyte,
-erythroid lineage cell,
-erythroid progenitor cell,
-"erythroid progenitor cell, mammalian",
+epidermal Langerhans cell,immune
+erythroblast,immune
+erythrocyte,immune
+erythroid lineage cell,immune
+erythroid progenitor cell,immune
+"erythroid progenitor cell, mammalian",immune
 eukaryotic cell,
 extravillous trophoblast,
 eye photoreceptor cell,
@@ -363,7 +363,7 @@ kidney granular cell,
 kidney interstitial cell,
 kidney interstitial fibroblast,
 Kupffer cell,
-Langerhans cell,
+Langerhans cell,immune
 large intestine goblet cell,
 late promyelocyte,
 lens fiber cell,
@@ -472,7 +472,7 @@ syncytiotrophoblast cell,
 taste receptor cell,
 tendon cell,
 theca cell,
-thymocyte,
+thymocyte,immune
 thyroid follicular cell,
 tongue muscle cell,
 tracheal goblet cell,
@@ -492,4 +492,8 @@ vasa recta ascending limb cell,
 vasa recta descending limb cell,
 vascular leptomeningeal cell,
 vascular lymphangioblast,
-ventricular cardiac muscle cell,
+ventricular cardiac muscle cell,
+enterocyte of epithelium proper of ileum,
+smooth muscle fiber of ileum            ,
+ileal goblet cell                       ,
+enteroendocrine cell of small intestine ,
diff --git a/dev/reannotate_cd4.R b/dev/reannotate_cd4.R
diff --git a/dev/reannotate_cd8.R b/dev/reannotate_cd8.R
diff --git a/dev/reannotate_monocytes.R b/dev/reannotate_monocytes.R
diff --git a/dev/scale_files.R b/dev/scale_files.R
@@ -19,16 +19,16 @@ library(glmGamPoi)
 
 # # # CREATE MAKEFILE
 # tab = "\t"
-# root_directory = "/vast/projects/RCP/human_cell_atlas"
-# split_data_directory = glue("{root_directory}/splitted_DB2_data")
-# scaled_data_directory = glue("{root_directory}/splitted_DB2_data_scaled")
-#
+# root_directory = "/vast/projects/cellxgene_curated"
+# split_data_directory = glue("{root_directory}/splitted_DB2_data_0.2")
+# scaled_data_directory = glue("{root_directory}/splitted_DB2_data_scaled_0.2.1")
+# 
 # dir(split_data_directory) |>
 # 	map( ~ glue("{scaled_data_directory}/{.x}:{split_data_directory}/{.x}\n{tab}Rscript scale_files.R {split_data_directory}/{.x} {scaled_data_directory}/{.x}")
 # ) |>
-# 	prepend(glue("CATEGORY=scale_data\nMEMORY=100000\nCORES=2\nWALL_TIME=30000")) |>
+# 	prepend(glue("CATEGORY=scale_data\nMEMORY=10000\nCORES=2\nWALL_TIME=30000")) |>
 # 	unlist()  |>
-# 	write_lines(glue("~/PostDoc/HCAquery/dev/scale_files.makeflow"))
+# 	write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/scale_files.makeflow"))
 
 
 
@@ -46,7 +46,8 @@ output_file |>  dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)
 data = loadHDF5SummarizedExperiment(input_file	)
 
 # Avoid completely empty cells
-which_to_select = which(colSums(data@assays@data$X) >0)
+col_sums = colSums(data@assays@data$X)
+which_to_select = which(col_sums >0 & col_sums < Inf)
 
 sce = SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,which_to_select ,drop=FALSE ], assay.type = "X")))
 rownames(sce) = rownames(data[,which_to_select  ])