Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow h5 seurat for anndata #111

Merged
merged 3 commits into from
Apr 23, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update pipeline, 0.2.2
stemangiola committed Apr 23, 2023
commit 8b7e5d163d85e18ad553ec1635b689037ce70559
206 changes: 140 additions & 66 deletions dev/DB2_files.R

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions dev/DB_files.R
Original file line number Diff line number Diff line change
@@ -10,13 +10,13 @@ library(scMerge)
library(glue)
library(DelayedArray)
library(HDF5Array)
library(HCAquery)
# library(CuratedAtlasQueryR)
library(openssl)


# CREATE MAKEFILE
tab = "\t"
root_directory = "/vast/scratch/users/mangiola.s/human_cell_atlas"
root_directory = "/vast/projects/cellxgene_curated"
splitted_light_data_directory = "/vast/projects/RCP/human_cell_atlas/splitted_light_data" #glue("{root_directory}/splitted_light_data")
DB_data_directory = glue("{root_directory}/splitted_DB_data")
gene_names = glue("{root_directory}/gene_names.rds")
2 changes: 1 addition & 1 deletion dev/annotate_files.R
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@ library(celldex)
library(SingleR)
library(glmGamPoi)
source("utility.R")
library(HCAquery)
library(CuratedAtlasQueryR)
library(BiocParallel)
library(scuttle)

403 changes: 373 additions & 30 deletions dev/annotation_harmonise.R

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dev/build_makefile_annotate_files.R
Original file line number Diff line number Diff line change
@@ -127,7 +127,7 @@ metadata_df |>
)) |>
pull(commands) |>
unlist() |>
write_lines(glue("~/PostDoc/HCAquery/dev/annotate_files.makeflow"))
write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/annotate_files.makeflow"))



24 changes: 15 additions & 9 deletions dev/get_gene_names.R
Original file line number Diff line number Diff line change
@@ -3,23 +3,29 @@ library(SingleCellExperiment)
library(tidyverse)
library(purrr)
library(glue)
library(HCAquery)
library(CuratedAtlasQueryR)
library(HDF5Array)

library(dbplyr)
library(DBI)
library(duckdb)


# Read arguments
args = commandArgs(trailingOnly=TRUE)
root_directory = "/vast/projects/RCP/human_cell_atlas" # args[[1]]
metadata_sql = glue("{root_directory}/metadata_annotated.sqlite")
metadata_DB = "/vast/projects/cellxgene_curated/metadata_annotated_0.2.3.parquet"
root_directory = "/vast/projects/cellxgene_curated" # args[[1]]
raw_data_directory = glue("{root_directory}/splitted_data_0.2")



samples =
# get_metadata(metadata_sql) |>
readRDS("/vast/projects/RCP/human_cell_atlas/metadata_annotated.rds") |>
distinct(file_id, .sample) |>
duckdb() |>
dbConnect(drv = _, read_only = TRUE) |>
tbl(metadata_DB) |>
distinct(file_id, sample_) |>
as_tibble() |>
group_by(file_id) |>
slice(1) |>
pull(.sample)
pull(sample_)

# Read gene names
dir(raw_data_directory, full.names = TRUE) |>
5 changes: 2 additions & 3 deletions dev/get_metadata.R
Original file line number Diff line number Diff line change
@@ -13,8 +13,7 @@ library(openssl)

# # CREATE MAKEFILE
# tab = "\t"
# root_directory = "/vast/projects/RCP/human_cell_atlas"
# # my_root_directory = "/vast/scratch/users/mangiola.s/human_cell_atlas"
# root_directory = "/vast/projects/cellxgene_curated"
# metadata_directory = glue("{root_directory}/metadata_0.2")
# raw_data_directory = glue("{root_directory}/raw_data")
# files_metadata = glue("{root_directory}/files_metadata.rds")
@@ -54,7 +53,7 @@ library(openssl)
# glue("CATEGORY=merge_metadata\nMEMORY=80024\nCORES=1\nWALL_TIME=10000"),
# glue("{metadata_path}:{paste(output_files_path, collapse = \" \")} {files_metadata}\n{tab}Rscript merge_metadata.R {paste(output_files_path, collapse = \" \")} {files_metadata} {metadata_path}")
# ) |>
# write_lines(glue("~/PostDoc/HCAquery/dev/get_metadata.makeflow"))
# write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/get_metadata.makeflow"))

source("utility.R")

2 changes: 1 addition & 1 deletion dev/light_files.R
Original file line number Diff line number Diff line change
@@ -108,7 +108,7 @@ library(HDF5Array)
# )) |>
# pull(commands) |>
# unlist() |>
# write_lines(glue("~/PostDoc/HCAquery/dev/light_files.makeflow"))
# write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/light_files.makeflow"))



22 changes: 22 additions & 0 deletions dev/merge_metadata.R
Original file line number Diff line number Diff line change
@@ -42,6 +42,28 @@ common_colnames =

print(common_colnames)

# # # Get the rest of uncommon metadata
# uncommon_metadata =
# input_file_paths |>
# enframe(value = "file") |>
# mutate(metadata_not_harmonised = imap(
# file,
# ~ .x %>%
# readRDS() |>
# select(-one_of(common_colnames), cell_ = .cell, file_id) |> select(cell_, file_id, everything()) |>
# mutate(file_id = file_id |> as.character())
# )) |>
# mutate(file_id = map_chr(metadata_not_harmonised, ~ .x |> distinct(file_id) |> pull(file_id))) |>
# select(-name, -file) |>
# select(file_id, metadata_not_harmonised) |>
# mutate(saved = map2(
# metadata_not_harmonised, file_id,
# ~ .x %>%
# {print(.y); (.)} |>
# saveRDS(glue("/vast/projects/cellxgene_curated/metadata_non_harmonised_0.2/{.y}.rds"), compress = "xz")
# ))


# Get all metadata

metadata =
32 changes: 18 additions & 14 deletions dev/metadata_cell_type.csv
Original file line number Diff line number Diff line change
@@ -302,10 +302,10 @@ contractile cell,
cord blood hematopoietic stem cell,
cortical cell of adrenal gland,
cultured cell,
DN3 thymocyte,
DN4 thymocyte,
double negative thymocyte,
"double-positive, alpha-beta thymocyte",
DN3 thymocyte,immune
DN4 thymocyte,immune
double negative thymocyte,immune
"double-positive, alpha-beta thymocyte",immune
duodenum glandular cell,
early promyelocyte,
embryonic stem cell,
@@ -316,16 +316,16 @@ enterocyte,
enterocyte of epithelium of large intestine,
enterocyte of epithelium of small intestine,
enteroendocrine cell,
enucleate erythrocyte,
enucleate erythrocyte,immune
enucleated reticulocyte,
epicardial adipocyte,
epidermal cell,
epidermal Langerhans cell,
erythroblast,
erythrocyte,
erythroid lineage cell,
erythroid progenitor cell,
"erythroid progenitor cell, mammalian",
epidermal Langerhans cell,immune
erythroblast,immune
erythrocyte,immune
erythroid lineage cell,immune
erythroid progenitor cell,immune
"erythroid progenitor cell, mammalian",immune
eukaryotic cell,
extravillous trophoblast,
eye photoreceptor cell,
@@ -363,7 +363,7 @@ kidney granular cell,
kidney interstitial cell,
kidney interstitial fibroblast,
Kupffer cell,
Langerhans cell,
Langerhans cell,immune
large intestine goblet cell,
late promyelocyte,
lens fiber cell,
@@ -472,7 +472,7 @@ syncytiotrophoblast cell,
taste receptor cell,
tendon cell,
theca cell,
thymocyte,
thymocyte,immune
thyroid follicular cell,
tongue muscle cell,
tracheal goblet cell,
@@ -492,4 +492,8 @@ vasa recta ascending limb cell,
vasa recta descending limb cell,
vascular leptomeningeal cell,
vascular lymphangioblast,
ventricular cardiac muscle cell,
ventricular cardiac muscle cell,
enterocyte of epithelium proper of ileum,
smooth muscle fiber of ileum ,
ileal goblet cell ,
enteroendocrine cell of small intestine ,
33 changes: 0 additions & 33 deletions dev/reannotate_cd4.R

This file was deleted.

34 changes: 0 additions & 34 deletions dev/reannotate_cd8.R

This file was deleted.

34 changes: 0 additions & 34 deletions dev/reannotate_monocytes.R

This file was deleted.

15 changes: 8 additions & 7 deletions dev/scale_files.R
Original file line number Diff line number Diff line change
@@ -19,16 +19,16 @@ library(glmGamPoi)

# # # CREATE MAKEFILE
# tab = "\t"
# root_directory = "/vast/projects/RCP/human_cell_atlas"
# split_data_directory = glue("{root_directory}/splitted_DB2_data")
# scaled_data_directory = glue("{root_directory}/splitted_DB2_data_scaled")
#
# root_directory = "/vast/projects/cellxgene_curated"
# split_data_directory = glue("{root_directory}/splitted_DB2_data_0.2")
# scaled_data_directory = glue("{root_directory}/splitted_DB2_data_scaled_0.2.1")
#
# dir(split_data_directory) |>
# map( ~ glue("{scaled_data_directory}/{.x}:{split_data_directory}/{.x}\n{tab}Rscript scale_files.R {split_data_directory}/{.x} {scaled_data_directory}/{.x}")
# ) |>
# prepend(glue("CATEGORY=scale_data\nMEMORY=100000\nCORES=2\nWALL_TIME=30000")) |>
# prepend(glue("CATEGORY=scale_data\nMEMORY=10000\nCORES=2\nWALL_TIME=30000")) |>
# unlist() |>
# write_lines(glue("~/PostDoc/HCAquery/dev/scale_files.makeflow"))
# write_lines(glue("~/PostDoc/CuratedAtlasQueryR/dev/scale_files.makeflow"))



@@ -46,7 +46,8 @@ output_file |> dirname() |> dir.create( showWarnings = FALSE, recursive = TRUE)
data = loadHDF5SummarizedExperiment(input_file )

# Avoid completely empty cells
which_to_select = which(colSums(data@assays@data$X) >0)
col_sums = colSums(data@assays@data$X)
which_to_select = which(col_sums >0 & col_sums < Inf)

sce = SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,which_to_select ,drop=FALSE ], assay.type = "X")))
rownames(sce) = rownames(data[,which_to_select ])