From 118496a2fb352990a62de3e881995af16c490106 Mon Sep 17 00:00:00 2001
From: Ally Hawkins <ally.hawkins@ccdatalab.org>
Date: Tue, 21 Jan 2025 12:26:06 -0600
Subject: [PATCH] use geneset tsv

---
 .../cell-type-ewings/references/README.md     |  3 +
 .../scripts/aucell-ews-signatures/01-aucell.R | 62 +++++++++++++------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/analyses/cell-type-ewings/references/README.md b/analyses/cell-type-ewings/references/README.md
index e9b0ef508..9f536e8ba 100644
--- a/analyses/cell-type-ewings/references/README.md
+++ b/analyses/cell-type-ewings/references/README.md
@@ -89,3 +89,6 @@ Wrenn _et al._ also used found that the following additional gene sets were high
 
 - [`HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION`](https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.html)
 - [`REGULATION_OF_EXTRACELLULAR_MATRIX_ORGANIZATION`](https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/GOBP_REGULATION_OF_EXTRACELLULAR_MATRIX_ORGANIZATION.html)
+
+The `msigdb-gene-sets.tsv` file contains a list of any gene sets that are from `MSigDB` that may be useful for analysis (such as those mentioned above). 
+All gene sets in this file are used when running `AUCell` as part of `run-aucell-ews-signatures.sh`. 
diff --git a/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R b/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R
index 0d2f8e123..0678b480c 100644
--- a/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R
+++ b/analyses/cell-type-ewings/scripts/aucell-ews-signatures/01-aucell.R
@@ -24,6 +24,13 @@ option_list <- list(
       All TSV files in the directory will be used and must contain the `ensembl_gene_id` column.
       File names will be used as the name of the gene set."
   ),
+  make_option(
+    opt_str = c("--msigdb_genesets"),
+    type = "character", 
+    default = file.path(project_root, "references", "msigdb-gene-sets.tsv"),
+    help = "Path to TSV file containing all gene sets from MSigDB to use with AUCell.
+      Must contain columns with `name`, `geneset`, `category`, and `subcategory`."
+  ), 
   make_option(
     opt_str = c("--max_rank_threshold"),
     type = "double",
@@ -58,6 +65,7 @@ opt <- parse_args(OptionParser(option_list = option_list))
 # make sure input files exist
 stopifnot(
   "sce file does not exist" = file.exists(opt$sce_file),
+  "MSigDB gene set file does not exist" = file.exists(opt$msigdb_genesets),
   "max_rank_threshold must be between 0 and 1" = (opt$max_rank_threshold <= 1 & opt$max_rank_threshold > 0)
 )
 
@@ -87,6 +95,9 @@ sce <- readr::read_rds(opt$sce_file)
 genes_to_remove <- rowData(sce)$detected > 0 
 filtered_sce <- sce[genes_to_remove , ]
 
+# read in gene sets to use with msigdb
+genesets_df <- readr::read_tsv(opt$msigdb_genesets)
+
 # Prep gene sets ---------------------------------------------------------------
 
 # get list of files with custom gene sets to use 
@@ -103,30 +114,41 @@ custom_genes_list <- gene_set_files |>
       unique()
   })
 
-# grab msigdb gene sets 
-ews_gene_sets <- c(
-  "staege" = "STAEGE_EWING_FAMILY_TUMOR",
-  "miyagawa_up" = "MIYAGAWA_TARGETS_OF_EWSR1_ETS_FUSIONS_UP",
-  "miyagawa_down" = "MIYAGAWA_TARGETS_OF_EWSR1_ETS_FUSIONS_DN",
-  "zhang"= "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", 
-  "riggi_up" = "RIGGI_EWING_SARCOMA_PROGENITOR_UP",
-  "riggi_down" = "RIGGI_EWING_SARCOMA_PROGENITOR_DN",
-  "kinsey_up" = "KINSEY_TARGETS_OF_EWSR1_FLII_FUSION_UP",
-  "kinsey_down"= "KINSEY_TARGETS_OF_EWSR1_FLII_FUSION_DN"
-)
+# get list of categories that we need to grab from msigdb
+category_list <- genesets_df |> 
+  dplyr::select(category, subcategory) |>
+  unique() |>
+  purrr::transpose()
+
+# list of genesets and names 
+geneset_list <- genesets_df$geneset |> 
+  purrr::set_names(genesets_df$name)
 
 # pull gene sets from msigbdr
-# all gene sets are part of C2, CGP
-msig_genes_df <- msigdbr::msigdbr(
-  species = "Homo sapiens",
-  category = "C2",
-  subcategory = "CGP"
-) |>
+# first pull out info for each category and then pull out specific genes for geneset 
+msig_genes_df <- category_list |> 
+  purrr::map(\(category_list){
+    
+    # replace subcategory with default NULL 
+    # can't use NULL in tsv since it gets read in as a character 
+    if(is.na(category_list$subcategory)){
+      subcategory <- NULL
+    } else {
+      subcategory <- category_list$subcategory
+    }
+    
+    msigdbr::msigdbr(
+      species = "Homo sapiens",
+      category = category_list$category,
+      subcategory = subcategory
+    )
+  }) |>
+  dplyr::bind_rows() |>
   # only keep relevant gene sets
-  dplyr::filter(gs_name %in% ews_gene_sets)
+  dplyr::filter(gs_name %in% geneset_list)
 
 # create named list of genes in each gene set
-msig_genes_list <- ews_gene_sets |>
+msig_genes_list <- geneset_list |>
   purrr::map(\(name){
     genes <- msig_genes_df |>
       dplyr::filter(gs_name == name) |>
@@ -152,7 +174,7 @@ auc_results <- AUCell::AUCell_run(
   counts_mtx, 
   collection, 
   aucMaxRank = max_rank,
-  #BPPARAM = bp_param
+  BPPARAM = bp_param
 )
 
 # Get threshold ----------------------------------------------------------------