SysBioChalmers · feiranl · Apr 23, 2024 · May 21, 2023 · May 21, 2023 · May 22, 2023
diff --git a/code/DepMapGeneEss/ConvertGeneEssInputData.R b/code/DepMapGeneEss/ConvertGeneEssInputData.R
@@ -0,0 +1,70 @@
+library(tidyverse)
+
+dataFolder = "C:/Work/MatlabCode/components/human-GEM/Human-GEMDepMapEval/Human-GEM/code/DepMapGeneEss/data/"
+
+
+d = read_csv(paste0(dataFolder, "CCLE_expression_full.csv"))
+dim(d)#1377 52055
+
+#transpose
+ds = as_tibble(cbind(gene = names(d)[-1], t(as.matrix(d[,-1]))))
+colnames(ds)[-1] = d[[1]]
+
+#make sure the columns are numeric instead of chr
+ds2 = ds
+for (i in 2:ncol(ds)) {
+  ds2[[i]] = as.numeric(ds[[i]])
+
+}
+
+#convert the gene names
+
+#To get ensembl: 
+#pattern = ".*\\(([A-Z0-9]*)\\)"
+#newGenes = str_match(ds$gene, pattern)
+#ds2$gene = newGenes[,2]
+#fix the ERCC genes
+#ds2$gene[is.na(newGenes[,2])] = ds$gene[is.na(newGenes[,2])];
+
+#To get gene symbols:
+#It is a bit tricky, not all genes follow the pattern. Some are like LINC00328-2P (ENSG00000225016),
+#some just an ensembl id (we then take the assembl id), some ERCC 
+newGenes = ds2$gene
+x  = strsplit(ds2$gene[!ERCCGenesSel], " ")
+for(i in 1:length(x)) {
+  newGenes[i] = x[[i]][1] #handles all cases
+}
+length(newGenes)
+length(unique(newGenes)) #not the same, we need to merge a few rows, done later
+
+ds2$gene = newGenes
+
+ds2
+
+
+#now convert the data. It is currently as log2(TPM + 1)
+dsTPM = ds2
+dsTPM[,-1] = 2^ds2[,-1] - 1
+colSums(dsTPM[,-1])#very minor roundoff differences, ok
+
+#Sum up the rows that have the same gene name
+duplGenes = unique(dsTPM$gene[duplicated(dsTPM$gene)])
+length(duplGenes)#9
+dsTPM[dsTPM$gene %in% duplGenes,1:10]
+#CCDC39 is a good example to test
+#is 1.59 + 0.150 in the first row, those should be summed up
+
+rowsToRem = rep(FALSE, nrow(dsTPM))
+for (i in 1:length(duplGenes)) {
+  inds = which(dsTPM$gene == duplGenes[i])
+  dsTPM[inds[1], -1] = colSums(dsTPM[inds, -1]) #the first row now gets the sum of all rows
+  rowsToRem[inds[-1]] = TRUE #the other rows are marked for deletion
+}
+sum(rowsToRem) #9, looks good
+#test: 
+dsTPM[[2]][dsTPM$gene == "CCDC39"] #1.74 0.15, as expected, ok
+dsTPM$gene[rowsToRem] # CCDC39 is in there, ok
+dsTPM2 = dsTPM[!rowsToRem,]
+
+write_tsv(dsTPM2, paste0(dataFolder, "DepMap_tpm_gene_symbols.txt"))
+
diff --git a/code/DepMapGeneEss/Instructions.txt b/code/DepMapGeneEss/Instructions.txt
@@ -0,0 +1,27 @@
+1. First  make sure you have the DepMap data - we currently use DepMap version 2021Q3. You need to go to the downloads section at
+https://depmap.org/portal/download/all/ , select 2021Q3.
+Download and place the files Achilles_gene_effect.csv and CCLE_expression_full.csv in the Human-GEM/code/DepMapGeneEss/data folder.
+2. Run the R script ConvertGeneEssInputData.R. Note that you need to adjust the folder - R is hopeless when it comes to 
+relative paths from the file - the information just doesn't exist. This will generate the file DepMap_tpm_gene_symbols.txt
+3. Run PrepDepMapData.m - you may have to modify the folder here as well. This will take an hour or so and generates the files
+arrayDataDepMap.mat and prepDataGeneSymbols.mat.
+4. Create a folder on the cluster with a suitable name, for example "GeneEssDepMap". Within that folder, create a folder named "components".
+Copy RAVEN, COBRA, and Human-GEM to components, they need to reside under those exact names, because those and underlying folders will be
+added to the matlab path. Make sure the Human-GEM folder Human-GEM/code/DepMapGeneEss/data is copied as well with
+the files that we generated above.
+5. Go back to your top folder ("GeneEssDepMap"). Create a folder DepMapRuns. Copy the files run_gen_depmap_models.sh, 
+run_evaless_depmap.sh, generate_DepMap_models_ftINIT_Cluster.m, and getTaskEssentialGenesCluster.m to that folder. Also create a folder 
+called "logs" in that folder.
+6. Make sure you are in the DepMapRuns folder. First, generate the models using the following line (also available in the .sh script):
+sbatch -o logs/run_gen_depmap_models-%A-%a.log --array=1-40 run_gen_depmap_models.sh
+It may be needed to init cobra and init RAVEN (SetRavenSolver etc.). TODO: Describe this here.
+7. Run sbatch -o logs/run_evaless_depmap-%A-%a.log --array=1-40 run_evaless_depmap.sh after the model generation has completed
+8. Download all produced files to the data folder on your computer. These are found in components/Human-GEM/code/DepMapGeneEss/data
+9. Run evaluateDepMapEssentialityPredictions.m. Note that you may need to change the folder in the file and that there are some things in 
+the file that need to be modified to match what you want to do. The file currently only gets stats for one model version, but can easily be expanded
+to take in data from more model versions, although these also need to be processed as above as well.
+10. Run PlotGeneEss.R in R to make a violin plot of MCC for each model. The code needs some modification to fix paths and number of models.
+
+
+
+
diff --git a/code/DepMapGeneEss/PlotGeneEss.R b/code/DepMapGeneEss/PlotGeneEss.R
@@ -0,0 +1,62 @@
+library(ggplot2)
+library(tidyverse)
+
+setwd("C:/Work/MatlabCode/components/human-GEM/Human-GEMDepMapEval/Human-GEM/code/DepMapGeneEss") #Modify to match your file system
+
+
+figPath = "figures/"
+
+
+############################################
+# Fig 1B - Gene essentiality
+############################################
+
+# load data
+# fns = c("data/geneEss_model1.txt",
+#         "data/geneEss_newalg.txt",
+#         "data/geneEss_newalg2.txt"
+#         )
+fns = c("data/geneEss_model1.txt")
+
+
+#names = c("tINIT","ftINIT 1+0", "ftINIT 1+1")
+names = c("Human2")
+
+gea_res = NULL
+for (i in 1:length(fns)) {
+  x = read.delim(file = fns[i], sep='\t', stringsAsFactors=F)
+  x$model = names[i]
+  gea_res = rbind(gea_res, x)
+}
+gea_res$model = factor(gea_res$model, as.character(names)[1:length(fns)])  # to enforce the model order
+
+
+color_palette <- c('#B5D39B','#6B97BC','#E7B56C')  # light green, light blue, light yellow
+
+p1B = ggplot(gea_res, aes(x = model, y = MCC, fill = model)) +
+  geom_violin(trim=F, show.legend=F, scale='count') +
+  scale_fill_manual(values=color_palette) +
+  theme_classic() + 
+  ylab('MCC') +
+  xlab('') +
+  theme(text = element_text(size=14),
+        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5,
+                                   color='black', size=14),
+        axis.text.y = element_text(color='black', size=14),
+        axis.line.x = element_blank()) +
+  ylim(c(0.08,0.40)) # + #manipulate these numbers to include all data
+  #ylim(c(0,0.5)) # +
+p1B
+
+
+ggsave(
+  paste0(figPath, "FigGeneEss.png"),
+  plot = p1B,
+  width = 3.5, height = 3.2, dpi = 300)
+
+ggsave(
+  paste0(figPath, "FigGeneEss.eps"),
+  plot = p1B,
+  width = 3.5, height = 3.2, dpi = 300)
+
+
diff --git a/code/DepMapGeneEss/PrepDepMapData.m b/code/DepMapGeneEss/PrepDepMapData.m
@@ -0,0 +1,51 @@
+%we are using depmap version 2021_Q3
+
+%Prepares the DepMap data, specifically by filtering out RNA-Seq samples for which no CRISPR data exists
+%cd C:/Work/MatlabCode/components/human-GEM/Human-GEMDepMapEval/Human-GEM/code/DepMapGeneEss %to be used if not running the whole file, may need to change this
+cd(fileparts(which(mfilename))); %to be used if running the whole file
+
+%% Load and prepare DepMap RNA-Seq data (cell lines)
+
+% load RNA-Seq data from txt file
+rna_data = readtable('data/DepMap_tpm_gene_symbols.txt');
+
+% load gene essentiality data (Achilles gene effect)
+ach_data = readtable('data/Achilles_gene_effect.csv');
+samples = ach_data.DepMap_ID;  % extract sample IDs
+
+% filter RNA-Seq data to only include samples for which we have
+% essentiality data
+cellLineNames = rna_data.Properties.VariableNames;
+%now replace '_' with '-'
+cellLineNames = strrep(cellLineNames, '_', '-');
+
+%{'Original column heading: 'ACH-001113''}
+
+keep = ismember(cellLineNames, samples);
+
+sum(keep) %891
+sum(keep)/length(keep) % 65%, seems reasonable
+
+% add RNA-Seq data to arrayData
+arrayDataDepMap.genes = rna_data.gene;
+arrayDataDepMap.tissues = cellLineNames(keep)';
+arrayDataDepMap.levels = table2array(rna_data(:, keep));
+arrayDataDepMap.threshold = 1;
+
+
+% save tINIT inputs
+save('data/arrayDataDepMap.mat','arrayDataDepMap');
+
+%Generate ftINIT prepData - only needs to be done once. Can take up to an hour to run
+model = importYaml('../../model/Human-GEM.yml');
+[model.grRules, skipped] = simplifyGrRules(model.grRules, true);%takes a few minutes to run
+prepData = prepHumanModelForftINIT(model, true, '../../data/metabolicTasks/metabolicTasks_Essential.txt', '../../model/reactions.tsv');
+save('data/prepDataGeneSymbols.mat', 'prepData')
+
+
+
+
+
+
+
+
diff --git a/code/DepMapGeneEss/data/DepMap data files go here.txt b/code/DepMapGeneEss/data/DepMap data files go here.txt
diff --git a/code/DepMapGeneEss/enrichment_test.m b/code/DepMapGeneEss/enrichment_test.m
@@ -0,0 +1,52 @@
+%This file is for convenience copied from the Human1 paper.
+function [penr,pdep] = enrichment_test(pop,sample,successes)
+%enrichment_test caculates p-value of enrichment of successes in a sample.
+%
+% [PENR,PDEP] = enrichment_test(POP,SAMPLE,SUCCESSES) evaluates the
+% significance of enrichment (and depletion) of SUCCESSES in a SAMPLE drawn
+% from population POP using the hypergeometric test.
+%
+%
+%--------------------------------- INPUTS ---------------------------------
+%
+% pop           Vector of genes comprising the population from
+%               which samples are drawn.
+%
+% sample        Vector of genes sampled from the population.
+%
+% successes     Vector of genes in the population that are defined as
+%               "successes". 
+%               
+%               The function will test if these "success" genes are 
+%               significantly depleted or enriched in the sample, given
+%               that they were drawn from the population.
+%
+% EXAMPLE: If a metabolite set enrichment analysis (MSEA) is being
+%          performed, then SAMPLE is the list of metabolites of interest,
+%          and SUCCESSES is a metabolite set (e.g., TCA cycle metabolites).
+%          POP is the list of all metabolites from which SAMPLE and
+%          SUCCESSES are drawn.
+%
+%--------------------------------- OUTPUTS --------------------------------
+%
+% penr          p-value associated with enrichment of successes in sample.
+%
+% pdep          p-value associated with depletion of successes in sample.
+%
+%
+% ***** WARNING: P-VALUES ARE NOT ADJUSTED FOR FALSE DISCOVERY RATE! *****
+%
+
+x = numel(intersect(successes,sample));  % calc # of successes in sample
+m = numel(pop);  % calc size of population
+k = numel(intersect(successes,pop));
+n = numel(sample);
+
+penr = hygecdf(x-1,m,k,n,'upper');
+pdep = hygecdf(x,m,k,n);
+
+
+
+
+
+