-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_kegg_enrichment.R
84 lines (72 loc) · 3.44 KB
/
get_kegg_enrichment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#' Identify KEGG enrichment for all annotated kegg terms for an organism using Fsher tests
#'
#' Rogan Grant 20212-08-25
#'
#' @param deseq_object a DESeq2 object with library normalization pre-calculated using DESeq(), vst(), etc
#' @param goi vector genes of interest in ensemble gene ID format. For fisher test: just IDs. For KS test: vector of scores with IDs as names.
#' @param kegg_organism abbreviated organism for kegg database (defaults to mouse "mmu")
#' @param gene_conversion conversion from ensembl_gene_id to external_gene_name (generally from biomart)
#' @param expression_cutoff the minimum counts detected for a given gene to be considered expressed
#' @param cores number of cores to use (defaults to 1)
#' @return a dataframe containing the significantly enriched GO terms and enrichment scores if requested
#' @export
get_kegg_enrichment = function(deseq_object, goi, kegg_organism = "mmu",
gene_conversion, expression_cutoff = 1, cores = 1)
{
#get kegg lists
kegg_pathways = keggList("pathway", kegg_organism)
kegg_ids = names(kegg_pathways) %>%
gsub("path:", "", .)
full_kegg_names = paste(as.character(kegg_pathways), names(kegg_pathways), sep = "; ")
kegg_lists = mclapply(kegg_ids, function(path){
kegg = keggGet(path)
genes = kegg[[1]][["GENE"]]
if(is.null(genes))
{
return(NA)
} else
{
#follows format ORF, description, so remove ORF and strip gene name
genes = genes[c(F, T)] #skip ORFs
genes = substring(genes, first = 1, last = (regexpr("\\;", genes) - 1))
}
return(genes) }, mc.cores = cores)
names(kegg_lists) = full_kegg_names
#get all expressed genes
universe = counts(deseq_object, normalized = T) %>%
as.data.frame() %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conversion) %>%
dplyr::filter(!is.na(external_gene_name) & external_gene_name != "") %>%
column_to_rownames("external_gene_name") %>%
dplyr::select(-ensembl_gene_id) %>%
as.matrix() %>%
.[rowSums(.) > expression_cutoff, ] %>%
rownames()
#now perform fisher tests of enrichment for each group
enrichments = mclapply(kegg_lists, function(kegg){
non_hits = setdiff(gene_universe, goi)
#make contingency table
hits_in_kegg = length(intersect(goi, kegg))
hits_not_in_kegg = length(setdiff(goi, kegg))
non_hits_in_kegg = length(intersect(non_hits, kegg))
non_hits_not_in_kegg = length(setdiff(non_hits, kegg))
contingency = rbind(c(hits_in_kegg, non_hits_in_kegg),
c(hits_not_in_kegg, non_hits_not_in_kegg))
rownames(contingency) = c("in_kegg_term", "not_in_kegg_term")
colnames(contingency) = c("DEG", "not_DEG")
#get expected frequencies using chisq
expected = suppressWarnings(chisq.test(contingency)$expected)
fisher = fisher.test(contingency, alternative = "greater")
#format as in topGO
out_df = data.frame(expected = expected["in_kegg_term", "DEG"],
observed = contingency["in_kegg_term", "DEG"],
fold_enrichment = contingency["in_kegg_term", "DEG"] / expected["in_kegg_term", "DEG"],
pval = fisher$p.value)
return(out_df) }, mc.cores = cores)
enrich_df = bind_rows(enrichments) %>%
dplyr::mutate(pathway = names(kegg_lists),
padj = p.adjust(pval, method = "fdr")) %>%
dplyr::arrange(padj)
return(enrich_df)
}