-
Notifications
You must be signed in to change notification settings - Fork 0
/
REST_expression_predictive modeling_v2.R
256 lines (207 loc) · 10.2 KB
/
REST_expression_predictive modeling_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
############################################
###Goal1: to find top REST-target genes that are negatively correlated with REST on mRNA level in GBM samples###
###function to convert gene IDs to ensembl Ids###
SYMBOLtoENS <- function(genes = character()) {
require("EnsDb.Hsapiens.v86")
mapIds <- mapIds(EnsDb.Hsapiens.v86, keys = genes, keytype = "SYMBOL")
mapIds_df <- as.data.frame(mapIds)
mapIds_df$mapIds
}
###opposite conversion###
ENStoSYMBOL <- function(genes = character()) {
require("org.Hs.eg.db")
mapIds <- mapIds(org.Hs.eg.db, keys = genes, keytype = "ENSEMBL", column="SYMBOL")
mapIds_df <- as.data.frame(mapIds)
mapIds_df$mapIds
}
library(TCGAbiolinks)
library(SummarizedExperiment)
###querying TCGA-GBM RNA-Seq dataset###
query_GBM <- GDCquery(project = "TCGA-GBM",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM-UQ",
legacy = FALSE,
sample.type = c("Primary Tumor", "Recurrent Tumor"),
access = "open")
GDCdownload(query_GBM)
###.rda file was created on 2/13/2021 and can be found in repo###
e_gbm <- load(file = "TCGA_exp.gbm.rda")
colData(data)
assays(data)
###cleaning and filtering expression data###
exp_Prep <- TCGAanalyze_Preprocessing(object = data,
cor.cut = 0.6,
datatype = "HTSeq - FPKM-UQ")
exp_Filt <- TCGAanalyze_Filtering(tabDF = exp_Prep,
method = "quantile",
qnt.cut = 0.25)
###transposing the matrix of counts###
t_exp_matrix <- t(exp_Filt)
dim(t_exp_matrix)
###obtaining correlation coefficients###
REST_position <- which(colnames(t_exp_matrix) == "ENSG00000084093")
out_cor <- t(cor(t_exp_matrix[, REST_position], t_exp_matrix[, -REST_position]))
out_cor <- as.data.frame(out_cor)
###ranking genes###
library(data.table)
setDT(out_cor, keep.rownames = "Genes")
out_cor_ranked <- out_cor[order(out_cor$V1),]
head(out_cor_ranked, 100)
colnames(out_cor_ranked)[2] <- "Pearson r"
###obtaining p-values and q-values###
library(dplyr)
out_cor_ranked %>%
mutate(p_value = NA) %>%
mutate(q_value = NA)
for (i in 1:nrow(out_cor_ranked)) {
gene_correl <- which(colnames(t_exp_matrix) == out_cor_ranked$Genes[i])
out_cor_ranked$p_value[i] = cor.test(t_exp_matrix[, REST_position], t_exp_matrix[, gene_correl])$p.value
}
library(WGCNA)
qqs <- qvalue(out_cor_ranked$p_value)
out_cor_ranked$q_value <- qqs$qvalues
###writing all ranked correlations to a csv file###
write.csv(as.data.frame(out_cor_ranked), "Output_TCGA_correlations_REST.csv")
###Goal2: to build regression-based predictive model to assess functional (silencing) activity of REST transcription factor based on its target genes###
###Method: forward regression with nested CV###
###feature selection: use top-100 negatively correlated genes and screen for those that have canonical/conserved RE-1 motif###
###list of genes with RE-1 motifs is from https://pubmed.ncbi.nlm.nih.gov/25990720/###
###convert ensembl top-100 REST-target gene IDs to hgnc-symbol###
feature_genes_ensembl <- out_cor_ranked$Genes[1:100]
feature_genes <- ENStoSYMBOL(feature_genes_ensembl)
###accessing published binding sites of REST in ESC cells and filtering genes with conserved RE-1 motifs###
RE1 <- as.data.frame(read_excel("Binding sites_REST_ESC_PMID25990720.xlsx", sheet = "Table S2 human (hg19)"))
RE1 <- RE1 %>%
filter(`non-alignable, alignable conserved or alignable non-conserved` == "conserved" & is.na(`gene name`) == F)
dim(RE1) ##1252 genes left
###finding overlap between our feature genes and RE-1 element-containing genes###
feature_genes_RE1 <- intersect(RE1$`gene name`, feature_genes)
###converting back to ensembl ids###
RESTTargetGenes <- SYMBOLtoENS(feature_genes_RE1)
###EDA:exploring GBM patients metrics that correlate with REST mRNA###
REST_mRNA <- c(t_exp_matrix[, "ENSG00000084093"])
IDH_status <- data$subtype_IDH.status
GBM_subtype <- data$subtype_Transcriptome.Subtype
age <- data$age_at_diagnosis
gender <- data$gender
df <- data.frame(REST= REST_mRNA, IDH_status = IDH_status,
GBM_subtype = GBM_subtype, age = age,
gender = gender)
mod = lm(REST~GBM_subtype+gender+age+IDH_status, data=df)
anova(mod) ## REST expression does not correlate with age, gender, and IDH status, but REST level is dependent on GBM molecular subtype###
REST_CL <- df %>%
dplyr::filter(GBM_subtype == "CL") #classical GBM
REST_ME <- df %>%
dplyr::filter(GBM_subtype == "ME") #mesenchymal GBM
REST_NE <- df %>%
dplyr::filter(GBM_subtype == "NE") #neural GBM
REST_PN <- df %>%
dplyr::filter(GBM_subtype == "PN") #proneural GBM
types <- c(rep("CL", length(REST_CL$REST)), rep("ME", length(REST_ME$REST)),
rep("NE", length(REST_NE$REST)), rep("PN", length(REST_PN$REST)))
REST_mRNA <- c(REST_CL$REST, REST_ME$REST, REST_NE$REST, REST_PN$REST)
library(ggplot2)
types_df <- data.frame(Type = types, REST_exp = REST_mRNA)
p <- ggplot(types_df, aes(x = Type, y=REST_exp, color = Type)) +
geom_boxplot() + labs(title="REST expression (TCGA-GBM)", x=NULL, y="FPKM")
p <- p+ theme(plot.title = element_text(size = 20))
p<- p+ theme(text = element_text(size=20),
axis.text.x = element_text(size=15),
axis.text.y = element_text(size=15))
p
###based on this plot, REST expression is the highest in most aggressive GBMs: classical and mesenchymal types.
###adding GBM subtype to expression data###
t_exp_matrix <- as.data.frame(t_exp_matrix)
t_exp_data <- cbind(t_exp_matrix, df$GBM_subtype)
names(t_exp_data)[names(t_exp_data) == 'df$GBM_subtype'] <- 'GBM_subtype'
t_exp_data <- t_exp_data %>%
dplyr::filter(is.na(GBM_subtype) == FALSE)
dim(t_exp_data) ##29 patients without GBM subtype info were removed###
###Fitting models for every REST target gene to find shared predictors###
FitFSwithNestedCV <- function(gene, data) {
require(tidyverse)
require(caret)
require(MASS)
require(nestfs)
require(data.table)
require(dplyr)
###Looking for top-10 correlated genes (pos/neg)###
gene_position <- which(colnames(data) == gene)
cor <- t(cor(data[, gene_position], data[, -gene_position]))
cor <- as.data.frame(cor)
cor[,"Genes"] <- rownames(cor)
cor_ranked <- cor[order(cor$V1),]
cor_ranked <- na.omit(cor_ranked)
predictors <- c(cor_ranked$Genes[1:10], cor_ranked$Genes[(nrow(cor_ranked)-9):nrow(cor_ranked)])
data_reduced <- data[,c(gene, predictors, "GBM_subtype")]
###Modeling with nested CV###
folds <- create.folds(5, nrow(data))
nest.res <- nested.fs(as.formula(paste(gene, "~ GBM_subtype")), data= data_reduced, family=gaussian(), folds=folds,
min.llk.diff = 10)
summary(nest.res)
}
###this loop may take a while.
results_mod <- list()
for (i in 1:length(RESTTargetGenes)) {
results_mod[i] <- FitFSwithNestedCV(RESTTargetGenes[i], data = t_exp_data)
}
total_res <- unlist(results_mod)
tail(sort(table(total_res)), 10)
###saving top genes predicting REST protein level (occur more than 4 times): MAPK8IP2, ATP1A3, RUNDC3A, CDK5R1, REST, TLCD3B
max_overlap <- c("ENSG00000008735", "ENSG00000105409", "ENSG00000108309",
"ENSG00000176749", "ENSG00000084093", "ENSG00000149926")
###Goal3: using top-overlapping predictors, to discriminate all 169 tumor samples into REST-low and REST-high###
cor(t_exp_data[,c(max_overlap)]) ###negative correlation between REST mRNA and all these genes
exp_data_final <- t_exp_matrix[,c(max_overlap)]
###k-means clustering of samples into two categories###
cl <- kmeans(exp_data_final, centers = 2, nstart = 10)
cl$centers
library(ggplot2)
p <- ggplot(data = exp_data_final,
aes(x= ENSG00000084093, y=ENSG00000108309, color = factor(cl$cluster))) +
geom_point() +
labs(title = "Clustering of TCGA-GBM samples", y = "RUNDC3A mRNA", x = "REST mRNA")
p
###Goal 4: to compare signaling pathways (proteomic data) between REST-low and REST-high samples using RPPA data to identify pathways for synergetic inhibition###
###uploading TCGA samples (mRNA expression, 6 top-predictor genes) from cBioPortal and RPPA data, separately###
RPPA <- as.data.frame(read_csv("TCGA-GBM-L3-S42.csv"))
mRNA_tested_samples <- as.data.frame(read_delim("TCGA_mRNA_tested_samples.txt"))
###filtering out missing values, e.g. samples with Not Performed (NP) expression testing
mRNA_tested_samples_filt <- mRNA_tested_samples %>%
dplyr::filter(REST != "NP")
mRNA_tested_samples_filt <- mRNA_tested_samples_filt[,-1]
matrix <- mRNA_tested_samples_filt[,2:7]
rownames(matrix) <- mRNA_tested_samples_filt[,1]
###clustering these samples into high- and low-REST
cl <- kmeans(matrix, centers = 2, nstart = 10)
cl$centers
res_cl <- as.data.frame(cl$cluster)
res_cl[,"Sample_ID"] <- rownames(res_cl)
head(res_cl)
rownames(res_cl) <- NULL
colnames(res_cl) <- c("Cluster", "Sample_ID")
###wrangling RPPA sample-IDs and mRNA-samples-IDs so they are compatible for joining
RPPA$Sample_ID <- substr(RPPA$Sample_ID,1,nchar(RPPA$Sample_ID)-15)
res_cl$Sample_ID <- substr(res_cl$Sample_ID,1,nchar(res_cl$Sample_ID)-3)
###merging RPPA and gene expression data###
merged_data <- merge(RPPA, res_cl, by = "Sample_ID")
dim(merged_data)
table(merged_data$Cluster)
###Analysis of both clusters separately + calculating average protein expression for every protein
merged_split <- split(merged_data, merged_data$Cluster)
proteins <- sapply(merged_split, function(x) colMeans(x[, 4:ncol(merged_data)]))
###sorting by cluster1 (high-REST) in the order of increasing the concentration###
proteins <- as.data.frame(proteins)
colnames(proteins) <- c("High-REST", "Low-REST")
proteins_sorted <- proteins[order(proteins$`High-REST`, decreasing = F),]
proteins_sorted
###printing out ratios
ratios <- proteins_sorted$`High-REST`/proteins_sorted$`Low-REST`
names(ratios) <- rownames(proteins_sorted)
###sorting in order of decreasing ratio High/Low-REST
ratios <- ratios[order(ratios, decreasing = T)]
par(mar=c(15,4,4,2))
barplot(ratios[1:20], col = "aquamarine",
main="High-REST associated proteins",ylab="Ratio High/Low REST",las=2)
###Conclusion: several pathways like EGFR or PAI inhibition may be promising to combine with REST targeting###