-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexpect_main.qmd
724 lines (551 loc) · 31.6 KB
/
expect_main.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
---
title: "Early identification of high-risk pregnancies to develop preeclampsia through non-invasive cell-free DNA methylation profiling"
subtitle: "The Early Prediction of prEgnancy Complications Testing or ExPECT study"
author: "Bram Van Gaever"
output-dir: quarto_output
format: html
code-fold: true
warning: false
message: false
toc: true
toc-expand: 2
toc-location: left
---
# Introduction
This notebook contains the code used to generate the results and plots of the ExPECT study. The code is fully reproducable using the data from the github repo. You can navigate through this page by using the bar on the left side. Code chunks are collapsed by default. To look at the code that generated a certain figure, click on the greyed out _Code_ titles. Like the one below this text, that chunk contains all packages and some helper functions/utilities that were used.
```{r}
#| label: "libs_and_utils"
#| output: false
# Chunck containing the used libraries, colors and some utility functions
set.seed(2341)
library(readxl)
library(limma)
library(bsseq)
library(RColorBrewer)
library(Rtsne)
library(pheatmap)
library(GenomicRanges)
library(biomaRt)
library(HDF5Array)
library(irlba)
library(kableExtra)
library(patchwork)
library(ggrepel)
library(ggbeeswarm)
library(seqsetvis)
library(biomaRt)
library(fuzzyjoin)
library(DSS)
library(ggpubr)
library(annotatr)
library(tidyverse)
library(compareGroups)
#Plotting colors
Green <- "#00A08A"
Orange <- "#F98300"
Blue <- "#5BBCD6"
Purple <- "#BF8AC2"
Brown <- "#B86D33"
Orange2 <- "#CC9F68"
Green2 <- "#7BB6AE"
Blue2 <- "#00778A"
Purple2 <- "#741078"
color_palette <- c(Green, Orange, Blue, Purple, Brown, Green2, Purple2, Blue2, Orange2)
gradient <- colorRampPalette(c(Blue, "#FFFFFF", Purple))(1000)
#"not in" operator
`%nin%` = Negate(`%in%`)
#Ranges to id funtction
ranges2id <- function(genomicRanges) {
idVector <- c(paste0(genomicRanges@seqnames, ":", genomicRanges@ranges@start, "-", genomicRanges@ranges@start + genomicRanges@ranges@width - 1))
}
#Id to ranges function
id2ranges <- function(idVector) {
idTib <- as_tibble(idVector) %>%
separate(value, into = c("seqnames", "start", "end"), sep = "[:-]", convert = T)
genomicRanges <- GRanges(seqnames =idTib$seqnames, ranges = IRanges(start = idTib$start, end = idTib$end))
}
#Heatmapping function
#Every row of the input matrix should contain data for one sample.
#Sample ids should be included as rownames that are also used in the annotation dataframe
meth_heatmap <- function(matrix, annotation, filename = NA,
gradient = colorRampPalette(c(Blue, "#FFFFFF", Purple))(1000),
colors = color_palette,
show_names = F, height = 15) {
#Make pheatmap color annotation
annotation_color_list = list()
for (column in colnames(annotation)) {
tmp_column = dplyr::select(annotation, all_of(column)) %>%
mutate(across(everything(), as.factor))
if (length(levels(tmp_column[[1]])) > 9) {
column_annotation = colorRampPalette(c("#FFFFFF", "#BF65C2"))(1000)
} else {
column_annotation = setNames(colors[1:length(levels(tmp_column[,1]))],
levels(tmp_column[,1]))
}
annotation_color_list[[column]] = column_annotation
}
#Generate the heatmap figure
heatmap = pheatmap(matrix,
color = gradient,
annotation_row = annotation,
annotation_colors = annotation_color_list,
show_colnames = FALSE,
border_color = NA,
show_rownames = show_names,
filename = filename,
clustering_distance_rows = "correlation",
height = height)
return(heatmap)
}
#Limma model fitting and result gathering fucntion
train_limma <- function(matrix, design) {
limma_fit = lmFit(matrix, design = design)
eB_fit = eBayes(limma_fit)
categories <- colnames(design)[-1]
results = list()
for (category in categories) {
top = topTable(eB_fit, coef = category, p.value = 0.01,
number = Inf, adjust.method = "BH")
results[[category]] = top
}
return(results)
}
```
# Data loading
A `bsseq` object and `.Rds` file containing normalized and clustered methylation values for the samples included is first loaded along with the annotation files. Both methylation data objects are generated by running the `smooth_and_cluster.R` script from the Scripts folder of this repo on _bismark_ `.cov` files, which are generated with the [nf-core/methylseq](https://github.com/nf-core/methylseq) pipeline. The pre-processing performed by the aformentioned script creates CpG clusters from the CpGs present in at least 50% of the samples. Methylation values are normalized and smoothed after which they are mapped to the clusters. Methylation values are then averaged across each cluster. The data loaded here is not stored in the github repo but will be available elsewhere (link to follow soon)
```{r}
#| label: data_loading
#| tbl-cap: "Annotation for samples included in this study"
#Load bsseq object
bs_expect <- loadHDF5SummarizedExperiment(dir = "Data/Methylation/Total")
#Load clustered methylation data
clusters_expect <- readRDS("Data/Methylation/Total/cluster_expect.Rds")
#Load study annotation
annotation <- read_csv("Data/expect_annotation.csv") %>% mutate(Group = ifelse(Group == "Pre-symptomatic", "Presymptomatic", "Symptomatic"))
long_samples <- read_table("Data/longitudinal_samples.txt")
#Split up annotations for ease of use
long_annotation <- annotation %>% filter(Sample_id %in% long_samples$Sample_id) %>% mutate(Group="longitudinal")
presympt_annotation <- annotation %>% filter(Gest_age_weeks < 14)
sympt_annotation <- annotation %>% filter(Gest_age_weeks >= 20) %>%
filter(Sample_id %nin% long_annotation$Sample_id)
filtered_annotation <- annotation %>% filter(Gest_age_weeks < 14 | Gest_age_weeks >= 20)
#Split up clusters for ease of use
sample_select <- presympt_annotation$Sample_id[presympt_annotation$Sample_id %in% rownames(pData(bs_expect))]
presympt_annotation <- presympt_annotation %>% filter(Sample_id %in% sample_select) %>% mutate(Group="Presymptomatic")
clusters_presympt <- clusters_expect %>% select(all_of(sample_select))
sample_select <- sympt_annotation$Sample_id[sympt_annotation$Sample_id %in% rownames(pData(bs_expect))]
sympt_annotation <- sympt_annotation %>% filter(Sample_id %in% sample_select) %>% mutate(Group="Symptomatic")
clusters_sympt <- clusters_expect %>% select(all_of(sample_select))
kable(annotation) %>%
kable_paper("hover") %>%
scroll_box(height = "200px")
```
# Data exploration
Sampling distributions for the different sample groups are plotted to get an overview of most common gestational ages at time of sample collection.
```{r}
#| label: sample_distribution
#| fig-cap: "Sample collection time overview per analysis group."
#Create a plot of the gestational ages per sample across the three cohorts, combine into one figure
(ggplot(data = presympt_annotation) +
geom_bar(aes(Gest_age_weeks, fill = Category), just = 0) +
scale_x_continuous(breaks = seq(min(presympt_annotation$Gest_age_weeks), max(presympt_annotation$Gest_age_weeks), 1)) +
scale_fill_manual(values = color_palette, labels = c("Control", "Preeclampsia")) +
xlab("")+
ylab("Number of samples")+
ggtitle("Presymptomatic cohort") +
theme_classic() +
ggplot(data = sympt_annotation) +
geom_bar(aes(Gest_age_weeks, fill = Category), just = 0) +
scale_x_continuous(breaks = seq(min(sympt_annotation$Gest_age_weeks), max(sympt_annotation$Gest_age_weeks), 1)) +
scale_y_continuous(breaks =c(2, 4, 6, 8, 10)) +
scale_fill_manual(values = color_palette, labels = c("Control", "Preeclampsia")) +
xlab("")+
ylab("Number of samples")+
ggtitle("Symptomatic cohort") +
theme_classic() ) /
ggplot(data = long_annotation) +
geom_bar(aes(Gest_age_weeks, fill = Category), just = 0) +
scale_x_continuous(breaks = seq(min(long_annotation$Gest_age_weeks), max(long_annotation$Gest_age_weeks), 1)) +
scale_y_continuous(breaks =c(2, 4, 6, 8, 10)) +
scale_fill_manual(values = color_palette, labels = c("Control", "Preeclampsia")) +
xlab("Gestational age at time of sampling (weeks)")+
ylab("Number of samples")+
ggtitle("Longitudinal cohort") +
theme_classic() +
patchwork::plot_annotation(tag_levels ="A") +
plot_layout(guides = "collect") & theme(legend.position = 'bottom')
cohort_overlap <- list("Symptomatic" = sympt_annotation$Patient[!duplicated(sympt_annotation$Patient)],
"Presymptomatic" = presympt_annotation$Patient[!duplicated(presympt_annotation$Patient)],
"Longitudinal" = long_annotation$Patient[!duplicated(long_annotation$Patient)])
ggvenn::ggvenn(cohort_overlap, fill_color = color_palette)
overlapping_patients <- intersect(sympt_annotation$Patient[!duplicated(sympt_annotation$Patient)],presympt_annotation$Patient[!duplicated(presympt_annotation$Patient)])
```
Dimensionality reduction is performed on the methylation values of the clusters to gain an overview of the samples and spot potential outliers.
```{r}
#| label: pca
#Perform a pca on the presymptomatic set
cluster_PCA_presympt <- prcomp_irlba(t(clusters_presympt), n = 10)
PCA_plot_data_presympt <- cluster_PCA_presympt$x %>% bind_cols(presympt_annotation)
#Perform a pca on the symptomatic set
cluster_PCA <- prcomp_irlba(t(clusters_sympt), n = 10)
PCA_plot_data <- cluster_PCA$x %>% bind_cols(sympt_annotation)
ggplot(PCA_plot_data_presympt) +
geom_point(aes(PC1, PC2, color = Category)) +
scale_color_manual(values = color_palette) +
theme_classic() +
ggtitle("Presymptomatic cohort") +
ggplot(PCA_plot_data) +
geom_point(aes(PC1, PC2, color = Category)) +
scale_color_manual(values = color_palette) +
theme_classic() +
ggtitle("Symptomatic cohort") +
patchwork::plot_annotation(tag_levels = "A")+ plot_layout(guides = "collect") & theme(legend.position = 'bottom')
```
## Comparison of genome-wide methylation levels
Genome-wide average methylation is compared between cases and controls for the pre-symptomatic and symptomatic cohorts. Both on an intra- and inter-cohort level.
```{r}
#| label: genome_wide_methylation
#| fig-cap: "General decrease in methylation during pregnancy, with a significant genome-wide hypomethylation in preeclampsia patients near late pregnancy."
#Compute mean methylation data for the Pre-symptomatic group
mean_meth_presympt <- apply(clusters_presympt, 2, mean) %>%
as_tibble() %>%
mutate(Sample_id = colnames(clusters_presympt)) %>%
left_join(presympt_annotation)
#Compare the means of this group with a Wilcox Rank Sum test
presympt_mean_test <- wilcox.test(mean_meth_presympt$value[mean_meth_presympt$Category == "PE"], mean_meth_presympt$value[mean_meth_presympt$Category == "Control"], conf.int = T, p.adjust.methods = "BH")
#Compute mean methylation data for the Symptomatic group
mean_meth_sympt <- apply(clusters_sympt, 2, mean) %>%
as_tibble() %>%
mutate(Sample_id = colnames(clusters_sympt)) %>%
right_join(sympt_annotation) %>%
mutate(Group = "Symptomatic")
#remove dependent samples for Wilcoxon test
mean_meth_sympt <- mean_meth_sympt %>% filter(Patient %nin% overlapping_patients)
#Compare the means of this group with a Wilcox Rank Sum test
sympt_mean_test <- wilcox.test(mean_meth_sympt$value[mean_meth_sympt$Category == "PE"], mean_meth_sympt$value[mean_meth_sympt$Category == "Control"], conf.int = T, p.adjust.methods = "BH")
#Combine both tibbles for plotting an overview
mean_meth_combined <- bind_rows(mean_meth_presympt, mean_meth_sympt) %>%
mutate(Category = paste(Group, Category))
#Create a boxplot
ggplot(mean_meth_combined, aes(Category, value)) +
geom_boxplot() +
geom_beeswarm(aes(color = Category), stroke = 1, shape = 1) +
stat_compare_means(comparisons = list(c("Presymptomatic Control", "Presymptomatic PE"),c("Symptomatic Control", "Symptomatic PE"),c("Presymptomatic Control", "Symptomatic Control"), c("Presymptomatic PE", "Symptomatic PE")),method ="wilcox.test", p.adjust.methods = "BH", label = "p.signif")+
scale_color_manual(values = color_palette) +
ylab("Mean genome wide methylation level") +
xlab("")+
guides(color = F)+
theme_classic()
```
Box-plots showing the mean genomic methylation level of controls and preeclampsia cases (PE). Methylation levels from presymptomatic samples, sampled between 11 and 20 weeks of gestation, show no significant difference. Methylation levels between cases and controls sampled after 20 weeks of gestation show a significantly lower methylation level of cases compared to controls. Significant methylation differences between pre-symptomatic and symptomatic groups can be observed for both preeclampsia cases and controls.
```{r}
#Calculate statistics on groups
filter(presympt_annotation, Category == "PE") %>% summarise(mean(Gest_age_weeks))
filter(presympt_annotation, Category == "Control") %>% summarise(mean(Gest_age_weeks))
filter(sympt_annotation, Category == "PE") %>% summarise(mean(Gest_age_weeks))
filter(sympt_annotation, Category == "Control") %>% summarise(mean(Gest_age_weeks))
total_study <- bind_rows(sympt_annotation, presympt_annotation) %>%
bind_rows(long_annotation) %>%
distinct()
total_patients <- total_study %>% distinct(Patient, .keep_all = T)
total_patients %>% group_by(Category) %>%
summarise(across(all_of(c("Gest_age_birth", "Gravida", "BMI")), list(mean = mean, sd = sd), na.rm = T))
total_patients %>% group_by(Category, Race) %>%
summarise(n= n())
compare_input <- total_patients %>% select(Gest_age_weeks, Category, Gravida, Fertility_treatment, BMI,Birth_term_category, Sex, Gest_age_birth)
compareGroups(Category ~ ., data=compare_input)
```
# DMR calculation
To define the methylation difference between PE and control samples DMR's are calculated in two ways. In the first method the *ad hoc* defined CpG clusters are used and a *limma* model is generated to calculate clusters that are DMR's. Clustering based on these DMR's can be seen below, use the tabs to select the number of DMR's used for clustering. As seen in the the largest difference can be observed between early pre-symptomatic samples and later symptomatic samples, therefore these two groups are also used separately for DMR calculation. DMR calculation on the entire dataset does not yield any significant DMR's between PE and control groups.
```{r}
#| label: limma_dmrs
#DMR calculation with limma
##dmr_design <- model.matrix(~Category + Batch + Sex + Gest_age_weeks, annotation)
##dmr_clusters_limma <- train_limma(clusters_expect, dmr_design) #This function is defined in the first code block
#DMR calculation on the entire set does not yield any significant DMR's
#Pre-symptomatic DMRs
dmr_design_presympt <- model.matrix(~Category + Batch + Sex + Gest_age_weeks, presympt_annotation)
dmr_presympt_limma <- train_limma(clusters_presympt, dmr_design_presympt)
dmrs_presympt_limma<- dmr_presympt_limma$CategoryPE %>% filter(abs(logFC) > 0.1) %>% filter(adj.P.Val < 0.01) %>% rownames()
#DMR calculation on the presymptomatic set does not yield many significant DMR's
#Symptomatic DMRs
dmr_design_sympt <- model.matrix(~Category + Batch + Sex+ Gest_age_weeks, sympt_annotation)
dmr_sympt_limma <- train_limma(clusters_sympt, dmr_design_sympt)
dmrs_sympt_limma<- dmr_sympt_limma$CategoryPE %>% filter(abs(logFC) > 0.1) %>% filter(adj.P.Val < 0.01) %>% rownames()
```
### Clustering
Heatmap plots are made to show the significance of the discovered DMR's, switch between the amount of DMR's used to generate the heatmap by using the tabset. The rows of the heatmaps represent samples, while columns represent the DMR's. Methylation values are shown on a gradient from blue to white to purple (with blue hues being <50% methylated regions and purple hues meaning >50% methylated regions).
::: panel-tabset
```{r}
#| label: annot_heatmap
sympt_heatmap_annot <- sympt_annotation %>% column_to_rownames(var = "Sample_id") %>%
dplyr::select(Category, Sex, Aspirin) %>%
mutate(across(Category:Aspirin, as.factor))
```
#### Top 500
```{r}
#| label: fig-heatmap_sympt500
p_dmr500_cluster_heatmap <- meth_heatmap(t(clusters_sympt[dmrs_sympt_limma,][1:500,]), sympt_heatmap_annot)
```
#### Top 250
```{r}
#| label: fig-heatmap_sympt250
p_dmr250_cluster_heatmap <- meth_heatmap(t(clusters_sympt[dmrs_sympt_limma,][1:250,]), sympt_heatmap_annot)
```
#### Top 100
```{r}
#| label: fig-heatmap_sympt100
p_dmr100_cluster_heatmap <- meth_heatmap(t(clusters_sympt[dmrs_sympt_limma,][1:100,]), sympt_heatmap_annot)
```
:::
### Presymptomatic clustering
Heatmap plots are made to show the significance of the discovered DMR's, switch between the amount of DMR's used to generate the heatmap by using the tabset. The rows of the heatmaps represent samples, while columns represent the DMR's. Methylation values are shown on a gradient from blue to white to purple (with blue hues being <50% methylated regions and purple hues meaning >50% methylated regions)
```{r}
#| label: fig-heatmap_presympt
presympt_heatmap_annot <- presympt_annotation %>% column_to_rownames(var = "Sample_id") %>%
dplyr::select(Category, Sex, Aspirin) %>%
mutate(across(Category:Aspirin, as.factor))
p_dmr_cluster_heatmap_pre <- meth_heatmap(t(clusters_presympt[dmrs_presympt_limma,]), presympt_heatmap_annot, height = 5)
```
## Overlap
DMR's are compared between the pre-symptomatic and symptomatic cohort. But only a small overlap in DMR's is found.
```{r}
#| label: overlap
#| fig-cap: "Only limited overlap is present in DMR's between early and late samples"
presympt_dmr_ranges <- id2ranges(dmrs_presympt_limma)
sympt_dmr_ranges <- id2ranges(dmrs_sympt_limma)
overlap_dmrs <- subsetByOverlaps(presympt_dmr_ranges, sympt_dmr_ranges)
ssvFeatureVenn(list(presympt_dmr_ranges,sympt_dmr_ranges), group_names = c("Pre-symptomatic cohort", "Symptomatic cohort"), counts_txt_size = 21)
```
## GO analysis
To explore enriched GO terms the `goana` package is used to perform a gene ontology analysis on the discovered DMR's. This is done three separate times, once on the genes in the presymptomatic set of DMR's, once on the genes in the symptomatic set of DMR's and a final time on the set of overlapping genes that are present in both sets of DMR's.
```{r}
#| label: tbl-annot
#| column: body-outset
#| tbl-cap: "Top GO terms in the overlapping DMR's"
mart <- useEnsembl("ensembl")
mart <- useDataset("hsapiens_gene_ensembl", mart)
attributes <- c("ensembl_gene_id","hgnc_symbol", "description", "chromosome_name", "start_position", "end_position", "strand", "entrezgene_id")
filters <- c("chromosomal_region")
dmr_presympt_finder_filtered <- dmr_presympt_limma$CategoryPE %>% filter(abs(logFC) > 0.1) %>%
rownames_to_column(var = "chromosomal_region") %>%
mutate(chromosomal_region = str_replace(chromosomal_region, "-", ":")) %>%
separate(chromosomal_region, into = c("chromosome_name", "start_position", "end_position"), sep = ":", convert = T, remove = F)
dmr_sympt_limma$CategoryPE <- dmr_sympt_limma$CategoryPE %>% filter(abs(logFC) > 0.1) %>%
rownames_to_column(var = "chromosomal_region") %>%
mutate(chromosomal_region = str_replace(chromosomal_region, "-", ":")) %>%
separate(chromosomal_region, into = c("chromosome_name", "start_position", "end_position"), sep = ":", convert = T, remove = F)
genes_presympt_finder <- getBM(attributes = attributes, filters = filters, values = dmr_presympt_finder_filtered$chromosomal_region, mart = mart)
genes_sympt_limma <- getBM(attributes = attributes, filters = filters, values = dmr_sympt_limma$CategoryPE$chromosomal_region, mart = mart)
#Perform GO analysis
GO_presympt <- goana(genes_presympt_finder$entrezgene_id, FDR = 0.05)
GO_sympt <- goana(genes_sympt_limma$entrezgene_id, FDR = 0.05)
unique_presympt <- genes_presympt_finder[genes_presympt_finder$ensembl_gene_id %nin% genes_sympt_limma$ensembl_gene_id,]
unique_sympt <- genes_sympt_limma[genes_sympt_limma$ensembl_gene_id %nin% genes_presympt_finder$ensembl_gene_id,]
overlap_genes <- genes_presympt_finder[genes_presympt_finder$ensembl_gene_id %in% genes_sympt_limma$ensembl_gene_id,]
GO_overlap <- goana(overlap_genes$entrezgene_id, FDR = 0.05)
```
::: panel-tabset
### Presymptomatic
```{r}
#| label : tbl-presympt_GO
kable(topGO(GO_presympt))
```
### Symptomatic
```{r}
#| label : tbl-sympt_GO
kable(topGO(GO_sympt))
```
### Overlap
```{r}
#| label : tbl-overlap_GO
kable(topGO(GO_overlap))
```
:::
## Genes of interest
```{r}
#| label: GOI
GOI <- c("FLT1", "PIGF", "ENG", "VEGFA", "HIF1A", "PAPPA")
#Get gene annotation for clusters
clusters_annotated <- clusters_expect %>%
rownames_to_column(var = "chromosomal_region") %>%
mutate(chromosomal_region = str_replace(chromosomal_region, "-", ":")) %>%
separate(chromosomal_region, into = c("chromosome_name", "start_position", "end_position"), sep = ":", convert = T, remove = F)
genes_clusters <- getBM(attributes = attributes, filters = filters, values = clusters_annotated$chromosomal_region, mart = mart)
#Annotate genes of interest
clusters_expect_annotated <- fuzzy_left_join(genes_clusters[genes_clusters$hgnc_symbol %in% GOI,], clusters_annotated,
by = c("chromosome_name", "start_position", "end_position"),
match_fun=list(`==`, `<=`, `>=`))
#Look at genes in different groups
PIGF_meth <- clusters_expect_annotated %>% filter(hgnc_symbol=="PIGF") %>%
select(contains("DNA")) %>%
t(.) %>%
as_tibble(rownames=NA) %>%
rownames_to_column(var="Sample_id") %>%
rowwise() %>%
mutate(Methylation = mean(c(V1, V2, V3))) %>%
ungroup() %>%
right_join(annotation) %>%
mutate(Category = paste(Group, Category))
ggplot(PIGF_meth, aes(Category, Methylation)) +
geom_boxplot() +
geom_beeswarm(aes(color = Category), stroke = 1, shape = 1) +
stat_compare_means(comparisons = list(c("Presymptomatic Control", "Presymptomatic PE"),c("Symptomatic Control", "Symptomatic PE"),c("Presymptomatic Control", "Symptomatic Control"), c("Presymptomatic PE", "Symptomatic PE")),method ="wilcox.test", p.adjust.methods = "BH", label = "p.signif")+
scale_color_manual(values = color_palette) +
ylab("Mean PIGF methylation level") +
xlab("")+
guides(color = F)+
theme_classic()
FLT1_meth <- clusters_expect_annotated %>% filter(hgnc_symbol=="FLT1") %>%
select(contains("DNA")) %>%
t(.) %>%
as_tibble(rownames=NA) %>%
rownames_to_column(var="Sample_id") %>%
rowwise() %>%
mutate(Methylation = mean(c_across(c(starts_with("V"))))) %>%
ungroup() %>%
right_join(annotation) %>%
mutate(Category = paste(Group, Category))
ggplot(FLT1_meth, aes(Category, Methylation)) +
geom_boxplot() +
geom_beeswarm(aes(color = Category), stroke = 1, shape = 1) +
stat_compare_means(comparisons = list(c("Presymptomatic Control", "Presymptomatic PE"),c("Symptomatic Control", "Symptomatic PE"),c("Presymptomatic Control", "Symptomatic Control"), c("Presymptomatic PE", "Symptomatic PE")),method ="wilcox.test", p.adjust.methods = "BH", label = "p.signif")+
scale_color_manual(values = color_palette) +
ylab("Mean FLT1 methylation level") +
xlab("")+
guides(color = F)+
theme_classic()
```
# Longitudinal analysis
```{r}
#| label: long_plots
#| fig-cap: "General genome-wide methylation trends across pregnancy differ between preeclampsia cases and controls."
#|
#Select only included samples
clusters_long <- clusters_expect %>% select(all_of(long_annotation$Sample_id))
#Get mean methylation values per sample
mean_meth_long <- t(clusters_long) %>%
rowMeans2(useNames = TRUE, na.rm = T) %>%
as_tibble(rownames = NA) %>%
rownames_to_column(var = "Sample_id") %>%
full_join(long_annotation)
#Look at methylation trends of PIGF and SFLT
annotated_long <- clusters_long %>%
rownames_to_column(var = "chromosomal_region") %>%
mutate(chromosomal_region = str_replace(chromosomal_region, "-", ":")) %>%
separate(chromosomal_region, into = c("chromosome_name", "start_position", "end_position"), sep = ":", convert = T, remove = F)
annotated_long <- fuzzy_left_join(genes_clusters[genes_clusters$hgnc_symbol %in% GOI,], annotated_long,
by = c("chromosome_name", "start_position", "end_position"),
match_fun=list(`==`, `<=`, `>=`))
# plot PIGF changes
PIGF_meth_long <- filter(annotated_long, hgnc_symbol == "PIGF") %>%
dplyr::select(long_annotation$Sample_id) %>%
summarise(across(everything(), mean, na.rm = T)) %>% t() %>%
as_tibble(rownames = NA) %>%
rownames_to_column(var = "Sample_id") %>%
dplyr::rename(PIGF = V1) %>%
left_join(long_annotation)
#Plot sFLT1
FLT_meth_long <- filter(annotated_long, hgnc_symbol == "FLT1") %>%
dplyr::select(long_annotation$Sample_id) %>%
summarise(across(everything(), mean, na.rm = T)) %>% t() %>%
as_tibble(rownames = NA) %>%
rownames_to_column(var = "Sample_id") %>%
dplyr::rename(FLT1 = V1) %>%
left_join(long_annotation)
(ggplot(mean_meth_long) +
geom_point(aes(x = Gest_age_weeks, y = value, color = Category)) +
geom_smooth(aes(x = Gest_age_weeks, y = value, color = Category, fill = Category))+
scale_color_manual(values = color_palette,labels = c("Control", "Preeclampsia")) +
scale_fill_manual(values = color_palette,labels = c("Control", "Preeclampsia")) +
labs(x = "Gestational age at time of sampling (weeks)", y = "Mean genome wide methylation level") +
theme_classic()) /
(ggplot(FLT_meth_long) +
geom_point(aes(Gest_age_weeks, FLT1, color = Category)) +
geom_smooth(aes(Gest_age_weeks, FLT1, color = Category, fill = Category)) +
scale_color_manual(values = color_palette,labels = c("Control", "Preeclampsia"))+
scale_fill_manual(values = color_palette,labels = c("Control", "Preeclampsia")) +
labs(x = "Gestational age at time of sampling (weeks)", y = "sFLT1 methylation value")+
guides(color = F, fill = F) +
theme_classic() +
ggplot(PIGF_meth_long) +
geom_point(aes(Gest_age_weeks, PIGF, color = Category)) +
geom_smooth(aes(Gest_age_weeks, PIGF, color = Category, fill = Category)) +
scale_color_manual(values = color_palette,labels = c("Control", "Preeclampsia"))+
scale_fill_manual(values = color_palette,labels = c("Control", "Preeclampsia")) +
labs(x = "Gestational age at time of sampling (weeks)", y = "PIGF methylation value")+
theme_classic()) + patchwork::plot_annotation(tag_levels = "A") + plot_layout(guides = "collect") & theme(legend.position = 'bottom')
```
Mean whole genome methylation follows a similar trend for preeclampsia patients and controls across pregnancy duration. At 11-13 weeks of GA, a lower global methylation is noted for preeclampsia cases (A) . Methylation of two preeclampsia related genes, sFLT-1 (B) and PIGF (C) follows slightly different patterns for controls and preeclampsia patients, ending at a lower methylation level in preeclampsia patients compared to controls.
## DMR's in the longitudinal cohort
Due to the small overlap between DMR's calculated on the pre-symptomatic and symptomatic cohorts, we hypothesize that these DMR's also undergo significant changes throughout the pregnancy duration. To investigate this groups of DMR's are extracted from the hierarchical clusterings made in the heatmap figures of the DMR section. The average methylation of these groups is then investigated over time.
::: panel-tabset
### Symptomatic DMR's
```{r}
#| label: long_dmr_clustering
#Get longitudinal cohort cluster methylation for symptomatic DMRs
long_sympt_dmr <- clusters_long[str_replace(dmr_sympt_limma$CategoryPE$chromosomal_region, "(?<=[0-9][0-9][0-9]):(?=[0-9][0-9])", "-") ,] %>% t() %>%
as_tibble()
colnames(long_sympt_dmr) <- str_replace_all(paste0("region_", colnames(long_sympt_dmr)), "[-:]", "_")
long_sympt_dmr <- long_sympt_dmr %>%
bind_cols(Sample_id = colnames(clusters_long)) %>%
relocate(Sample_id) %>%
full_join(long_annotation)
#Get clusters used in previous heatmapfigure (look at top 250 DMRs)
sympt_memb <- cutree(p_dmr250_cluster_heatmap$tree_col, k = 4)
names(sympt_memb) <- str_replace_all(paste0("region_", names(sympt_memb)), "[-:]", "_")
plot_regioncluster_long <- function(dmr_methylation, sample_ids, annotation, region_clusters) {
for (i in unique(region_clusters)) {
plot_data <- dmr_methylation %>% select(any_of(names(region_clusters[region_clusters == i]))) %>% as.matrix() %>%
rowMeans2(na.rm = T) %>% as_tibble() %>%
bind_cols(Sample_id = sample_ids) %>%
full_join(annotation)
plot <- ggplot(plot_data) +
geom_point(aes(Gest_age_weeks, value, color = Category)) +
geom_smooth(aes(Gest_age_weeks, value, color = Category, fill = Category)) +
scale_color_manual(values = color_palette, labels = c("Control", "Preeclampsia"))+
scale_fill_manual(values = color_palette, labels = c("Control", "Preeclampsia"))+
scale_y_continuous(limits = c(0,1)) +
xlab("Gestational age (weeks)") +
ylab("Methylation value") +
theme_classic()
if (exists("full_plot")) {
full_plot <- full_plot + plot
} else {
full_plot <- plot
}
}
(full_plot + plot_layout(guides = "collect") + patchwork::plot_annotation(tag_levels = "A")) & theme(legend.position = 'bottom')
}
plot_regioncluster_long(long_sympt_dmr, colnames(clusters_long), long_annotation, sympt_memb)
```
### Pre-symptomatic DMR's
```{r}
#| label : long_dmr_clustering_presympt
#|
#Get longitudinal cohort cluster methylation for pre-symptomatic DMRs
bs_long <- bs_expect[,long_annotation$Sample_id]
long_pre_dmr <- suppressWarnings(getMeth(bs_long, regions = dmr_presympt_finder_filtered, what = "perRegion", type = "smooth"))
rownames(long_pre_dmr) <- paste("region",dmr_presympt_finder_filtered$chr, dmr_presympt_finder_filtered$start, dmr_presympt_finder_filtered$end, sep = "_")
long_pre_dmr <- t(long_pre_dmr) %>% as_tibble(rownames = NA)
#Get clusters used in previous heatmapfigure (look at top 250 DMRs)
presympt_memb <- cutree(p_dmrfinder250_heatmap$tree_col, k = 4)
names(presympt_memb) <- str_replace_all(paste0("region_", dmr_presympt_finder_filtered[1:250,]$chromosomal_region), "[-:]", "_")
plot_regioncluster_long(long_pre_dmr, colnames(clusters_long), long_annotation, presympt_memb)
```
:::
```{r}
#| label: long_dmr
#| fig-cap: "A limited set of DMRs can differentiate most cases and controls in a gestational age independent manner"
#Compute DMRs that are consistent in the longitudinal group
long_dmr_design <- model.matrix(~ Category + Gest_age_weeks + Batch, long_annotation)
long_dmr <- train_limma(clusters_long, long_dmr_design)
heatmap_annot_long <- long_annotation %>% column_to_rownames(var = "Sample_id") %>%
select(Category, Gest_age_weeks, Batch) %>%
rename(`Gestational age (weeks)` = Gest_age_weeks)
hm_data <- t(clusters_long[rownames(long_dmr$CategoryPE),])
meth_heatmap(hm_data, annotation = heatmap_annot_long)
```
A total of `r dim(long_dmr$CategoryPE)[1]` DMR's are found to be significant between cases and controls that are independent of the gestational age of the sample. These DMR's arent able to fully make a destinction between cases and controls for all included samples.
# More
The code for the machine learning part of the paper can be found [here](expect_classifiers.qmd) or use the navigation at the top of this page.