-
Notifications
You must be signed in to change notification settings - Fork 0
/
PhD_thesis_code.Rmd
1428 lines (1211 loc) · 50.2 KB
/
PhD_thesis_code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "An improved pipeline for LC-MS spectra processing and annotation"
always_allow_html: true
output:
html_document:
toc: true
# toc_float: false
number_sections: false
toc_depth: 4
author: Elzbieta Lauzikaite
date: '`r format(Sys.time(), "%d %B, %Y")`'
---
<style>
pre code, pre, code {
white-space: pre !important;
overflow-x: scroll !important;
word-break: keep-all !important;
word-wrap: initial !important;
}
</style>
```{r setup, echo = FALSE, include = FALSE}
options(width = 999)
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
# Color Format
colFmt <- function(x, color) {
outputFormat <- knitr::opts_knit$get("rmarkdown.pandoc.to")
if (outputFormat == "latex") {
paste("\\textcolor{", color, "}{", x, "}", sep = "")
} else if (outputFormat == "html") {
paste("<font color='", color, "'>", x, "</font>", sep = "")
} else {
x
}
}
```
The details of the PhD thesis analyses are provided in this document. The **R** and **Python** scripts utilised in the analyses are sorted according to the chapters in which they appeared. Links to all required external libraries, as well as additional developed scripts are provided in the document.
`massFlowR` code developed during the PhD is available on its [GitHub repository](https://github.com/lauzikaite/massFlowR). Only the names of the functions and methods exported from the `massFlowR` namespace are provided in this document.
# Glossary
**PCS** - *pseudo chemical spectra* - list of structurally related co-eluting features
**ROI** - *regions of interest* - reference RT and m/z windows in which to search for a feature
**IPC** - *Imperial Phenome Centre*
# Computing environment
Analyses were performed on **R** and **Python** environments. Versions for the most version-sensitive libraries are provided.
```{r, eval=TRUE,message=FALSE,results='asis',echo=FALSE}
options(knitr.kable.NA = "")
library(magrittr)
library(kableExtra)
dt <- setNames(
data.frame(
c("3.0.0", NA),
c("3.4.4", NA),
c(NA, "1.2.1"),
row.names = c("XCMS", "nPYc")
),
nm = c("3.4.0", "3.5.1", "3.6")
)
kable(dt,
align = rep("c", 3)
) %>%
kable_styling("striped", full_width = TRUE, position = "left") %>%
add_header_above(c(" " = 1, "R" = 2, "Python" = 1))
```
To install all **R** dependencies required in this document, run the following function:
```{r}
# To install dependencies
install_bioconductor <- function() {
dependencies <- c("xcms", "MSnbase", "faahKO", "igraph", "doParallel", "foreach", "ggplot2", "viridis", "gridExtra", "tidyr", "cowplot", "viridis", "dplyr", "IPO", "peakPantheR")
installed <- installed.packages()
to_install <- subset(dependencies, !(dependencies %in% installed[, "Package"]))
if (length(to_install) != 0) {
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
message("Installing packages: ", to_install, " ...")
BiocManager::install(to_install)
} else {
message("All dependencies are already installed.")
}
}
install_bioconductor()
## finally, install massFlowR
devtools::install_github("lauzikaite/massflowR", dependencies = FALSE)
```
# Chapter 2
## Standard peakPantheR workflow
Endogenous metabolites were detected and integrated using R package `peakPantheR`, which is available on [Bioconductor](https://bioconductor.org/packages/release/bioc/html/peakPantheR.html). For details on the workflow, please refer to the [GitHub](https://github.com/phenomecentre/peakPantheR/) repository and provided tutorials.
The preliminary RT and m/z ROI were kindly provided by the IPC team. These regions were further optimised using `peakPantheR` package, with the final values provided in **Thesis Appendix A and Appendix B**. A standard `peakPantheR` worflow was applied as follows.
```{r}
library(peakPantheR)
# Inputs ----------------------------------------------------------------
meta <- read.csv(
file = "path to study metadata", # study metadata with sample acquisition order and sample type
header = TRUE, stringsAsFactors = FALSE
)
## select pooled QC samples only
selected_dt <- subset(meta[order(meta$run_order), ], class == "Study Pool")
# New annotation ----------------------------------------------------------
new_anno <- peakPantheR_loadAnnotationParamsCSV(CSVParamPath = "path to table with ROI values")
new_anno <- resetAnnotation(new_anno,
spectraPaths = meta$filepath,
spectraMetadata = meta,
useUROI = TRUE,
useFIR = TRUE
)
new_anno_res <- peakPantheR_parallelAnnotation(new_anno,
ncores = 6,
verbose = TRUE
)
final_anno <- new_anno_res$annotation
cols <- unname(setNames(viridis::viridis(n = nrow(meta)),
nm = seq(nrow(meta))
)[spectraMetadata(final_anno)$total_order])
outputAnnotationDiagnostic(final_anno,
saveFolder = "path to output directory",
savePlots = TRUE,
sampleColour = cols,
verbose = TRUE
)
outputAnnotationResult(final_anno,
saveFolder = "path to output directory",
annotationName = "study name",
verbose = TRUE
)
# Re-integrate missed samples ----------------------------------------------
miss_dt <- subset(meta, filepath %in% new_anno_res$failures$file)
miss_anno <- resetAnnotation(final_anno,
spectraPaths = new_anno_res$failures$file,
spectraMetadata = miss_dt,
useUROI = TRUE,
useFIR = TRUE
)
miss_anno_res <- peakPantheR_parallelAnnotation(miss_anno,
ncores = 0, ## serial, since macOS tend to fail mzML reading
verbose = TRUE
)
```
## IPO optimisation
It was attempted to optimise centWave peak-picking parameters with an open-source package `IPO`, which is available on [Bioconductor](https://www.bioconductor.org/packages/release/bioc/html/IPO.html).
Parameters which were not optimised by `IPO` were provided as a single starting value. Parameters that were set for optimisation were listed with lower and upper starting values:
* min_peakwidth - minimum peak width in chromatographic space, sec
* max_peakwidth - maximum peak width in chromatographic space, sec
* noise - centroids with intensity < noise are omitted from ROI detection
* prefilter - during ROI detection, mass traces are only retained if they contain at least [prefilter] peaks
* value_of_prefilter - during ROI detection, mass traces are only retained if they contain at least [prefilter] peaks with [value_of_prefilter] intensity
* snthresh - signal/noise ratio: ([maximum peak intensity] - [estimated baseline value]) / standard deviation of local chromatographic noise
```{r}
library(IPO)
## set centWave parameters for optimisation
ppparam <- getDefaultXcmsSetStartingParams('centWave')
ppparam$min_peakwidth <- c(1.5, 3)
ppparam$max_peakwidth <- c(5, 20)
ppparam$noise <- c(200, 1000)
ppparam$prefilter <- c(4, 10)
ppparam$value_of_prefilter <- c(500,10000)
ppparam$snthresh <- c(3, 5)
## set values not to be optimised
ppparam$ppm <- 25
ppparam$mzdiff <- 0.01
ppparam$integrate <- 2
ppparam$mzCenterFun <- 'wMean'
opp <- optimizeXcmsSet(files = files,
params = ppparam,
BPPARAM = MulticoreParam(workers = bw), # number of parallel workers
nSlaves = 1, # must be 1 if BPPARAM is used
subdir = "path to output directory"
)
print(opp$best_settings$result)
print(opp$best_settings$parameters)
```
## XCMS version 2 syntax
AIRWAVE study datasets were processed with the standard `XCMS` pipeline, using the **version 2 syntax**. The exact XCMS parameters vary from assay to assay, details for which are provided in **Thesis Chapter 2, Section 2.3.3 "XCMS pre-processing"**.
```{r}
library(xcms)
# Peak-picking ------------------------------------------------------------
xset <- xcmsSet(
files = "paths to mzML files",
method = "centWave",
peakwidth = c(1.5, 14),
ppm = 25,
noise = 500,
snthresh = 5,
mzdiff = 0.01,
prefilter = c(10, 3000),
mzCenterFun = "wMean",
integrate = 2,
fitgauss = FALSE,
verbose.columns = FALSE,
BPPARAM = MulticoreParam(workers = ...)
)
# Grouping ----------------------------------------------------------------
gset <- group(xset,
method = "density",
minfrac = 0,
minsamp = 0,
bw = 2,
mzwid = 0.01
)
# RT correction -------------------------------------------------------------
rset <- retcor.peakgroups(gset,
plottype = "none",
smooth = "loess",
missing = 10,
extra = 10,
span = 10
)
# Grouping -----------------------------------------------------------------
grset <- group(rset,
method = "density",
minfrac = 0,
minsamp = 0,
bw = 2,
mzwid = 0.01
)
# Filling peaks ------------------------------------------------------------
fset <- fillPeaks(grset,
method = "chrom",
BPPARAM = MulticoreParam(workers = bw)
)
```
## QC pipeline
Standard quality control pipeline was applied to every processed dataset. Python library `nPYc` was utilised. It is available through `pip`. For details please refer to its extensive [documentation](https://npyc-toolbox.readthedocs.io/en/latest/), or the [HitHub repository](https://github.com/phenomecentre/nPYc-Toolbox).
The main QC pipeline stages are outlined below.
```{python, eval=FALSE}
## Python script for post-processing QC assesment
import os
import matplotlib.pyplot as plt
import scipy
import pandas
import numpy
import pickle
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import sys
import nPYc
import pyChemometrics
import copy
from nPYc.enumerations import VariableType, DatasetLevel, AssayRole, SampleType
from nPYc.utilities.normalisation import NullNormaliser, TotalAreaNormaliser, ProbabilisticQuotientNormaliser
## load dataset
dataset=nPYc.MSDataset('path to peak-picked dataset', fileType='XCMS', sop='GenericMS', ...)
dataset.addSampleInfo(descriptionFormat='Basic CSV', filePath='path to study metadata')
# =============================================================================
# Feature filtering
# =============================================================================
## is batch correction is desired?
datasetCorrected=dataset # if no batch correction is desired
datasetCorrected=nPYc.batchAndROCorrection.correctMSdataset(dataset, window=11) # if batch correction is desired
## extract RSD and correlation to dilution values for each feature
rsd=nPYc.utilities.rsdsBySampleType(datasetCorrected)
featuresOutput={
'peakid': dataset.featureMetadata['peakid'],
'rsd': nPYc.utilities.rsd(dataset.intensityData),
'rsd_StudyPool': rsd['Study Pool'],
'rsd_ExternalReference': rsd['External Reference'],
'cor_to_dilution': dataset.correlationToDilution
}
dt=pandas.DataFrame(data=featuresOutput)
## apply feature and sample masks
## feature correction according to standard nPYc criteria: correlation to dilution and RSD
datasetCorrected.updateMasks(
sampleTypes=[SampleType.StudySample, SampleType.StudyPool],
assayRoles=[AssayRole.Assay, AssayRole.PrecisionReference],
filterFeatures=True)
datasetCorrected.applyMasks()
# =============================================================================
# PCA: scores, loadings and correlation
# =============================================================================
## build cross-validated PCA model
PCAmodel=nPYc.multivariate.exploratoryAnalysisPCA(datasetCorrected, scaling=1)
## extract hotellings values
t2=PCAmodel.hotelling_T2(comps=[0,1])
angle=numpy.arange(-numpy.pi, numpy.pi, 0.01)
x=t2[0] * numpy.cos(angle)
y=t2[1] * numpy.sin(angle)
hotelling=pandas.DataFrame(x) # final table with values
## get PCA scores
scores=PCAmodel.scores
df=pandas.DataFrame(scores)
df.columns=['PC' + str(i + 1) for i in range(scores.shape[1])]
df['Sample File Name']=datasetCorrected.sampleMetadata['Sample File Name']
df['Class']=datasetCorrected.sampleMetadata['Class'] # final table with values
## variance explained by each PC
var=pandas.DataFrame(PCAmodel.modelParameters['VarExpRatio'])
var.columns=['Variance'] # final table with values
## PCA scores correlation with continuous data
cont_cor=list()
variables=['Backing', 'Collision', 'Detector', 'TOF', 'Run Order', 'Age']
for var_name in variables:
cor=nPYc.multivariate.pcaSignificance(values=PCAmodel.scores,
classes=datasetCorrected.sampleMetadata[var_name],
valueType='continuous')
cont_cor.append(cor)
cont_cor=pandas.DataFrame(cont_cor)
cont_cor.columns=['PC' + str(i + 1) for i in range(cont_cor.shape[1])]
cont_cor.index=variables
## PCA scores correlation with categorical data
cat_cor=list()
variables=['Well', 'Plate', 'Sample position', 'Sample batch', 'Gender', 'BMI_cat']
for var_name in variables:
cor=nPYc.multivariate.pcaSignificance(values=PCAmodel.scores,
classes=datasetCorrected.sampleMetadata[var_name],
valueType='categorical')
cat_cor.append(cor)
cat_cor=pandas.DataFrame(cat_cor)
cat_cor.columns=['PC' + str(i + 1) for i in range(cat_cor.shape[1])]
cat_cor.index=variables
## bind both tables together and save into one table
df=pandas.concat([cont_cor, cat_cor])
df['variable']=['continuous' if x in ['Backing', 'Collision', 'Detector', 'TOF', 'Run Order', 'Age'] else 'categorical' for x in df.index]
```
# Chapter 3
## Pipeline development
### Pseudo chemical spectra generation
#### Gaussian shape analysis
To evaluate how well EIC correlation distinguishes similar peaks among co-eluting peaks, a simulation was first performed with peaks of identical shape. A characteristic chromatographic peak, missing a few scans and fitting a Gaussian curve well, was identified in the raw data of a representative urine sample.
All required functions are provided in the *simulation-selfEIC-correlation.R* script. Load the required functions:
```{r, }
source(file = "./Chapter_3/simulation-selfEIC-correlation.R")
```
Simulation was performed using DEVSET study's pooled quality control sample **PipelineTesting_RPOS_ToF10_U1W24_SR**.
```{r}
## Peak-pick a raw file
rw <- MSnbase::readMSData("path to the raw mzML file",
msLevel. = 1,
mode = "onDisk")
cwt <- xcms::CentWaveParam(
ppm = 25,
snthresh = 5,
noise = 200,
prefilter = c(10, 5000),
peakwidth = c(1.5, 5),
mzdiff = 0,
verboseColumns = TRUE, # essential to extract Gaussian model values
fitgauss = TRUE # fits Gaussian model to the chromatographic shape
)
## to avoid a reported bug of centWave not returning Gaussian fit results:
## register serial backend and load XCMS fully instead of taking function from its namespace
## https://github.com/sneumann/xcms/issues/352
library(xcms)
BiocParallel::register(BiocParallel::SerialParam())
res <- findChromPeaks(object = rw,
param = cwt)
pks <- data.frame(chromPeaks(res))
pks <- pks[order(pks$into, decreasing = TRUE), ] # order by decreasing intensity to have nice peaks first
## extract EIC for every picked peak
eic <- xcms::chromatogram(rw,
rt = data.frame(rt_lower = pks$rtmin, rt_upper = pks$rtmax),
mz = data.frame(mz_lower = pks$mzmin, mz_upper = pks$mzmax)
)
clean_eic <- lapply(1:nrow(eic), function(ch) {
MSnbase::clean(eic[ch, ], na.rm = T)
})
## Identify a peak with an elution profile that fits gaussian the best
## Function will make a plot for every peak
fit <- lapply(1:nrow(pks),
FUN = checkPk,
eic = eic)
p <- 25 # this peak was selected at random from the appropriatelly looking peaks
## Perform self-EIC-correlation using the reference peak
corPk(pk = p,
out_dir = "path to output directory",
eic = eic)
```
#### EIC correlation of endogenous metabolites
EIC correlation of ions corresponding to 15 endogenous metabolites in DEVSET samples was performed. All required functions are provided in the *endogenous-metabolites-EICcorrelation.R* script. Load the required functions:
```{r, }
source(file = "./Chapter_3/endogenous-metabolites-EICcorrelation.R")
```
First, 15 `r colFmt("endogenous metabolites and their main adducts and in-source fragments", "Crimson")` were identified in the LC-MS spectra of all DEVSET samples using m/z and RT regions kindly provided by the IPC team. The integration regions were optimised using `peakPantheR` package. General script for the workflow is provided in this document's section [standard peakPantheR workflow](#standard-peakpanther-workflow).
```{r}
# Peak-pick DEVSET samples ------------------------------------------------
samples_rnames <- "paths to DEVSET raw mzML files"
raw <- MSnbase::readMSData(samples_rnames,
msLevel. = 1,
mode = "onDisk"
)
raw_ls <- split(raw, fromFile(raw))
cwt <- xcms::CentWaveParam(
ppm = 25,
snthresh = 5,
noise = 200,
prefilter = c(10, 5000),
peakwidth = c(1, 5),
mzdiff = 0,
verboseColumns = TRUE
)
doParallel::registerDoParallel(cores = 6)
result <- foreach::foreach(
f = samples_rnames,
.inorder = TRUE,
.errorhandling = "pass"
) %dopar% {
raw <- massFlowR:::readDATA(f = f)
res <- xcms::findChromPeaks(
object = raw,
param = cwt
)
pks <- data.frame(xcms::chromPeaks(res))
return(pks)
}
# Find features corresponding to metabolites ------------------------------
## mz and rt regions for endogenous metabolites were optimised using peakPantheR library
## script is provided in this document under Chapter 2, "Standard peakPantheR workflow"
## mz/rt regions for DEVSET samples are provided in Thesis Appendix B, Table B.1
standards <- read.csv(
file = "metabolites_roi.csv",
header = TRUE,
stringsAsFactors = FALSE
)
ids <- unique(standards$cpdID.metabolite)
selected_standards <- ids
roi <- prepROI(
data_dir = ppr_dir,
metabolites_ids = gsub("-", ".", standards$cpdID),
samples_fnames = samples_fnames
)
## find corresponding features
matches <- lapply(samples_ind, function(ns) {
dat <- result[[which(samples_ind == ns)]]
roi_ns <- roi[[which(samples_ind == ns)]]
lapply(seq(nrow(roi_ns)), function(n) {
dat[which(
dat$mz >= roi_ns$mzMin[n] &
dat$mz <= roi_ns$mzMax[n] &
dat$rt >= roi_ns$rtMin[n] &
dat$rt <= roi_ns$rtMax[n]
), c("mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "scpos", "into")]
})
})
## omit samples that failed to be integrated with ppR (if any)
sel_samples_ind <- samples_ind[-c(which(sapply(roi, is.null)))]
sel_samples_rnames <- samples$raw_filepath[sel_samples_ind]
sel_samples_fnames <- samples$filename[sel_samples_ind]
```
Next, the `r colFmt("EIC of the centWave features", "Crimson")` corresponding to metabolites adducts and in-source fragments were correlated.
```{r, }
# EIC correlation of endogenous metabolites ions ---------------------------
cor_results <- list()
for (id in ids) {
message("checking compound: ", id)
id_inds <- which(standards$cpdID.metabolite == id)
## correlated main adduct and its validated adduct(s)
matches_id <- lapply(sel_samples_ind, function(ns) {
matches_ind <- matches[[which(samples_ind == ns)]][id_inds]
matches_n <- unlist(lapply(matches_ind, nrow))
## if there are multiple matches, the the one which is closer in scpos to the adduct match
if (length(which(matches_n > 1)) > 0) {
if (length(which(matches_n == 1)) == 1) {
scpos_ref <- matches_ind[[which(matches_n == 1)]]$scpos
} else {
into_highest <- which.max(matches_ind[[1]]$into)
scpos_ref <- matches_ind[[1]]$scpos[into_highest]
}
lapply(matches_ind, function(dat) {
match_closest <- which.min(abs(dat$scpos - scpos_ref))
dat[match_closest, ]
})
} else {
matches_ind
}
})
## (2) extract eic
matches_eic <- lapply(sel_samples_ind, function(ns) {
matches_ind <- matches_id[[which(sel_samples_ind == ns)]]
## extract EIC if atleast two adducts are present
if (length(which(sapply(matches_ind, nrow) > 0)) >= 2) {
massFlowR:::extractEIC(raw_ls[[which(samples_ind == ns)]],
pks = do.call(rbind, matches_ind)
)
}
})
matches_eic_ind <- which(!sapply(matches_eic, is.null))
## (3) correlate eic of the main adduct (1st in the list) and all associated adducts
matches_cor <- lapply(matches_eic_ind, function(ns) {
eic <- matches_eic[[ns]]
rx <- MSnbase::rtime(eic[[1]])
unlist(lapply(2:length(eic), function(y) {
ry <- MSnbase::rtime(eic[[y]])
common_scan <- base::intersect(rx, ry)
if (length(common_scan) > 3) {
ix <-
as.numeric(MSnbase::intensity(eic[[1]])[which(rx %in% common_scan)]) ## main adduct
iy <-
as.numeric(MSnbase::intensity(eic[[y]])[which(ry %in% common_scan)]) ## iterator
cor(ix, iy, method = "pearson", use = "pairwise.complete.obs")
} else {
0
}
}))
})
## save output
cor_results[[length(cor_results) + 1]] <- matches_cor
}
## Make plots
cor_all <- lapply(selected_standards, function(id) {
data.frame(
cor = unlist(cor_results[[which(selected_standards == id)]]),
id = id,
metabolite = standards$cpdName[standards$cpdID.metabolite == id][1],
stringsAsFactors = FALSE
)
})
cor_all <- do.call(rbind, cor_all)
cor_all <- subset(cor_all, cor > 0)
library(ggplot2)
ggplot(data = cor_all) +
geom_density(aes(x = cor, group = metabolite), fill = "#440154FF", color = "white") +
scale_x_continuous("EIC correlation") +
scale_y_continuous("Density") +
geom_vline(aes(xintercept = 0.95), linetype = "dashed") +
facet_wrap(~metabolite, ncol = 3, scales = "free_y") +
theme_bw(base_size = 12) +
theme(
legend.position = "bottom",
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
axis.line = ggplot2::element_line(size = 0.1)
)
```
### Feature alignment across samples
Spectral similarity between PCS that comprise of features corresponding to endogenous metabolites was analysed using three intensity scaling strategies:
1. No scaling
2. Square-root intensity scaling
3. Weight-based intensity scaling
Scaled spectra was then normalised to the total magnitude of the spectral vector. A simple spectral dot product function was then applied to determine spectral similarity between the PCS in the first DEVSET sample in which adducts of metabolite-of-interest were detected and all following samples. All required functions are provided in the *endogenous-metabolites-spectral-scaling.R* script. Load required functions:
```{r, }
source(file = "./Chapter_3/endogenous-metabolites-spectral-scaling.R")
```
Firstly, `r colFmt("features corresponding to endogenous metabolites", "Crimson")` were found as in the earlier [section](#eic-correlation-of-endogenous-metabolites).
Then, the `r colFmt("spectral similarity was compared", "Crimson")` using different scaling methods.
```{r}
library(dplyr)
library(ggplot2)
## all other required packaged are listed in the endogenous-metabolites-spectral-scaling.R script
## they need to be installed, but not loaded
# For each metabolite -----------------------------------------------------
for (id in ids) {
message("checking compound: ", id)
id_inds <- which(standards$cpdID.metabolite == id)
## extract main adduct and its validated adduct(s) from all samples
matches_id <- lapply(sel_samples_ind, function(ns) {
matches_ind <- matches[[which(samples_ind == ns)]][id_inds]
matches_n <- unlist(lapply(matches_ind, nrow))
## if there are multiple matches, take the one which is closer in 'scpos' to the adduct match
if (length(which(matches_n > 1)) > 0) {
if (length(which(matches_n == 1)) == 1) {
scpos_ref <- matches_ind[[which(matches_n == 1)]]$scpos
} else {
into_highest <- which.max(matches_ind[[1]]$into)
scpos_ref <- matches_ind[[1]]$scpos[into_highest]
}
lapply(matches_ind, function(dat) {
match_closest <- which.min(abs(dat$scpos - scpos_ref))
dat[match_closest, ]
})
} else {
matches_ind
}
})
## samples in which features corresponding to both adducts are available
## samples in which features corresponding to both adducts are in the same peakgr
peakgroups_id <- sapply(sel_samples_ind, function(ns) {
matches_ns_1 <- matches_id[[which(sel_samples_ind == ns)]][[1]]
matches_ns_2 <- matches_id[[which(sel_samples_ind == ns)]][[2]]
if (all(nrow(matches_ns_1) > 0 & nrow(matches_ns_2) > 0)) { # if none of the adducts have duplicating features
if (matches_ns_1$tmp_peakgr == matches_ns_2$tmp_peakgr) { # if both adducts are assigned to the same peakgroup
matches_ns_1$tmp_peakgr
} else {
0
}
} else {
0
}
})
## extract whole peakgroups from every sample
peakgroups_dat <- lapply(sel_samples_ind[which(peakgroups_id != 0)], function(ns) {
dat <- object@data[[ns]]
pkg <- peakgroups_id[[which(sel_samples_ind == ns)]]
dat <- dat[dat$tmp_peakgr == pkg, ]
if (nrow(dat) > 0) {
## scale vector un three different ways
# dat$into <- dat$into # unscaled, redundant
dat$into_sqrt <- sqrt(dat$into) # sqrt-scaled
dat$into_weight <- apply(setNames(dat[, c("mz", "into")], # weight-scaled
nm = c("mz", "into")
),
1,
## normalise scaled vector to the total magnitude of the spectral vector
FUN = scaleWEIGHT
)
## now normalise scaled vectors to the total magnitude of the spectral vecto
dat$into_norm <- normMAGN(into = dat$into) # no-scaling, normalised
dat$into_sqrt_norm <- normMAGN(into = dat$into_sqrt) # sqrt-scaling, nornalised
dat$into_weight_norm <- normMAGN(into = dat$into_weight) # weight-scaling, normalised
## sample number/index here refers to run order
dat$run_order <- ns
## mark whether extracted feature corresponds to either of the validated adducts
## 1 - the main ion
## 2 - any other ion in the peakPantheR list
## these will be used to color the spectral 'pins'
dat$adduct <- 0
dat$adduct[dat$tmp_peakid %in% matches_id[[which(sel_samples_ind == ns)]][[1]]$tmp_peakid] <- 1
dat$adduct[dat$tmp_peakid %in% matches_id[[which(sel_samples_ind == ns)]][[2]]$tmp_peakid] <- 2
return(dat)
}
})
peakgroups_dat <- do.call(rbind, peakgroups_dat)
## build spectra: reference spectra comes from the 1st sample that this peakgroup was built in
ref <- subset(peakgroups_dat, run_order == min(run_order))
## sample spectra: all other samples that contain this peakgroup
sample_specs <- subset(peakgroups_dat, run_order != min(run_order))
# obtain cosines ----------------------------------------------------------
cos <- data.frame()
for (ro in unique(sample_specs$run_order)) {
## extract peaks from the reference spectra and sample spectra
ds <- subset(sample_specs, run_order == ro)
all_peaks <- c(ref$mz, ds$mz)
## put all peaks mz values together
spec <- data.frame(
mz = seq(
from = min(all_peaks),
to = max(all_peaks),
by = 0.01
)
)
spec$bin <- 1:nrow(spec)
cos_ls <-
lapply(c("none", "sqrt", "weight"), function(scale_i) {
getCOS(
spec = spec, target_peaks = ref, matched_peaks = ds,
scale = scale_i, norm = TRUE
)
})
cos <- rbind(
cos,
data.frame(
run_order = ro,
no_scale_norm = cos_ls[[1]],
scale_sqrt_norm = cos_ls[[2]],
scale_weight_norm = cos_ls[[3]]
)
)
}
# make spectra plots ------------------------------------------------------
n_peaks <- peakgroups_dat %>%
group_by(run_order) %>%
summarise(n = n()) %>%
ungroup()
## how many peaks are in each sample?
cos$n_peaks <- n_peaks$n[match(cos$run_order, n_peaks$run_order)]
cos_long <- tidyr::gather(cos, key = "scaling", value = cos, -run_order, -n_peaks)
gg_labels <- setNames(c("No-scaling, normalised", "Sqrt-scaled, normalised", "Weight-scaled, normalised"),
nm = c("no_scale_norm", "scale_sqrt_norm", "scale_weight_norm")
)
## Plot distribution accr to number of features per pseudo chemical spectra
ggplot(cos_long) +
geom_boxplot(aes(x = as.factor(n_peaks), y = cos, fill = scaling), position = "dodge2") +
scale_fill_viridis_d(
begin = 0.1,
name = "Spectra processing",
labels = gg_labels
) +
facet_wrap(~scaling,
labeller = as_labeller(gg_labels)
) +
scale_y_continuous(name = "Spectral similarity") +
scale_x_discrete(name = "Number of features in pseudo chemical spectra") +
ggtitle(paste0(standards$cpdName[id_inds][1], ", ", id)) +
theme_bw(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(size = 0.1)
)
## Plot spectral similarity plots, multiple scenarios are possible for each metabolite:
## (A) unit and weight scaling gives the same cos
## (B) unit and weight scaling gives opposite cos
## (C) unit and weight scaling gives opposite cos
cos_A <- subset(cos, (no_scale_norm > 0.9 & scale_sqrt_norm > 0.9 & scale_weight_norm > 0.9) |
(no_scale_norm < 0.3 & scale_sqrt_norm < 0.3 & scale_weight_norm < 0.3))
cos_B <- subset(cos, (no_scale_norm > 0.9 & scale_sqrt_norm > 0.9 & scale_weight_norm < 0.3))
cos_C <- subset(cos, (no_scale_norm < 0.9 & scale_sqrt_norm < 0.9 & scale_weight_norm > 0.3))
## make single plot per metabolite depending on which scenario it falls under
if (nrow(cos_A) > 0) {
plotSCENARIO(scen = cos_A, peakgroups_dat = peakgroups_dat, scen_name = "A", ref = ref, id = id)
}
if (nrow(cos_B) > 0) {
plotSCENARIO(scen = cos_B, peakgroups_dat = peakgroups_dat, scen_name = "B", ref = ref, id = id)
}
if (nrow(cos_C) > 0) {
plotSCENARIO(scen = cos_C, peakgroups_dat = peakgroups_dat, scen_name = "C", ref = ref, id = id)
}
}
```
### Feature alignment validation
#### Pearson correlation of endogenous metabolites
Pearson intensity correlation of features corresponding to 15 endogenous metabolites in DEVSET samples.
Firstly, `r colFmt("features corresponding to endogenous metabolites", "Crimson")` are found as in the earlier [section](#eic-correlation-of-endogenous-metabolites).
Then, the `r colFmt("main ion of each metabolite is correlated", "Crimson")` with all its validated adducts and in-source fragments.
```{r, eval=FALSE}
# Pearson Iintensity correlation ------------------------------------------
int_cor_res <- data.frame(stringsAsFactors = FALSE)
for (id in selected_standards) {
message("checking compound: ", id)
id_inds <- which(standards$cpdID.metabolite == id)
## main adduct and its validated adduct(s)
matches_id <- lapply(sel_samples_ind, function(ns) {
matches_ind <- matches[[which(samples_ind == ns)]][id_inds]
matches_n <- unlist(lapply(matches_ind, nrow))
## if there are multiple matches, the the one which is closer in scpos to the adduct match
if (length(which(matches_n > 1)) > 0) {
if (length(which(matches_n == 1)) == 1) {
scpos_ref <- matches_ind[[which(matches_n == 1)]]$scpos
} else {
into_highest <- which.max(matches_ind[[1]]$into)
scpos_ref <- matches_ind[[1]]$scpos[into_highest]
}
lapply(matches_ind, function(dat) {
match_closest <- which.min(abs(dat$scpos - scpos_ref))
dat[match_closest, ]
})
} else {
matches_ind
}
})
main <- unlist(lapply(matches_id, function(s) {
if (nrow(s[[1]]) > 0) {
s[[1]]$into
} else {
NA
}
}))
## (A) correlation with true adduct(s)
cor_adducts <- lapply(2:length(id_inds), function(ind) {
other <- unlist(lapply(matches_id, function(s) {
if (nrow(s[[ind]]) > 0) {
s[[ind]]$into
} else {
NA
}
}))
if (length(other[!is.na(other)]) > 0) {
cor(main, other, method = "pearson", use = "pairwise.complete.obs")
} else {
NA
}
})
cor_adducts <- data.frame(
id = id,
main = standards$cpdName[id_inds[1]],
adduct = standards$cpdName[id_inds[-1]],
adduct_idn = standards$cpdID.n[id_inds[-1]],
cor = unlist(cor_adducts),
stringsAsFactors = FALSE
)
int_cor_res <- rbind(int_cor_res, cor_adducts)
}
```
## Comparison to other tools
### Synthetic data generation
All synthetic datasets were generated using a single peaks table obtained for a real quality control sample from the DEVSET study. Study data can be downloaded from the MetaboLights server (study identifier [MTBLS694](https://www.ebi.ac.uk/metabolights/MTBLS694)).
The reference sample used in synthetic data generation is **PipelineTesting_RPOS_ToF10_U1W24_SR**. Corresponding mzML file was processed with the `masflowR` pipeline to generate pseudo chemical spectra (here referred to as peak groups, which comprise of structurally related co-eluting features) as follows:
```{r}
library(massFlowR)
library(xcms)
fname <- "path to the raw mzML file"
## centWave parameters for peak-picking
cwt <- xcms::CentWaveParam(
ppm = 25,
snthresh = 5,
noise = 200,
prefilter = c(10, 5000),
peakwidth = c(1, 5),
mzdiff = 0,
verboseColumns = TRUE
)
## process the reference sample, function will write a .csv file in the out_dir
massFlowR::groupPEAKS(file = fname, cwt = cwt, out_dir = "output directory/")
```
Synthetic data was generated using functions defined in *synthetic-data-generation_functions.R script*. Load the required functions:
```{r}
source(file = "./Chapter_3/synthetic-data-generation.R")
```
First, the `r colFmt("nonlinear RT drifts", "Crimson")` were modeled using cubic spline function. Features corresponding to 15 validated metabolites were identified in the DEVSET quality control samples. The code for this step is provided in the earlier [section](#eic-correlation-of-endogenous-metabolites). The RT of the features corresponding to the metabolites was modeled as below. Only the final spline model output is needed for synthetic data generation (argument `drift` for function `simulateDATA`).
```{r}
# RT deviation for every feature ------------------------------------------
adducts <- lapply(ids, function(id) {
message("checking compound: ", id)
## take only the main adduct for the compound
id_ind <- which(standards$cpdID.metabolite == id)[1]
## extract corresponding feature from every sample
sapply(sel_samples_ind, function(ns) {
matches_ns <- matches[[which(samples_ind == ns)]][[id_ind]]
if (nrow(matches_ns) > 1) {
stop(ns)
} else {
if (nrow(matches_ns) == 0) {
return(NA)
}
}
return(matches_ns$rt)
})
})
## extract endogenous metabolites names
cpdNames <- unlist(lapply(ids, function(id) {
id_ind <- which(standards$cpdID.metabolite == id)[1]
standards$cpdName[id_ind]
}))
## make a summary data.frame with RT in every sample, for every metabolite
gdf <- data.frame(
compound = rep(cpdNames, each = length(sel_samples_fnames)),
compound_no = rep(seq(length(cpdNames)), each = length(sel_samples_fnames)),
sample = rep(seq(length(sel_samples_fnames)), length(cpdNames)),
rt = unlist(adducts),
stringsAsFactors = FALSE
)
gdf_spline <- setNames(gdf, nm = c("compound", "compound_no", "x", "y"))
## fit cubic smoothing spline to every metabolite's RT
gdf_spline <- lapply(unique(gdf$compound_no), function(cn) {
x <- gdf_spline[gdf_spline$compound_no == cn, "x"]
y <- gdf_spline[gdf_spline$compound_no == cn, "y"]
present <- which(!is.na(y))
mod <- stats::smooth.spline(
x = x[present],
y = y[present],
cv = TRUE
)
data.frame(
compound = cpdNames[cn],
compound_no = cn,
predict(mod),
stringsAsFactors = FALSE
)
})
gdf_spline <- do.call("rbind", gdf_spline)
## make summary plot
library(ggplot2)
ggplot(gdf, aes(x = sample, y = rt)) +
geom_point() +
facet_wrap(~compound_no,
scales = "free_y", ncol = 3,
labeller = as_labeller(setNames(cpdNames, nm = seq(length(cpdNames))))
) +
geom_line(
data = gdf_spline,
aes(x, y, group = compound_no), color = "#FDE725FF", size = 1
) +
scale_x_continuous(name = "Sample run order") +
theme_bw() +
ylab("Retention time") +
theme(legend.position = "none") +
## make truly minimal theme
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(size = 0.1)
)
## save spline models into a csv file to be used for synthetic data generaiton