-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbiblio.bib
executable file
·1567 lines (1515 loc) · 86.8 KB
/
biblio.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Generated by Paperpile. Check out http://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.
@ARTICLE{Software-km,
title = "Targeted variant detection in leukemia using unaligned {RNA-Seq}
reads",
author = "{Eric Olivier Audemard, Patrick Gendron, Vincent-Philippe
Lavall{\'e}e, Jos{\'e}e H{\'e}bert, Guy Sauvageau, S{\'e}bastien
Lemieux}",
abstract = "Mutations identified in each Acute Myeloid Leukemia (AML)
patients are useful for prognosis and to select targeted
therapies. Detection of such mutations by the analysis of
Next-Generation Sequencing (NGS) data requiresa computationally
intensive read mapping step and application of several variant
calling methods. Targeted mutation identification drastically
shifts the usual tradeoff between accuracy and performance by
concentrating all computations over a small portion of sequence
space. Here, we present km, an efficient approach leveraging
k-mer decomposition of reads to identify targeted mutations. Our
approach is versatile, as it can detect single-base mutations,
several types of insertions and deletions, as well as fusions. We
used two independent AML cohorts (The Cancer Genome Atlas and
Leucegene), to show that mutation detection by km is fast,
accurate and mainly limited by sequencing depth. Therefore, km
allows to establish fast diagnostics from NGS data, and could be
suitable for clinical applications.",
journal = "bioRXiv",
month = apr,
year = 2018,
keywords = "Variant calling paper"
}
@ARTICLE{Kohei_Hagiwara_Liang_Ding_Michael_N_Edmonson_Stephen_V_Rice_Scott_Newman_Soheil_Meshinchi_Rhonda_E_Ries_Michael_Rusch_Jinghui_Zhang2019-ny,
title = "{RNAIndel}: a machine-learning framework for discovery of somatic
coding indels using tumor {RNA-Seq} data",
author = "{Kohei Hagiwara, Liang Ding, Michael N. Edmonson, Stephen V.
Rice, Scott Newman, Soheil Meshinchi, Rhonda E. Ries, Michael
Rusch, Jinghui Zhang}",
abstract = "Transcriptome sequencing (RNA-Seq) has been used for gene
expression profiling and fusion detection but rarely for small
insertion/deletion (indel) analysis due to the presence of
artifacts generated during the PCR-based library preparation as
well as alignment of spliced reads. Somatic indel calling is
further challenged by the lack of matched normal RNA-Seq data. We
present RNAIndel, a machine-learning based approach for
classifying RNA-Seq indels into somatic, germline, and artifact
by random forest models. RNAIndel was trained on tumor RNA-Seq of
330 pediatric cancer patients for whom whole exome and PCR-free
whole-genome sequencing of paired tumor- normal DNA samples were
also performed. Feature selection characterized somatic indels as
those that were not explained by the strand-slippage model, a
widely accepted hypothesis to explain indel generation during DNA
replication. The method was tested on two independent RNA-Seq
datasets with variable library protocols and RNA-Seq read
lengths. Despite the heterogeneity in RNA-Seq data acquisition,
RNAIndel robustly predicted 87- 93\% of somatic indels,
recovering subclonal pathogenic indels that were missed by a
500$\times$ targeted sequencing of DNA samples. With RNAIndel,
researchers can perform somatic indel calling in the
transcriptome, expanding the utility of RNA-Seq and enhancing the
interpretability of somatic indels.",
journal = "biorXiv",
month = jan,
year = 2019,
keywords = "To read;Variant calling paper"
}
@ARTICLE{Mose2019-vh,
title = "Improved Indel Detection in {DNA} and {RNA} via Realignment with
{ABRA2}",
author = "Mose, Lisle E and Perou, Charles M and Parker, Joel S",
abstract = "Motivation: Genomic variant detection from next-generation
sequencing (NGS) has become established as an extremely important
component of research and clinical diagnoses in both cancer and
Mendelian disorders. Insertions and deletions (indels) are a
common source of variation and can frequently impact
functionality, thus making their detection vitally important.
While substantial effort has gone into detecting indels from DNA,
there is still opportunity for improvement. Further, detection of
indels from RNA-Seq data has largely been an afterthought and
offers another critical area for variant detection. Results: We
present here ABRA2, a redesign of the original ABRA
implementation that offers support for realignment of both RNA
and DNA short reads. The process results in improved accuracy and
scalability including support for human whole genomes. Results
demonstrate substantial improvement in indel detection for a
variety of data types, including those that were not previously
supported by ABRA. Further, ABRA2 results in broad improvements
to variant calling accuracy across a wide range of
post-processing workflows including whole genomes, targeted
exomes, and transcriptome sequencing. Availability: ABRA2 is
implemented in a combination of Java and C/C ++ and is freely
available to all from: https://github.com/mozack/abra2.
Supplementary information: Supplementary data are available at
Bioinformatics online.",
journal = "Bioinformatics",
month = jan,
year = 2019,
keywords = "RNA variant calling;Variant calling paper",
language = "en"
}
@UNPUBLISHED{Cleary2015-bl,
title = "Comparing Variant Call Files for Performance Benchmarking of
{Next-Generation} Sequencing Variant Calling Pipelines",
author = "Cleary, John G and Braithwaite, Ross and Gaastra, Kurt and
Hilbush, Brian S and Inglis, Stuart and Irvine, Sean A and
Jackson, Alan and Littin, Richard and Rathod, Mehul and Ware,
David and Zook, Justin M and Trigg, Len and De La Vega, Francisco
M",
abstract = "To evaluate and compare the performance of variant calling
methods and their confidence scores, comparisons between a test
call set and a ?gold standard? need to be carried out.
Unfortunately, these comparisons are not straightforward with the
current Variant Call Files (VCF), which are the standard output
of most variant calling algorithms for high-throughput sequencing
data. Comparisons of VCFs are often confounded by the different
representations of indels, MNPs, and combinations thereof with
SNVs in complex regions of the genome, resulting in misleading
results. A variant caller is inherently a classification method
designed to score putative variants with confidence scores that
could permit controlling the rate of false positives (FP) or
false negatives (FN) for a given application. Receiver operator
curves (ROC) and the area under the ROC (AUC) are efficient
metrics to evaluate a test call set versus a gold standard.
However, in the case of VCF data this also requires a special
accounting to deal with discrepant representations. We developed
a novel algorithm for comparing variant call sets that deals with
complex call representation discrepancies and through a dynamic
programing method that minimizes false positives and negatives
globally across the entire call sets for accurate performance
evaluation of VCFs.",
journal = "bioRxiv",
pages = "023754",
month = aug,
year = 2015,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Tarasov2015-ex,
title = "Sambamba: fast processing of {NGS} alignment formats",
author = "Tarasov, Artem and Vilella, Albert J and Cuppen, Edwin and
Nijman, Isaac J and Prins, Pjotr",
abstract = "UNLABELLED: Sambamba is a high-performance robust tool and
library for working with SAM, BAM and CRAM sequence alignment
files; the most common file formats for aligned next generation
sequencing data. Sambamba is a faster alternative to samtools
that exploits multi-core processing and dramatically reduces
processing time. Sambamba is being adopted at sequencing centers,
not only because of its speed, but also because of additional
functionality, including coverage analysis and powerful filtering
capability. AVAILABILITY AND IMPLEMENTATION: Sambamba is free and
open source software, available under a GPLv2 license. Sambamba
can be downloaded and installed from
http://www.open-bio.org/wiki/Sambamba.Sambamba v0.5.0 was
released with doi:10.5281/zenodo.13200.",
journal = "Bioinformatics",
volume = 31,
number = 12,
pages = "2032--2034",
month = jun,
year = 2015,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Dobin2013-uc,
title = "{STAR}: ultrafast universal {RNA-seq} aligner",
author = "Dobin, Alexander and Davis, Carrie A and Schlesinger, Felix and
Drenkow, Jorg and Zaleski, Chris and Jha, Sonali and Batut,
Philippe and Chaisson, Mark and Gingeras, Thomas R",
abstract = "Motivation: Accurate alignment of high-throughput RNA-seq data
is a challenging and yet unsolved problem because of the
non-contiguous transcript structure, relatively short read
lengths and constantly increasing throughput of the sequencing
technologies. Currently available RNA-seq aligners suffer from
high mapping error rates, low mapping speed, read length
limitation and mapping biases.Results: To align our large (>80
billon reads) ENCODE Transcriptome RNA-seq dataset, we developed
the Spliced Transcripts Alignment to a Reference (STAR) software
based on a previously undescribed RNA-seq alignment algorithm
that uses sequential maximum mappable seed search in
uncompressed suffix arrays followed by seed clustering and
stitching procedure. STAR outperforms other aligners by a factor
of >50 in mapping speed, aligning to the human genome 550
million 2 $\times$ 76 bp paired-end reads per hour on a modest
12-core server, while at the same time improving alignment
sensitivity and precision. In addition to unbiased de novo
detection of canonical junctions, STAR can discover
non-canonical splices and chimeric (fusion) transcripts, and is
also capable of mapping full-length RNA sequences. Using Roche
454 sequencing of reverse transcription polymerase chain
reaction amplicons, we experimentally validated 1960 novel
intergenic splice junctions with an 80--90\% success rate,
corroborating the high precision of the STAR mapping
strategy.Availability and implementation: STAR is implemented as
a standalone C++ code. STAR is free open source software
distributed under GPLv3 license and can be downloaded from
http://code.google.com/p/rna-star/.Contact:[email protected].",
journal = "Bioinformatics",
publisher = "Oxford University Press",
volume = 29,
number = 1,
pages = "15--21",
month = jan,
year = 2013,
keywords = "Variant calling paper"
}
@ARTICLE{Tange_undated-th,
title = "{GNU} Parallel: The {Command-Line} Power Tool",
author = "Tange, Ole",
keywords = "Variant calling paper"
}
@MISC{Andrews2010-fk,
title = "{FastQC}: a quality control tool for high throughput sequence
data",
author = "Andrews, Simon",
year = 2010,
keywords = "Variant calling paper"
}
@MISC{Tcga_undated-qb,
title = "{TCGA-LAML}",
author = "{TCGA}",
keywords = "RNA variant calling;Variant calling paper"
}
@MISC{Li_undated-qy,
title = "seqtk: Toolkit for processing sequences in {FASTA/Q} formats",
booktitle = "seqtk",
author = "Li, Heng",
howpublished = "\url{https://github.com/lh3/seqtk}",
keywords = "Variant calling paper"
}
@MISC{Institute_undated-qc,
title = "Picard Tools",
booktitle = "Picard Tools",
author = "Institute, Broad",
howpublished = "\url{http://broadinstitute.github.io/picard.}",
keywords = "Variant calling paper"
}
@ARTICLE{McLaren2016-lv,
title = "The Ensembl Variant Effect Predictor",
author = "McLaren, William and Gil, Laurent and Hunt, Sarah E and Riat,
Harpreet Singh and Ritchie, Graham R S and Thormann, Anja and
Flicek, Paul and Cunningham, Fiona",
abstract = "The Ensembl Variant Effect Predictor is a powerful toolset for
the analysis, annotation, and prioritization of genomic variants
in coding and non-coding regions. It provides access to an
extensive collection of genomic annotation, with a variety of
interfaces to suit different requirements, and simple options for
configuring and extending analysis. It is open source, free to
use, and supports full reproducibility of results. The Ensembl
Variant Effect Predictor can simplify and accelerate variant
interpretation in a wide range of study designs.",
journal = "Genome Biol.",
volume = 17,
number = 1,
pages = "122",
month = jun,
year = 2016,
keywords = "Genome; NGS; SNP; Variant annotation;Variant calling paper",
language = "en"
}
@ARTICLE{Ramaswami2013-gm,
title = "Identifying {RNA} editing sites using {RNA} sequencing data alone",
author = "Ramaswami, Gokul and Zhang, Rui and Piskol, Robert and Keegan,
Liam P and Deng, Patricia and O'Connell, Mary A and Li, Jin Billy",
abstract = "We show that RNA editing sites can be called with high confidence
using RNA sequencing data from multiple samples across either
individuals or species, without the need for matched genomic DNA
sequence. We identified many previously unidentified editing
sites in both humans and Drosophila; our results nearly double
the known number of human protein recoding events. We also found
that human genes harboring conserved editing sites within Alu
repeats are enriched for neuronal functions.",
journal = "Nat. Methods",
volume = 10,
number = 2,
pages = "128--132",
month = feb,
year = 2013,
keywords = "Variant calling paper",
language = "en"
}
@MISC{Smit_AFA_Hubley_R_Green_P2013-tz,
title = "{RepeatMasker}",
author = "{Smit, AFA, Hubley, R \& Green, P.}",
year = 2013,
keywords = "Variant calling paper"
}
@ARTICLE{Ramaswami2014-nl,
title = "{RADAR}: a rigorously annotated database of {A-to-I} {RNA}
editing",
author = "Ramaswami, Gokul and Li, Jin Billy",
abstract = "We present RADAR--a rigorously annotated database of A-to-I RNA
editing (available at http://RNAedit.com). The identification of
A-to-I RNA editing sites has been dramatically accelerated in the
past few years by high-throughput RNA sequencing studies. RADAR
includes a comprehensive collection of A-to-I RNA editing sites
identified in humans (Homo sapiens), mice (Mus musculus) and
flies (Drosophila melanogaster), together with extensive manually
curated annotations for each editing site. RADAR also includes an
expandable listing of tissue-specific editing levels for each
editing site, which will facilitate the assignment of biological
functions to specific editing sites.",
journal = "Nucleic Acids Res.",
volume = 42,
number = "Database issue",
pages = "D109--13",
month = jan,
year = 2014,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Forbes2017-qq,
title = "{COSMIC}: somatic cancer genetics at high-resolution",
author = "Forbes, Simon A and Beare, David and Boutselakis, Harry and
Bamford, Sally and Bindal, Nidhi and Tate, John and Cole,
Charlotte G and Ward, Sari and Dawson, Elisabeth and Ponting,
Laura and Stefancsik, Raymund and Harsha, Bhavana and Kok, Chai
Yin and Jia, Mingming and Jubb, Harry and Sondka, Zbyslaw and
Thompson, Sam and De, Tisham and Campbell, Peter J",
abstract = "COSMIC, the Catalogue of Somatic Mutations in Cancer
(http://cancer.sanger.ac.uk) is a high-resolution resource for
exploring targets and trends in the genetics of human cancer.
Currently the broadest database of mutations in cancer, the
information in COSMIC is curated by expert scientists, primarily
by scrutinizing large numbers of scientific publications. Over 4
million coding mutations are described in v78 (September 2016),
combining genome-wide sequencing results from 28 366 tumours with
complete manual curation of 23 489 individual publications
focused on 186 key genes and 286 key fusion pairs across all
cancers. Molecular profiling of large tumour numbers has also
allowed the annotation of more than 13 million non-coding
mutations, 18 029 gene fusions, 187 429 genome rearrangements, 1
271 436 abnormal copy number segments, 9 175 462 abnormal
expression variants and 7 879 142 differentially methylated CpG
dinucleotides. COSMIC now details the genetics of drug
resistance, novel somatic gene mutations which allow a tumour to
evade therapeutic cancer drugs. Focusing initially on highly
characterized drugs and genes, COSMIC v78 contains wide
resistance mutation profiles across 20 drugs, detailing the
recurrence of 301 unique resistance alleles across 1934
drug-resistant tumours. All information from the COSMIC database
is available freely on the COSMIC website.",
journal = "Nucleic Acids Res.",
volume = 45,
number = "D1",
pages = "D777--D783",
month = jan,
year = 2017,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Lek2016-im,
title = "Analysis of protein-coding genetic variation in 60,706 humans",
author = "Lek, Monkol and Karczewski, Konrad J and Minikel, Eric V and
Samocha, Kaitlin E and Banks, Eric and Fennell, Timothy and
O'Donnell-Luria, Anne H and Ware, James S and Hill, Andrew J and
Cummings, Beryl B and Tukiainen, Taru and Birnbaum, Daniel P and
Kosmicki, Jack A and Duncan, Laramie E and Estrada, Karol and
Zhao, Fengmei and Zou, James and Pierce-Hoffman, Emma and
Berghout, Joanne and Cooper, David N and Deflaux, Nicole and
DePristo, Mark and Do, Ron and Flannick, Jason and Fromer,
Menachem and Gauthier, Laura and Goldstein, Jackie and Gupta,
Namrata and Howrigan, Daniel and Kiezun, Adam and Kurki, Mitja I
and Moonshine, Ami Levy and Natarajan, Pradeep and Orozco, Lorena
and Peloso, Gina M and Poplin, Ryan and Rivas, Manuel A and
Ruano-Rubio, Valentin and Rose, Samuel A and Ruderfer, Douglas M
and Shakir, Khalid and Stenson, Peter D and Stevens, Christine
and Thomas, Brett P and Tiao, Grace and Tusie-Luna, Maria T and
Weisburd, Ben and Won, Hong-Hee and Yu, Dongmei and Altshuler,
David M and Ardissino, Diego and Boehnke, Michael and Danesh,
John and Donnelly, Stacey and Elosua, Roberto and Florez, Jose C
and Gabriel, Stacey B and Getz, Gad and Glatt, Stephen J and
Hultman, Christina M and Kathiresan, Sekar and Laakso, Markku and
McCarroll, Steven and McCarthy, Mark I and McGovern, Dermot and
McPherson, Ruth and Neale, Benjamin M and Palotie, Aarno and
Purcell, Shaun M and Saleheen, Danish and Scharf, Jeremiah M and
Sklar, Pamela and Sullivan, Patrick F and Tuomilehto, Jaakko and
Tsuang, Ming T and Watkins, Hugh C and Wilson, James G and Daly,
Mark J and MacArthur, Daniel G and {Exome Aggregation Consortium}",
abstract = "Large-scale reference data sets of human genetic variation are
critical for the medical and functional interpretation of DNA
sequence changes. Here we describe the aggregation and analysis
of high-quality exome (protein-coding region) DNA sequence data
for 60,706 individuals of diverse ancestries generated as part of
the Exome Aggregation Consortium (ExAC). This catalogue of human
genetic diversity contains an average of one variant every eight
bases of the exome, and provides direct evidence for the presence
of widespread mutational recurrence. We have used this catalogue
to calculate objective metrics of pathogenicity for sequence
variants, and to identify genes subject to strong selection
against various classes of mutation; identifying 3,230 genes with
near-complete depletion of predicted protein-truncating variants,
with 72\% of these genes having no currently established human
disease phenotype. Finally, we demonstrate that these data can be
used for the efficient filtering of candidate disease-causing
variants, and for the discovery of human 'knockout' variants in
protein-coding genes.",
journal = "Nature",
volume = 536,
number = 7616,
pages = "285--291",
month = aug,
year = 2016,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Sherry2001-eh,
title = "{dbSNP}: the {NCBI} database of genetic variation",
author = "Sherry, S T and Ward, M H and Kholodov, M and Baker, J and Phan,
L and Smigielski, E M and Sirotkin, K",
abstract = "In response to a need for a general catalog of genome variation
to address the large-scale sampling designs required by
association studies, gene mapping and evolutionary biology, the
National Center for Biotechnology Information (NCBI) has
established the dbSNP database [S.T.Sherry, M.Ward and K.
Sirotkin (1999) Genome Res., 9, 677-679]. Submissions to dbSNP
will be integrated with other sources of information at NCBI such
as GenBank, PubMed, LocusLink and the Human Genome Project data.
The complete contents of dbSNP are available to the public at
website: http://www.ncbi.nlm.nih.gov/SNP. The complete contents
of dbSNP can also be downloaded in multiple formats via anonymous
FTP at ftp://ncbi.nlm.nih.gov/snp/.",
journal = "Nucleic Acids Res.",
volume = 29,
number = 1,
pages = "308--311",
month = jan,
year = 2001,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Li2009-th,
title = "The Sequence {Alignment/Map} format and {SAMtools}",
author = "Li, Heng and Handsaker, Bob and Wysoker, Alec and Fennell, Tim
and Ruan, Jue and Homer, Nils and Marth, Gabor and Abecasis,
Goncalo and Durbin, Richard and {1000 Genome Project Data
Processing Subgroup}",
abstract = "SUMMARY: The Sequence Alignment/Map (SAM) format is a generic
alignment format for storing read alignments against reference
sequences, supporting short and long reads (up to 128 Mbp)
produced by different sequencing platforms. It is flexible in
style, compact in size, efficient in random access and is the
format in which alignments from the 1000 Genomes Project are
released. SAMtools implements various utilities for
post-processing alignments in the SAM format, such as indexing,
variant caller and alignment viewer, and thus provides universal
tools for processing read alignments. AVAILABILITY:
http://samtools.sourceforge.net.",
journal = "Bioinformatics",
volume = 25,
number = 16,
pages = "2078--2079",
month = aug,
year = 2009,
keywords = "Variant calling paper",
language = "en"
}
% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Coudray2018-yw,
title = "Detection and benchmarking of somatic mutations in cancer
genomes using {RNA-seq} data",
author = "Coudray, A and Battenhouse, A M and Bucher, P and Iyer, V R",
abstract = "To detect functional somatic mutations in tumor samples,
whole-exome sequencing (WES) is often used for its reliability
and relative low cost. RNA-seq, while generally used to measure
gene expression, can potentially also be used for identification
of somatic mutations. However there has been little systematic
evaluation of the utility of RNA-seq for identifying somatic
mutations. Here, we develop and evaluate a pipeline for
processing RNA-seq data from glioblastoma multiforme (GBM)
tumors in order to identify somatic mutations. The …",
journal = "bioRxiv",
publisher = "biorxiv.org",
year = 2018,
keywords = "RNA variant calling;Variant calling paper"
}
@UNPUBLISHED{Poplin2017-ae,
title = "Scaling accurate genetic variant discovery to tens of thousands
of samples",
author = "Poplin, Ryan and Ruano-Rubio, Valentin and DePristo, Mark A and
Fennell, Tim J and Carneiro, Mauricio O and Van der Auwera,
Geraldine A and Kling, David E and Gauthier, Laura D and
Levy-Moonshine, Ami and Roazen, David and Shakir, Khalid and
Thibault, Joel and Chandran, Sheila and Whelan, Chris and Lek,
Monkol and Gabriel, Stacey and Daly, Mark J and Neale, Benjamin
and MacArthur, Daniel G and Banks, Eric",
abstract = "Comprehensive disease gene discovery in both common and rare
diseases will require the efficient and accurate detection of all
classes of genetic variation across tens to hundreds of thousands
of human samples. We describe here a novel assembly-based
approach to variant calling, the GATK HaplotypeCaller (HC) and
Reference Confidence Model (RCM), that determines genotype
likelihoods independently per-sample but performs joint calling
across all samples within a project simultaneously. We show by
calling over 90,000 samples from the Exome Aggregation Consortium
(ExAC) that, in contrast to other algorithms, the HC-RCM scales
efficiently to very large sample sizes without loss in accuracy;
and that the accuracy of indel variant calling is superior in
comparison to other algorithms. More importantly, the HC-RCM
produces a fully squared-off matrix of genotypes across all
samples at every genomic position being investigated. The HC- RCM
is a novel, scalable, assembly-based algorithm with abundant
applications for population genetics and clinical studies.",
journal = "bioRxiv",
pages = "201178",
month = nov,
year = 2017,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Tang2014-df,
title = "The {eSNV-detect}: a computational system to identify expressed
single nucleotide variants from transcriptome sequencing data",
author = "Tang, Xiaojia and Baheti, Saurabh and Shameer, Khader and
Thompson, Kevin J and Wills, Quin and Niu, Nifang and Holcomb,
Ilona N and Boutet, Stephane C and Ramakrishnan, Ramesh and
Kachergus, Jennifer M and Kocher, Jean-Pierre A and
Weinshilboum, Richard M and Wang, Liewei and Thompson, E Aubrey
and Kalari, Krishna R",
abstract = "Abstract. Rapid development of next generation sequencing
technology has enabled the identification of genomic alterations
from short sequencing reads. There a",
journal = "Nucleic Acids Res.",
publisher = "Oxford University Press",
volume = 42,
number = 22,
pages = "e172--e172",
month = dec,
year = 2014,
keywords = "rna; exome; breast cancer; rna, messenger;Variant calling paper"
}
@ARTICLE{Christoforides2013-cr,
title = "Identification of somatic mutations in cancer through
Bayesian-based analysis of sequenced genome pairs",
author = "Christoforides, Alexis and Carpten, John D and Weiss, Glen J and
Demeure, Michael J and Von Hoff, Daniel D and Craig, David W",
abstract = "BACKGROUND: The field of cancer genomics has rapidly adopted
next-generation sequencing (NGS) in order to study and
characterize malignant tumors with unprecedented resolution. In
particular for cancer, one is often trying to identify somatic
mutations--changes specific to a tumor and not within an
individual's germline. However, false positive and false negative
detections often result from lack of sufficient variant evidence,
contamination of the biopsy by stromal tissue, sequencing errors,
and the erroneous classification of germline variation as
tumor-specific. RESULTS: We have developed a generalized Bayesian
analysis framework for matched tumor/normal samples with the
purpose of identifying tumor-specific alterations such as single
nucleotide mutations, small insertions/deletions, and structural
variation. We describe our methodology, and discuss its
application to other types of paired-tissue analysis such as the
detection of loss of heterozygosity as well as allelic imbalance.
We also demonstrate the high level of sensitivity and specificity
in discovering simulated somatic mutations, for various
combinations of a) genomic coverage and b) emulated
heterogeneity. CONCLUSION: We present a Java-based implementation
of our methods named Seurat, which is made available for free
academic use. We have demonstrated and reported on the discovery
of different types of somatic change by applying Seurat to an
experimentally-derived cancer dataset using our methods; and have
discussed considerations and practices regarding the accurate
detection of somatic events in cancer genomes. Seurat is
available at https://sites.google.com/site/seuratsomatic.",
journal = "BMC Genomics",
volume = 14,
pages = "302",
month = may,
year = 2013,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Radenbaugh2014-cj,
title = "{RADIA}: {RNA} and {DNA} integrated analysis for somatic mutation
detection",
author = "Radenbaugh, Amie J and Ma, Singer and Ewing, Adam and Stuart,
Joshua M and Collisson, Eric A and Zhu, Jingchun and Haussler,
David",
abstract = "The detection of somatic single nucleotide variants is a crucial
component to the characterization of the cancer genome. Mutation
calling algorithms thus far have focused on comparing the normal
and tumor genomes from the same individual. In recent years, it
has become routine for projects like The Cancer Genome Atlas
(TCGA) to also sequence the tumor RNA. Here we present RADIA (RNA
and DNA Integrated Analysis), a novel computational method
combining the patient-matched normal and tumor DNA with the tumor
RNA to detect somatic mutations. The inclusion of the RNA
increases the power to detect somatic mutations, especially at
low DNA allelic frequencies. By integrating an individual's DNA
and RNA, we are able to detect mutations that would otherwise be
missed by traditional algorithms that examine only the DNA. We
demonstrate high sensitivity (84\%) and very high precision (98\%
and 99\%) for RADIA in patient data from endometrial carcinoma
and lung adenocarcinoma from TCGA. Mutations with both high DNA
and RNA read support have the highest validation rate of over
99\%. We also introduce a simulation package that spikes in
artificial mutations to patient data, rather than simulating
sequencing data from a reference genome. We evaluate sensitivity
on the simulation data and demonstrate our ability to rescue back
mutations at low DNA allelic frequencies by including the RNA.
Finally, we highlight mutations in important cancer genes that
were rescued due to the incorporation of the RNA.",
journal = "PLoS One",
volume = 9,
number = 11,
pages = "e111516",
month = nov,
year = 2014,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Ewels2016-rc,
title = "{MultiQC}: summarize analysis results for multiple tools and
samples in a single report",
author = "Ewels, Philip and Magnusson, M{\aa}ns and Lundin, Sverker and
K{\"a}ller, Max",
abstract = "MOTIVATION: Fast and accurate quality control is essential for
studies involving next-generation sequencing data. Whilst
numerous tools exist to quantify QC metrics, there is no common
approach to flexibly integrate these across tools and large
sample sets. Assessing analysis results across an entire project
can be time consuming and error prone; batch effects and outlier
samples can easily be missed in the early stages of analysis.
RESULTS: We present MultiQC, a tool to create a single report
visualising output from multiple tools across many samples,
enabling global trends and biases to be quickly identified.
MultiQC can plot data from many common bioinformatics tools and
is built to allow easy extension and customization. AVAILABILITY
AND IMPLEMENTATION: MultiQC is available with an GNU GPLv3
license on GitHub, the Python Package Index and Bioconda.
Documentation and example reports are available at
http://multiqc.info CONTACT: [email protected].",
journal = "Bioinformatics",
volume = 32,
number = 19,
pages = "3047--3048",
month = oct,
year = 2016,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Liao2013-ga,
title = "The Subread aligner: fast, accurate and scalable read mapping by
seed-and-vote",
author = "Liao, Yang and Smyth, Gordon K and Shi, Wei",
abstract = "Read alignment is an ongoing challenge for the analysis of data
from sequencing technologies. This article proposes an elegantly
simple multi-seed strategy, called seed-and-vote, for mapping
reads to a reference genome. The new strategy chooses the mapped
genomic location for the read directly from the seeds. It uses a
relatively large number of short seeds (called subreads)
extracted from each read and allows all the seeds to vote on the
optimal location. When the read length is <160 bp, overlapping
subreads are used. More conventional alignment algorithms are
then used to fill in detailed mismatch and indel information
between the subreads that make up the winning voting block. The
strategy is fast because the overall genomic location has already
been chosen before the detailed alignment is done. It is
sensitive because no individual subread is required to map
exactly, nor are individual subreads constrained to map close by
other subreads. It is accurate because the final location must be
supported by several different subreads. The strategy extends
easily to find exon junctions, by locating reads that contain
sets of subreads mapping to different exons of the same gene. It
scales up efficiently for longer reads.",
journal = "Nucleic Acids Res.",
volume = 41,
number = 10,
pages = "e108",
month = may,
year = 2013,
keywords = "Variant calling paper",
language = "en"
}
@ARTICLE{Xu2018-bt,
title = "A review of somatic single nucleotide variant calling algorithms
for next-generation sequencing data",
author = "Xu, Chang",
abstract = "Detection of somatic mutations holds great potential in cancer
treatment and has been a very active research field in the past
few years, especially since the breakthrough of the
next-generation sequencing technology. A collection of variant
calling pipelines have been developed with different underlying
models, filters, input data requirements, and targeted
applications. This review aims to enumerate these unique features
of the state-of-the-art variant callers, in the hope to provide a
practical guide for selecting the appropriate pipeline for
specific applications. We will focus on the detection of somatic
single nucleotide variants, ranging from traditional variant
callers based on whole genome or exome sequencing of paired
tumor-normal samples to recent low-frequency variant callers
designed for targeted sequencing protocols with unique molecular
identifiers. The variant callers have been extensively
benchmarked with inconsistent performances across these studies.
We will review the reference materials, datasets, and performance
metrics that have been used in the benchmarking studies. In the
end, we will discuss emerging trends and future directions of the
variant calling algorithms.",
journal = "Comput. Struct. Biotechnol. J.",
volume = 16,
pages = "15--24",
month = feb,
year = 2018,
keywords = "Benchmarking; Low-frequency mutation; Somatic mutation; Unique
molecular identifier; Variant calling;Variant calling paper",
language = "en"
}
@ARTICLE{Lai2016-ws,
title = "{VarDict}: a novel and versatile variant caller for
next-generation sequencing in cancer research",
author = "Lai, Zhongwu and Markovets, Aleksandra and Ahdesmaki, Miika and
Chapman, Brad and Hofmann, Oliver and McEwen, Robert and Johnson,
Justin and Dougherty, Brian and Barrett, J Carl and Dry, Jonathan
R",
abstract = "Accurate variant calling in next generation sequencing (NGS) is
critical to understand cancer genomes better. Here we present
VarDict, a novel and versatile variant caller for both DNA- and
RNA-sequencing data. VarDict simultaneously calls SNV, MNV,
InDels, complex and structural variants, expanding the detected
genetic driver landscape of tumors. It performs local
realignments on the fly for more accurate allele frequency
estimation. VarDict performance scales linearly to sequencing
depth, enabling ultra-deep sequencing used to explore tumor
evolution or detect tumor DNA circulating in blood. In addition,
VarDict performs amplicon aware variant calling for polymerase
chain reaction (PCR)-based targeted sequencing often used in
diagnostic settings, and is able to detect PCR artifacts.
Finally, VarDict also detects differences in somatic and loss of
heterozygosity variants between paired samples. VarDict
reprocessing of The Cancer Genome Atlas (TCGA) Lung
Adenocarcinoma dataset called known driver mutations in KRAS,
EGFR, BRAF, PIK3CA and MET in 16\% more patients than previously
published variant calls. We believe VarDict will greatly
facilitate application of NGS in clinical cancer research.",
journal = "Nucleic Acids Res.",
volume = 44,
number = 11,
pages = "e108",
month = jun,
year = 2016,
keywords = "Finding optimal coverage;Variant calling paper",
language = "en"
}
@ARTICLE{Bohnert2017-if,
title = "Comprehensive benchmarking of {SNV} callers for highly admixed
tumor data",
author = "Bohnert, Regina and Vivas, Sonia and Jansen, Gunther",
abstract = "Precision medicine attempts to individualize cancer therapy by
matching tumor-specific genetic changes with effective targeted
therapies. A crucial first step in this process is the reliable
identification of cancer-relevant variants, which is considerably
complicated by the impurity and heterogeneity of clinical tumor
samples. We compared the impact of admixture of non-cancerous
cells and low somatic allele frequencies on the sensitivity and
precision of 19 state-of-the-art SNV callers. We studied both
whole exome and targeted gene panel data and up to 13 distinct
parameter configurations for each tool. We found vast differences
among callers. Based on our comprehensive analyses we recommend
joint tumor-normal calling with MuTect, EBCall or Strelka for
whole exome somatic variant calling, and HaplotypeCaller or
FreeBayes for whole exome germline calling. For targeted gene
panel data on a single tumor sample, LoFreqStar performed best.
We further found that tumor impurity and admixture had a negative
impact on precision, and in particular, sensitivity in whole
exome experiments. At admixture levels of 60\% to 90\% sometimes
seen in pathological biopsies, sensitivity dropped significantly,
even when variants were originally present in the tumor at 100\%
allele frequency. Sensitivity to low-frequency SNVs improved with
targeted panel data, but whole exome data allowed more efficient
identification of germline variants. Effective somatic variant
calling requires high-quality pathological samples with minimal
admixture, a consciously selected sequencing strategy, and the
appropriate variant calling tool with settings optimized for the
chosen type of data.",
journal = "PLoS One",
volume = 12,
number = 10,
pages = "e0186175",
month = oct,
year = 2017,
keywords = "RNA variant calling;Variant calling paper",
language = "en"
}
@ARTICLE{Tarazona2011-qy,
title = "Differential expression in {RNA-seq}: a matter of depth",
author = "Tarazona, Sonia and Garc{\'\i}a-Alcalde, Fernando and Dopazo,
Joaqu{\'\i}n and Ferrer, Alberto and Conesa, Ana",
abstract = "Next-generation sequencing (NGS) technologies are revolutionizing
genome research, and in particular, their application to
transcriptomics (RNA-seq) is increasingly being used for gene
expression profiling as a replacement for microarrays. However,
the properties of RNA-seq data have not been yet fully
established, and additional research is needed for understanding
how these data respond to differential expression analysis. In
this work, we set out to gain insights into the characteristics
of RNA-seq data analysis by studying an important parameter of
this technology: the sequencing depth. We have analyzed how
sequencing depth affects the detection of transcripts and their
identification as differentially expressed, looking at aspects
such as transcript biotype, length, expression level, and
fold-change. We have evaluated different algorithms available for
the analysis of RNA-seq and proposed a novel
approach--NOISeq--that differs from existing methods in that it
is data-adaptive and nonparametric. Our results reveal that most
existing methodologies suffer from a strong dependency on
sequencing depth for their differential expression calls and that
this results in a considerable number of false positives that
increases as the number of reads grows. In contrast, our proposed
method models the noise distribution from the actual data, can
therefore better adapt to the size of the data set, and is more
effective in controlling the rate of false discoveries. This work
discusses the true potential of RNA-seq for studying regulation
at low expression ranges, the noise within RNA-seq data, and the
issue of replication.",
journal = "Genome Res.",
volume = 21,
number = 12,
pages = "2213--2223",
month = dec,
year = 2011,
keywords = "Finding optimal coverage;Variant calling paper;sequencing depth,
gene expression, AML",
language = "en"
}
@ARTICLE{Wu2016-kh,
title = "Experimental Design and Power Calculation for {RNA-seq}
Experiments",
author = "Wu, Zhijin and Wu, Hao",
abstract = "Power calculation is a critical component of RNA-seq experimental
design. The flexibility of RNA-seq experiment and the wide
dynamic range of transcription it measures make it an attractive
technology for whole transcriptome analysis. These features, in
addition to the high dimensionality of RNA-seq data, bring
complexity in experimental design, making an analytical power
calculation no longer realistic. In this chapter we review the
major factors that influence the statistical power of detecting
differential expression, and give examples of power assessment
using the R package PROPER.",
journal = "Methods Mol. Biol.",
volume = 1418,
pages = "379--390",
year = 2016,
keywords = "Experimental design; Gene expression; RNA-Seq; Sample size;
Statistical power;Finding optimal coverage;RNA variant
calling;Variant calling paper;sequencing depth, gene expression,
AML",
language = "en"
}
@ARTICLE{Sandmann2017-kc,
title = "Evaluating Variant Calling Tools for {Non-Matched}
{Next-Generation} Sequencing Data",
author = "Sandmann, Sarah and de Graaf, Aniek O and Karimi, Mohsen and van
der Reijden, Bert A and Hellstr{\"o}m-Lindberg, Eva and Jansen,
Joop H and Dugas, Martin",
abstract = "Valid variant calling results are crucial for the use of
next-generation sequencing in clinical routine. However, there
are numerous variant calling tools that usually differ in
algorithms, filtering strategies, recommendations and thus, also
in the output. We evaluated eight open-source tools regarding
their ability to call single nucleotide variants and short indels
with allelic frequencies as low as 1\% in non-matched
next-generation sequencing data: GATK HaplotypeCaller, Platypus,
VarScan, LoFreq, FreeBayes, SNVer, SAMtools and VarDict. We
analysed two real datasets from patients with myelodysplastic
syndrome, covering 54 Illumina HiSeq samples and 111 Illumina
NextSeq samples. Mutations were validated by re-sequencing on the
same platform, on a different platform and expert based review.
In addition we considered two simulated datasets with varying
coverage and error profiles, covering 50 samples each. In all
cases an identical target region consisting of 19 genes (42,322
bp) was analysed. Altogether, no tool succeeded in calling all
mutations. High sensitivity was always accompanied by low
precision. Influence of varying coverages- and background noise
on variant calling was generally low. Taking everything into
account, VarDict performed best. However, our results indicate
that there is a need to improve reproducibility of the results in
the context of multithreading.",
journal = "Sci. Rep.",
volume = 7,
pages = "43169",
month = feb,
year = 2017,
keywords = "RNA variant calling;Variant calling paper",
language = "en"
}
@ARTICLE{Quinn2013-oh,
title = "Development of strategies for {SNP} detection in {RNA-seq} data:
application to lymphoblastoid cell lines and evaluation using
1000 Genomes data",
author = "Quinn, Emma M and Cormican, Paul and Kenny, Elaine M and Hill,
Matthew and Anney, Richard and Gill, Michael and Corvin, Aiden P
and Morris, Derek W",
abstract = "Next-generation RNA sequencing (RNA-seq) maps and analyzes
transcriptomes and generates data on sequence variation in
expressed genes. There are few reported studies on analysis
strategies to maximize the yield of quality RNA-seq SNP data. We
evaluated the performance of different SNP-calling methods
following alignment to both genome and transcriptome by applying
them to RNA-seq data from a HapMap lymphoblastoid cell line
sample and comparing results with sequence variation data from
1000 Genomes. We determined that the best method to achieve high
specificity and sensitivity, and greatest number of SNP calls, is
to remove duplicate sequence reads after alignment to the genome
and to call SNPs using SAMtools. The accuracy of SNP calls is
dependent on sequence coverage available. In terms of
specificity, 89\% of RNA-seq SNPs calls were true variants where
coverage is >10X. In terms of sensitivity, at >10X coverage 92\%
of all expected SNPs in expressed exons could be detected.
Overall, the results indicate that RNA-seq SNP data are a very
useful by-product of sequence-based transcriptome analysis. If
RNA-seq is applied to disease tissue samples and assuming that
genes carrying mutations relevant to disease biology are being
expressed, a very high proportion of these mutations can be
detected.",
journal = "PLoS One",
volume = 8,
number = 3,
pages = "e58815",
month = mar,
year = 2013,
keywords = "Finding optimal coverage;RNA variant calling;Variant calling
paper",
language = "en"
}
@ARTICLE{McKenna2010-mc,
title = "The Genome Analysis Toolkit: a {MapReduce} framework for
analyzing next-generation {DNA} sequencing data",
author = "McKenna, Aaron and Hanna, Matthew and Banks, Eric and Sivachenko,
Andrey and Cibulskis, Kristian and Kernytsky, Andrew and
Garimella, Kiran and Altshuler, David and Gabriel, Stacey and
Daly, Mark and DePristo, Mark A",
abstract = "Next-generation DNA sequencing (NGS) projects, such as the 1000
Genomes Project, are already revolutionizing our understanding of
genetic variation among individuals. However, the massive data
sets generated by NGS--the 1000 Genome pilot alone includes
nearly five terabases--make writing feature-rich, efficient, and
robust analysis tools difficult for even computationally
sophisticated individuals. Indeed, many professionals are limited
in the scope and the ease with which they can answer scientific
questions by the complexity of accessing and manipulating the
data produced by these machines. Here, we discuss our Genome
Analysis Toolkit (GATK), a structured programming framework
designed to ease the development of efficient and robust analysis
tools for next-generation DNA sequencers using the functional
programming philosophy of MapReduce. The GATK provides a small
but rich set of data access patterns that encompass the majority
of analysis tool needs. Separating specific analysis calculations
from common data management infrastructure enables us to optimize
the GATK framework for correctness, stability, and CPU and memory
efficiency and to enable distributed and shared memory
parallelization. We highlight the capabilities of the GATK by
describing the implementation and application of robust,
scale-tolerant tools like coverage calculators and single
nucleotide polymorphism (SNP) calling. We conclude that the GATK
programming framework enables developers and analysts to quickly
and easily write efficient and robust NGS tools, many of which
have already been incorporated into large-scale sequencing
projects like the 1000 Genomes Project and The Cancer Genome
Atlas.",
journal = "Genome Res.",
volume = 20,
number = 9,
pages = "1297--1303",
month = sep,
year = 2010,
keywords = "Variant calling paper",
language = "en"
}