biblio.bib

% Generated by Paperpile. Check out http://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.

@ARTICLE{Software-km,
  title    = "Targeted variant detection in leukemia using unaligned {RNA-Seq}
              reads",
  author   = "{Eric Olivier Audemard, Patrick Gendron, Vincent-Philippe
              Lavall{\'e}e, Jos{\'e}e H{\'e}bert, Guy Sauvageau, S{\'e}bastien
              Lemieux}",
  abstract = "Mutations identified in each Acute Myeloid Leukemia (AML)
              patients are useful for prognosis and to select targeted
              therapies. Detection of such mutations by the analysis of
              Next-Generation Sequencing (NGS) data requiresa computationally
              intensive read mapping step and application of several variant
              calling methods. Targeted mutation identification drastically
              shifts the usual tradeoff between accuracy and performance by
              concentrating all computations over a small portion of sequence
              space. Here, we present km, an efficient approach leveraging
              k-mer decomposition of reads to identify targeted mutations. Our
              approach is versatile, as it can detect single-base mutations,
              several types of insertions and deletions, as well as fusions. We
              used two independent AML cohorts (The Cancer Genome Atlas and
              Leucegene), to show that mutation detection by km is fast,
              accurate and mainly limited by sequencing depth. Therefore, km
              allows to establish fast diagnostics from NGS data, and could be
              suitable for clinical applications.",
  journal  = "bioRXiv",
  month    =  apr,
  year     =  2018,
  keywords = "Variant calling paper"
}

@ARTICLE{Kohei_Hagiwara_Liang_Ding_Michael_N_Edmonson_Stephen_V_Rice_Scott_Newman_Soheil_Meshinchi_Rhonda_E_Ries_Michael_Rusch_Jinghui_Zhang2019-ny,
  title    = "{RNAIndel}: a machine-learning framework for discovery of somatic
              coding indels using tumor {RNA-Seq} data",
  author   = "{Kohei Hagiwara, Liang Ding, Michael N. Edmonson, Stephen V.
              Rice, Scott Newman, Soheil Meshinchi, Rhonda E. Ries, Michael
              Rusch, Jinghui Zhang}",
  abstract = "Transcriptome sequencing (RNA-Seq) has been used for gene
              expression profiling and fusion detection but rarely for small
              insertion/deletion (indel) analysis due to the presence of
              artifacts generated during the PCR-based library preparation as
              well as alignment of spliced reads. Somatic indel calling is
              further challenged by the lack of matched normal RNA-Seq data. We
              present RNAIndel, a machine-learning based approach for
              classifying RNA-Seq indels into somatic, germline, and artifact
              by random forest models. RNAIndel was trained on tumor RNA-Seq of
              330 pediatric cancer patients for whom whole exome and PCR-free
              whole-genome sequencing of paired tumor- normal DNA samples were
              also performed. Feature selection characterized somatic indels as
              those that were not explained by the strand-slippage model, a
              widely accepted hypothesis to explain indel generation during DNA
              replication. The method was tested on two independent RNA-Seq
              datasets with variable library protocols and RNA-Seq read
              lengths. Despite the heterogeneity in RNA-Seq data acquisition,
              RNAIndel robustly predicted 87- 93\% of somatic indels,
              recovering subclonal pathogenic indels that were missed by a
              500$\times$ targeted sequencing of DNA samples. With RNAIndel,
              researchers can perform somatic indel calling in the
              transcriptome, expanding the utility of RNA-Seq and enhancing the
              interpretability of somatic indels.",
  journal  = "biorXiv",
  month    =  jan,
  year     =  2019,
  keywords = "To read;Variant calling paper"
}

@ARTICLE{Mose2019-vh,
  title    = "Improved Indel Detection in {DNA} and {RNA} via Realignment with
              {ABRA2}",
  author   = "Mose, Lisle E and Perou, Charles M and Parker, Joel S",
  abstract = "Motivation: Genomic variant detection from next-generation
              sequencing (NGS) has become established as an extremely important
              component of research and clinical diagnoses in both cancer and
              Mendelian disorders. Insertions and deletions (indels) are a
              common source of variation and can frequently impact
              functionality, thus making their detection vitally important.
              While substantial effort has gone into detecting indels from DNA,
              there is still opportunity for improvement. Further, detection of
              indels from RNA-Seq data has largely been an afterthought and
              offers another critical area for variant detection. Results: We
              present here ABRA2, a redesign of the original ABRA
              implementation that offers support for realignment of both RNA
              and DNA short reads. The process results in improved accuracy and
              scalability including support for human whole genomes. Results
              demonstrate substantial improvement in indel detection for a
              variety of data types, including those that were not previously
              supported by ABRA. Further, ABRA2 results in broad improvements
              to variant calling accuracy across a wide range of
              post-processing workflows including whole genomes, targeted
              exomes, and transcriptome sequencing. Availability: ABRA2 is
              implemented in a combination of Java and C/C ++ and is freely
              available to all from: https://github.com/mozack/abra2.
              Supplementary information: Supplementary data are available at
              Bioinformatics online.",
  journal  = "Bioinformatics",
  month    =  jan,
  year     =  2019,
  keywords = "RNA variant calling;Variant calling paper",
  language = "en"
}

@UNPUBLISHED{Cleary2015-bl,
  title    = "Comparing Variant Call Files for Performance Benchmarking of
              {Next-Generation} Sequencing Variant Calling Pipelines",
  author   = "Cleary, John G and Braithwaite, Ross and Gaastra, Kurt and
              Hilbush, Brian S and Inglis, Stuart and Irvine, Sean A and
              Jackson, Alan and Littin, Richard and Rathod, Mehul and Ware,
              David and Zook, Justin M and Trigg, Len and De La Vega, Francisco
              M",
  abstract = "To evaluate and compare the performance of variant calling
              methods and their confidence scores, comparisons between a test
              call set and a ?gold standard? need to be carried out.
              Unfortunately, these comparisons are not straightforward with the
              current Variant Call Files (VCF), which are the standard output
              of most variant calling algorithms for high-throughput sequencing
              data. Comparisons of VCFs are often confounded by the different
              representations of indels, MNPs, and combinations thereof with
              SNVs in complex regions of the genome, resulting in misleading
              results. A variant caller is inherently a classification method
              designed to score putative variants with confidence scores that
              could permit controlling the rate of false positives (FP) or
              false negatives (FN) for a given application. Receiver operator
              curves (ROC) and the area under the ROC (AUC) are efficient
              metrics to evaluate a test call set versus a gold standard.
              However, in the case of VCF data this also requires a special
              accounting to deal with discrepant representations. We developed
              a novel algorithm for comparing variant call sets that deals with
              complex call representation discrepancies and through a dynamic
              programing method that minimizes false positives and negatives
              globally across the entire call sets for accurate performance
              evaluation of VCFs.",
  journal  = "bioRxiv",
  pages    = "023754",
  month    =  aug,
  year     =  2015,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Tarasov2015-ex,
  title    = "Sambamba: fast processing of {NGS} alignment formats",
  author   = "Tarasov, Artem and Vilella, Albert J and Cuppen, Edwin and
              Nijman, Isaac J and Prins, Pjotr",
  abstract = "UNLABELLED: Sambamba is a high-performance robust tool and
              library for working with SAM, BAM and CRAM sequence alignment
              files; the most common file formats for aligned next generation
              sequencing data. Sambamba is a faster alternative to samtools
              that exploits multi-core processing and dramatically reduces
              processing time. Sambamba is being adopted at sequencing centers,
              not only because of its speed, but also because of additional
              functionality, including coverage analysis and powerful filtering
              capability. AVAILABILITY AND IMPLEMENTATION: Sambamba is free and
              open source software, available under a GPLv2 license. Sambamba
              can be downloaded and installed from
              http://www.open-bio.org/wiki/Sambamba.Sambamba v0.5.0 was
              released with doi:10.5281/zenodo.13200.",
  journal  = "Bioinformatics",
  volume   =  31,
  number   =  12,
  pages    = "2032--2034",
  month    =  jun,
  year     =  2015,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Dobin2013-uc,
  title     = "{STAR}: ultrafast universal {RNA-seq} aligner",
  author    = "Dobin, Alexander and Davis, Carrie A and Schlesinger, Felix and
               Drenkow, Jorg and Zaleski, Chris and Jha, Sonali and Batut,
               Philippe and Chaisson, Mark and Gingeras, Thomas R",
  abstract  = "Motivation: Accurate alignment of high-throughput RNA-seq data
               is a challenging and yet unsolved problem because of the
               non-contiguous transcript structure, relatively short read
               lengths and constantly increasing throughput of the sequencing
               technologies. Currently available RNA-seq aligners suffer from
               high mapping error rates, low mapping speed, read length
               limitation and mapping biases.Results: To align our large (>80
               billon reads) ENCODE Transcriptome RNA-seq dataset, we developed
               the Spliced Transcripts Alignment to a Reference (STAR) software
               based on a previously undescribed RNA-seq alignment algorithm
               that uses sequential maximum mappable seed search in
               uncompressed suffix arrays followed by seed clustering and
               stitching procedure. STAR outperforms other aligners by a factor
               of >50 in mapping speed, aligning to the human genome 550
               million 2 $\times$ 76 bp paired-end reads per hour on a modest
               12-core server, while at the same time improving alignment
               sensitivity and precision. In addition to unbiased de novo
               detection of canonical junctions, STAR can discover
               non-canonical splices and chimeric (fusion) transcripts, and is
               also capable of mapping full-length RNA sequences. Using Roche
               454 sequencing of reverse transcription polymerase chain
               reaction amplicons, we experimentally validated 1960 novel
               intergenic splice junctions with an 80--90\% success rate,
               corroborating the high precision of the STAR mapping
               strategy.Availability and implementation: STAR is implemented as
               a standalone C++ code. STAR is free open source software
               distributed under GPLv3 license and can be downloaded from
               http://code.google.com/p/rna-star/.Contact:dobin@cshl.edu.",
  journal   = "Bioinformatics",
  publisher = "Oxford University Press",
  volume    =  29,
  number    =  1,
  pages     = "15--21",
  month     =  jan,
  year      =  2013,
  keywords  = "Variant calling paper"
}

@ARTICLE{Tange_undated-th,
  title    = "{GNU} Parallel: The {Command-Line} Power Tool",
  author   = "Tange, Ole",
  keywords = "Variant calling paper"
}

@MISC{Andrews2010-fk,
  title    = "{FastQC}: a quality control tool for high throughput sequence
              data",
  author   = "Andrews, Simon",
  year     =  2010,
  keywords = "Variant calling paper"
}

@MISC{Tcga_undated-qb,
  title    = "{TCGA-LAML}",
  author   = "{TCGA}",
  keywords = "RNA variant calling;Variant calling paper"
}

@MISC{Li_undated-qy,
  title        = "seqtk: Toolkit for processing sequences in {FASTA/Q} formats",
  booktitle    = "seqtk",
  author       = "Li, Heng",
  howpublished = "\url{https://github.com/lh3/seqtk}",
  keywords     = "Variant calling paper"
}

@MISC{Institute_undated-qc,
  title        = "Picard Tools",
  booktitle    = "Picard Tools",
  author       = "Institute, Broad",
  howpublished = "\url{http://broadinstitute.github.io/picard.}",
  keywords     = "Variant calling paper"
}

@ARTICLE{McLaren2016-lv,
  title    = "The Ensembl Variant Effect Predictor",
  author   = "McLaren, William and Gil, Laurent and Hunt, Sarah E and Riat,
              Harpreet Singh and Ritchie, Graham R S and Thormann, Anja and
              Flicek, Paul and Cunningham, Fiona",
  abstract = "The Ensembl Variant Effect Predictor is a powerful toolset for
              the analysis, annotation, and prioritization of genomic variants
              in coding and non-coding regions. It provides access to an
              extensive collection of genomic annotation, with a variety of
              interfaces to suit different requirements, and simple options for
              configuring and extending analysis. It is open source, free to
              use, and supports full reproducibility of results. The Ensembl
              Variant Effect Predictor can simplify and accelerate variant
              interpretation in a wide range of study designs.",
  journal  = "Genome Biol.",
  volume   =  17,
  number   =  1,
  pages    = "122",
  month    =  jun,
  year     =  2016,
  keywords = "Genome; NGS; SNP; Variant annotation;Variant calling paper",
  language = "en"
}

@ARTICLE{Ramaswami2013-gm,
  title    = "Identifying {RNA} editing sites using {RNA} sequencing data alone",
  author   = "Ramaswami, Gokul and Zhang, Rui and Piskol, Robert and Keegan,
              Liam P and Deng, Patricia and O'Connell, Mary A and Li, Jin Billy",
  abstract = "We show that RNA editing sites can be called with high confidence
              using RNA sequencing data from multiple samples across either
              individuals or species, without the need for matched genomic DNA
              sequence. We identified many previously unidentified editing
              sites in both humans and Drosophila; our results nearly double
              the known number of human protein recoding events. We also found
              that human genes harboring conserved editing sites within Alu
              repeats are enriched for neuronal functions.",
  journal  = "Nat. Methods",
  volume   =  10,
  number   =  2,
  pages    = "128--132",
  month    =  feb,
  year     =  2013,
  keywords = "Variant calling paper",
  language = "en"
}

@MISC{Smit_AFA_Hubley_R_Green_P2013-tz,
  title    = "{RepeatMasker}",
  author   = "{Smit, AFA, Hubley, R \& Green, P.}",
  year     =  2013,
  keywords = "Variant calling paper"
}

@ARTICLE{Ramaswami2014-nl,
  title    = "{RADAR}: a rigorously annotated database of {A-to-I} {RNA}
              editing",
  author   = "Ramaswami, Gokul and Li, Jin Billy",
  abstract = "We present RADAR--a rigorously annotated database of A-to-I RNA
              editing (available at http://RNAedit.com). The identification of
              A-to-I RNA editing sites has been dramatically accelerated in the
              past few years by high-throughput RNA sequencing studies. RADAR
              includes a comprehensive collection of A-to-I RNA editing sites
              identified in humans (Homo sapiens), mice (Mus musculus) and
              flies (Drosophila melanogaster), together with extensive manually
              curated annotations for each editing site. RADAR also includes an
              expandable listing of tissue-specific editing levels for each
              editing site, which will facilitate the assignment of biological
              functions to specific editing sites.",
  journal  = "Nucleic Acids Res.",
  volume   =  42,
  number   = "Database issue",
  pages    = "D109--13",
  month    =  jan,
  year     =  2014,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Forbes2017-qq,
  title    = "{COSMIC}: somatic cancer genetics at high-resolution",
  author   = "Forbes, Simon A and Beare, David and Boutselakis, Harry and
              Bamford, Sally and Bindal, Nidhi and Tate, John and Cole,
              Charlotte G and Ward, Sari and Dawson, Elisabeth and Ponting,
              Laura and Stefancsik, Raymund and Harsha, Bhavana and Kok, Chai
              Yin and Jia, Mingming and Jubb, Harry and Sondka, Zbyslaw and
              Thompson, Sam and De, Tisham and Campbell, Peter J",
  abstract = "COSMIC, the Catalogue of Somatic Mutations in Cancer
              (http://cancer.sanger.ac.uk) is a high-resolution resource for
              exploring targets and trends in the genetics of human cancer.
              Currently the broadest database of mutations in cancer, the
              information in COSMIC is curated by expert scientists, primarily
              by scrutinizing large numbers of scientific publications. Over 4
              million coding mutations are described in v78 (September 2016),
              combining genome-wide sequencing results from 28 366 tumours with
              complete manual curation of 23 489 individual publications
              focused on 186 key genes and 286 key fusion pairs across all
              cancers. Molecular profiling of large tumour numbers has also
              allowed the annotation of more than 13 million non-coding
              mutations, 18 029 gene fusions, 187 429 genome rearrangements, 1
              271 436 abnormal copy number segments, 9 175 462 abnormal
              expression variants and 7 879 142 differentially methylated CpG
              dinucleotides. COSMIC now details the genetics of drug
              resistance, novel somatic gene mutations which allow a tumour to
              evade therapeutic cancer drugs. Focusing initially on highly
              characterized drugs and genes, COSMIC v78 contains wide
              resistance mutation profiles across 20 drugs, detailing the
              recurrence of 301 unique resistance alleles across 1934
              drug-resistant tumours. All information from the COSMIC database
              is available freely on the COSMIC website.",
  journal  = "Nucleic Acids Res.",
  volume   =  45,
  number   = "D1",
  pages    = "D777--D783",
  month    =  jan,
  year     =  2017,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Lek2016-im,
  title    = "Analysis of protein-coding genetic variation in 60,706 humans",
  author   = "Lek, Monkol and Karczewski, Konrad J and Minikel, Eric V and
              Samocha, Kaitlin E and Banks, Eric and Fennell, Timothy and
              O'Donnell-Luria, Anne H and Ware, James S and Hill, Andrew J and
              Cummings, Beryl B and Tukiainen, Taru and Birnbaum, Daniel P and
              Kosmicki, Jack A and Duncan, Laramie E and Estrada, Karol and
              Zhao, Fengmei and Zou, James and Pierce-Hoffman, Emma and
              Berghout, Joanne and Cooper, David N and Deflaux, Nicole and
              DePristo, Mark and Do, Ron and Flannick, Jason and Fromer,
              Menachem and Gauthier, Laura and Goldstein, Jackie and Gupta,
              Namrata and Howrigan, Daniel and Kiezun, Adam and Kurki, Mitja I
              and Moonshine, Ami Levy and Natarajan, Pradeep and Orozco, Lorena
              and Peloso, Gina M and Poplin, Ryan and Rivas, Manuel A and
              Ruano-Rubio, Valentin and Rose, Samuel A and Ruderfer, Douglas M
              and Shakir, Khalid and Stenson, Peter D and Stevens, Christine
              and Thomas, Brett P and Tiao, Grace and Tusie-Luna, Maria T and
              Weisburd, Ben and Won, Hong-Hee and Yu, Dongmei and Altshuler,
              David M and Ardissino, Diego and Boehnke, Michael and Danesh,
              John and Donnelly, Stacey and Elosua, Roberto and Florez, Jose C
              and Gabriel, Stacey B and Getz, Gad and Glatt, Stephen J and
              Hultman, Christina M and Kathiresan, Sekar and Laakso, Markku and
              McCarroll, Steven and McCarthy, Mark I and McGovern, Dermot and
              McPherson, Ruth and Neale, Benjamin M and Palotie, Aarno and
              Purcell, Shaun M and Saleheen, Danish and Scharf, Jeremiah M and
              Sklar, Pamela and Sullivan, Patrick F and Tuomilehto, Jaakko and
              Tsuang, Ming T and Watkins, Hugh C and Wilson, James G and Daly,
              Mark J and MacArthur, Daniel G and {Exome Aggregation Consortium}",
  abstract = "Large-scale reference data sets of human genetic variation are
              critical for the medical and functional interpretation of DNA
              sequence changes. Here we describe the aggregation and analysis
              of high-quality exome (protein-coding region) DNA sequence data
              for 60,706 individuals of diverse ancestries generated as part of
              the Exome Aggregation Consortium (ExAC). This catalogue of human
              genetic diversity contains an average of one variant every eight
              bases of the exome, and provides direct evidence for the presence
              of widespread mutational recurrence. We have used this catalogue
              to calculate objective metrics of pathogenicity for sequence
              variants, and to identify genes subject to strong selection
              against various classes of mutation; identifying 3,230 genes with
              near-complete depletion of predicted protein-truncating variants,
              with 72\% of these genes having no currently established human
              disease phenotype. Finally, we demonstrate that these data can be
              used for the efficient filtering of candidate disease-causing
              variants, and for the discovery of human 'knockout' variants in
              protein-coding genes.",
  journal  = "Nature",
  volume   =  536,
  number   =  7616,
  pages    = "285--291",
  month    =  aug,
  year     =  2016,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Sherry2001-eh,
  title    = "{dbSNP}: the {NCBI} database of genetic variation",
  author   = "Sherry, S T and Ward, M H and Kholodov, M and Baker, J and Phan,
              L and Smigielski, E M and Sirotkin, K",
  abstract = "In response to a need for a general catalog of genome variation
              to address the large-scale sampling designs required by
              association studies, gene mapping and evolutionary biology, the
              National Center for Biotechnology Information (NCBI) has
              established the dbSNP database [S.T.Sherry, M.Ward and K.
              Sirotkin (1999) Genome Res., 9, 677-679]. Submissions to dbSNP
              will be integrated with other sources of information at NCBI such
              as GenBank, PubMed, LocusLink and the Human Genome Project data.
              The complete contents of dbSNP are available to the public at
              website: http://www.ncbi.nlm.nih.gov/SNP. The complete contents
              of dbSNP can also be downloaded in multiple formats via anonymous
              FTP at ftp://ncbi.nlm.nih.gov/snp/.",
  journal  = "Nucleic Acids Res.",
  volume   =  29,
  number   =  1,
  pages    = "308--311",
  month    =  jan,
  year     =  2001,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Li2009-th,
  title    = "The Sequence {Alignment/Map} format and {SAMtools}",
  author   = "Li, Heng and Handsaker, Bob and Wysoker, Alec and Fennell, Tim
              and Ruan, Jue and Homer, Nils and Marth, Gabor and Abecasis,
              Goncalo and Durbin, Richard and {1000 Genome Project Data
              Processing Subgroup}",
  abstract = "SUMMARY: The Sequence Alignment/Map (SAM) format is a generic
              alignment format for storing read alignments against reference
              sequences, supporting short and long reads (up to 128 Mbp)
              produced by different sequencing platforms. It is flexible in
              style, compact in size, efficient in random access and is the
              format in which alignments from the 1000 Genomes Project are
              released. SAMtools implements various utilities for
              post-processing alignments in the SAM format, such as indexing,
              variant caller and alignment viewer, and thus provides universal
              tools for processing read alignments. AVAILABILITY:
              http://samtools.sourceforge.net.",
  journal  = "Bioinformatics",
  volume   =  25,
  number   =  16,
  pages    = "2078--2079",
  month    =  aug,
  year     =  2009,
  keywords = "Variant calling paper",
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Coudray2018-yw,
  title     = "Detection and benchmarking of somatic mutations in cancer
               genomes using {RNA-seq} data",
  author    = "Coudray, A and Battenhouse, A M and Bucher, P and Iyer, V R",
  abstract  = "To detect functional somatic mutations in tumor samples,
               whole-exome sequencing (WES) is often used for its reliability
               and relative low cost. RNA-seq, while generally used to measure
               gene expression, can potentially also be used for identification
               of somatic mutations. However there has been little systematic
               evaluation of the utility of RNA-seq for identifying somatic
               mutations. Here, we develop and evaluate a pipeline for
               processing RNA-seq data from glioblastoma multiforme (GBM)
               tumors in order to identify somatic mutations. The …",
  journal   = "bioRxiv",
  publisher = "biorxiv.org",
  year      =  2018,
  keywords  = "RNA variant calling;Variant calling paper"
}

@UNPUBLISHED{Poplin2017-ae,
  title    = "Scaling accurate genetic variant discovery to tens of thousands
              of samples",
  author   = "Poplin, Ryan and Ruano-Rubio, Valentin and DePristo, Mark A and
              Fennell, Tim J and Carneiro, Mauricio O and Van der Auwera,
              Geraldine A and Kling, David E and Gauthier, Laura D and
              Levy-Moonshine, Ami and Roazen, David and Shakir, Khalid and
              Thibault, Joel and Chandran, Sheila and Whelan, Chris and Lek,
              Monkol and Gabriel, Stacey and Daly, Mark J and Neale, Benjamin
              and MacArthur, Daniel G and Banks, Eric",
  abstract = "Comprehensive disease gene discovery in both common and rare
              diseases will require the efficient and accurate detection of all
              classes of genetic variation across tens to hundreds of thousands
              of human samples. We describe here a novel assembly-based
              approach to variant calling, the GATK HaplotypeCaller (HC) and
              Reference Confidence Model (RCM), that determines genotype
              likelihoods independently per-sample but performs joint calling
              across all samples within a project simultaneously. We show by
              calling over 90,000 samples from the Exome Aggregation Consortium
              (ExAC) that, in contrast to other algorithms, the HC-RCM scales
              efficiently to very large sample sizes without loss in accuracy;
              and that the accuracy of indel variant calling is superior in
              comparison to other algorithms. More importantly, the HC-RCM
              produces a fully squared-off matrix of genotypes across all
              samples at every genomic position being investigated. The HC- RCM
              is a novel, scalable, assembly-based algorithm with abundant
              applications for population genetics and clinical studies.",
  journal  = "bioRxiv",
  pages    = "201178",
  month    =  nov,
  year     =  2017,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Tang2014-df,
  title     = "The {eSNV-detect}: a computational system to identify expressed
               single nucleotide variants from transcriptome sequencing data",
  author    = "Tang, Xiaojia and Baheti, Saurabh and Shameer, Khader and
               Thompson, Kevin J and Wills, Quin and Niu, Nifang and Holcomb,
               Ilona N and Boutet, Stephane C and Ramakrishnan, Ramesh and
               Kachergus, Jennifer M and Kocher, Jean-Pierre A and
               Weinshilboum, Richard M and Wang, Liewei and Thompson, E Aubrey
               and Kalari, Krishna R",
  abstract  = "Abstract. Rapid development of next generation sequencing
               technology has enabled the identification of genomic alterations
               from short sequencing reads. There a",
  journal   = "Nucleic Acids Res.",
  publisher = "Oxford University Press",
  volume    =  42,
  number    =  22,
  pages     = "e172--e172",
  month     =  dec,
  year      =  2014,
  keywords  = "rna; exome; breast cancer; rna, messenger;Variant calling paper"
}

@ARTICLE{Christoforides2013-cr,
  title    = "Identification of somatic mutations in cancer through
              Bayesian-based analysis of sequenced genome pairs",
  author   = "Christoforides, Alexis and Carpten, John D and Weiss, Glen J and
              Demeure, Michael J and Von Hoff, Daniel D and Craig, David W",
  abstract = "BACKGROUND: The field of cancer genomics has rapidly adopted
              next-generation sequencing (NGS) in order to study and
              characterize malignant tumors with unprecedented resolution. In
              particular for cancer, one is often trying to identify somatic
              mutations--changes specific to a tumor and not within an
              individual's germline. However, false positive and false negative
              detections often result from lack of sufficient variant evidence,
              contamination of the biopsy by stromal tissue, sequencing errors,
              and the erroneous classification of germline variation as
              tumor-specific. RESULTS: We have developed a generalized Bayesian
              analysis framework for matched tumor/normal samples with the
              purpose of identifying tumor-specific alterations such as single
              nucleotide mutations, small insertions/deletions, and structural
              variation. We describe our methodology, and discuss its
              application to other types of paired-tissue analysis such as the
              detection of loss of heterozygosity as well as allelic imbalance.
              We also demonstrate the high level of sensitivity and specificity
              in discovering simulated somatic mutations, for various
              combinations of a) genomic coverage and b) emulated
              heterogeneity. CONCLUSION: We present a Java-based implementation
              of our methods named Seurat, which is made available for free
              academic use. We have demonstrated and reported on the discovery
              of different types of somatic change by applying Seurat to an
              experimentally-derived cancer dataset using our methods; and have
              discussed considerations and practices regarding the accurate
              detection of somatic events in cancer genomes. Seurat is
              available at https://sites.google.com/site/seuratsomatic.",
  journal  = "BMC Genomics",
  volume   =  14,
  pages    = "302",
  month    =  may,
  year     =  2013,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Radenbaugh2014-cj,
  title    = "{RADIA}: {RNA} and {DNA} integrated analysis for somatic mutation
              detection",
  author   = "Radenbaugh, Amie J and Ma, Singer and Ewing, Adam and Stuart,
              Joshua M and Collisson, Eric A and Zhu, Jingchun and Haussler,
              David",
  abstract = "The detection of somatic single nucleotide variants is a crucial
              component to the characterization of the cancer genome. Mutation
              calling algorithms thus far have focused on comparing the normal
              and tumor genomes from the same individual. In recent years, it
              has become routine for projects like The Cancer Genome Atlas
              (TCGA) to also sequence the tumor RNA. Here we present RADIA (RNA
              and DNA Integrated Analysis), a novel computational method
              combining the patient-matched normal and tumor DNA with the tumor
              RNA to detect somatic mutations. The inclusion of the RNA
              increases the power to detect somatic mutations, especially at
              low DNA allelic frequencies. By integrating an individual's DNA
              and RNA, we are able to detect mutations that would otherwise be
              missed by traditional algorithms that examine only the DNA. We
              demonstrate high sensitivity (84\%) and very high precision (98\%
              and 99\%) for RADIA in patient data from endometrial carcinoma
              and lung adenocarcinoma from TCGA. Mutations with both high DNA
              and RNA read support have the highest validation rate of over
              99\%. We also introduce a simulation package that spikes in
              artificial mutations to patient data, rather than simulating
              sequencing data from a reference genome. We evaluate sensitivity
              on the simulation data and demonstrate our ability to rescue back
              mutations at low DNA allelic frequencies by including the RNA.
              Finally, we highlight mutations in important cancer genes that
              were rescued due to the incorporation of the RNA.",
  journal  = "PLoS One",
  volume   =  9,
  number   =  11,
  pages    = "e111516",
  month    =  nov,
  year     =  2014,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Ewels2016-rc,
  title    = "{MultiQC}: summarize analysis results for multiple tools and
              samples in a single report",
  author   = "Ewels, Philip and Magnusson, M{\aa}ns and Lundin, Sverker and
              K{\"a}ller, Max",
  abstract = "MOTIVATION: Fast and accurate quality control is essential for
              studies involving next-generation sequencing data. Whilst
              numerous tools exist to quantify QC metrics, there is no common
              approach to flexibly integrate these across tools and large
              sample sets. Assessing analysis results across an entire project
              can be time consuming and error prone; batch effects and outlier
              samples can easily be missed in the early stages of analysis.
              RESULTS: We present MultiQC, a tool to create a single report
              visualising output from multiple tools across many samples,
              enabling global trends and biases to be quickly identified.
              MultiQC can plot data from many common bioinformatics tools and
              is built to allow easy extension and customization. AVAILABILITY
              AND IMPLEMENTATION: MultiQC is available with an GNU GPLv3
              license on GitHub, the Python Package Index and Bioconda.
              Documentation and example reports are available at
              http://multiqc.info CONTACT: phil.ewels@scilifelab.se.",
  journal  = "Bioinformatics",
  volume   =  32,
  number   =  19,
  pages    = "3047--3048",
  month    =  oct,
  year     =  2016,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Liao2013-ga,
  title    = "The Subread aligner: fast, accurate and scalable read mapping by
              seed-and-vote",
  author   = "Liao, Yang and Smyth, Gordon K and Shi, Wei",
  abstract = "Read alignment is an ongoing challenge for the analysis of data
              from sequencing technologies. This article proposes an elegantly
              simple multi-seed strategy, called seed-and-vote, for mapping
              reads to a reference genome. The new strategy chooses the mapped
              genomic location for the read directly from the seeds. It uses a
              relatively large number of short seeds (called subreads)
              extracted from each read and allows all the seeds to vote on the
              optimal location. When the read length is <160 bp, overlapping
              subreads are used. More conventional alignment algorithms are
              then used to fill in detailed mismatch and indel information
              between the subreads that make up the winning voting block. The
              strategy is fast because the overall genomic location has already
              been chosen before the detailed alignment is done. It is
              sensitive because no individual subread is required to map
              exactly, nor are individual subreads constrained to map close by
              other subreads. It is accurate because the final location must be
              supported by several different subreads. The strategy extends
              easily to find exon junctions, by locating reads that contain
              sets of subreads mapping to different exons of the same gene. It
              scales up efficiently for longer reads.",
  journal  = "Nucleic Acids Res.",
  volume   =  41,
  number   =  10,
  pages    = "e108",
  month    =  may,
  year     =  2013,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Xu2018-bt,
  title    = "A review of somatic single nucleotide variant calling algorithms
              for next-generation sequencing data",
  author   = "Xu, Chang",
  abstract = "Detection of somatic mutations holds great potential in cancer
              treatment and has been a very active research field in the past
              few years, especially since the breakthrough of the
              next-generation sequencing technology. A collection of variant
              calling pipelines have been developed with different underlying
              models, filters, input data requirements, and targeted
              applications. This review aims to enumerate these unique features
              of the state-of-the-art variant callers, in the hope to provide a
              practical guide for selecting the appropriate pipeline for
              specific applications. We will focus on the detection of somatic
              single nucleotide variants, ranging from traditional variant
              callers based on whole genome or exome sequencing of paired
              tumor-normal samples to recent low-frequency variant callers
              designed for targeted sequencing protocols with unique molecular
              identifiers. The variant callers have been extensively
              benchmarked with inconsistent performances across these studies.
              We will review the reference materials, datasets, and performance
              metrics that have been used in the benchmarking studies. In the
              end, we will discuss emerging trends and future directions of the
              variant calling algorithms.",
  journal  = "Comput. Struct. Biotechnol. J.",
  volume   =  16,
  pages    = "15--24",
  month    =  feb,
  year     =  2018,
  keywords = "Benchmarking; Low-frequency mutation; Somatic mutation; Unique
              molecular identifier; Variant calling;Variant calling paper",
  language = "en"
}

@ARTICLE{Lai2016-ws,
  title    = "{VarDict}: a novel and versatile variant caller for
              next-generation sequencing in cancer research",
  author   = "Lai, Zhongwu and Markovets, Aleksandra and Ahdesmaki, Miika and
              Chapman, Brad and Hofmann, Oliver and McEwen, Robert and Johnson,
              Justin and Dougherty, Brian and Barrett, J Carl and Dry, Jonathan
              R",
  abstract = "Accurate variant calling in next generation sequencing (NGS) is
              critical to understand cancer genomes better. Here we present
              VarDict, a novel and versatile variant caller for both DNA- and
              RNA-sequencing data. VarDict simultaneously calls SNV, MNV,
              InDels, complex and structural variants, expanding the detected
              genetic driver landscape of tumors. It performs local
              realignments on the fly for more accurate allele frequency
              estimation. VarDict performance scales linearly to sequencing
              depth, enabling ultra-deep sequencing used to explore tumor
              evolution or detect tumor DNA circulating in blood. In addition,
              VarDict performs amplicon aware variant calling for polymerase
              chain reaction (PCR)-based targeted sequencing often used in
              diagnostic settings, and is able to detect PCR artifacts.
              Finally, VarDict also detects differences in somatic and loss of
              heterozygosity variants between paired samples. VarDict
              reprocessing of The Cancer Genome Atlas (TCGA) Lung
              Adenocarcinoma dataset called known driver mutations in KRAS,
              EGFR, BRAF, PIK3CA and MET in 16\% more patients than previously
              published variant calls. We believe VarDict will greatly
              facilitate application of NGS in clinical cancer research.",
  journal  = "Nucleic Acids Res.",
  volume   =  44,
  number   =  11,
  pages    = "e108",
  month    =  jun,
  year     =  2016,
  keywords = "Finding optimal coverage;Variant calling paper",
  language = "en"
}

@ARTICLE{Bohnert2017-if,
  title    = "Comprehensive benchmarking of {SNV} callers for highly admixed
              tumor data",
  author   = "Bohnert, Regina and Vivas, Sonia and Jansen, Gunther",
  abstract = "Precision medicine attempts to individualize cancer therapy by
              matching tumor-specific genetic changes with effective targeted
              therapies. A crucial first step in this process is the reliable
              identification of cancer-relevant variants, which is considerably
              complicated by the impurity and heterogeneity of clinical tumor
              samples. We compared the impact of admixture of non-cancerous
              cells and low somatic allele frequencies on the sensitivity and
              precision of 19 state-of-the-art SNV callers. We studied both
              whole exome and targeted gene panel data and up to 13 distinct
              parameter configurations for each tool. We found vast differences
              among callers. Based on our comprehensive analyses we recommend
              joint tumor-normal calling with MuTect, EBCall or Strelka for
              whole exome somatic variant calling, and HaplotypeCaller or
              FreeBayes for whole exome germline calling. For targeted gene
              panel data on a single tumor sample, LoFreqStar performed best.
              We further found that tumor impurity and admixture had a negative
              impact on precision, and in particular, sensitivity in whole
              exome experiments. At admixture levels of 60\% to 90\% sometimes
              seen in pathological biopsies, sensitivity dropped significantly,
              even when variants were originally present in the tumor at 100\%
              allele frequency. Sensitivity to low-frequency SNVs improved with
              targeted panel data, but whole exome data allowed more efficient
              identification of germline variants. Effective somatic variant
              calling requires high-quality pathological samples with minimal
              admixture, a consciously selected sequencing strategy, and the
              appropriate variant calling tool with settings optimized for the
              chosen type of data.",
  journal  = "PLoS One",
  volume   =  12,
  number   =  10,
  pages    = "e0186175",
  month    =  oct,
  year     =  2017,
  keywords = "RNA variant calling;Variant calling paper",
  language = "en"
}

@ARTICLE{Tarazona2011-qy,
  title    = "Differential expression in {RNA-seq}: a matter of depth",
  author   = "Tarazona, Sonia and Garc{\'\i}a-Alcalde, Fernando and Dopazo,
              Joaqu{\'\i}n and Ferrer, Alberto and Conesa, Ana",
  abstract = "Next-generation sequencing (NGS) technologies are revolutionizing
              genome research, and in particular, their application to
              transcriptomics (RNA-seq) is increasingly being used for gene
              expression profiling as a replacement for microarrays. However,
              the properties of RNA-seq data have not been yet fully
              established, and additional research is needed for understanding
              how these data respond to differential expression analysis. In
              this work, we set out to gain insights into the characteristics
              of RNA-seq data analysis by studying an important parameter of
              this technology: the sequencing depth. We have analyzed how
              sequencing depth affects the detection of transcripts and their
              identification as differentially expressed, looking at aspects
              such as transcript biotype, length, expression level, and
              fold-change. We have evaluated different algorithms available for
              the analysis of RNA-seq and proposed a novel
              approach--NOISeq--that differs from existing methods in that it
              is data-adaptive and nonparametric. Our results reveal that most
              existing methodologies suffer from a strong dependency on
              sequencing depth for their differential expression calls and that
              this results in a considerable number of false positives that
              increases as the number of reads grows. In contrast, our proposed
              method models the noise distribution from the actual data, can
              therefore better adapt to the size of the data set, and is more
              effective in controlling the rate of false discoveries. This work
              discusses the true potential of RNA-seq for studying regulation
              at low expression ranges, the noise within RNA-seq data, and the
              issue of replication.",
  journal  = "Genome Res.",
  volume   =  21,
  number   =  12,
  pages    = "2213--2223",
  month    =  dec,
  year     =  2011,
  keywords = "Finding optimal coverage;Variant calling paper;sequencing depth,
              gene expression, AML",
  language = "en"
}

@ARTICLE{Wu2016-kh,
  title    = "Experimental Design and Power Calculation for {RNA-seq}
              Experiments",
  author   = "Wu, Zhijin and Wu, Hao",
  abstract = "Power calculation is a critical component of RNA-seq experimental
              design. The flexibility of RNA-seq experiment and the wide
              dynamic range of transcription it measures make it an attractive
              technology for whole transcriptome analysis. These features, in
              addition to the high dimensionality of RNA-seq data, bring
              complexity in experimental design, making an analytical power
              calculation no longer realistic. In this chapter we review the
              major factors that influence the statistical power of detecting
              differential expression, and give examples of power assessment
              using the R package PROPER.",
  journal  = "Methods Mol. Biol.",
  volume   =  1418,
  pages    = "379--390",
  year     =  2016,
  keywords = "Experimental design; Gene expression; RNA-Seq; Sample size;
              Statistical power;Finding optimal coverage;RNA variant
              calling;Variant calling paper;sequencing depth, gene expression,
              AML",
  language = "en"
}

@ARTICLE{Sandmann2017-kc,
  title    = "Evaluating Variant Calling Tools for {Non-Matched}
              {Next-Generation} Sequencing Data",
  author   = "Sandmann, Sarah and de Graaf, Aniek O and Karimi, Mohsen and van
              der Reijden, Bert A and Hellstr{\"o}m-Lindberg, Eva and Jansen,
              Joop H and Dugas, Martin",
  abstract = "Valid variant calling results are crucial for the use of
              next-generation sequencing in clinical routine. However, there
              are numerous variant calling tools that usually differ in
              algorithms, filtering strategies, recommendations and thus, also
              in the output. We evaluated eight open-source tools regarding
              their ability to call single nucleotide variants and short indels
              with allelic frequencies as low as 1\% in non-matched
              next-generation sequencing data: GATK HaplotypeCaller, Platypus,
              VarScan, LoFreq, FreeBayes, SNVer, SAMtools and VarDict. We
              analysed two real datasets from patients with myelodysplastic
              syndrome, covering 54 Illumina HiSeq samples and 111 Illumina
              NextSeq samples. Mutations were validated by re-sequencing on the
              same platform, on a different platform and expert based review.
              In addition we considered two simulated datasets with varying
              coverage and error profiles, covering 50 samples each. In all
              cases an identical target region consisting of 19 genes (42,322
              bp) was analysed. Altogether, no tool succeeded in calling all
              mutations. High sensitivity was always accompanied by low
              precision. Influence of varying coverages- and background noise
              on variant calling was generally low. Taking everything into
              account, VarDict performed best. However, our results indicate
              that there is a need to improve reproducibility of the results in
              the context of multithreading.",
  journal  = "Sci. Rep.",
  volume   =  7,
  pages    = "43169",
  month    =  feb,
  year     =  2017,
  keywords = "RNA variant calling;Variant calling paper",
  language = "en"
}

@ARTICLE{Quinn2013-oh,
  title    = "Development of strategies for {SNP} detection in {RNA-seq} data:
              application to lymphoblastoid cell lines and evaluation using
              1000 Genomes data",
  author   = "Quinn, Emma M and Cormican, Paul and Kenny, Elaine M and Hill,
              Matthew and Anney, Richard and Gill, Michael and Corvin, Aiden P
              and Morris, Derek W",
  abstract = "Next-generation RNA sequencing (RNA-seq) maps and analyzes
              transcriptomes and generates data on sequence variation in
              expressed genes. There are few reported studies on analysis
              strategies to maximize the yield of quality RNA-seq SNP data. We
              evaluated the performance of different SNP-calling methods
              following alignment to both genome and transcriptome by applying
              them to RNA-seq data from a HapMap lymphoblastoid cell line
              sample and comparing results with sequence variation data from
              1000 Genomes. We determined that the best method to achieve high
              specificity and sensitivity, and greatest number of SNP calls, is
              to remove duplicate sequence reads after alignment to the genome
              and to call SNPs using SAMtools. The accuracy of SNP calls is
              dependent on sequence coverage available. In terms of
              specificity, 89\% of RNA-seq SNPs calls were true variants where
              coverage is >10X. In terms of sensitivity, at >10X coverage 92\%
              of all expected SNPs in expressed exons could be detected.
              Overall, the results indicate that RNA-seq SNP data are a very
              useful by-product of sequence-based transcriptome analysis. If
              RNA-seq is applied to disease tissue samples and assuming that
              genes carrying mutations relevant to disease biology are being
              expressed, a very high proportion of these mutations can be
              detected.",
  journal  = "PLoS One",
  volume   =  8,
  number   =  3,
  pages    = "e58815",
  month    =  mar,
  year     =  2013,
  keywords = "Finding optimal coverage;RNA variant calling;Variant calling
              paper",
  language = "en"
}

@ARTICLE{McKenna2010-mc,
  title    = "The Genome Analysis Toolkit: a {MapReduce} framework for
              analyzing next-generation {DNA} sequencing data",
  author   = "McKenna, Aaron and Hanna, Matthew and Banks, Eric and Sivachenko,
              Andrey and Cibulskis, Kristian and Kernytsky, Andrew and
              Garimella, Kiran and Altshuler, David and Gabriel, Stacey and
              Daly, Mark and DePristo, Mark A",
  abstract = "Next-generation DNA sequencing (NGS) projects, such as the 1000
              Genomes Project, are already revolutionizing our understanding of
              genetic variation among individuals. However, the massive data
              sets generated by NGS--the 1000 Genome pilot alone includes
              nearly five terabases--make writing feature-rich, efficient, and
              robust analysis tools difficult for even computationally
              sophisticated individuals. Indeed, many professionals are limited
              in the scope and the ease with which they can answer scientific
              questions by the complexity of accessing and manipulating the
              data produced by these machines. Here, we discuss our Genome
              Analysis Toolkit (GATK), a structured programming framework
              designed to ease the development of efficient and robust analysis
              tools for next-generation DNA sequencers using the functional
              programming philosophy of MapReduce. The GATK provides a small
              but rich set of data access patterns that encompass the majority
              of analysis tool needs. Separating specific analysis calculations
              from common data management infrastructure enables us to optimize
              the GATK framework for correctness, stability, and CPU and memory
              efficiency and to enable distributed and shared memory
              parallelization. We highlight the capabilities of the GATK by
              describing the implementation and application of robust,
              scale-tolerant tools like coverage calculators and single
              nucleotide polymorphism (SNP) calling. We conclude that the GATK
              programming framework enables developers and analysts to quickly
              and easily write efficient and robust NGS tools, many of which
              have already been incorporated into large-scale sequencing
              projects like the 1000 Genomes Project and The Cancer Genome
              Atlas.",
  journal  = "Genome Res.",
  volume   =  20,
  number   =  9,
  pages    = "1297--1303",
  month    =  sep,
  year     =  2010,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Castle2014-vl,
  title    = "Mutated tumor alleles are expressed according to their {DNA}
              frequency",
  author   = "Castle, John C and Loewer, Martin and Boegel, Sebastian and
              Tadmor, Arbel D and Boisguerin, Valesca and de Graaf, Jos and
              Paret, Claudia and Diken, Mustafa and Kreiter, Sebastian and
              T{\"u}reci, {\"O}zlem and Sahin, Ugur",
  abstract = "The transcription of tumor mutations from DNA into RNA has
              implications for biology, epigenetics and clinical practice. It
              is not clear if mutations are in general transcribed and, if so,
              at what proportion to the wild-type allele. Here, we examined the
              correlation between DNA mutation allele frequency and RNA
              mutation allele frequency. We sequenced the exome and
              transcriptome of tumor cell lines with large copy number
              variations, identified heterozygous single nucleotide mutations
              and absolute DNA copy number, and determined the corresponding
              DNA and RNA mutation allele fraction. We found that 99\% of the
              DNA mutations in expressed genes are expressed as RNA. Moreover,
              we found a high correlation between the DNA and RNA mutation
              allele frequency. Exceptions are mutations that cause premature
              termination codons and therefore activate nonsense-mediated
              decay. Beyond this, we did not find evidence of any wide-scale
              mechanism, such as allele-specific epigenetic silencing,
              preferentially promoting mutated or wild-type alleles. In
              conclusion, our data strongly suggest that genes are equally
              transcribed from all alleles, mutated and wild-type, and thus
              transcribed in proportion to their DNA allele frequency.",
  journal  = "Sci. Rep.",
  volume   =  4,
  pages    = "4743",
  month    =  apr,
  year     =  2014,
  keywords = "Finding optimal coverage;RNA variant calling;Variant calling
              paper",
  language = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Schurch2016-mw,
  title    = "How many biological replicates are needed in an {RNA-seq}
              experiment and which differential expression tool should you use?",
  author   = "Schurch, Nicholas J and Schofield, Piet{\'a} and Gierli{\'n}ski,
              Marek and Cole, Christian and Sherstnev, Alexander and Singh,
              Vijender and Wrobel, Nicola and Gharbi, Karim and Simpson, Gordon
              G and Owen-Hughes, Tom and Blaxter, Mark and Barton, Geoffrey J",
  abstract = "RNA-seq is now the technology of choice for genome-wide
              differential gene expression experiments, but it is not clear how
              many biological replicates are needed to ensure valid biological
              interpretation of the results or which statistical tools are best
              for analyzing the data. An RNA-seq experiment with 48 biological
              replicates in each of two conditions was performed to answer
              these questions and provide guidelines for experimental design.
              With three biological replicates, nine of the 11 tools evaluated
              found only 20\%-40\% of the significantly differentially
              expressed (SDE) genes identified with the full set of 42 clean
              replicates. This rises to >85\% for the subset of SDE genes
              changing in expression by more than fourfold. To achieve >85\%
              for all SDE genes regardless of fold change requires more than 20
              biological replicates. The same nine tools successfully control
              their false discovery rate at ≲5\% for all numbers of replicates,
              while the remaining two tools fail to control their FDR
              adequately, particularly for low numbers of replicates. For
              future RNA-seq experiments, these results suggest that at least
              six biological replicates should be used, rising to at least 12
              when it is important to identify SDE genes for all fold changes.
              If fewer than 12 replicates are used, a superior combination of
              true positive and false positive performances makes edgeR and
              DESeq2 the leading tools. For higher replicate numbers,
              minimizing false positives is more important and DESeq marginally
              outperforms the other tools.",
  journal  = "RNA",
  volume   =  22,
  number   =  6,
  pages    = "839--851",
  month    =  jun,
  year     =  2016,
  keywords = "RNA-seq; benchmarking; differential expression; experimental
              design; replication; statistical power; yeast;To read;Finding
              optimal coverage;Variant calling paper",
  language = "en"
}

@ARTICLE{Guo2014-pj,
  title    = "{RNAseqPS}: A Web Tool for Estimating Sample Size and Power for
              {RNAseq} Experiment",
  author   = "Guo, Yan and Zhao, Shilin and Li, Chung-I and Sheng, Quanhu and
              Shyr, Yu",
  abstract = "Sample size and power determination is the first step in the
              experimental design of a successful study. Sample size and power
              calculation is required for applications for National Institutes
              of Health (NIH) funding. Sample size and power calculation is
              well established for traditional biological studies such as mouse
              model, genome wide association study (GWAS), and microarray
              studies. Recent developments in high-throughput sequencing
              technology have allowed RNAseq to replace microarray as the
              technology of choice for high-throughput gene expression
              profiling. However, the sample size and power analysis of RNAseq
              technology is an underdeveloped area. Here, we present RNAseqPS,
              an advanced online RNAseq power and sample size calculation tool
              based on the Poisson and negative binomial distributions.
              RNAseqPS was built using the Shiny package in R. It provides an
              interactive graphical user interface that allows the users to
              easily conduct sample size and power analysis for RNAseq
              experimental design. RNAseqPS can be accessed directly at
              http://cqs.mc.vanderbilt.edu/shiny/RNAseqPS/.",
  journal  = "Cancer Inform.",
  volume   =  13,
  number   = "Suppl 6",
  pages    = "1--5",
  month    =  oct,
  year     =  2014,
  keywords = "RNAseq; experiment design; power analysis; sample size
              calculation;Finding optimal coverage;To read;RNA-Seq
              power;Variant calling paper",
  language = "en"
}

@ARTICLE{Ching2014-ai,
  title    = "Power analysis and sample size estimation for {RNA-Seq}
              differential expression",
  author   = "Ching, Travers and Huang, Sijia and Garmire, Lana X",
  abstract = "It is crucial for researchers to optimize RNA-seq experimental
              designs for differential expression detection. Currently, the
              field lacks general methods to estimate power and sample size for
              RNA-Seq in complex experimental designs, under the assumption of
              the negative binomial distribution. We simulate RNA-Seq count
              data based on parameters estimated from six widely different
              public data sets (including cell line comparison, tissue
              comparison, and cancer data sets) and calculate the statistical
              power in paired and unpaired sample experiments. We
              comprehensively compare five differential expression analysis
              packages (DESeq, edgeR, DESeq2, sSeq, and EBSeq) and evaluate
              their performance by power, receiver operator characteristic
              (ROC) curves, and other metrics including areas under the curve
              (AUC), Matthews correlation coefficient (MCC), and F-measures.
              DESeq2 and edgeR tend to give the best performance in general.
              Increasing sample size or sequencing depth increases power;
              however, increasing sample size is more potent than sequencing
              depth to increase power, especially when the sequencing depth
              reaches 20 million reads. Long intergenic noncoding RNAs
              (lincRNA) yields lower power relative to the protein coding
              mRNAs, given their lower expression level in the same RNA-Seq
              experiment. On the other hand, paired-sample RNA-Seq
              significantly enhances the statistical power, confirming the
              importance of considering the multifactor experimental design.
              Finally, a local optimal power is achievable for a given budget
              constraint, and the dominant contributing factor is sample size
              rather than the sequencing depth. In conclusion, we provide a
              power analysis tool
              (http://www2.hawaii.edu/~lgarmire/RNASeqPowerCalculator.htm) that
              captures the dispersion in the data and can serve as a practical
              reference under the budget constraint of RNA-Seq experiments.",
  journal  = "RNA",
  volume   =  20,
  number   =  11,
  pages    = "1684--1696",
  month    =  nov,
  year     =  2014,
  keywords = "RNA-Seq; bioinformatics; power analysis; sample size;
              simulation;Finding optimal coverage;Variant calling
              paper;sequencing depth, gene expression, AML",
  language = "en"
}

@ARTICLE{Liao2014-lj,
  title    = "featureCounts: an efficient general purpose program for assigning
              sequence reads to genomic features",
  author   = "Liao, Yang and Smyth, Gordon K and Shi, Wei",
  abstract = "MOTIVATION: Next-generation sequencing technologies generate
              millions of short sequence reads, which are usually aligned to a
              reference genome. In many applications, the key information
              required for downstream analysis is the number of reads mapping
              to each genomic feature, for example to each exon or each gene.
              The process of counting reads is called read summarization. Read
              summarization is required for a great variety of genomic analyses
              but has so far received relatively little attention in the
              literature. RESULTS: We present featureCounts, a read
              summarization program suitable for counting reads generated from
              either RNA or genomic DNA sequencing experiments. featureCounts
              implements highly efficient chromosome hashing and feature
              blocking techniques. It is considerably faster than existing
              methods (by an order of magnitude for gene-level summarization)
              and requires far less computer memory. It works with either
              single or paired-end reads and provides a wide range of options
              appropriate for different sequencing applications. AVAILABILITY
              AND IMPLEMENTATION: featureCounts is available under GNU General
              Public License as part of the Subread
              (http://subread.sourceforge.net) or Rsubread
              (http://www.bioconductor.org) software packages.",
  journal  = "Bioinformatics",
  volume   =  30,
  number   =  7,
  pages    = "923--930",
  month    =  apr,
  year     =  2014,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Wu2010-rt,
  title     = "Fast and {SNP-tolerant} detection of complex variants and
               splicing in short reads",
  author    = "Wu, Thomas D and Nacu, Serban",
  journal   = "Bioinformatics",
  publisher = "Oxford University Press",
  volume    =  26,
  number    =  7,
  pages     = "873--881",
  month     =  apr,
  year      =  2010,
  keywords  = "Variant calling paper"
}

@ARTICLE{Cibulskis2013-vv,
  title     = "Sensitive detection of somatic point mutations in impure and
               heterogeneous cancer samples",
  author    = "Cibulskis, Kristian and Lawrence, Michael S and Carter, Scott L
               and Sivachenko, Andrey and Jaffe, David and Sougnez, Carrie and
               Gabriel, Stacey and Meyerson, Matthew and Lander, Eric S and
               Getz, Gad",
  abstract  = "The MuTect algorithm for calling somatic point mutations enables
               subclonal analysis of the whole-genome or whole-exome sequencing
               data being generated in large-scale cancer genomics projects.",
  journal   = "Nat. Biotechnol.",
  publisher = "Nature Research",
  volume    =  31,
  number    =  3,
  pages     = "213--219",
  month     =  feb,
  year      =  2013,
  keywords  = "Variant calling paper;RNA variant calling",
  language  = "en"
}

@ARTICLE{Koboldt2012-wx,
  title    = "{VarScan} 2: somatic mutation and copy number alteration
              discovery in cancer by exome sequencing",
  author   = "Koboldt, Daniel C and Zhang, Qunyuan and Larson, David E and
              Shen, Dong and McLellan, Michael D and Lin, Ling and Miller,
              Christopher A and Mardis, Elaine R and Ding, Li and Wilson,
              Richard K",
  abstract = "Cancer is a disease driven by genetic variation and mutation.
              Exome sequencing can be utilized for discovering these variants
              and mutations across hundreds of tumors. Here we present an
              analysis tool, VarScan 2, for the detection of somatic mutations
              and copy number alterations (CNAs) in exome data from
              tumor-normal pairs. Unlike most current approaches, our algorithm
              reads data from both samples simultaneously; a heuristic and
              statistical algorithm detects sequence variants and classifies
              them by somatic status (germline, somatic, or LOH); while a
              comparison of normalized read depth delineates relative copy
              number changes. We apply these methods to the analysis of exome
              sequence data from 151 high-grade ovarian tumors characterized as
              part of the Cancer Genome Atlas (TCGA). We validated some 7790
              somatic coding mutations, achieving 93\% sensitivity and 85\%
              precision for single nucleotide variant (SNV) detection.
              Exome-based CNA analysis identified 29 large-scale alterations
              and 619 focal events per tumor on average. As in our previous
              analysis of these data, we observed frequent amplification of
              oncogenes (e.g., CCNE1, MYC) and deletion of tumor suppressors
              (NF1, PTEN, and CDKN2A). We searched for additional recurrent
              focal CNAs using the correlation matrix diagonal segmentation
              (CMDS) algorithm, which identified 424 significant events
              affecting 582 genes. Taken together, our results demonstrate the
              robust performance of VarScan 2 for somatic mutation and CNA
              detection and shed new light on the landscape of genetic
              alterations in ovarian cancer.",
  journal  = "Genome Res.",
  volume   =  22,
  number   =  3,
  pages    = "568--576",
  month    =  mar,
  year     =  2012,
  keywords = "Variant calling paper",
  language = "en"
}

@ARTICLE{Sun2016-te,
  title    = "Indel detection from {RNA-seq} data: tool evaluation and
              strategies for accurate detection of actionable mutations",
  author   = "Sun, Zhifu and Bhagwate, Aditya and Prodduturi, Naresh and Yang,
              Ping and Kocher, Jean-Pierre A",
  abstract = "Driver somatic mutations are a hallmark of a tumor that can be
              used for diagnosis and targeted therapy. Mutations are primarily
              detected from tumor DNA. As dynamic molecules of gene activities,
              transcriptome profiling by RNA sequence (RNA-seq) is becoming
              increasingly popular, which not only measures gene expression but
              also structural variations such as mutations and fusion
              transcripts. Although single-nucleotide variants (SNVs) can be
              easily identified from RNA-seq, intermediate long
              insertions/deletions (indels > 2 bases and less than sequence
              reads) cause significant challenges and are ignored by most
              RNA-seq analysis tools. This study evaluates commonly used
              RNA-seq analysis programs along with variant and somatic mutation
              callers in a series of data sets with simulated and known indels.
              The aim is to develop strategies for accurate indel detection.
              Our results show that the RNA-seq alignment is the most important
              step for indel identification and the evaluated programs have a
              wide range of sensitivity to map sequence reads with indels, from
              not at all to decently sensitive. The sensitivity is impacted by
              sequence read lengths. Most variant calling programs rely on hard
              evidence indels marked in the alignment and the programs with
              realignment may use soft-clipped reads for indel inferencing.
              Based on the observations, we have provided practical
              recommendations for indel detection when different RNA-seq
              aligners are used and demonstrated the best option with highly
              reliable results. With careful customization of bioinformatics
              algorithms, RNA-seq can be reliably used for both SNV and indel
              mutation detection that can be used for clinical decision-making.",
  journal  = "Brief. Bioinform.",
  month    =  jul,
  year     =  2016,
  keywords = "EGFR; RNA sequencing; alignment; indels; mutation; variant
              calling;RNA variant calling;Variant calling paper",
  language = "en"
}

@ARTICLE{Meynert2014-pa,
  title    = "Variant detection sensitivity and biases in whole genome and
              exome sequencing",
  author   = "Meynert, Alison M and Ansari, Morad and FitzPatrick, David R and
              Taylor, Martin S",
  abstract = "BACKGROUND: Less than two percent of the human genome is protein
              coding, yet that small fraction harbours the majority of known
              disease causing mutations. Despite rapidly falling whole genome
              sequencing (WGS) costs, much research and increasingly the
              clinical use of sequence data is likely to remain focused on the
              protein coding exome. We set out to quantify and understand how
              WGS compares with the targeted capture and sequencing of the
              exome (exome-seq), for the specific purpose of identifying single
              nucleotide polymorphisms (SNPs) in exome targeted regions.
              RESULTS: We have compared polymorphism detection sensitivity and
              systematic biases using a set of tissue samples that have been
              subject to both deep exome and whole genome sequencing. The
              scoring of detection sensitivity was based on sequence down
              sampling and reference to a set of gold-standard SNP calls for
              each sample. Despite evidence of incremental improvements in
              exome capture technology over time, whole genome sequencing has
              greater uniformity of sequence read coverage and reduced biases
              in the detection of non-reference alleles than exome-seq.
              Exome-seq achieves 95\% SNP detection sensitivity at a mean
              on-target depth of 40 reads, whereas WGS only requires a mean of
              14 reads. Known disease causing mutations are not biased towards
              easy or hard to sequence areas of the genome for either exome-seq
              or WGS. CONCLUSIONS: From an economic perspective, WGS is at
              parity with exome-seq for variant detection in the targeted
              coding regions. WGS offers benefits in uniformity of read
              coverage and more balanced allele ratio calls, both of which can
              in most cases be offset by deeper exome-seq, with the caveat that
              some exome-seq targets will never achieve sufficient mapped read
              depth for variant detection due to technical difficulties or
              probe failures. As WGS is intrinsically richer data that can
              provide insight into polymorphisms outside coding regions and
              reveal genomic rearrangements, it is likely to progressively
              replace exome-seq for many applications.",
  journal  = "BMC Bioinformatics",
  volume   =  15,
  pages    = "247",
  month    =  jul,
  year     =  2014,
  keywords = "Finding optimal coverage;RNA variant calling;Variant calling
              paper",
  language = "en"
}

@ARTICLE{Meynert2013-by,
  title    = "Quantifying single nucleotide variant detection sensitivity in
              exome sequencing",
  author   = "Meynert, Alison M and Bicknell, Louise S and Hurles, Matthew E
              and Jackson, Andrew P and Taylor, Martin S",
  abstract = "BACKGROUND: The targeted capture and sequencing of genomic
              regions has rapidly demonstrated its utility in genetic studies.
              Inherent in this technology is considerable heterogeneity of
              target coverage and this is expected to systematically impact our
              sensitivity to detect genuine polymorphisms. To fully interpret
              the polymorphisms identified in a genetic study it is often
              essential to both detect polymorphisms and to understand where
              and with what probability real polymorphisms may have been
              missed. RESULTS: Using down-sampling of 30 deeply sequenced
              exomes and a set of gold-standard single nucleotide variant (SNV)
              genotype calls for each sample, we developed an empirical model
              relating the read depth at a polymorphic site to the probability
              of calling the correct genotype at that site. We find that
              measured sensitivity in SNV detection is substantially worse than
              that predicted from the naive expectation of sampling from a
              binomial. This calibrated model allows us to produce single
              nucleotide resolution SNV sensitivity estimates which can be
              merged to give summary sensitivity measures for any arbitrary
              partition of the target sequences (nucleotide, exon, gene,
              pathway, exome). These metrics are directly comparable between
              platforms and can be combined between samples to give ``power
              estimates'' for an entire study. We estimate a local read depth
              of 13X is required to detect the alleles and genotype of a
              heterozygous SNV 95\% of the time, but only 3X for a homozygous
              SNV. At a mean on-target read depth of 20X, commonly used for
              rare disease exome sequencing studies, we predict 5-15\% of
              heterozygous and 1-4\% of homozygous SNVs in the targeted regions
              will be missed. CONCLUSIONS: Non-reference alleles in the
              heterozygote state have a high chance of being missed when
              commonly applied read coverage thresholds are used despite the
              widely held assumption that there is good polymorphism detection
              at these coverage levels. Such alleles are likely to be of
              functional importance in population based studies of rare
              diseases, somatic mutations in cancer and explaining the
              ``missing heritability'' of quantitative traits.",
  journal  = "BMC Bioinformatics",
  volume   =  14,
  pages    = "195",
  month    =  jun,
  year     =  2013,
  keywords = "Finding optimal coverage;RNA variant calling;Variant calling
              paper",
  language = "en"
}

@ARTICLE{Sims2014-pn,
  title    = "Sequencing depth and coverage: key considerations in genomic
              analyses",
  author   = "Sims, David and Sudbery, Ian and Ilott, Nicholas E and Heger,
              Andreas and Ponting, Chris P",
  abstract = "Sequencing technologies have placed a wide range of genomic
              analyses within the capabilities of many laboratories. However,
              sequencing costs often set limits to the amount of sequences that
              can be generated and, consequently, the biological outcomes that
              can be achieved from an experimental design. In this Review, we
              discuss the issue of sequencing depth in the design of
              next-generation sequencing experiments. We review current
              guidelines and precedents on the issue of coverage, as well as
              their underlying considerations, for four major study designs,
              which include de novo genome sequencing, genome resequencing,
              transcriptome sequencing and genomic location analyses (for
              example, chromatin immunoprecipitation followed by sequencing
              (ChIP-seq) and chromosome conformation capture (3C)).",
  journal  = "Nat. Rev. Genet.",
  volume   =  15,
  number   =  2,
  pages    = "121--132",
  month    =  feb,
  year     =  2014,
  keywords = "Finding optimal coverage;Variant calling paper;sequencing depth,
              gene expression, AML"
}

@ARTICLE{Piskol2013-gb,
  title    = "Reliable identification of genomic variants from {RNA-seq} data",
  author   = "Piskol, Robert and Ramaswami, Gokul and Li, Jin Billy",
  abstract = "Identifying genomic variation is a crucial step for unraveling
              the relationship between genotype and phenotype and can yield
              important insights into human diseases. Prevailing methods rely
              on cost-intensive whole-genome sequencing (WGS) or whole-exome
              sequencing (WES) approaches while the identification of genomic
              variants from often existing RNA sequencing (RNA-seq) data
              remains a challenge because of the intrinsic complexity in the
              transcriptome. Here, we present a highly accurate approach termed
              SNPiR to identify SNPs in RNA-seq data. We applied SNPiR to
              RNA-seq data of samples for which WGS and WES data are also
              available and achieved high specificity and sensitivity. Of the
              SNPs called from the RNA-seq data, >98\% were also identified by
              WGS or WES. Over 70\% of all expressed coding variants were
              identified from RNA-seq, and comparable numbers of exonic
              variants were identified in RNA-seq and WES. Despite our method's
              limitation in detecting variants in expressed regions only, our
              results demonstrate that SNPiR outperforms current
              state-of-the-art approaches for variant detection from RNA-seq
              data and offers a cost-effective and reliable alternative for SNP
              discovery.",
  journal  = "Am. J. Hum. Genet.",
  volume   =  93,
  number   =  4,
  pages    = "641--651",
  month    =  oct,
  year     =  2013,
  keywords = "RNA variant calling;Variant calling paper"
}

@ARTICLE{Conesa2016-rs,
  title    = "A survey of best practices for {RNA-seq} data analysis",
  author   = "Conesa, Ana and Madrigal, Pedro and Tarazona, Sonia and
              Gomez-Cabrero, David and Cervera, Alejandra and McPherson, Andrew
              and Szcze{\'s}niak, Micha{\l} Wojciech and Gaffney, Daniel J and
              Elo, Laura L and Zhang, Xuegong and Mortazavi, Ali",
  abstract = "RNA-sequencing (RNA-seq) has a wide variety of applications, but
              no single analysis pipeline can be used in all cases. We review
              all of the major steps in RNA-seq data analysis, including
              experimental design, quality control, read alignment,
              quantification of gene and transcript levels, visualization,
              differential gene expression, alternative splicing, functional
              analysis, gene fusion detection and eQTL mapping. We highlight
              the challenges associated with each step. We discuss the analysis
              of small RNAs and the integration of RNA-seq with other
              functional genomics techniques. Finally, we discuss the outlook
              for novel technologies that are changing the state of the art in
              transcriptomics.",
  journal  = "Genome Biol.",
  volume   =  17,
  number   =  1,
  pages    = "13",
  month    =  jan,
  year     =  2016,
  keywords = "Finding optimal coverage;Variant calling paper"
}

@ARTICLE{Lavallee2016-sf,
  title    = "{RNA-sequencing} analysis of core binding factor {AML} identifies
              recurrent {ZBTB7A} mutations and defines {RUNX1-CBFA2T3} fusion
              signature",
  author   = "Lavall{\'e}e, Vincent-Philippe and Lemieux, S{\'e}bastien and
              Boucher, Genevi{\`e}ve and Gendron, Patrick and Boivin, Isabel
              and Armstrong, Richard Neil and Sauvageau, Guy and H{\'e}bert,
              Jos{\'e}e",
  journal  = "Blood",
  month    =  mar,
  year     =  2016,
  keywords = "Leukemia;Variant calling paper;sequencing depth, gene expression,
              AML"
}


@UNPUBLISHED{Flensburg2018-ah,
   title    = "{SuperFreq}: Integrated mutation detection and clonal tracking in
               cancer",
   author   = "Flensburg, Christoffer and Sargeant, Tobias and Oshlack, Alicia
               and Majewski, Ian",
   abstract = "Motivation: Analysing multiple tumour samples from an individual
               cancer patient allows insight into the way the disease evolves.
               Monitoring the expansion and contraction of distinct clones helps
               to reveal the mutations that initiate the disease and those that
               drive progression; therefore, the ability to identify and track
               clones using genomics data is of great interest. Existing
               approaches for clonal tracking typically require the user to
               combine multiple tools that are not purpose-made. Furthermore,
               most methods require a matched normal (non-tumour) sample, which
               limits the scope of application. Results: We have built
               superFreq, a cancer exome sequencing analysis tool that calls and
               annotates somatic SNVs and CNAs and attributes them to clones.
               SuperFreq makes use of unrelated control samples and does not
               require matched normal samples. We demonstrate the ability of
               superFreq to track clones by combining real samples in known
               proportions to simulating a multi-sample analysis. In addition,
               we compared superFreq to other somatic SNV callers and CNA
               callers on exome sequencing data from cancer-normal pairs,
               including 304 participants gathered from 33 cancer types in The
               Cancer Genome Atlas (TCGA). SuperFreq offers a reliable platform
               to identify somatic mutations and to track clones. SuperFreq
               recalled 91\% of somatic SNVs identified by a consensus of four
               other methods, with a median of 1 additional somatic SNV per
               sample that was not found by any other method. CNA calls from
               superFreq showed good agreement with those generated by Sequenza,
               or those from ASCAT generated using matched SNP arrays. Using our
               simulated data set for testing multi-sample clonal tracking, we
               found that superFreq identified 93\% of clones with a cellular
               fraction of at least 50\%, and mutations were assigned to clones
               with high recall and close to 100\% precision. In addition,
               SuperFreq maintained a similar level of performance for most
               aspects of the analysis without a matched normal control.
               SuperFreq is a highly adaptable method and has already been used
               in multiple different projects. Availability: SuperFreq is
               implemented in R and available on github at
               https://github.com/ChristofferFlensburg/superFreq.",
   journal  = "bioRxiv",
   pages    = "380097",
   month    =  jul,
   year     =  2018,
   language = "en"
 }