diff --git a/src/main/scala/com/fulcrumgenomics/bam/Pileup.scala b/src/main/scala/com/fulcrumgenomics/bam/Pileup.scala deleted file mode 100644 index 9aa21d0ea..000000000 --- a/src/main/scala/com/fulcrumgenomics/bam/Pileup.scala +++ /dev/null @@ -1,217 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2017 Fulcrum Genomics LLC - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -package com.fulcrumgenomics.bam - -import com.fulcrumgenomics.FgBioDef._ -import com.fulcrumgenomics.bam.api.SamRecord -import com.fulcrumgenomics.fasta.SequenceDictionary -import htsjdk.samtools.CigarOperator -import htsjdk.samtools.util.SequenceUtil - -import scala.collection.mutable -import scala.collection.mutable.ListBuffer - -/** - * Represents a pileup of reads/bases at a single genomic location. - */ -case class Pileup[A <: PileupEntry](refName: String, refIndex: Int, pos: Int, pile: Seq[A]) extends Iterable[A] { - override def iterator: Iterator[A] = pile.iterator - - /** - * Returns the depth of coverage at this position. This includes reads who have a base at - * this position as well as reads with deletions that span this position, but does NOT - * include insertions. - */ - lazy val depth = pile.count { - case i: InsertionEntry => false - case _ => true - } - - /** Returns the [[depth]] of the pileup (NOT the number of records in the pileup. */ - override def size: Int = depth - - /** Returns a copy of this pileup with only a single observation from any template. */ - def withoutOverlaps: Pileup[A] = { - val builder = IndexedSeq.newBuilder[A] - val names = new mutable.HashSet[String]() - this.pile.foreach { p => if (names.add(p.rec.name)) builder += p } - copy(pile=builder.result) - } - - /** Returns a copy of this pileup object without any indels. */ - def withoutIndels: Pileup[BaseEntry] = copy(pile=this.baseIterator.toSeq) - - /** Provides an iterator over the non-indel entries in the pile. */ - def baseIterator: Iterator[BaseEntry] = iterator.collect { case x: BaseEntry => x } -} - -/** - * Base trait for pileup entries that exposes the [[com.fulcrumgenomics.bam.api.SamRecord]] and the 0-based offset into - * the record's bases and qualities that is relevant to the pileup. - */ -sealed trait PileupEntry { - /** The SamRecord that the pileup entry represents. */ - val rec: SamRecord - - /** - * The zero-based offset within the [[com.fulcrumgenomics.bam.api.SamRecord]] that is represented. - * For matches and mismatches this is the offset of the base in question. - * For deletions it is the base prior to the deleted sequence. - * For insertions it is the first inserted base in the read. - */ - val offset: Int - - require(offset >= 0 && offset < rec.length, s"Offset must be between 0 and read length. Offset=$offset, rlen=${rec.length}") -} - -/** Pileup entry representing a match or mismatch. */ -case class BaseEntry(override val rec: SamRecord, override val offset: Int) extends PileupEntry { - val base: Byte = SequenceUtil.upperCase(rec.bases(offset)) - val qual: Byte = rec.quals(offset) - - /** Returns the 1-based position within the read in the read's current orientation. */ - def positionInRead: Int = offset + 1 - - /** - * Returns the 1-based position within the read in the order the read was sequenced. - * This can be thought of as the offset from the 5' end plus one. - */ - def positionInReadInReadOrder: Int = { - if (rec.negativeStrand) rec.length - offset else positionInRead - } - - /** Returns the base as it was read on the sequencer. */ - def baseInReadOrientation: Byte = if (rec.negativeStrand) SequenceUtil.complement(base) else base - -} - -/** Pileup entry representing an insertion. */ -case class InsertionEntry(override val rec: SamRecord, override val offset: Int) extends PileupEntry - -/** Pileup entry representing a deletion. */ -case class DeletionEntry(override val rec: SamRecord, override val offset: Int) extends PileupEntry - -/** - * Class that provides methods to build and filter [[Pileup]]s. - * - * @param dict the SequenceDictionary for the reference sequence - * @param minMapQ the minimum MAPQ to allow a read to contribute to the pileup - * @param minBaseQ the minimum base quality to allow a base to contribute to the pileup - * @param mappedPairsOnly if true only allow reads from read-pairs with both reads mapped to contribute to the pileup - * @param includeDuplicates if true allow reads flagged as duplicates to contribute to the pileup - * @param includeSecondaryAlignments if true allow secondary alignments to contribute to the pileup - * @param includeSupplementalAlignments if true allow supplemental alignments to contribute to the pileup - */ -class PileupBuilder(dict: SequenceDictionary, - minMapQ: Int = 20, - minBaseQ: Int = 20, - mappedPairsOnly: Boolean = true, - includeDuplicates: Boolean = false, - includeSecondaryAlignments: Boolean = false, - includeSupplementalAlignments: Boolean = false) { - - private val readFilters = new ListBuffer[SamRecord => Boolean]() - private val baseFilters = new ListBuffer[PileupEntry => Boolean]() - - // Setup the default read level filters - if (minMapQ > 0) readFilters += { r => r.mapq >= minMapQ } - if (mappedPairsOnly) readFilters += { r => r.paired && r.mapped && r.mateMapped } - if (!includeDuplicates) readFilters += { r => !r.duplicate } - if (!includeSecondaryAlignments) readFilters += { r => !r.secondary } - if (!includeSupplementalAlignments) readFilters += { r => !r.supplementary } - - // Setup the default base level filters - if (minBaseQ > 0) baseFilters += { - case p: BaseEntry => p.qual >= minBaseQ - case _ => true - } - - /** Adds a read level filter to the set of filters. */ - def withReadFilter(f: SamRecord => Boolean): PileupBuilder = yieldAndThen(this) { this.readFilters += f } - - /** Adds a base level filter to the set of filters. */ - def withBaseFilter(f: PileupEntry => Boolean): PileupBuilder = yieldAndThen(this) { this.baseFilters += f } - - /** Checks to see if all read filters accept the read. */ - protected def accepts(rec: SamRecord): Boolean = this.readFilters.forall(f => f(rec)) - - /** Checks to see if all read filters accept the read. */ - protected def accepts(entry: PileupEntry): Boolean = this.baseFilters.forall(f => f(entry)) - - /** - * Constructs a pileup, at the single requested position, from the records returned by the iterator. - * - * @param iterator an Iterator of SamRecords that may or may not overlap the position - * @param refName the name of the reference sequence/contig/chrom on which the position resides - * @param pos the 1-based position on the reference sequence at which to construct the pileup - */ - def build(iterator: Iterator[SamRecord], refName: String, pos: Int): Pileup[PileupEntry] = { - val refIndex = dict(refName).index - require(refIndex >= 0, s"Unknown reference sequence: ${refName}") - - val builder = IndexedSeq.newBuilder[PileupEntry] - iterator.bufferBetter - .dropWhile(r => r.refIndex < refIndex) - .takeWhile(r => r.refIndex == refIndex && r.start <= pos + 1) - .filterNot(r => r.unmapped) - .filter(r => r.start <= pos || startsWithInsertion(r)) - .filter(r => r.end >= pos) - .filter(r => this.readFilters.forall(f => f(r))) - .foreach { rec => - if (rec.start == pos + 1) { - // Must be an insertion at the start of the read - val entry = new InsertionEntry(rec, 0) - if (accepts(entry)) builder += entry - } - else { - rec.readPosAtRefPos(pos, returnLastBaseIfDeleted=false) match { - case 0 => - // Must be deleted in the read - val delPos = rec.readPosAtRefPos(pos, returnLastBaseIfDeleted=true) - val entry = DeletionEntry(rec, delPos-1) - if (accepts(entry)) builder += entry - case p => - val entry = BaseEntry(rec, p-1) - if (accepts(entry)) builder += entry - - // Also check to see if the subsequent base represents an insertion - if (p < rec.length - 1 && rec.refPosAtReadPos(p+1) == 0) { - val entry = InsertionEntry(rec, p) - if (accepts(entry)) builder += entry - } - } - } - } - - Pileup(refName=refName, refIndex=refIndex, pos=pos, pile=builder.result) - } - - /** Returns true if the read is mapped and the first non-clipping operator is an insertion. */ - private def startsWithInsertion(r: SamRecord) = { - r.mapped && - r.cigar.iterator.bufferBetter.dropWhile(_.operator.isClipping).headOption.exists(_.operator == CigarOperator.INSERTION) - } -} - diff --git a/src/main/scala/com/fulcrumgenomics/bam/pileup/Pileup.scala b/src/main/scala/com/fulcrumgenomics/bam/pileup/Pileup.scala new file mode 100644 index 000000000..7425ff038 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/bam/pileup/Pileup.scala @@ -0,0 +1,110 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.bam.api.SamRecord +import htsjdk.samtools.util.SequenceUtil + +import scala.collection.mutable + +/** Represents a pileup of SAM records and their alignments at a single genomic location. */ +case class Pileup[A <: PileupEntry](refName: String, refIndex: Int, pos: Int, pile: Seq[A]) extends Iterable[A] { + + /** Returns the depth of coverage at this position. This includes reads who have a base at this position as well as + * reads with deletions that span this position, but does NOT include insertions. + */ + lazy val depth: Int = pile.count { + case _: InsertionEntry => false + case _ => true + } + + /** An iterator over the elements in this pile. */ + override def iterator: Iterator[A] = pile.iterator + + /** Returns the [[depth]] of the pileup (NOT the number of records in the pileup). */ + override def size: Int = depth + + /** Returns a copy of this pileup with only a single observation from any template. */ + def withoutOverlaps: Pileup[A] = { + val builder = IndexedSeq.newBuilder[A] + val names = mutable.HashSet.empty[String] + names.sizeHint(pile.length) + pile.foreach { p => if (names.add(p.rec.name)) builder += p } + copy(pile = builder.result()) + } + + /** Returns a copy of this pileup object without any indels and only base entries. */ + def withoutIndels: Pileup[BaseEntry] = copy(pile = this.baseIterator.toSeq) + + /** Provides an iterator over the non-indel entries in the pileup. */ + def baseIterator: Iterator[BaseEntry] = this.iterator.collect { case x: BaseEntry => x } +} + +/** Base trait for pileup entries that exposes the [[SamRecord]] and the 0-based offset into the record's bases and + * qualities that is relevant to the pileup. + */ +sealed trait PileupEntry { + + /** The SamRecord that the pileup entry represents. */ + val rec: SamRecord + + /** The zero-based offset within the [[SamRecord]] that is represented. + * + * - For matches and mismatches this is the offset of the base in question. + * - For deletions it is the base prior to the deleted sequence. + * - For insertions it is the first inserted base in the read. + */ + val offset: Int + + require( + offset >= 0 && offset < rec.length, + s"Offset must be between 0 and read length. Offset: $offset, Read Length: ${rec.length}" + ) +} + +/** Pileup entry representing a match or mismatch. */ +case class BaseEntry(override val rec: SamRecord, override val offset: Int) extends PileupEntry { + val base: Byte = SequenceUtil.upperCase(rec.bases(offset)) + val qual: Byte = rec.quals(offset) + + /** Returns the 1-based position within the read in the read's current orientation. */ + def positionInRead: Int = offset + 1 + + /** Returns the 1-based position within the read in the order the read was sequenced. This can be thought of as the + * offset from the 5-prime end plus one. + */ + def positionInReadInReadOrder: Int = { + if (rec.negativeStrand) rec.length - offset else positionInRead + } + + /** Returns the base as it was read on the sequencer. */ + def baseInReadOrientation: Byte = if (rec.negativeStrand) SequenceUtil.complement(base) else base +} + +/** Pileup entry representing an insertion. */ +case class InsertionEntry(override val rec: SamRecord, override val offset: Int) extends PileupEntry + +/** Pileup entry representing a deletion. */ +case class DeletionEntry(override val rec: SamRecord, override val offset: Int) extends PileupEntry diff --git a/src/main/scala/com/fulcrumgenomics/bam/pileup/PileupBuilder.scala b/src/main/scala/com/fulcrumgenomics/bam/pileup/PileupBuilder.scala new file mode 100644 index 000000000..89963cf96 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/bam/pileup/PileupBuilder.scala @@ -0,0 +1,226 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.FgBioDef.FgBioEnum +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.api.{SamRecord, SamSource} +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.{RandomAccess, Streaming} +import com.fulcrumgenomics.bam.pileup.PileupBuilder.PileupParameters +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.fasta.SequenceDictionary +import enumeratum.EnumEntry +import htsjdk.samtools.CigarOperator.{INSERTION => Insertion} + +import java.io.Closeable +import scala.collection.mutable.ArrayBuffer + +/** Companion object for [[PileupBuilder]]. */ +object PileupBuilder { + + /** Sensible defaults for various implementations of [[PileupBuilder]]. */ + trait PileupParameters { + + /** The minimum mapping quality a record must have to contribute to a pileup by default. */ + val minMapQ: Int = 20 + + /** The minimum base quality that a base must have to contribute to a pileup by default. */ + val minBaseQ: Int = 20 + + /** Only allow paired records with both records mapped to contribute to a pileup by default. */ + val mappedPairsOnly: Boolean = true + + /** Allow records flagged as duplicates to contribute to a pileup by default. */ + val includeDuplicates: Boolean = false + + /** Allow records flagged as secondary alignments to contribute to a pileup by default. */ + val includeSecondaryAlignments: Boolean = false + + /** Allow records flagged as supplementary alignments to contribute to a pileup by default. */ + val includeSupplementalAlignments: Boolean = false + + /** Exclude any record of an FR pair where the site requested is outside the insert by default. */ + val includeMapPositionsOutsideFrInsert: Boolean = false + } + + /** A singleton version of all sensible default pileup parameters. */ + object PileupDefaults extends PileupParameters + + /** A trait that all enumerations of BAM access pattern must extend. */ + sealed trait BamAccessPattern extends EnumEntry + + /** The various types of BAM access patterns. */ + object BamAccessPattern extends FgBioEnum[BamAccessPattern] { + override def values: IndexedSeq[BamAccessPattern] = findValues + + /** The type of BAM access pattern when queries are completed using random access. */ + case object RandomAccess extends BamAccessPattern + + /** The type of BAM access pattern when queries are completed using full BAM streaming. */ + case object Streaming extends BamAccessPattern + } + + /** Build a [[PileupBuilder]] from a [[SamSource]]. */ + def apply( + source: SamSource, + accessPattern: BamAccessPattern = RandomAccess, + minMapQ: Int = PileupDefaults.minMapQ, + minBaseQ: Int = PileupDefaults.minBaseQ, + mappedPairsOnly: Boolean = PileupDefaults.mappedPairsOnly, + includeDuplicates: Boolean = PileupDefaults.includeDuplicates, + includeSecondaryAlignments: Boolean = PileupDefaults.includeSecondaryAlignments, + includeSupplementalAlignments: Boolean = PileupDefaults.includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert, + ): PileupBuilder with Closeable = accessPattern match { + case RandomAccess => RandomAccessPileupBuilder( + source = source, + minMapQ = minMapQ, + minBaseQ = minBaseQ, + mappedPairsOnly = mappedPairsOnly, + includeDuplicates = includeDuplicates, + includeSecondaryAlignments = includeSecondaryAlignments, + includeSupplementalAlignments = includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert = includeMapPositionsOutsideFrInsert, + ) + case Streaming => StreamingPileupBuilder( + source = source, + minMapQ = minMapQ, + minBaseQ = minBaseQ, + mappedPairsOnly = mappedPairsOnly, + includeDuplicates = includeDuplicates, + includeSecondaryAlignments = includeSecondaryAlignments, + includeSupplementalAlignments = includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert = includeMapPositionsOutsideFrInsert, + ) + } + + /** Returns true if is in a mapped FR pair but the position is outside the insert coordinates of . + * Returns false if is in a mapped FR pair and the position is inside the insert coordinates of or + * is not in a mapped FR pair. + */ + private def positionIsOutsideFrInsert(rec: SamRecord, refIndex: Int, pos: Int): Boolean = { + rec.isFrPair && { + val (start, end) = Bams.insertCoordinates(rec) + rec.refIndex == refIndex && pos >= start && pos <= end + } + } + + /** Returns true if the first non-clipping operator is an insertion. */ + private def startsWithInsertion(rec: SamRecord): Boolean = { + rec.cigar.find(elem => !elem.operator.isClipping).exists(_.operator == Insertion) + } +} + +/** A trait that all pileup builders must extends. */ +trait PileupBuilder extends PileupParameters { + + /** The sequence dictionary associated with the records we will pileup. */ + val dict: SequenceDictionary + + /** Pileup records at this position. */ + def pileup(refName: String, pos: Int): Pileup[PileupEntry] + + /** Quickly check the SAM record to see if all the simple static per-read filters accept the read. */ + @inline private final def quickAcceptRecord(rec: SamRecord): Boolean = { + var compare = true + if (compare && minMapQ > 0) compare = rec.mapq >= minMapQ + if (compare && mappedPairsOnly) compare = rec.paired && rec.mapped && rec.mateMapped + if (compare && !includeDuplicates) compare = !rec.duplicate + if (compare && !includeSecondaryAlignments) compare = !rec.secondary + if (compare && !includeSupplementalAlignments) compare = !rec.supplementary + compare + } + + /** Quickly check the pileup entry to see if all the simple static per-base filters accept the entry. */ + @inline private final def quickAcceptEntry(entry: PileupEntry): Boolean = { + entry match { + case p: BaseEntry => p.qual >= minBaseQ + case _ => true + } + } + + /** Custom SAM record filters to further filter down pileups made from this builder. */ + protected val recFilters: ArrayBuffer[SamRecord => Boolean] = ArrayBuffer.empty[SamRecord => Boolean] + + /** Custom pileup entry filters to further filter down pileups made from this builder. */ + protected val entryFilters: ArrayBuffer[PileupEntry => Boolean] = ArrayBuffer.empty[PileupEntry => Boolean] + + /** Adds a filter to the set of SAM record filters; filters should return true to retain records and false to discard. */ + def withReadFilter(fn: SamRecord => Boolean): this.type = { recFilters.addOne(fn); this } + + /** Adds a filter to the set of pileup entry filters; filters should return true to retain entries and false to discard. */ + def withEntryFilter(fn: PileupEntry => Boolean): this.type = { entryFilters.addOne(fn); this } + + /** Checks to see if all SAM record filters accept the record. */ + protected def acceptRecord(rec: SamRecord): Boolean = quickAcceptRecord(rec) && recFilters.forall(fn => fn(rec)) + + /** Checks to see if all pileup entry filters accept the pileup entry. */ + protected def acceptEntry(entry: PileupEntry): Boolean = quickAcceptEntry(entry) && entryFilters.forall(fn => fn(entry)) + + /** Constructs a pileup, at the single requested position, from the records returned by the iterator. + * + * @param recs the collection of coordinate-sorted SAM records that may or may not overlap the position + * @param refName the name of the reference sequence on which the position resides + * @param pos the 1-based position on the reference sequence at which to construct the pileup + */ + protected def build(recs: IterableOnce[SamRecord], refName: String, pos: Int): Pileup[PileupEntry] = { + val refIndex = dict(refName).index.ensuring(_ >= 0, s"Unknown reference sequence name: $refName") + val pile = IndexedSeq.newBuilder[PileupEntry] + if (recs.knownSize >= 0) pile.sizeHint(pile.knownSize) + + @inline def testAndAdd(entry: PileupEntry): Unit = if (this.acceptEntry(entry)) pile.addOne(entry) + + recs.iterator.bufferBetter + .dropWhile(rec => rec.refIndex < refIndex) + .takeWhile(rec => rec.refIndex == refIndex && rec.start <= pos + 1) + .filter { rec => + var compare: Boolean = !rec.unmapped + if (compare) compare = this.acceptRecord(rec) + if (compare) compare = rec.end >= pos + if (compare) compare = rec.start <= pos || PileupBuilder.startsWithInsertion(rec) + if (compare) compare = if (!includeMapPositionsOutsideFrInsert && rec.isFrPair) { + PileupBuilder.positionIsOutsideFrInsert(rec, refIndex = refIndex, pos = pos) + } else { compare } + compare + } + .foreach { rec => + if (rec.start == pos + 1) { // This site must be an insertion in the read that is at the start of the read. + testAndAdd(InsertionEntry(rec, 0)) + } else { + val offset = rec.readPosAtRefPos(pos, returnLastBaseIfDeleted = false) + if (offset == 0) { // This site must be deleted within the read. + val deletionPosition = rec.readPosAtRefPos(pos, returnLastBaseIfDeleted = true) + testAndAdd(DeletionEntry(rec, deletionPosition - 1)) + } else { // This site must be a matched site within the read. + testAndAdd(BaseEntry(rec, offset - 1)) + // Also check to see if the subsequent base represents an insertion. + if (offset < rec.length - 1 && rec.refPosAtReadPos(offset + 1) == 0) testAndAdd(InsertionEntry(rec, offset)) + } + } + } + + Pileup(refName = refName, refIndex = refIndex, pos = pos, pile = pile.result()) + } +} diff --git a/src/main/scala/com/fulcrumgenomics/bam/pileup/RandomAccessPileupBuilder.scala b/src/main/scala/com/fulcrumgenomics/bam/pileup/RandomAccessPileupBuilder.scala new file mode 100644 index 000000000..978b37a0f --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/bam/pileup/RandomAccessPileupBuilder.scala @@ -0,0 +1,96 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.bam.api.QueryType.Overlapping +import com.fulcrumgenomics.bam.api.SamSource +import com.fulcrumgenomics.bam.pileup.PileupBuilder._ +import com.fulcrumgenomics.fasta.SequenceDictionary + +import java.io.Closeable + +/** Companion object for [[RandomAccessPileupBuilder]]. */ +object RandomAccessPileupBuilder { + + /** Build a random access pileup builder from an indexed SAM source. */ + def apply( + source: SamSource, + minMapQ: Int = PileupDefaults.minMapQ, + minBaseQ: Int = PileupDefaults.minBaseQ, + mappedPairsOnly: Boolean = PileupDefaults.mappedPairsOnly, + includeDuplicates: Boolean = PileupDefaults.includeDuplicates, + includeSecondaryAlignments: Boolean = PileupDefaults.includeSecondaryAlignments, + includeSupplementalAlignments: Boolean = PileupDefaults.includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert, + ): RandomAccessPileupBuilder = { + require(source.indexed, "SAM source must be indexed for random access!") + new RandomAccessPileupBuilder( + source = source, + minMapQ = minMapQ, + minBaseQ = minBaseQ, + mappedPairsOnly = mappedPairsOnly, + includeDuplicates = includeDuplicates, + includeSecondaryAlignments = includeSecondaryAlignments, + includeSupplementalAlignments = includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert = includeMapPositionsOutsideFrInsert, + ) + } +} + +/** A pileup builder that builds pileups using index-based BAM random access. + * + * @param source the indexed [[SamSource]] of records we will pileup. + * @param minBaseQ the minimum base quality that a base must have to contribute to a pileup. + * @param minMapQ the minimum mapping quality a record must have to contribute to a pileup. + * @param mappedPairsOnly if true, only allow paired records with both records mapped to contribute to a pileup. + * @param includeDuplicates if true, allow records flagged as duplicates to contribute to a pileup. + * @param includeSecondaryAlignments if true, allow records flagged as secondary alignments to contribute to a pileup. + * @param includeSupplementalAlignments if true, allow records flagged as supplementary alignments to contribute to a pileup. + * @param includeMapPositionsOutsideFrInsert if true, include any record of an FR pair where the site requested is outside the insert. + */ +class RandomAccessPileupBuilder private( + source: SamSource, + override val minMapQ: Int = PileupDefaults.minMapQ, + override val minBaseQ: Int = PileupDefaults.minBaseQ, + override val mappedPairsOnly: Boolean = PileupDefaults.mappedPairsOnly, + override val includeDuplicates: Boolean = PileupDefaults.includeDuplicates, + override val includeSecondaryAlignments: Boolean = PileupDefaults.includeSecondaryAlignments, + override val includeSupplementalAlignments: Boolean = PileupDefaults.includeSupplementalAlignments, + override val includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert, +) extends PileupBuilder with Closeable { + + /** The sequence dictionary associated with the records we will pileup. */ + override val dict: SequenceDictionary = source.dict + + /** Pileup records at this position. */ + override def pileup(refName: String, pos: Int): Pileup[PileupEntry] = { + val end = Math.min(pos + 1, dict(refName).length) // Add 1bp to "end" in case a read has an insertion there. + val overlapping = source.query(refName, start = pos, end = end, Overlapping) + this.build(overlapping, refName = refName, pos = pos) + } + + /** Close the wrapped SAM source. */ + override def close(): Unit = source.close() +} diff --git a/src/main/scala/com/fulcrumgenomics/bam/pileup/StreamingPileupBuilder.scala b/src/main/scala/com/fulcrumgenomics/bam/pileup/StreamingPileupBuilder.scala new file mode 100644 index 000000000..ca856a2dc --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/bam/pileup/StreamingPileupBuilder.scala @@ -0,0 +1,185 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.bam.api.SamOrder.Coordinate +import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource} +import com.fulcrumgenomics.bam.pileup.PileupBuilder._ +import com.fulcrumgenomics.bam.pileup.StreamingPileupBuilder.DefaultInitialCacheSize +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.coord.LocatableOrdering +import com.fulcrumgenomics.fasta.SequenceDictionary +import htsjdk.samtools.util.{Interval, Locatable} + +import java.io.Closeable +import scala.collection.mutable +import scala.language.reflectiveCalls + +/** Companion object for [[StreamingPileupBuilder]]. */ +object StreamingPileupBuilder { + + /** The default initial cache size for pre-allocating an array for a pileup of reads. Set to 300x coverage by default. */ + val DefaultInitialCacheSize: Int = 300 + + /** Build a streaming pileup builder from a coordinate sorted SAM source. */ + def apply( + source: SamSource, + minMapQ: Int = PileupDefaults.minMapQ, + minBaseQ: Int = PileupDefaults.minBaseQ, + mappedPairsOnly: Boolean = PileupDefaults.mappedPairsOnly, + includeDuplicates: Boolean = PileupDefaults.includeDuplicates, + includeSecondaryAlignments: Boolean = PileupDefaults.includeSecondaryAlignments, + includeSupplementalAlignments: Boolean = PileupDefaults.includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert, + initialCacheSize: Int = DefaultInitialCacheSize, + ): StreamingPileupBuilder = { + require(SamOrder(source.header).contains(Coordinate), "SAM source must be coordinate sorted!") + lazy val iterator = source.iterator + new StreamingPileupBuilder( + records = iterator, + dict = source.dict, + minMapQ = minMapQ, + minBaseQ = minBaseQ, + mappedPairsOnly = mappedPairsOnly, + includeDuplicates = includeDuplicates, + includeSecondaryAlignments = includeSecondaryAlignments, + includeSupplementalAlignments = includeSupplementalAlignments, + includeMapPositionsOutsideFrInsert = includeMapPositionsOutsideFrInsert, + source = Some(iterator) + ) + } + + /** Helper class to ensure pileups are locatable and can be used in coordinate comparison and ordering. */ + private implicit class LocatablePileup(pileup: Pileup[PileupEntry]) extends Locatable { + override def getContig: String = pileup.refName + override def getStart: Int = pileup.pos + override def getEnd: Int = pileup.pos + } +} + +/** A pileup builder that builds coordinate-maintaining or advancing pileups using lazy end-to-end BAM streaming. + * + * For every call to , this class will advance until SAM records are found that overlap the requested + * locus. These records will be loaded into an internal cache and then a pileup will be built from them. The pileup + * will be cached in the event a coordinate-maintaining call to is made in the future. If another + * coordinate-advancing call is made to then any previous SAM records are evicted from the internal cache that + * do not overlap the requested locus and will be advanced to find all additional SAM records that overlap + * the requested locus. + * + * @param records a by-name iterator of SAM records that is assumed to be coordinate-sorted. + * @param dict the sequence dictionary associated with the records we will pileup. + * @param minBaseQ the minimum base quality that a base must have to contribute to a pileup. + * @param minMapQ the minimum mapping quality a record must have to contribute to a pileup. + * @param mappedPairsOnly if true, only allow paired records with both records mapped to contribute to a pileup. + * @param includeDuplicates if true, allow records flagged as duplicates to contribute to a pileup. + * @param includeSecondaryAlignments if true, allow records flagged as secondary alignments to contribute to a pileup. + * @param includeSupplementalAlignments if true, allow records flagged as supplementary alignments to contribute to a pileup. + * @param includeMapPositionsOutsideFrInsert if true, include any record of an FR pair where the site requested is outside the insert. + * @param initialCacheSize the initial size for the internal SAM record cache, set this to your expected pileup depth. + */ +class StreamingPileupBuilder private( + records: => Iterator[SamRecord], + override val dict: SequenceDictionary, + override val minMapQ: Int = PileupDefaults.minMapQ, + override val minBaseQ: Int = PileupDefaults.minBaseQ, + override val mappedPairsOnly: Boolean = PileupDefaults.mappedPairsOnly, + override val includeDuplicates: Boolean = PileupDefaults.includeDuplicates, + override val includeSecondaryAlignments: Boolean = PileupDefaults.includeSecondaryAlignments, + override val includeSupplementalAlignments: Boolean = PileupDefaults.includeSupplementalAlignments, + override val includeMapPositionsOutsideFrInsert: Boolean = PileupDefaults.includeMapPositionsOutsideFrInsert, + initialCacheSize: Int = DefaultInitialCacheSize, + source: => Option[{ def close(): Unit }] = None, +) extends PileupBuilder with Closeable { + import com.fulcrumgenomics.bam.pileup.StreamingPileupBuilder.LocatablePileup + + /** Records that we've accumulated that could overlap another coordinate-advancing call to . */ + private val cache: mutable.ArrayBuffer[SamRecord] = new mutable.ArrayBuffer[SamRecord](initialCacheSize) + + /** Whether this builder is able to pileup more records from the input iterator of SAM records or not. */ + private var closed: Boolean = false + + /** A genomic ordering for any locatable that utilizes the sequence dictionary corresponding to the input records. */ + private val byCoordinate: Ordering[Locatable] = LocatableOrdering(dict) + + /** The last pileup we built over the input SAM records and is cached to save time if the locus is re-requested. */ + private var lastPileup: Option[Pileup[PileupEntry]] = None + + /** The underlying buffered stream of input SAM records which is lazily summoned. */ + private lazy val underlying: Iterator[SamRecord] = records.filter(_.mapped).bufferBetter + + /** Advance this builder to the next requested locus and add all possibly overlapping records to the cache. */ + @inline private def seekAndFillCache(refIndex: Int, pos: Int): Unit = { + // Drop records up until the next record stands a chance of overlapping the requested locus. Then, take records up + // until the next record stands a chance of having a start greater than the requested locus plus one. All records in + // this query have a chance of overlapping the locus so we must then filter down to only those that have an end + // location greater than or equal to the requested locus. Finally, these records will be added to the cache so that + // they can be evaluated for pileup at the requested locus. We add one to for the case when a record starts + // with an insertion. + val maybeOverlapping = underlying + .dropWhile(rec => rec.refIndex < refIndex || (rec.refIndex == refIndex && rec.end < pos)) + .takeWhile(rec => rec.refIndex == refIndex && rec.start <= pos + 1) + .filter(rec => rec.end >= pos) + cache.addAll(maybeOverlapping) + } + + /** Efficiently advance to the next coordinate-maintaining or coordinate-advancing locus and build a pileup there. */ + def pileup(refName: String, pos: Int): Pileup[PileupEntry] = { + require(!closed, "Cannot advance to a new locus if the pileup builder was closed!") + val currentLocus = new Interval(refName, pos, pos) + val refIndex = dict(refName).index.ensuring(_ >= 0, s"Reference name not in sequence dictionary: $refName") + + // If there is a last pileup defined and it was a locus prior to the requested locus then purge the cache of any + // records that will not overlap the requested locus, advance the iterator to accumulate maybe-overlapping records + // back into the cache, and finally pileup records from the requested locus using the cache. If there is a last + // pileup defined and the requested locus is at the same locus as the last pileup, then return the cached pileup. + // If there is a last pileup defined and the requested locus is prior to the last pileup then an exception will be + // raised since it means we are not coordinate-maintaining or coordinate-advancing. When there is no last pileup + // defined, advance the iterator to accumulate maybe-overlapping records into a fresh cache and then pileup records + // from the requested locus using the cache. + val pileup = lastPileup match { + case Some(last) if byCoordinate.lt(last, currentLocus) => + lastPileup = None // Set to `None` now so we can drop the object reference ASAP for garbage collection. + if (last.refIndex != refIndex) cache.clear() // If we've moved to a new reference sequence index, simply clear. + else cache.filterInPlace(rec => (rec.refIndex == refIndex && rec.end >= pos) || rec.refIndex > refIndex) + this.seekAndFillCache(refIndex = refIndex, pos = pos) + this.build(cache, refName = refName, pos = pos) + case Some(last) if byCoordinate.equiv(last, currentLocus) => last + case Some(last) => + throw new IllegalArgumentException( + s"Queried locus $refName:$pos is behind the previous pileup and should be " + + s"greater than or equal to: ${last.refName}:${last.pos}" + ) + case None => + this.seekAndFillCache(refIndex = refIndex, pos = pos) + this.build(cache, refName = refName, pos = pos) + } + + lastPileup = Some(pileup) + pileup + } + + /** Close this pileup builder and clear internal state so that all object references can be garbage collected. */ + override def close(): Unit = { closed = true; source.foreach(_.close()); cache.clear(); lastPileup = None } +} diff --git a/src/main/scala/com/fulcrumgenomics/coord/LocatableOrdering.scala b/src/main/scala/com/fulcrumgenomics/coord/LocatableOrdering.scala new file mode 100644 index 000000000..5b3b99693 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/coord/LocatableOrdering.scala @@ -0,0 +1,20 @@ +package com.fulcrumgenomics.coord + +import com.fulcrumgenomics.fasta.SequenceDictionary +import htsjdk.samtools.util.Locatable + +/** Methods for building orderings of [[Locatable]] instances. */ +object LocatableOrdering { + + /** Build a coordinate-based ordering of [[Locatable]] instances. */ + def apply(dict: SequenceDictionary): Ordering[Locatable] = (x: Locatable, y: Locatable) => { + var compare = (dict.get(x.getContig), dict.get(y.getContig)) match { + case (Some(meta1), Some(meta2)) => meta1.index.compare(meta2.index) + case (None, _) => throw new NoSuchElementException(s"Sequence dictionary does not contain contig: ${x.getContig}") + case (_, None) => throw new NoSuchElementException(s"Sequence dictionary does not contain contig: ${y.getContig}") + } + if (compare == 0) compare = x.getStart.compare(y.getStart) + if (compare == 0) compare = x.getEnd.compare(y.getEnd) + compare + } +} diff --git a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala index adf4de985..9eccbbde6 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala @@ -121,5 +121,12 @@ object VcfSource { val reader = AbstractFeatureReader.getFeatureReader(path.toUri.toString, codec, false) new VcfSource(reader, headerTransformer=headerTransformer) } -} + /** Return the only sample in the VCF source otherwise raise an exception. */ + def onlySample(source: VcfSource): String = { + source.header.samples.toList match { + case head :: Nil => head + case _ => throw new IllegalArgumentException(s"Source is not single-sample. Found samples: ${source.header.samples.mkString(", ")}") + } + } +} diff --git a/src/main/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcf.scala index 1bc0b2940..ae8eaff2e 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcf.scala @@ -24,190 +24,182 @@ package com.fulcrumgenomics.vcf.filtration -import com.fulcrumgenomics.FgBioDef._ -import com.fulcrumgenomics.bam._ -import com.fulcrumgenomics.bam.api.{QueryType, SamSource} +import com.fulcrumgenomics.bam.api.SamSource +import com.fulcrumgenomics.bam.pileup.PileupBuilder +import com.fulcrumgenomics.bam.pileup.PileupBuilder.{BamAccessPattern, PileupDefaults} +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.RandomAccess import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.commons.io.Io import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.sopt.{arg, clp} -import com.fulcrumgenomics.util.Io -import com.fulcrumgenomics.vcf.api.{VcfHeader, VcfSource, VcfWriter} +import com.fulcrumgenomics.util.ProgressLogger +import com.fulcrumgenomics.vcf.api.{VcfSource, VcfWriter} import scala.collection.immutable.ListMap -@clp(group=ClpGroups.VcfOrBcf, description= - """ - |Applies one or more filters to a VCF of somatic variants. The VCF must contain genotype information - |for the tumor sample. If the VCF also contains genotypes for one or more other samples, the - |`--sample` option must be provided to specify the sample whose genotypes to examine and whose reads - |are present in the BAM file. - | - |Various options are available for filtering the reads coming from the BAM file, including - |`--min-mapping-quality`, `--min-base-quality` and `--paired-reads-only`. The latter filters to only - |paired end reads where both reads are mapped. Reads marked as duplicates, secondary alignments - |and supplemental alignments are all filtered out. - | - |Each available filter may generate annotations in the `INFO` field of the output VCF and optionally, - |if a threshold is specified, may apply one or more `FILTER`s to applicable variants. - | - |In previous versions of this tool, the only available filter was specific to A-base addition artifacts and was - |referred to as the 'End Repair Artifact Filter.' This filter has been renamed to 'A-tailing Artifact Filter', but - |its functionality is unchanged. The filter's associated command-line parameters, `INFO` field key, and `FILTER` tag - |have also been renamed accordingly, as described below. - | - |## Available Filters - | - |### A-tailing Artifact Filter (previously 'End Repair Artifact Filter') - | - |The A-tailing artifact filter attempts to measure the probability that a single-nucleotide mismatch is the product of - |errors in the template generated during the A-base addition steps that are common to many Illumina library - |preparation protocols. The artifacts occur if/when a recessed 3' end is incorrectly filled in with one or more - |adenines during A-base addition. Incorrect adenine incorporation presents specifically as errors to T at the - |beginning of reads (and in very short templates, as matching errors to A at the ends of reads). - | - |The filter adds the `INFO` field `ATAP` (previously `ERAP`) to SNVs with an A or T alternate allele. This field - |records the p-value representing the probability of the null hypothesis that the variant is a true mutation, so - |lower p-values indicate that the variant is more likely an A-tailing artifact. If a threshold p-value is specified, - |the `FILTER` tag `ATailingArtifact` (previously `EndRepairArtifact`) will be applied to variants with p-values less - |than or equal to the threshold. - | - |Two options are available: - | - |* `--a-tailing-distance` (previously `--end-repair-distance`) allows control over how close to the ends of - | reads/templates errors can be considered to be candidates for the A-tailing artifact. - | Higher values decrease the power of the test, so this should be set as low as possible - | given observed errors. - |* `--a-tailing-p-value` (previously `--end-repair-p-value`) the p-value at or below which a filter should be applied. - | If no value is supplied only the `INFO` annotation is produced and no `FILTER` is applied. - | - |### End Repair Fill-in Artifact Filter - | - |The end repair fill-in artifact filter attempts to measure the probability that a single-nucleotide mismatch is the - |product of an error in the template generated during the end repair fill-in step that is common to many Illumina - |library preparation protocols, in which single-stranded 3' overhangs are filled in to create a blunt end. These - |artifacts originate from single-stranded templates containing damaged bases, often as a consequence of oxidative - |damage. These DNA lesions, for example 8-oxoguanine, undergo mismatched pairing, which after PCR appear as mutations - |at the ends of reads. - | - |The filter adds the `INFO` field `ERFAP` to records SNVs. This field records the p-value representing the probability of the - |null hypothesis (e.g. that the variant is a true mutation), so lower p-values indicate that the variant is more likely - |an end repair fill-in artifact. If a threshold p-value is specified, then the `FILTER` tag `EndRepairFillInArtifact` - |will be applied to variants with p-values less than or equal to the threshold. - | - |Two options are available: - | - |* `--end-repair-fill-in-distance` allows control over how close to the ends of reads/templates errors can be - | considered to be candidates for the artifact. Higher values decrease the - | power of the test, so this should be set as low as possible given observed - | errors. - |* `--end-repair-fill-in-p-value` the p-value below which a filter should be applied. If no value is supplied - | only the annotation is produced and no filtering is performed. - """) -class FilterSomaticVcf -( @arg(flag='i', doc="Input VCF of somatic variant calls.") val input: PathToVcf, - @arg(flag='o', doc="Output VCF of filtered somatic variants.") val output: PathToVcf, - @arg(flag='b', doc="BAM file for the tumor sample.") val bam: PathToBam, - @arg(flag='s', doc="Sample name in VCF if `> 1` sample present.") val sample: Option[String] = None, - @arg(flag='m', doc="Minimum mapping quality for reads.") val minMappingQuality: Int = 30, - @arg(flag='q', doc="Minimum base quality.") val minBaseQuality: Int = 20, - @arg(flag='p', doc="Use only paired reads mapped in pairs.") val pairedReadsOnly: Boolean = false, - // Developer Note: filter-specific attributes should NOT be given flags as we will likely run - // out of flags that way and end up with a mix of +flag and -flag options. - @arg(doc="Distance from end of read to implicate A-base addition artifacts. Set to :none: to deactivate the filter.") val aTailingDistance: Option[Int] = Some(2), - @arg(doc="Minimum acceptable p-value for the A-base addition artifact test.") val aTailingPValue: Option[Double] = None, - @arg(doc="Distance from end of read to implicate end repair fill-in artifacts. Set to :none: to deactivate the filter.") val endRepairFillInDistance: Option[Int] = Some(15), - @arg(doc="Minimum acceptable p-value for the end repair fill-in artifact test.") val endRepairFillInPValue: Option[Double] = None +@clp( + description = + """Applies one or more filters to a VCF of somatic variants. The VCF must contain genotype information for the + |tumor sample. If the VCF also contains genotypes for one or more other samples, the `--sample` option must be + |provided to specify the sample whose genotypes to examine and whose reads are present in the BAM file. + | + |Various options are available for filtering the reads coming from the BAM file, including + |`--min-mapping-quality`, `--min-base-quality` and `--paired-reads-only`. The latter filters to only paired end + |reads where both reads are mapped. Reads marked as duplicates, secondary alignments and supplemental alignments + |are all filtered out. + | + |Each available filter may generate annotations in the `INFO` field of the output VCF and optionally, if a + |threshold is specified, may apply one or more `FILTER`s to applicable variants. + | + |In previous versions of this tool, the only available filter was specific to A-base addition artifacts and was + |referred to as the 'End Repair Artifact Filter.' This filter has been renamed to 'A-tailing Artifact Filter', but + |its functionality is unchanged. The filter's associated command-line parameters, `INFO` field key, and `FILTER` + |tag have also been renamed accordingly, as described below. + | + |## Available Filters + | + |### A-tailing Artifact Filter (previously 'End Repair Artifact Filter') + | + |The A-tailing artifact filter attempts to measure the probability that a single-nucleotide mismatch is the + |product of errors in the template generated during the A-base addition steps that are common to many Illumina + |library preparation protocols. The artifacts occur if/when a recessed 3' end is incorrectly filled in with one\ + |or more adenines during A-base addition. Incorrect adenine incorporation presents specifically as errors to T at + |the beginning of reads (and in very short templates, as matching errors to A at the ends of reads). + | + |The filter adds the `INFO` field `ATAP` (previously `ERAP`) to SNVs with an A or T alternate allele. This field + |records the p-value representing the probability of the null hypothesis that the variant is a true mutation, so + |lower p-values indicate that the variant is more likely an A-tailing artifact. If a threshold p-value is + |specified, the `FILTER` tag `ATailingArtifact` (previously `EndRepairArtifact`) will be applied to variants with + |p-values less than or equal to the threshold. + | + |Two options are available: + | + |* `--a-tailing-distance` (previously `--end-repair-distance`) allows control over how close to the ends of + | reads/templates errors can be considered to be candidates for the A-tailing artifact. + | Higher values decrease the power of the test, so this should be set as low as possible + | given observed errors. + |* `--a-tailing-p-value` (previously `--end-repair-p-value`) the p-value at or below which a filter should be + | applied. If no value is supplied only the `INFO` annotation is produced and no `FILTER` + | is applied. + | + |### End Repair Fill-in Artifact Filter + | + |The end repair fill-in artifact filter attempts to measure the probability that a single-nucleotide mismatch is + |the product of an error in the template generated during the end repair fill-in step that is common to many + |Illumina library preparation protocols, in which single-stranded 3' overhangs are filled in to create a blunt + |end. These artifacts originate from single-stranded templates containing damaged bases, often as a consequence + |of oxidative damage. These DNA lesions, for example 8-oxoguanine, undergo mismatched pairing, which after PCR + |appear as mutations at the ends of reads. + | + |The filter adds the `INFO` field `ERFAP` to records SNVs. This field records the p-value representing the + |probability of the null hypothesis (e.g. that the variant is a true mutation), so lower p-values indicate that + |the variant is more likely an end repair fill-in artifact. If a threshold p-value is specified, then the `FILTER` + |tag `EndRepairFillInArtifact` will be applied to variants with p-values less than or equal to the threshold. + | + |Two options are available: + | + |* `--end-repair-fill-in-distance` allows control over how close to the ends of reads/templates errors can be + | considered to be candidates for the artifact. Higher values decrease the + | power of the test, so this should be set as low as possible given observed + | errors. + |* `--end-repair-fill-in-p-value` the p-value below which a filter should be applied. If no value is supplied + | only the annotation is produced and no filtering is performed. + | + |## Performance Expectations + | + |By default `--access-pattern` will be set to `RandomAccess` and the input BAM will be queried using index-based + |random access. Random access is mandatory if the input VCF is not coordinate sorted. If random access is not + |requested and the input VCF is not coordinate sorted, then an exception will be raised on the first + |non-coordinate increasing VCF record found. The BAM must be coordinate sorted in all cases and additionally be + |indexed if random access is requested. + | + |Often, a VCF file will contain a sparse set of records that are scattered across a given territory within a + |genome (or the records will be sparsely scattered genome-wide). If the territory of the VCF records is markedly + |smaller than the territory of all aligned SAM records in the BAM file, then random access may be the most + |efficient BAM access pattern. However, there are cases where random access will be less efficient such as when + |the VCF is coordinate sorted and the variant call records are very densely packed across a similar territory as + |compared to all aligned SAM records. Such a case is common in deeply sequenced hybrid selection NGS experiments + |and setting `--access-pattern` to `Streaming` will often be the most efficient BAM access pattern. + """, + group = ClpGroups.VcfOrBcf +) class FilterSomaticVcf( + @arg(flag = 'i', doc = "Input VCF of somatic variant calls.") val input: PathToVcf, + @arg(flag = 'o', doc = "Output VCF of filtered somatic variants.") val output: PathToVcf, + @arg(flag = 'b', doc = "BAM file for the tumor sample.") val bam: PathToBam, + @arg(flag = 's', doc = "Sample name in VCF if `> 1` sample present.") val sample: Option[String] = None, + @arg(flag = 'm', doc = "Minimum mapping quality for reads.") val minMappingQuality: Int = PileupDefaults.minMapQ, + @arg(flag = 'q', doc = "Minimum base quality.") val minBaseQuality: Int = PileupDefaults.minBaseQ, + @arg(flag = 'p', doc = "Use only paired reads mapped in pairs.") val pairedReadsOnly: Boolean = false, + @arg(flag = 'A', doc = "The type of BAM access pattern to use.") val accessPattern: BamAccessPattern = RandomAccess, + @arg(doc = "Distance from 5-prime end of read to implicate A-base addition artifacts. Set to :none: to deactivate the filter.") + val aTailingDistance: Option[Int] = Some(2), + @arg(doc = "Minimum acceptable p-value for the A-base addition artifact test.") + val aTailingPValue: Option[Double] = None, + @arg(doc = "Distance from 5-prime end of read to implicate end repair fill-in artifacts. Set to :none: to deactivate the filter.") + val endRepairFillInDistance: Option[Int] = Some(15), + @arg(doc = "Minimum acceptable p-value for the end repair fill-in artifact test.") + val endRepairFillInPValue: Option[Double] = None ) extends FgBioTool with LazyLogging { Io.assertReadable(input) Io.assertReadable(bam) Io.assertCanWriteFile(output) - override def execute(): Unit = { - val vcfIn = VcfSource(input) - val bamIn = SamSource(bam) - val vcfSample = sample match { - case Some(s) => - validate(vcfIn.header.samples.contains(s), s"Sample $s not present in VCF.") - s - case None => - if (vcfIn.header.samples.size == 1) vcfIn.header.samples.head - else invalid("Must supply --sample when VCF contains more than one sample.") - } - - val filters = Seq.empty[ReadEndSomaticVariantFilter] ++ - aTailingDistance.map { dist => new ATailingArtifactLikelihoodFilter(dist, aTailingPValue) } ++ - endRepairFillInDistance.map { dist => new EndRepairFillInArtifactLikelihoodFilter(dist, endRepairFillInPValue) } - - if (filters.isEmpty) logger.warning("All filters are disabled and this tool will no-op!") - - val builder = new PileupBuilder(bamIn.dict, mappedPairsOnly=pairedReadsOnly, minBaseQ=minBaseQuality, minMapQ=minMappingQuality) - val out = makeWriter(output, vcfIn.header, filters) + /** All somatic variant filters that will be applied to the input VCF. If none are set, a warning will be logged. */ + private val somaticFilters: Seq[ReadEndSomaticVariantFilter] = Seq.empty ++ + aTailingDistance.map(dist => new ATailingArtifactLikelihoodFilter(dist, aTailingPValue)) ++ + endRepairFillInDistance.map(dist => new EndRepairFillInArtifactLikelihoodFilter(dist, endRepairFillInPValue)) + if (somaticFilters.isEmpty) logger.warning(s"${getClass.getSimpleName} filters are disabled and this tool will no-op!") - vcfIn.foreach { variant => - val gt = variant.genotypes(vcfSample) - val applicableFilters = filters.filter(_.appliesTo(gt)) - - val variantOut = if (applicableFilters.isEmpty) variant else { - val iterator = bamIn.query(variant.chrom, variant.pos, variant.pos, QueryType.Overlapping) - val pileup = filterPileup(builder.build(iterator, variant.chrom, variant.pos).withoutOverlaps) - iterator.close() + /** Execute the tool [[FilterSomaticVcf]]. */ + override def execute(): Unit = { + val records = SamSource(bam) + val source = VcfSource(input) + val progress = ProgressLogger(logger = logger, noun = "variants", verb = "written", unit = 1000) + + val piler = PileupBuilder( + records, + accessPattern = accessPattern, + mappedPairsOnly = pairedReadsOnly, + minBaseQ = minBaseQuality, + minMapQ = minMappingQuality, + ) - val allAnnotations = variant.attrs.toBuffer - val allFilters = variant.filters.toBuffer + val header = source.header.copy( + infos = source.header.infos ++ somaticFilters.flatMap(_.VcfInfoLines), + filters = source.header.filters ++ somaticFilters.flatMap(_.VcfFilterLines), + ) - applicableFilters.foreach { f => - // Generate the set of annotations, push them into the ctxBuilder, then also add any filters to the ctxBuilder - val annotations = f.annotations(pileup, gt) - allAnnotations ++= annotations - allFilters ++= f.filters(annotations) - } + val name = sample.getOrElse(VcfSource.onlySample(source)).ensuring( + sample => source.header.samples.contains(sample), + s"There is no genotype with the following sample in the input VCF/BCF: $sample" + ) - variant.copy(attrs = ListMap(allAnnotations.toSeq:_*), filters = allFilters.toSet) - } + val writer = VcfWriter(output, header = header) - out += variantOut - } + source.tapEach(progress.record).foreach { variant => + val updated = somaticFilters.filter(_.appliesTo(variant.genotypes(name))) match { + case Nil => variant + case filters => + val pileup = piler.pileup(refName = variant.chrom, pos = variant.pos).withoutOverlaps + val allAnnotations = variant.attrs.toBuffer + val allFilters = variant.filters.toBuffer - out.close() - vcfIn.safelyClose() - bamIn.safelyClose() - } + filters.foreach { filter => + val annotations = filter.annotations(pileup, variant.genotypes(name)) + allAnnotations ++= annotations + allFilters ++= filter.filters(annotations) + } - /** - * Filters the pileup to remove entries that are from PE reads who's insert size indicates that the - * aligned base we're looking at is _outside_ the insert! - */ - private def filterPileup[A <: PileupEntry](pileup: Pileup[A]): Pileup[A] = { - val filtered = pileup.pile.filter { e => - if (e.rec.isFrPair) { - val (start, end) = Bams.insertCoordinates(e.rec) - if (pileup.pos < start || pileup.pos > end) { - logger.warning( - s""" - |Ignoring read ${e.rec.name} mapped over variant site ${pileup.refName}:${pileup.pos} that has - |incompatible insert size yielding insert coordinates of ${pileup.refName}:$start-$end which do not overlap the variant. - """.stripMargin.trim.linesIterator.mkString(" ")) - false - } - else { - true - } - } - else { - true + variant.copy(attrs = ListMap.from(allAnnotations), filters = Set.from(allFilters)) } + writer.write(updated) } - pileup.copy(pile=filtered) - } - - /** Builds a VCF writer that had all the necessary headers. */ - private def makeWriter(output: PathToVcf, in: VcfHeader, filters: Seq[SomaticVariantFilter]): VcfWriter = { - val header = in.copy( - filters = (in.filters ++ filters.flatMap(_.VcfFilterLines)).distinct, - infos = (in.infos ++ filters.flatMap(_.VcfInfoLines)).distinct - ) - - VcfWriter(output, header) + piler.safelyClose() + records.safelyClose() + source.safelyClose() + writer.close() + progress.logLast() } } diff --git a/src/main/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilter.scala b/src/main/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilter.scala index 7722cf90c..6bd368f1c 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilter.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilter.scala @@ -24,7 +24,8 @@ package com.fulcrumgenomics.vcf.filtration -import com.fulcrumgenomics.bam.{Bams, BaseEntry, Pileup, PileupEntry} +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.pileup.{BaseEntry, Pileup, PileupEntry} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.util.NumericTypes.{LogProbability => LnProb} import com.fulcrumgenomics.util.Sequences diff --git a/src/main/scala/com/fulcrumgenomics/vcf/filtration/SomaticVariantFilter.scala b/src/main/scala/com/fulcrumgenomics/vcf/filtration/SomaticVariantFilter.scala index cd1a8d9ff..777ac0151 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/filtration/SomaticVariantFilter.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/filtration/SomaticVariantFilter.scala @@ -24,7 +24,7 @@ package com.fulcrumgenomics.vcf.filtration -import com.fulcrumgenomics.bam.{Pileup, PileupEntry} +import com.fulcrumgenomics.bam.pileup.{Pileup, PileupEntry} import com.fulcrumgenomics.vcf.api.{Genotype, VcfFilterHeader, VcfInfoHeader} /** diff --git a/src/test/scala/com/fulcrumgenomics/bam/PileupTest.scala b/src/test/scala/com/fulcrumgenomics/bam/PileupTest.scala deleted file mode 100644 index 5d635d815..000000000 --- a/src/test/scala/com/fulcrumgenomics/bam/PileupTest.scala +++ /dev/null @@ -1,194 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2017 Fulcrum Genomics LLC - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -package com.fulcrumgenomics.bam - -import com.fulcrumgenomics.testing.{ReferenceSetBuilder, SamBuilder, UnitSpec} -import htsjdk.samtools.reference.ReferenceSequenceFileFactory - -class PileupTest extends UnitSpec { - // A reference sequence for use below - private val ref = { - val builder = new ReferenceSetBuilder() - builder.add("chr1").add("A", 1000) - builder.add("chr2").add("C", 1000) - val path = builder.toTempFile() - ReferenceSequenceFileFactory.getReferenceSequenceFile(path) - } - - private val dict = { - import com.fulcrumgenomics.fasta.Converters.FromSAMSequenceDictionary - ref.getSequenceDictionary.fromSam - } - - "PileupBuilder.build" should "build a simple pileup from fragment reads" in { - val piler = new PileupBuilder(dict, mappedPairsOnly=false) - val builder = new SamBuilder(readLength=50, sd=Some(dict)) - 1 to 5 foreach { _ => builder.addFrag(start=100).foreach(_.bases = "A" * 50) } - 1 to 4 foreach { _ => builder.addFrag(start=100).foreach(_.bases = "C" * 50) } - 1 to 3 foreach { _ => builder.addFrag(start=100).foreach(_.bases = "G" * 50) } - 1 to 2 foreach { _ => builder.addFrag(start=100).foreach(_.bases = "T" * 50) } - 1 to 1 foreach { _ => builder.addFrag(start=100).foreach(_.bases = "N" * 50) } - - piler.build(builder.iterator, "chr1", 50).pile shouldBe empty - val pile = piler.build(builder.iterator, "chr1", 100).withoutIndels - pile.depth shouldBe 5 + 4 + 3 + 2 + 1 - pile.count(_.base == 'A') shouldBe 5 - pile.count(_.base == 'C') shouldBe 4 - pile.count(_.base == 'G') shouldBe 3 - pile.count(_.base == 'T') shouldBe 2 - pile.count(_.base == 'N') shouldBe 1 - } - - it should "build pileups from reads that contain indels" in { - val piler = new PileupBuilder(dict, mappedPairsOnly=false) - val builder = new SamBuilder(readLength=50, sd=Some(dict)) - builder.addFrag(name="q1", start=101, cigar="10M2D40M").foreach(_.bases = "A" * 50) - builder.addFrag(name="q2", start=101, cigar="10M2I38M").foreach(_.bases = "C" * 50) - builder.addFrag(name="q3", start=101, cigar="31M9I10M").foreach(_.bases = "G" * 50) - builder.addFrag(name="q4", start=101, cigar="30M9D20M").foreach(_.bases = "T" * 50) - builder.addFrag(name="q5", start=141, cigar="10I40M" ).foreach(_.bases = "N" * 50) - - // Before any of the indels, we should have 4 _bases_ - piler.build(builder.iterator, "chr1", 105).baseIterator.size shouldBe 4 - - // The locus before the first insertion and deletion (should include the insertion) - val p1 = piler.build(builder.iterator, "chr1", 110) - p1.depth shouldBe 4 - p1.iterator.size shouldBe 5 - p1.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "ACGT" - p1.iterator.collect{ case x: InsertionEntry => x }.map(_.rec.name).next() shouldBe "q2" - p1.baseIterator.toSeq should contain theSameElementsAs p1.withoutIndels.iterator.toSeq - - // The locus of the first deleted base - val p2 = piler.build(builder.iterator, "chr1", 111) - p2.depth shouldBe 4 - p2.iterator.size shouldBe 4 - p2.baseIterator.size shouldBe 3 - p2.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "CGT" - p2.iterator.collect{ case x: DeletionEntry => x }.map(_.rec.name).next() shouldBe "q1" - p2.baseIterator.toSeq should contain theSameElementsAs p2.withoutIndels.iterator.toSeq - - // The locus of the bigger indels - val p3 = piler.build(builder.iterator, "chr1", 131) - p3.depth shouldBe 4 - p3.iterator.size shouldBe 5 - p3.baseIterator.size shouldBe 3 - p3.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "ACG" - p3.iterator.collect{ case x: InsertionEntry => x }.map(_.rec.name).next() shouldBe "q3" - p3.iterator.collect{ case x: DeletionEntry => x }.map(_.rec.name).next() shouldBe "q4" - p3.baseIterator.toSeq should contain theSameElementsAs p3.withoutIndels.iterator.toSeq - - // Locus where there is a read with a leading indel - val p4 = piler.build(builder.iterator, "chr1", 140) - p4.depth shouldBe 4 // shouldn't count the leading indel - p4.iterator.size shouldBe 5 - p4.baseIterator.size shouldBe 4 - p4.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "ACGT" - p4.iterator.collect{ case x: InsertionEntry => x }.map(_.rec.name).next() shouldBe "q5" - p4.baseIterator.toSeq should contain theSameElementsAs p4.withoutIndels.iterator.toSeq - } - - it should "filter out reads below the min mapq" in { - val builder = new SamBuilder(readLength=50, sd=Some(dict)) - builder.addFrag(name="q1", start=101, mapq=9) - builder.addFrag(name="q2", start=101, mapq=10) - builder.addFrag(name="q3", start=101, mapq=11) - val pile = new PileupBuilder(dict, mappedPairsOnly=false, minMapQ=10).build(builder.iterator, "chr1", 105) - pile.depth shouldBe 2 - pile.exists(_.rec.name == "q1") shouldBe false - } - - it should "filter out reads below the min baseq" in { - val builder = new SamBuilder(readLength=50, sd=Some(dict), baseQuality=19) - builder.addFrag(name="q1", start=101) - builder ++= new SamBuilder(readLength=50, sd=Some(dict), baseQuality=20).addFrag(name="q2", start=101) - builder ++= new SamBuilder(readLength=50, sd=Some(dict), baseQuality=21).addFrag(name="q3", start=101) - val pile = new PileupBuilder(dict, mappedPairsOnly=false, minBaseQ=20).build(builder.iterator, "chr1", 105) - pile.depth shouldBe 2 - pile.exists(_.rec.name == "q1") shouldBe false - } - - it should "filter out reads that are not part of a mapped pair" in { - val builder = new SamBuilder(readLength=50, sd=Some(dict)) - builder.addFrag(name="q1", start=101) - builder.addPair(name="q2", start1=101, start2=101, unmapped2=true) - builder.addPair(name="q3", start1=101, start2=300) - val pile = new PileupBuilder(dict, mappedPairsOnly=true).build(builder.iterator, "chr1", 105) - pile.depth shouldBe 1 - pile.exists(r => r.rec.name == "q3" && r.rec.firstOfPair) shouldBe true - } - - it should "filter out reads and bases with custom filters" in { - val piler = new PileupBuilder(dict, mappedPairsOnly=false) - .withReadFilter(r => !r.name.startsWith("x")) - .withBaseFilter(_.offset > 5) - val builder = new SamBuilder(readLength=50, sd=Some(dict)) - builder.addFrag(name="q1", start=100) - builder.addFrag(name="x2", start=104) - builder.addFrag(name="q3", start=108) - builder.addFrag(name="q4", start=112) - - val pile = piler.build(builder.iterator, "chr1", 115) - pile.depth shouldBe 2 - pile.map(_.rec.name) should contain theSameElementsAs Seq("q1", "q3") - } - - it should "remove one half of each overlapping pair" in { - val builder = new SamBuilder(readLength=50, sd=Some(dict)) - builder.addPair(name="q1", start1=100, start2=110) - builder.addPair(name="q2", start1=110, start2=100) - builder.addPair(name="q3", start1= 50, start2=100) - val pile = new PileupBuilder(dict, mappedPairsOnly=true).build(builder.iterator, "chr1", 125) - - pile.depth shouldBe 5 - pile.withoutOverlaps.depth shouldBe 3 - pile.withoutOverlaps.groupBy(_.rec.name).values.foreach(xs => xs.size shouldBe 1) - } - - "PilupRec" should "report correct offsets/positions/bases" in { - val builder = new SamBuilder(readLength=50, sd=Some(dict), baseQuality=35) - builder.addPair(name="q1", start1=101, start2=201, bases1 = "A"*50, bases2 = "C"*50) - - val pile1 = new PileupBuilder(dict, mappedPairsOnly=true).build(builder.iterator, "chr1", 105) - pile1.depth shouldBe 1 - val p1 = pile1.pile.head.asInstanceOf[BaseEntry] - p1.base shouldBe 'A' - p1.baseInReadOrientation shouldBe 'A' - p1.qual shouldBe 35 - p1.offset shouldBe 4 - p1.positionInRead shouldBe 5 - p1.positionInReadInReadOrder shouldBe 5 - - val pile2 = new PileupBuilder(dict, mappedPairsOnly=true).build(builder.iterator, "chr1", 205) - pile2.depth shouldBe 1 - val p2 = pile2.pile.head.asInstanceOf[BaseEntry] - p2.base shouldBe 'C' - p2.baseInReadOrientation shouldBe 'G' - p2.qual shouldBe 35 - p2.offset shouldBe 4 - p2.positionInRead shouldBe 5 - p2.positionInReadInReadOrder shouldBe 46 - } -} diff --git a/src/test/scala/com/fulcrumgenomics/bam/pileup/PileupBuilderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/pileup/PileupBuilderTest.scala new file mode 100644 index 000000000..967ad689b --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/bam/pileup/PileupBuilderTest.scala @@ -0,0 +1,402 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.bam.api.SamOrder.{Coordinate, Unknown} +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.{RandomAccess, Streaming} +import com.fulcrumgenomics.bam.pileup.PileupBuilderTest._ +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} +import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} + +/** Companion object for [[PileupBuilderTest]]. */ +object PileupBuilderTest { + + /** The name for chromosome 1. */ + private val Chr1: String = "chr1" + + /** The reference sequence index for chromosome 1. */ + private val Chr1Index: Int = 0 + + /** The name for chromosome 2. */ + private val Chr2: String = "chr2" + + /** The reference sequence index for chromosome 2. */ + private val Chr2Index: Int = 1 + + /** The read length to use for most tests. */ + private val ReadLength: Int = 50 + + /** Small contig to use for tests. */ + private val SmallContigSequence: String = "ACGTGCAACGTGGCCTCAGTGATTATGCGC" * 100 + + /** The custom SAM/BAM sequence dictionary to use for all tests. */ + private val TestSequenceDictionary: SequenceDictionary = { + SequenceDictionary( + SequenceMetadata(Chr1, SmallContigSequence.length, index = Chr1Index), + SequenceMetadata(Chr2, SmallContigSequence.length, index = Chr2Index), + ) + } +} + +/** Unit tests for [[PileupBuilder]]. */ +class PileupBuilderTest extends UnitSpec { + + BamAccessPattern.values.foreach { accessPattern => + s"PileupBuilder.apply with $accessPattern BAM access" should "build a simple pileup of only matched/mismatched fragment reads" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + 1 to 5 foreach { _ => builder.addFrag(start = 5).foreach(_.bases = "A" * ReadLength) } + 1 to 4 foreach { _ => builder.addFrag(start = 5).foreach(_.bases = "C" * ReadLength) } + 1 to 3 foreach { _ => builder.addFrag(start = 5).foreach(_.bases = "G" * ReadLength) } + 1 to 2 foreach { _ => builder.addFrag(start = 5).foreach(_.bases = "T" * ReadLength) } + 1 to 1 foreach { _ => builder.addFrag(start = 5).foreach(_.bases = "N" * ReadLength) } + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, mappedPairsOnly = false) + val pile = piler.pileup(Chr1, 5).withoutIndels + + pile.depth shouldBe 5 + 4 + 3 + 2 + 1 + pile.count(_.base == 'A') shouldBe 5 + pile.count(_.base == 'C') shouldBe 4 + pile.count(_.base == 'G') shouldBe 3 + pile.count(_.base == 'T') shouldBe 2 + pile.count(_.base == 'N') shouldBe 1 + source.safelyClose() + piler.safelyClose() + } + + it should "pileup only matched/mismatched fragment reads while ignoring any that occur on a previous locus" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + // Occurs on a previous reference sequence + 1 to 5 foreach { _ => builder.addFrag(start = 55, contig = Chr1Index).foreach(_.bases = "A" * ReadLength) } + + // Occurs in a previous position and does not overlap requested pileup locus (they are abutting). + 1 to 5 foreach { _ => builder.addFrag(start = 5, contig = Chr2Index).foreach(_.bases = "A" * ReadLength) } + + 1 to 5 foreach { _ => builder.addFrag(start = 55, contig = Chr2Index).foreach(_.bases = "A" * ReadLength) } + 1 to 4 foreach { _ => builder.addFrag(start = 55, contig = Chr2Index).foreach(_.bases = "C" * ReadLength) } + 1 to 3 foreach { _ => builder.addFrag(start = 55, contig = Chr2Index).foreach(_.bases = "G" * ReadLength) } + 1 to 2 foreach { _ => builder.addFrag(start = 55, contig = Chr2Index).foreach(_.bases = "T" * ReadLength) } + 1 to 1 foreach { _ => builder.addFrag(start = 55, contig = Chr2Index).foreach(_.bases = "N" * ReadLength) } + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, mappedPairsOnly = false) + val pile = piler.pileup(Chr2, 55).withoutIndels + + pile.depth shouldBe 5 + 4 + 3 + 2 + 1 + pile.count(_.base == 'A') shouldBe 5 + pile.count(_.base == 'C') shouldBe 4 + pile.count(_.base == 'G') shouldBe 3 + pile.count(_.base == 'T') shouldBe 2 + pile.count(_.base == 'N') shouldBe 1 + source.safelyClose() + piler.safelyClose() + } + + it should "build a pileup that contains all edge cases of indels" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(name = "q1", start = 101, cigar = "10M2D40M").foreach(_.bases = "A" * ReadLength) + builder.addFrag(name = "q2", start = 101, cigar = "10M2I38M").foreach(_.bases = "C" * ReadLength) + builder.addFrag(name = "q3", start = 101, cigar = "31M9I10M").foreach(_.bases = "G" * ReadLength) + builder.addFrag(name = "q4", start = 101, cigar = "30M9D20M").foreach(_.bases = "T" * ReadLength) + builder.addFrag(name = "q5", start = 141, cigar = "10I40M" ).foreach(_.bases = "N" * ReadLength) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, mappedPairsOnly = false) + + // Before any of the indels, we should have 4 _bases_ + piler.pileup(Chr1, 105).baseIterator.size shouldBe 4 + + // The locus before the first insertion and deletion (should include the insertion) + val p1 = piler.pileup(Chr1, 110) + p1.depth shouldBe 4 + p1.iterator.size shouldBe 5 + p1.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "ACGT" + p1.iterator.collect{ case x: InsertionEntry => x }.map(_.rec.name).next() shouldBe "q2" + p1.baseIterator.toSeq should contain theSameElementsAs p1.withoutIndels.iterator.toSeq + + // The locus of the first deleted base + val p2 = piler.pileup(Chr1, 111) + p2.depth shouldBe 4 + p2.iterator.size shouldBe 4 + p2.baseIterator.size shouldBe 3 + p2.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "CGT" + p2.iterator.collect{ case x: DeletionEntry => x }.map(_.rec.name).next() shouldBe "q1" + p2.baseIterator.toSeq should contain theSameElementsAs p2.withoutIndels.iterator.toSeq + + // The locus of the bigger indels + val p3 = piler.pileup(Chr1, 131) + p3.depth shouldBe 4 + p3.iterator.size shouldBe 5 + p3.baseIterator.size shouldBe 3 + p3.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "ACG" + p3.iterator.collect{ case x: InsertionEntry => x }.map(_.rec.name).next() shouldBe "q3" + p3.iterator.collect{ case x: DeletionEntry => x }.map(_.rec.name).next() shouldBe "q4" + p3.baseIterator.toSeq should contain theSameElementsAs p3.withoutIndels.iterator.toSeq + + // Locus where there is a read with a leading indel + val p4 = piler.pileup(Chr1, 140) + p4.depth shouldBe 4 // shouldn't count the leading indel + p4.iterator.size shouldBe 5 + p4.baseIterator.size shouldBe 4 + p4.baseIterator.map(_.base.toChar).toSeq.sorted.mkString shouldBe "ACGT" + p4.iterator.collect{ case x: InsertionEntry => x }.map(_.rec.name).next() shouldBe "q5" + p4.baseIterator.toSeq should contain theSameElementsAs p4.withoutIndels.iterator.toSeq + + source.safelyClose() + piler.safelyClose() + } + + it should "allow a repeated locus query for a coordinate-maintaining call" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(start = 5).foreach(_.bases = "N" * ReadLength) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, mappedPairsOnly = false) + + val pile1 = piler.pileup(Chr1, 5).withoutIndels + pile1.depth shouldBe 1 + pile1.count(_.base == 'N') shouldBe 1 + + val pile2 = piler.pileup(Chr1, 5).withoutIndels + pile2.depth shouldBe 1 + pile2.count(_.base == 'N') shouldBe 1 + + source.safelyClose() + piler.safelyClose() + } + + it should "filter out reads below the minimum mapping quality" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(name = "q1", start = 101, mapq = 9) + builder.addFrag(name = "q2", start = 101, mapq = 10) + builder.addFrag(name = "q3", start = 101, mapq = 11) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, minMapQ = 10, mappedPairsOnly = false) + val pile = piler.pileup(Chr1, 105) + + pile.depth shouldBe 2 + pile.exists(_.rec.name == "q1") shouldBe false + + source.safelyClose() + piler.safelyClose() + } + + it should "filter out base entries below the minimum base quality" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), baseQuality = 19, sort = Some(Coordinate)) + + builder.addFrag(name = "q1", start = 101) + builder ++= new SamBuilder( + readLength = ReadLength, + sd = Some(TestSequenceDictionary), + baseQuality = 20 + ).addFrag(name = "q2", start = 101) + builder ++= new SamBuilder( + readLength = ReadLength, + sd = Some(TestSequenceDictionary), + baseQuality = 21 + ).addFrag(name = "q3", start = 101) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, mappedPairsOnly = false) + val pile = piler.pileup(Chr1, 105) + + pile.depth shouldBe 2 + pile.exists(_.rec.name == "q1") shouldBe false + + source.safelyClose() + piler.safelyClose() + } + + it should "filter out reads that are not a part of a mapped pair" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(name = "q1", start = 101) + builder.addPair(name = "q2", start1 = 101, start2 = 101, unmapped2 = true) + builder.addPair(name = "q3", start1 = 101, start2 = 300) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern) + val pile = piler.pileup(Chr1, 105) + + pile.depth shouldBe 1 + pile.exists(r => r.rec.name == "q3" && r.rec.firstOfPair) shouldBe true + + source.safelyClose() + piler.safelyClose() + } + + it should "remove one half of each overlapping pair" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addPair(name = "q1", start1 = 100, start2 = 110) + builder.addPair(name = "q2", start1 = 110, start2 = 100) + builder.addPair(name = "q3", start1 = 50, start2 = 100) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern) + val pile = piler.pileup(Chr1, 125) + + pile.depth shouldBe 5 + pile.withoutOverlaps.depth shouldBe 3 + pile.withoutOverlaps.groupBy(_.rec.name).values.foreach(xs => xs.size shouldBe 1) + + source.safelyClose() + piler.safelyClose() + } + + it should "filter out records where a position is outside the insert for an FR pair" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addPair(name = "q2", start1 = 101, start2 = 100) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern) + + piler.pileup(Chr1, 100).depth shouldBe 0 + piler.pileup(Chr1, 101).depth shouldBe 2 + piler.pileup(Chr1, 100 + ReadLength - 1).depth shouldBe 2 + piler.pileup(Chr1, 101 + ReadLength - 1).depth shouldBe 0 + + source.safelyClose() + piler.safelyClose() + } + + it should "include records where a position is outside the insert for an FR pair if asked to" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addPair(name = "q2", start1 = 101, start2 = 100) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, includeMapPositionsOutsideFrInsert = true) + + piler.pileup(Chr1, 100).depth shouldBe 1 + piler.pileup(Chr1, 101).depth shouldBe 2 + piler.pileup(Chr1, 100 + ReadLength - 1).depth shouldBe 2 + piler.pileup(Chr1, 101 + ReadLength - 1).depth shouldBe 1 + + source.safelyClose() + piler.safelyClose() + } + + it should "not filter out records where a position is outside what might look like an 'insert' for a non-FR pair" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addPair(name = "q2", start1 = 101, start2 = 100, strand1 = Minus, strand2 = Plus) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern) + + piler.pileup(Chr1, 100).depth shouldBe 1 + piler.pileup(Chr1, 101).depth shouldBe 2 + piler.pileup(Chr1, 100 + ReadLength - 1).depth shouldBe 2 + piler.pileup(Chr1, 101 + ReadLength - 1).depth shouldBe 1 + + source.safelyClose() + piler.safelyClose() + } + + it should "filter out records and entries with custom filters" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(name = "q1", start = 100) + builder.addFrag(name = "x2", start = 104) + builder.addFrag(name = "q3", start = 108) + builder.addFrag(name = "q4", start = 112) + + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern = accessPattern, mappedPairsOnly = false) + .withReadFilter(r => !r.name.startsWith("x")) + .withEntryFilter(_.offset > 5) + + val pile = piler.pileup(Chr1, 115) + + pile.depth shouldBe 2 + pile.map(_.rec.name) should contain theSameElementsAs Seq("q1", "q3") + + source.safelyClose() + piler.safelyClose() + } + + it should "report the correct positions and offsets from pileup entries" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), baseQuality = 35, sort = Some(Coordinate)) + + builder.addPair(name = "q1", start1 = 101, start2 = 201, bases1 = "A" * ReadLength, bases2 = "C" * ReadLength) + + val source = builder.toSource + + val piler1 = PileupBuilder(source, accessPattern = accessPattern) + val pile1 = piler1.pileup(Chr1, 105) + val p1 = pile1.pile.head.asInstanceOf[BaseEntry] + + pile1.depth shouldBe 1 + p1.base shouldBe 'A' + p1.baseInReadOrientation shouldBe 'A' + p1.qual shouldBe 35 + p1.offset shouldBe 4 + p1.positionInRead shouldBe 5 + p1.positionInReadInReadOrder shouldBe 5 + + val piler2 = PileupBuilder(source, accessPattern = accessPattern) + val pile2 = piler2.pileup(Chr1, 205) + val p2 = pile2.pile.head.asInstanceOf[BaseEntry] + + pile2.depth shouldBe 1 + p2.base shouldBe 'C' + p2.baseInReadOrientation shouldBe 'G' + p2.qual shouldBe 35 + p2.offset shouldBe 4 + p2.positionInRead shouldBe 5 + p2.positionInReadInReadOrder shouldBe 46 + + source.safelyClose() + piler1.safelyClose() + piler2.safelyClose() + } + } + + it should "raise an exception if the input SAM source is not coordinate sorted when BAM access is `RandomAccess`" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Unknown)) + val source = builder.toSource + require(!source.indexed) + an[IllegalArgumentException] shouldBe thrownBy { PileupBuilder(source, accessPattern = RandomAccess) } + source.safelyClose() + } + + it should "raise an exception if the input SAM source is not coordinate sorted when BAM access is `Streaming`" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Unknown)) + val source = builder.toSource + an[IllegalArgumentException] shouldBe thrownBy { PileupBuilder(source, accessPattern = Streaming) } + source.safelyClose() + } +} diff --git a/src/test/scala/com/fulcrumgenomics/bam/pileup/PileupTest.scala b/src/test/scala/com/fulcrumgenomics/bam/pileup/PileupTest.scala new file mode 100644 index 000000000..54253275d --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/bam/pileup/PileupTest.scala @@ -0,0 +1,89 @@ +/* + * The MIT License + * + * Copyright (c) 2017 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.FgBioDef.SafelyClosable +import com.fulcrumgenomics.bam.api.SamOrder.Coordinate +import com.fulcrumgenomics.bam.pileup.PileupTest._ +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} + +/** Companion object for [[PileupTest]]. */ +object PileupTest { + + /** The name for chromosome 1. */ + private val Chr1: String = "chr1" + + /** The reference sequence index for chromosome 1. */ + private val Chr1Index: Int = 0 + + /** The read length to use for most tests. */ + private val ReadLength: Int = 50 + + /** Small contig to use for tests. */ + private val SmallContigSequence: String = "ACGTGCAACGTGGCCTCAGTGATTATGCGC" * 100 + + /** The custom SAM/BAM sequence dictionary to use for all tests. */ + private val TestSequenceDictionary: SequenceDictionary = { + SequenceDictionary(SequenceMetadata(Chr1, SmallContigSequence.length, index = Chr1Index)) + } +} + +/** Unit tests for [[Pileup]]. */ +class PileupTest extends UnitSpec { + + "BaseEntry" should "report the correct offsets/positions/bases" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), baseQuality = 35, sort = Some(Coordinate)) + builder.addPair(name = "q1", start1 = 101, start2 = 201, bases1 = "A" * ReadLength, bases2 = "C" * ReadLength) + + val source1 = builder.toSource + val piler1 = PileupBuilder(source1, mappedPairsOnly = true) + val pile1 = piler1.pileup(Chr1, 105) + pile1.depth shouldBe 1 + val p1 = pile1.pile.head.asInstanceOf[BaseEntry] + p1.base shouldBe 'A' + p1.baseInReadOrientation shouldBe 'A' + p1.qual shouldBe 35 + p1.offset shouldBe 4 + p1.positionInRead shouldBe 5 + p1.positionInReadInReadOrder shouldBe 5 + source1.safelyClose() + piler1.safelyClose() + + val source2 = builder.toSource + val piler2 = PileupBuilder(source2, mappedPairsOnly = true) + val pile2 = piler2.pileup(Chr1, 205) + pile2.depth shouldBe 1 + val p2 = pile2.pile.head.asInstanceOf[BaseEntry] + p2.base shouldBe 'C' + p2.baseInReadOrientation shouldBe 'G' + p2.qual shouldBe 35 + p2.offset shouldBe 4 + p2.positionInRead shouldBe 5 + p2.positionInReadInReadOrder shouldBe 46 + source2.safelyClose() + piler2.safelyClose() + } +} diff --git a/src/test/scala/com/fulcrumgenomics/bam/pileup/RandomAccessPileupBuilderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/pileup/RandomAccessPileupBuilderTest.scala new file mode 100644 index 000000000..86774b702 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/bam/pileup/RandomAccessPileupBuilderTest.scala @@ -0,0 +1,76 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.bam.api.SamOrder.Coordinate +import com.fulcrumgenomics.bam.pileup.RandomAccessPileupBuilderTest._ +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} + +/** Companion object for [[RandomAccessPileupBuilderTest]]. */ +object RandomAccessPileupBuilderTest { + + /** The name for chromosome 1. */ + private val Chr1: String = "chr1" + + /** The reference sequence index for chromosome 1. */ + private val Chr1Index: Int = 0 + + /** The read length to use for most tests. */ + private val ReadLength: Int = 50 + + /** Small contig to use for tests. */ + private val SmallContigSequence: String = "ACGTGCAACGTGGCCTCAGTGATTATGCGC" * 100 + + /** The custom SAM/BAM sequence dictionary to use for all tests. */ + private val TestSequenceDictionary: SequenceDictionary = { + SequenceDictionary(SequenceMetadata(Chr1, SmallContigSequence.length, index = Chr1Index)) + } +} + +/** Unit tests for [[RandomAccessPileupBuilder]]. */ +class RandomAccessPileupBuilderTest extends UnitSpec { + + "RandomAccessPileupBuilder" should "allow for query of a locus prior to the last locus" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(start = 5).foreach(_.bases = "N" * ReadLength) + + val source = builder.toSource + val piler = RandomAccessPileupBuilder(source, mappedPairsOnly = false) + + val pile1 = piler.pileup(Chr1, 6).withoutIndels + pile1.depth shouldBe 1 + pile1.count(_.base == 'N') shouldBe 1 + + val pile2 = piler.pileup(Chr1, 5).withoutIndels + pile2.depth shouldBe 1 + pile2.count(_.base == 'N') shouldBe 1 + + source.safelyClose() + piler.safelyClose() + } +} diff --git a/src/test/scala/com/fulcrumgenomics/bam/pileup/StreamingPileupBuilderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/pileup/StreamingPileupBuilderTest.scala new file mode 100644 index 000000000..badcb64ac --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/bam/pileup/StreamingPileupBuilderTest.scala @@ -0,0 +1,74 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.bam.pileup + +import com.fulcrumgenomics.bam.api.SamOrder.Coordinate +import com.fulcrumgenomics.bam.pileup.StreamingPileupBuilderTest._ +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} + +/** Companion object for [[StreamingPileupBuilderTest]]. */ +object StreamingPileupBuilderTest { + + /** The name for chromosome 1. */ + private val Chr1: String = "chr1" + + /** The reference sequence index for chromosome 1. */ + private val Chr1Index: Int = 0 + + /** The read length to use for most tests. */ + private val ReadLength: Int = 50 + + /** Small contig to use for tests. */ + private val SmallContigSequence: String = "ACGTGCAACGTGGCCTCAGTGATTATGCGC" * 100 + + /** The custom SAM/BAM sequence dictionary to use for all tests. */ + private val TestSequenceDictionary: SequenceDictionary = { + SequenceDictionary(SequenceMetadata(Chr1, SmallContigSequence.length, index = Chr1Index)) + } +} + +/** Unit tests for [[StreamingPileupBuilder]]. */ +class StreamingPileupBuilderTest extends UnitSpec { + + "StreamingPileupBuilder" should "raise an exception if asked to query a locus prior to the last locus" in { + val builder = new SamBuilder(readLength = ReadLength, sd = Some(TestSequenceDictionary), sort = Some(Coordinate)) + + builder.addFrag(start = 5).foreach(_.bases = "N" * ReadLength) + + val source = builder.toSource + val piler = StreamingPileupBuilder(source, mappedPairsOnly = false) + + val pile1 = piler.pileup(Chr1, 6).withoutIndels + pile1.depth shouldBe 1 + pile1.count(_.base == 'N') shouldBe 1 + + an[IllegalArgumentException] shouldBe thrownBy { piler.pileup(Chr1, 5) } + + source.safelyClose() + piler.safelyClose() + } +} diff --git a/src/test/scala/com/fulcrumgenomics/coord/LocatableOrderingTest.scala b/src/test/scala/com/fulcrumgenomics/coord/LocatableOrderingTest.scala new file mode 100644 index 000000000..7b13d9f66 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/coord/LocatableOrderingTest.scala @@ -0,0 +1,106 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.coord + +import com.fulcrumgenomics.coord.LocatableOrderingTest._ +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.UnitSpec +import htsjdk.samtools.util.{Interval, Locatable} + +/** Companion object for [[LocatableOrderingTest]]. */ +object LocatableOrderingTest { + + /** The reference sequence name for chromosome 1. */ + private val Chr1: String = "chr1" + + /** The reference sequence name for chromosome 2. */ + private val Chr2: String = "chr2" + + /** The sequence dictionary for the ordering. */ + private val Dict: SequenceDictionary = SequenceDictionary(SequenceMetadata(Chr1), SequenceMetadata(Chr2)) + + /** The ordering of the given . */ + private val Ordering: Ordering[Locatable] = LocatableOrdering(Dict) +} + +/** Unit tests for [[LocatableOrdering]]. */ +class LocatableOrderingTest extends UnitSpec { + + "LocatableOrdering" should "know when two locatables are equivalent" in { + val interval1 = new Interval(Chr1, 1, 1) + val interval2 = new Interval(Chr1, 1, 1) + Ordering.compare(interval1, interval2) shouldBe 0 + } + + it should "know when one Locatable is more 'left' than another Locatable on the same contig" in { + val interval1 = new Interval(Chr1, 1, 1) + val interval2 = new Interval(Chr1, 2, 2) + Ordering.compare(interval1, interval2) should be < 0 + } + + it should "know when one Locatable is more 'right' than another Locatable on the same contig" in { + val interval1 = new Interval(Chr1, 2, 2) + val interval2 = new Interval(Chr1, 1, 2) + Ordering.compare(interval1, interval2) should be > 0 + } + + it should "know when one Locatable is more 'left' than another Locatable on a further 'right' contig" in { + val interval1 = new Interval(Chr1, 1, 1) + val interval2 = new Interval(Chr2, 1, 1) + Ordering.compare(interval1, interval2) should be < 0 + } + + it should "know when one Locatable is more 'right' than another Locatable on a further 'left' contig" in { + val interval1 = new Interval(Chr2, 1, 1) + val interval2 = new Interval(Chr1, 1, 1) + Ordering.compare(interval1, interval2) should be > 0 + } + + it should "raise an exception when any of the locatables are aligned to contigs that don't exist" in { + val interval1 = new Interval(Chr1, 1, 1) + val interval2 = new Interval("ChrDoesNotExist", 1, 1) + a[NoSuchElementException] shouldBe thrownBy { Ordering.compare(interval1, interval2) } + } + + it should "order genomic locatables first by contig, then start, then end" in { + val intervals = Seq( + new Interval(Chr2, 2, 5), + new Interval(Chr2, 2, 9), + new Interval(Chr1, 2, 5), + new Interval(Chr1, 2, 5), + new Interval(Chr1, 4, 5) + ) + val expected = Seq( + new Interval(Chr1, 2, 5), + new Interval(Chr1, 2, 5), + new Interval(Chr1, 4, 5), + new Interval(Chr2, 2, 5), + new Interval(Chr2, 2, 9) + ) + intervals.min(Ordering) shouldBe new Interval(Chr1, 2, 5) + intervals.max(Ordering) shouldBe new Interval(Chr2, 2, 9) + intervals.sorted(Ordering) should contain theSameElementsInOrderAs expected + } +} diff --git a/src/test/scala/com/fulcrumgenomics/vcf/api/VcfSourceTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/api/VcfSourceTest.scala new file mode 100644 index 000000000..efb427814 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/vcf/api/VcfSourceTest.scala @@ -0,0 +1,22 @@ +package com.fulcrumgenomics.vcf.api + +import com.fulcrumgenomics.FgBioDef.SafelyClosable +import com.fulcrumgenomics.testing.{UnitSpec, VcfBuilder} + +/** Unit tests for [[VcfSource]]. */ +class VcfSourceTest extends UnitSpec { + + /** The name for test sample number 1. */ + private val sample1 = "sample1" + + /** The name for test sample number 2. */ + private val sample2 = "sample2" + + "VcfUtil.onlySample" should "return the only sample in a VCF source" in { + val singleSample = VcfBuilder(samples = Seq(sample1)).toSource + val doubleSample = VcfBuilder(samples = Seq(sample1, sample2)).toSource + VcfSource.onlySample(singleSample) shouldBe sample1 + singleSample.safelyClose() + an[IllegalArgumentException] shouldBe thrownBy { VcfSource.onlySample(doubleSample) } + } +} diff --git a/src/test/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcfTest.scala index 3b8c9a6d2..dc8f03183 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/filtration/FilterSomaticVcfTest.scala @@ -24,13 +24,18 @@ package com.fulcrumgenomics.vcf.filtration -import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.SamOrder +import com.fulcrumgenomics.bam.api.SamOrder.Coordinate +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.Streaming +import com.fulcrumgenomics.commons.CommonsDef._ import com.fulcrumgenomics.testing.VcfBuilder.Gt import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec, VcfBuilder} -import com.fulcrumgenomics.vcf.api.VcfSource +import com.fulcrumgenomics.vcf.api._ +/** Unit tests for [[FilterSomaticVcf]]. */ class FilterSomaticVcfTest extends UnitSpec { + // A pair of (paths to) VCFs for use in testing below private lazy val Seq(tumorOnlyVcf, tumorNormalVcf) = { Seq(Seq("tumor"), Seq("tumor", "normal")).map { samples => @@ -121,7 +126,7 @@ class FilterSomaticVcfTest extends UnitSpec { "FilterSomaticVcf" should "work on an empty VCF" in { val emptyVcf = VcfBuilder(Seq("tumor")).toTempFile() val filteredVcf = makeTempFile("filtered.", ".vcf") - new FilterSomaticVcf(input=emptyVcf, output=filteredVcf, bam=bam).execute() + new FilterSomaticVcf(input = emptyVcf, output = filteredVcf, bam = bam).execute() val reader = VcfSource(filteredVcf) reader.header.info.contains(ATailInfoKey) shouldBe true reader.header.filter.contains(ATailFilterKey) shouldBe true @@ -130,88 +135,160 @@ class FilterSomaticVcfTest extends UnitSpec { reader.safelyClose() } - it should "work on a single sample VCF" in { - val filteredVcf = makeTempFile("filtered.", ".vcf") - new FilterSomaticVcf(input=tumorOnlyVcf, output=filteredVcf, bam=bam).execute() - val variants = readVcfRecs(filteredVcf) - variants should have size 5 - variants(0).get[Float](ATailInfoKey).isDefined shouldBe true - variants(1).get[Float](ATailInfoKey).isDefined shouldBe true - variants(2).get[Float](ATailInfoKey).isDefined shouldBe false - variants(3).get[Float](ATailInfoKey).isDefined shouldBe true - variants(4).get[Float](ATailInfoKey).isDefined shouldBe false - variants.exists(_.filters.contains(ATailFilterKey)) shouldBe false // no threshold == no filtering - - variants(0).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(1).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(2).get[Float](EndRepairInfoKey).isDefined shouldBe false - variants(3).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(4).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants.exists(_.filters.contains(EndRepairFilterKey)) shouldBe false // no threshold == no filtering - } + it should "raise an exception when stream BAM is true and variant records are not coordinate ordered" in { + val alleles = AlleleSet(Allele("C"), Allele("A")) + val attrs = Map("AD" -> Seq(8, 2)) + val gt = Genotype(alleles, sample = "sample1", calls = alleles.toIndexedSeq, attrs = attrs) - it should "fail on a single-sample VCF if an invalid sample name is provided" in { - val filteredVcf = makeTempFile("filtered.", ".vcf") - an[Exception] shouldBe thrownBy { new FilterSomaticVcf(input=tumorOnlyVcf, output=filteredVcf, bam=bam, sample=Some("WhoDis")).execute() } - } + val builder = new SamBuilder(sort = Some(Coordinate)) + builder.addFrag(start = 1) - it should "fail on a multi-sample VCF if no sample name is provided" in { - val filteredVcf = makeTempFile("filtered.", ".vcf") - an[Exception] shouldBe thrownBy { new FilterSomaticVcf(input=tumorNormalVcf, output=filteredVcf, bam=bam).execute() } - } + val input = makeTempFile(getClass.getSimpleName, ".vcf") + val output = makeTempFile(getClass.getSimpleName, ".vcf") + val bam = builder.toTempFile() - it should "fail on a multi-sample VCF if an invalid sample name is provided" in { - val filteredVcf = makeTempFile("filtered.", ".vcf") - an[Exception] shouldBe thrownBy { new FilterSomaticVcf(input=tumorNormalVcf, output=filteredVcf, bam=bam, sample=Some("WhoDis")).execute() } - } + val writer = VcfWriter(input, VcfBuilder.DefaultHeader.copy(samples = IndexedSeq("sample1"))) - it should "work on a multi-sample VCF if a sample name is given" in { - val filteredVcf = makeTempFile("filtered.", ".vcf") - new FilterSomaticVcf(input=tumorNormalVcf, output=filteredVcf, bam=bam, sample=Some("tumor")).execute() - val variants = readVcfRecs(filteredVcf) - variants should have size 5 - variants(0).get[Float](ATailInfoKey).isDefined shouldBe true - variants(1).get[Float](ATailInfoKey).isDefined shouldBe true - variants(2).get[Float](ATailInfoKey).isDefined shouldBe false - variants(3).get[Float](ATailInfoKey).isDefined shouldBe true - variants(4).get[Float](ATailInfoKey).isDefined shouldBe false - variants.exists(_.filters.contains(ATailFilterKey)) shouldBe false // no threshold == no filtering - - variants(0).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(1).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(2).get[Float](EndRepairInfoKey).isDefined shouldBe false - variants(3).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(4).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants.exists(_.filters.contains(EndRepairFilterKey)) shouldBe false // no threshold == no filtering + writer.write(Variant(builder.dict.head.name, pos = 2, alleles = alleles, genotypes = Map("sample1" -> gt))) + writer.write(Variant(builder.dict.head.name, pos = 1, alleles = alleles, genotypes = Map("sample1" -> gt))) + writer.close() + + an[IllegalArgumentException] shouldBe thrownBy { + new FilterSomaticVcf( + input = input, + output = output, + bam = bam, + accessPattern = Streaming + ).execute() + } } - it should "apply filters if filter-specific p-value thresholds are supplied" in { - val filteredVcf = makeTempFile("filtered.", ".vcf") - new FilterSomaticVcf(input=tumorOnlyVcf, output=filteredVcf, bam=bam, sample=Some("tumor"), aTailingDistance=Some(4), aTailingPValue=Some(0.001), endRepairFillInPValue = Some(0.001)).execute() - val variants = readVcfRecs(filteredVcf) - variants should have size 5 - variants(0).get[Float](ATailInfoKey).isDefined shouldBe true - variants(1).get[Float](ATailInfoKey).isDefined shouldBe true - variants(2).get[Float](ATailInfoKey).isDefined shouldBe false - variants(3).get[Float](ATailInfoKey).isDefined shouldBe true - variants(4).get[Float](ATailInfoKey).isDefined shouldBe false - - variants(0).filters.contains(ATailFilterKey) shouldBe true - variants(1).filters.contains(ATailFilterKey) shouldBe false - variants(2).filters.contains(ATailFilterKey) shouldBe false - variants(3).filters.contains(ATailFilterKey) shouldBe true - variants(4).filters.contains(ATailFilterKey) shouldBe false - - variants(0).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(1).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(2).get[Float](EndRepairInfoKey).isDefined shouldBe false - variants(3).get[Float](EndRepairInfoKey).isDefined shouldBe true - variants(4).get[Float](EndRepairInfoKey).isDefined shouldBe true - - variants(0).filters.contains(EndRepairFilterKey) shouldBe true - variants(1).filters.contains(EndRepairFilterKey) shouldBe false - variants(2).filters.contains(EndRepairFilterKey) shouldBe false - variants(3).filters.contains(EndRepairFilterKey) shouldBe true - variants(4).filters.contains(EndRepairFilterKey) shouldBe true + BamAccessPattern.values.foreach { accessPattern => + it should s"work on a single sample VCF when BAM access is: $accessPattern" in { + val filteredVcf = makeTempFile("filtered.", ".vcf") + new FilterSomaticVcf(input = tumorOnlyVcf, output = filteredVcf, bam = bam, accessPattern = accessPattern).execute() + val variants = readVcfRecs(filteredVcf) + variants should have size 5 + variants(0).get[Float](ATailInfoKey).isDefined shouldBe true + variants(1).get[Float](ATailInfoKey).isDefined shouldBe true + variants(2).get[Float](ATailInfoKey).isDefined shouldBe false + variants(3).get[Float](ATailInfoKey).isDefined shouldBe true + variants(4).get[Float](ATailInfoKey).isDefined shouldBe false + variants.exists(_.filters.contains(ATailFilterKey)) shouldBe false // no threshold == no filtering + + variants(0).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(1).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(2).get[Float](EndRepairInfoKey).isDefined shouldBe false + variants(3).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(4).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants.exists(_.filters.contains(EndRepairFilterKey)) shouldBe false // no threshold == no filtering + } + + it should s"fail on a single-sample VCF if an invalid sample name is provided when BAM access is: $accessPattern" in { + val filteredVcf = makeTempFile("filtered.", ".vcf") + an[AssertionError] shouldBe thrownBy { + new FilterSomaticVcf( + input = tumorOnlyVcf, + output = filteredVcf, + bam = bam, + sample = Some("WhoDis"), + accessPattern = accessPattern + ).execute() + } + } + + it should s"fail on a multi-sample VCF if no sample name is provided when BAM access is: $accessPattern" in { + val filteredVcf = makeTempFile("filtered.", ".vcf") + an[IllegalArgumentException] shouldBe thrownBy { + new FilterSomaticVcf( + input = tumorNormalVcf, + output = filteredVcf, + bam = bam, + accessPattern = accessPattern + ).execute() + } + } + + it should s"fail on a multi-sample VCF if an invalid sample name is provided when BAM access is: $accessPattern" in { + val filteredVcf = makeTempFile("filtered.", ".vcf") + an[AssertionError] shouldBe thrownBy { + new FilterSomaticVcf( + input = tumorNormalVcf, + output = filteredVcf, + bam = bam, + sample = Some("WhoDis"), + accessPattern = accessPattern + ).execute() + } + } + + it should s"work on a multi-sample VCF if a sample name is given when BAM access is: $accessPattern" in { + val filteredVcf = makeTempFile("filtered.", ".vcf") + + new FilterSomaticVcf( + input = tumorNormalVcf, + output = filteredVcf, + bam = bam, + sample = Some("tumor"), + accessPattern = accessPattern + ).execute() + + val variants = readVcfRecs(filteredVcf) + variants should have size 5 + variants(0).get[Float](ATailInfoKey).isDefined shouldBe true + variants(1).get[Float](ATailInfoKey).isDefined shouldBe true + variants(2).get[Float](ATailInfoKey).isDefined shouldBe false + variants(3).get[Float](ATailInfoKey).isDefined shouldBe true + variants(4).get[Float](ATailInfoKey).isDefined shouldBe false + variants.exists(_.filters.contains(ATailFilterKey)) shouldBe false // no threshold == no filtering + + variants(0).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(1).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(2).get[Float](EndRepairInfoKey).isDefined shouldBe false + variants(3).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(4).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants.exists(_.filters.contains(EndRepairFilterKey)) shouldBe false // no threshold == no filtering + } + + it should s"apply filters if filter-specific p-value thresholds are supplied when BAM access is: $accessPattern" in { + val filteredVcf = makeTempFile("filtered.", ".vcf") + + new FilterSomaticVcf( + input = tumorOnlyVcf, + output = filteredVcf, + bam = bam, + sample = Some("tumor"), + aTailingDistance = Some(4), + aTailingPValue = Some(0.001), + endRepairFillInPValue = Some(0.001), + accessPattern = accessPattern + ).execute() + + val variants = readVcfRecs(filteredVcf) + variants should have size 5 + variants(0).get[Float](ATailInfoKey).isDefined shouldBe true + variants(1).get[Float](ATailInfoKey).isDefined shouldBe true + variants(2).get[Float](ATailInfoKey).isDefined shouldBe false + variants(3).get[Float](ATailInfoKey).isDefined shouldBe true + variants(4).get[Float](ATailInfoKey).isDefined shouldBe false + + variants(0).filters.contains(ATailFilterKey) shouldBe true + variants(1).filters.contains(ATailFilterKey) shouldBe false + variants(2).filters.contains(ATailFilterKey) shouldBe false + variants(3).filters.contains(ATailFilterKey) shouldBe true + variants(4).filters.contains(ATailFilterKey) shouldBe false + + variants(0).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(1).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(2).get[Float](EndRepairInfoKey).isDefined shouldBe false + variants(3).get[Float](EndRepairInfoKey).isDefined shouldBe true + variants(4).get[Float](EndRepairInfoKey).isDefined shouldBe true + + variants(0).filters.contains(EndRepairFilterKey) shouldBe true + variants(1).filters.contains(EndRepairFilterKey) shouldBe false + variants(2).filters.contains(EndRepairFilterKey) shouldBe false + variants(3).filters.contains(EndRepairFilterKey) shouldBe true + variants(4).filters.contains(EndRepairFilterKey) shouldBe true + } } } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilterTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilterTest.scala index 3f7561a13..b6ac7c3e9 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilterTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/filtration/ReadEndSomaticVariantFilterTest.scala @@ -24,8 +24,10 @@ package com.fulcrumgenomics.vcf.filtration -import com.fulcrumgenomics.FgBioDef._ -import com.fulcrumgenomics.bam.{BaseEntry, PileupBuilder} +import com.fulcrumgenomics.bam.api.SamOrder.Coordinate +import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.{RandomAccess, Streaming} +import com.fulcrumgenomics.bam.pileup._ +import com.fulcrumgenomics.commons.CommonsDef._ import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.vcf.api.{AlleleSet, Genotype} @@ -116,7 +118,7 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { "EndRepairFillInArtifactLikelihoodFilter.annotations" should "compute a non-significant p-value when data is distributed throughout the reads" in { // Simulate a G>T non-artifact at bp 25 val filter = new EndRepairFillInArtifactLikelihoodFilter(distance=15) - val builder = new SamBuilder(readLength=50, baseQuality=40) + val builder = new SamBuilder(readLength=50, baseQuality=40, sort=Some(Coordinate)) // Add a ton of reference allele for (start <- Range.inclusive(1, 25); i <- Range.inclusive(1, 10); strand <- Seq(Plus, Minus)) { @@ -129,16 +131,20 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { } val refName = builder.dict(0).name - val pile = new PileupBuilder(dict=builder.dict, mappedPairsOnly=false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=Streaming, mappedPairsOnly=false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true annotations(filter.readEndInfoLine.id).asInstanceOf[Double] should be > 0.5 + source.safelyClose() + piler.safelyClose() } it should "compute a significant p-value when data is heavily biased" in { // Simulate a G>T non-artifact at bp 25 val filter = new EndRepairFillInArtifactLikelihoodFilter(distance=15) - val builder = new SamBuilder(readLength=50, baseQuality=40) + val builder = new SamBuilder(readLength=50, baseQuality=40, sort=Some(Coordinate)) // Add a ton of reference allele at various positions for (start <- Range.inclusive(1, 25); i <- Range.inclusive(1, 5); strand <- Seq(Plus, Minus)) { @@ -151,10 +157,14 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { } val refName = builder.dict(0).name - val pile = new PileupBuilder(dict=builder.dict, mappedPairsOnly=false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=Streaming, mappedPairsOnly=false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true annotations(filter.readEndInfoLine.id).asInstanceOf[Double] should be < 1e-6 + source.safelyClose() + piler.safelyClose() } @@ -220,7 +230,7 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { "ATailingArtifactLikelihoodFilter.annotations" should "compute a non-significant p-value when data is distributed throughout the reads" in { // Simulate a G>T non-artifact at bp 25 val filter = new ATailingArtifactLikelihoodFilter(distance=5) - val builder = new SamBuilder(readLength=50, baseQuality=40) + val builder = new SamBuilder(readLength=50, baseQuality=40, sort=Some(Coordinate)) // Add a ton of reference allele for (start <- Range.inclusive(1, 25); i <- Range.inclusive(1, 10); strand <- Seq(Plus, Minus)) { @@ -233,16 +243,20 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { } val refName = builder.dict(0).name - val pile = new PileupBuilder(dict=builder.dict, mappedPairsOnly=false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=Streaming, mappedPairsOnly=false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true annotations(filter.readEndInfoLine.id).asInstanceOf[Double] should be > 0.5 + source.safelyClose() + piler.safelyClose() } it should "compute a significant p-value when data is heavily biased" in { // Simulate a G>T non-artifact at bp 25 val filter = new ATailingArtifactLikelihoodFilter(distance=5) - val builder = new SamBuilder(readLength=50, baseQuality=40) + val builder = new SamBuilder(readLength=50, baseQuality=40, sort=Some(Coordinate)) // Add a ton of reference allele at various positions for (start <- Range.inclusive(1, 25); i <- Range.inclusive(1, 5); strand <- Seq(Plus, Minus)) { @@ -255,16 +269,20 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { } val refName = builder.dict(0).name - val pile = new PileupBuilder(dict=builder.dict, mappedPairsOnly=false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=Streaming, mappedPairsOnly=false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true annotations(filter.readEndInfoLine.id).asInstanceOf[Double] should be < 1e-6 + source.safelyClose() + piler.safelyClose() } it should "compute an intermediate p-value when data is heavily biased for both ref and alt" in { // Simulate a G>T non-artifact at bp 25 val filter = new ATailingArtifactLikelihoodFilter(distance=5) - val builder = new SamBuilder(readLength=50, baseQuality=40) + val builder = new SamBuilder(readLength=50, baseQuality=40, sort=Some(Coordinate)) // Add a ton of reference allele, 5/6ths in the first 5bp of the read for (start <- Range.inclusive(20, 25); i <- Range.inclusive(1, 10); strand <- Seq(Plus, Minus)) { @@ -277,30 +295,42 @@ class ReadEndSomaticVariantFilterTest extends UnitSpec { } val refName = builder.dict(0).name - val pile = new PileupBuilder(dict=builder.dict, mappedPairsOnly=false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=Streaming, mappedPairsOnly=false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true annotations(filter.readEndInfoLine.id).asInstanceOf[Double] should be < 1e-4 + source.safelyClose() + piler.safelyClose() } it should "not throw an exception if there is no alt allele coverage or the pileup is empty" in { val filter = new ATailingArtifactLikelihoodFilter(distance=10) - val builder = new SamBuilder(readLength=50, baseQuality=40) + val builder = new SamBuilder(readLength=50, baseQuality=40, sort=Some(Coordinate)) val refName = builder.dict(0).name { // Test with just an empty pileup - val pile = new PileupBuilder(dict = builder.dict, mappedPairsOnly = false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=RandomAccess, mappedPairsOnly = false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true + source.safelyClose() + piler.safelyClose() } { // And again with just a bunch of ref allele for (start <- Range.inclusive(1, 20); i <- Range.inclusive(1, 10); strand <- Seq(Plus, Minus)) { builder.addFrag(start = start, strand = strand, bases = "G" * 50) } - val pile = new PileupBuilder(dict = builder.dict, mappedPairsOnly = false).build(builder.iterator, refName, 25) + val source = builder.toSource + val piler = PileupBuilder(source, accessPattern=RandomAccess, mappedPairsOnly = false) + val pile = piler.pileup(refName, 25) val annotations = filter.annotations(pile, singleGenotype("G", "T")) annotations.contains(filter.readEndInfoLine.id) shouldBe true + source.safelyClose() + piler.safelyClose() } } }