Skip to content

Commit

Permalink
Ensure FilterSomaticVcf handles PASS variants correctly (#909)
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval authored Apr 18, 2023
1 parent cbbef4a commit ad6e200
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 11 deletions.
7 changes: 3 additions & 4 deletions src/main/scala/com/fulcrumgenomics/testing/VcfBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,13 @@

package com.fulcrumgenomics.testing

import java.nio.file.Files

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.testing.VcfBuilder.Gt
import com.fulcrumgenomics.vcf.api.Allele.NoCallAllele
import com.fulcrumgenomics.vcf.api.Variant.PassingFilters
import com.fulcrumgenomics.vcf.api._

import scala.collection.compat._
import java.nio.file.Files
import scala.collection.immutable.ListMap
import scala.collection.mutable
import scala.util.Try
Expand Down Expand Up @@ -166,7 +165,7 @@ class VcfBuilder private (initialHeader: VcfHeader) extends Iterable[Variant] {
require(!variants.contains(key), s"Variant already exists at position $chrom:$pos")
require(alleles.nonEmpty, s"Must specify at least one allele.")
info.keys.foreach(k => require(this._header.info.contains(k), s"No INFO header for key $k"))
filters.foreach(f => require(this._header.filter.contains(f), s"No FILTER header for key $f"))
filters.filterNot(PassingFilters.contains).foreach(f => require(this._header.filter.contains(f), s"No FILTER header for key $f"))
require(gts.map(_.sample).toSet.size == gts.size, s"Non-unique sample names in genotypes.")

val alleleSet = AlleleSet(alleles:_*)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,18 @@ package com.fulcrumgenomics.vcf.filtration

import com.fulcrumgenomics.bam.api.SamSource
import com.fulcrumgenomics.bam.pileup.PileupBuilder
import com.fulcrumgenomics.bam.pileup.PileupBuilder.{BamAccessPattern, PileupDefaults}
import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.RandomAccess
import com.fulcrumgenomics.bam.pileup.PileupBuilder.{BamAccessPattern, PileupDefaults}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.commons.io.Io
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.ProgressLogger
import com.fulcrumgenomics.vcf.api.Variant.PassingFilters
import com.fulcrumgenomics.vcf.api.{VcfSource, VcfWriter}

import scala.collection.immutable.ListMap
import scala.collection.mutable.ListBuffer

@clp(
description =
Expand Down Expand Up @@ -182,16 +183,21 @@ import scala.collection.immutable.ListMap
case Nil => variant
case filters =>
val pileup = piler.pileup(refName = variant.chrom, pos = variant.pos).withoutOverlaps
val allAnnotations = variant.attrs.toBuffer
val allFilters = variant.filters.toBuffer
val newAnnotations = new ListBuffer[(String, Any)]
val newFilters = new ListBuffer[String]

filters.foreach { filter =>
val annotations = filter.annotations(pileup, variant.genotypes(name))
allAnnotations ++= annotations
allFilters ++= filter.filters(annotations)
newAnnotations ++= annotations
newFilters ++= filter.filters(annotations)
}

val allFilters = {
if (variant.filters == PassingFilters && newFilters.nonEmpty) newFilters.toSet
else variant.filters ++ newFilters
}

variant.copy(attrs = ListMap.from(allAnnotations), filters = Set.from(allFilters))
variant.copy(attrs = variant.attrs ++ newAnnotations, filters = allFilters)
}
writer.write(updated)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import com.fulcrumgenomics.bam.pileup.PileupBuilder.BamAccessPattern.Streaming
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.testing.VcfBuilder.Gt
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec, VcfBuilder}
import com.fulcrumgenomics.vcf.api.Variant.PassingFilters
import com.fulcrumgenomics.vcf.api._

/** Unit tests for [[FilterSomaticVcf]]. */
Expand Down Expand Up @@ -163,6 +164,163 @@ class FilterSomaticVcfTest extends UnitSpec {
}
}

it should s"not change existing INFO data when adding new INFO data" in {
val samples = Seq("sample1")
val variants = VcfBuilder(samples)
variants.add(pos = 200, alleles = Seq("G", "A"), gts = Seq(Gt("sample1", "G/A")), info = Map("DP" -> 2))

val length = 40
val sequences = new SamBuilder(readLength = length, sort = Some(SamOrder.Coordinate))

// G>A SNP at 200 with low variant allele frequency but even coverage of reads
// - ATailingArtifactFilter should apply and output a high p-value
// - EndRepairArtifactFilter should apply and output a high p-value
for (pos <- 161 to 200; i <- 1 to 3) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "G" * length, bases2 = "G" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "G" * length, bases2 = "G" * length)
if (i == 1 && pos % 4 == 0) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "A" * length, bases2 = "A" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "A" * length, bases2 = "A" * length)
}
}

val output = makeTempFile(getClass.getSimpleName, ".vcf")

new FilterSomaticVcf(
input = variants.toTempFile(),
output = output,
bam = sequences.toTempFile(),
accessPattern = Streaming,
aTailingPValue = None,
endRepairFillInPValue = None,
).execute()

val Seq(annotated) = readVcfRecs(output)

annotated.pos shouldBe 200
annotated.get[Int]("DP").value shouldBe 2
annotated.get[Float](ATailInfoKey).value shouldBe 1.0
annotated.get[Float](EndRepairInfoKey).value shouldBe 1.0
annotated.filters shouldBe empty
}

it should s"not change existing FILTER data when adding new FILTER data" in {
val samples = Seq("sample1")
val variants = VcfBuilder(samples)
variants.add(pos = 200, alleles = Seq("G", "A"), gts = Seq(Gt("sample1", "G/A")), filters = Seq("LowQD"))

val length = 40
val sequences = new SamBuilder(readLength = length, sort = Some(SamOrder.Coordinate))

// G>A SNP at 200 with low variant allele frequency but even coverage of reads
// - ATailingArtifactFilter should apply and output a high p-value
// - EndRepairArtifactFilter should apply and output a high p-value
for (pos <- 161 to 200; i <- 1 to 3) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "G" * length, bases2 = "G" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "G" * length, bases2 = "G" * length)
if (i == 1 && pos % 4 == 0) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "A" * length, bases2 = "A" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "A" * length, bases2 = "A" * length)
}
}

val output = makeTempFile(getClass.getSimpleName, ".vcf")

new FilterSomaticVcf(
input = variants.toTempFile(),
output = output,
bam = sequences.toTempFile(),
accessPattern = Streaming,
aTailingPValue = Some(1.0),
endRepairFillInPValue = Some(1.0),
).execute()

val Seq(annotated) = readVcfRecs(output)

annotated.pos shouldBe 200
annotated.get[Float](ATailInfoKey).value shouldBe 1.0
annotated.get[Float](EndRepairInfoKey).value shouldBe 1.0
annotated.filters should contain theSameElementsAs Set("LowQD", ATailFilterKey, EndRepairFilterKey)
}

it should s"not remove PASS from the FILTER field if no new filters are added" in {
val samples = Seq("sample1")
val variants = VcfBuilder(samples)
variants.add(pos = 200, alleles = Seq("G", "A"), gts = Seq(Gt("sample1", "G/A")), filters = PassingFilters)

val length = 40
val sequences = new SamBuilder(readLength = length, sort = Some(SamOrder.Coordinate))

// G>A SNP at 200 with low variant allele frequency but even coverage of reads
// - ATailingArtifactFilter should apply and output a high p-value
// - EndRepairArtifactFilter should apply and output a high p-value
for (pos <- 161 to 200; i <- 1 to 3) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "G" * length, bases2 = "G" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "G" * length, bases2 = "G" * length)
if (i == 1 && pos % 4 == 0) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "A" * length, bases2 = "A" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "A" * length, bases2 = "A" * length)
}
}

val output = makeTempFile(getClass.getSimpleName, ".vcf")

new FilterSomaticVcf(
input = variants.toTempFile(),
output = output,
bam = sequences.toTempFile(),
accessPattern = Streaming,
aTailingPValue = None,
endRepairFillInPValue = None,
).execute()

val Seq(annotated) = readVcfRecs(output)

annotated.pos shouldBe 200
annotated.get[Float](ATailInfoKey).value shouldBe 1.0
annotated.get[Float](EndRepairInfoKey).value shouldBe 1.0
annotated.filters should contain theSameElementsAs PassingFilters
}

it should s"remove PASS from the FILTER field if a new filter is added" in {
val samples = Seq("sample1")
val variants = VcfBuilder(samples)
variants.add(pos = 200, alleles = Seq("G", "A"), gts = Seq(Gt("sample1", "G/A")), filters = PassingFilters)

val length = 40
val sequences = new SamBuilder(readLength = length, sort = Some(SamOrder.Coordinate))

// G>A SNP at 200 with low variant allele frequency but even coverage of reads
// - ATailingArtifactFilter should apply and output a high p-value
// - EndRepairArtifactFilter should apply and output a high p-value
for (pos <- 161 to 200; i <- 1 to 3) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "G" * length, bases2 = "G" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "G" * length, bases2 = "G" * length)
if (i == 1 && pos % 4 == 0) {
sequences.addPair(start1 = pos, start2 = pos + length, bases1 = "A" * length, bases2 = "A" * length)
sequences.addPair(start1 = pos - length, start2 = pos, bases1 = "A" * length, bases2 = "A" * length)
}
}

val output = makeTempFile(getClass.getSimpleName, ".vcf")

new FilterSomaticVcf(
input = variants.toTempFile(),
output = output,
bam = sequences.toTempFile(),
accessPattern = Streaming,
aTailingPValue = Some(1.0),
endRepairFillInPValue = Some(1.0),
).execute()

val Seq(annotated) = readVcfRecs(output)

annotated.pos shouldBe 200
annotated.get[Float](ATailInfoKey).value shouldBe 1.0
annotated.get[Float](EndRepairInfoKey).value shouldBe 1.0
annotated.filters should contain theSameElementsAs Set(ATailFilterKey, EndRepairFilterKey)
}

BamAccessPattern.values.foreach { accessPattern =>
it should s"work on a single sample VCF when BAM access is: $accessPattern" in {
val filteredVcf = makeTempFile("filtered.", ".vcf")
Expand Down

0 comments on commit ad6e200

Please sign in to comment.