Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure --min-reads is a hard filter in CallDuplexConsensusReads #1010

Merged
merged 5 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ private[alignment] object CigarStats {
*/
case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
// Cache whether or not the Cigar is coalesced already (i.e. has no pair of adjacent elements with the same operator)
private lazy val isCoalesced: Boolean = {
private lazy val isCoalesced: Boolean = elems.length == 1 || {
var itIs = true
var index = 0
while (index < elems.length-1 && itIs) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,16 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
val y = groups.lift(1).getOrElse(Seq.empty)

if (hasMinimumNumberOfReads(x, y)) {
callDuplexConsensusRead(x, y)
val consensus = callDuplexConsensusRead(x, y)
if (consensus.forall(duplexHasMinimumNumberOfReads)) {
consensus
} else {
// Even though we had enough reads going into consensus calling, we can
// lose some in the process (e.g. to CIGAR filtering), and end up with
// consensus reads that will than fail the min-reads check
rejectRecords(groups.flatten, FilterMinReads)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We reject the original reads in the group, and not the consensus.

clintval marked this conversation as resolved.
Show resolved Hide resolved
Nil
}
}
else {
rejectRecords(groups.flatten, FilterMinReads)
Expand All @@ -204,7 +213,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String,

}

/** Returns true if there enough reads according to the minReads option. */
/** Returns true if there are enough reads according to the minReads option. */
private def hasMinimumNumberOfReads(x: Seq[SamRecord], y: Seq[SamRecord]): Boolean = {
// Get the number of reads per strand, in decreasing (more stringent) order.
val (numXy: Int, numYx: Int) = {
Expand All @@ -215,6 +224,16 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx
}

/** Returns true if there are enough reads composing the consensus according to the minReads option. */
private def duplexHasMinimumNumberOfReads(consensus: SamRecord): Boolean = {
val (numXy: Int, numYx: Int) = {
val numAb = consensus[Int](ConsensusTags.PerRead.AbRawReadCount)
val numBa = consensus[Int](ConsensusTags.PerRead.BaRawReadCount)
if (numAb >= numBa) (numAb, numBa) else (numBa, numAb)
}
this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx
}

private def areAllSameStrand(reads: Seq[SamRecord]): Boolean = {
if (reads.nonEmpty) {
val ss1Flag = reads.head.negativeStrand
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ class FilterConsensusReads
FilterResult(keepRead=false, 0)
}
else {
val result = if (isDuplexRecord(rec)) filterDuplexConsensusRead(rec) else filterVanillaConsensusRead(rec)
val result = if (Umis.isFgbioDuplexConsensus(rec)) filterDuplexConsensusRead(rec) else filterVanillaConsensusRead(rec)
result.copy(keepRead = result.keepRead && fractionNoCalls(rec) <= this.maxNoCallFraction)
}
}
Expand Down Expand Up @@ -329,9 +329,6 @@ class FilterConsensusReads
}
}

/** Quick check to see if a record is a duplex record. */
private def isDuplexRecord(rec: SamRecord): Boolean = rec.get(ConsensusTags.PerRead.AbRawReadCount).isDefined

/** Reverses all the consensus tags if the right conditions are set. */
private def reverseConsensusTagsIfNeeded(rec: SamRecord): Unit = if (reversePerBaseTags && rec.negativeStrand) {
ConsensusTags.PerBase.TagsToReverse.foreach { tag =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] {
private def simplifyCigar(cigar: Cigar) = {
import CigarOperator._
if (cigar.forall(e => e.operator == M || e.operator == I || e.operator == D)) {
cigar
cigar.coalesce
clintval marked this conversation as resolved.
Show resolved Hide resolved
}
else {
val newElems = cigar.elems.map {
Expand Down
25 changes: 14 additions & 11 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.umi.ConsensusTags
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{RawReadCount, AbRawReadCount, BaRawReadCount}
import com.fulcrumgenomics.util.Sequences

object Umis {
Expand Down Expand Up @@ -129,14 +129,17 @@ object Umis {
ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-'
}

/** Returns True if the record appears to be a consensus read,
* typically produced by fgbio's CallMolecularConsensusReads or CallDuplexConsensusReads.
*
* @param rec the record to check
* @return boolean indicating if the record is a consensus or not
*/
def isFgbioStyleConsensus(rec: SamRecord): Boolean = {
rec.contains(ConsensusTags.PerRead.RawReadCount) ||
(rec.contains(ConsensusTags.PerRead.AbRawReadCount) && rec.contains(ConsensusTags.PerRead.BaRawReadCount))
}
/** Returns True if the record appears to be a consensus sequence typically produced by fgbio's
* CallMolecularConsensusReads or CallDuplexConsensusReads.
*
* @param rec the record to check
* @return boolean indicating if the record is a consensus or not
*/
def isFgbioStyleConsensus(rec: SamRecord): Boolean = isFgbioSimplexConsensus(rec) || isFgbioDuplexConsensus(rec)

/** Returns true if the record appears to be a simplex consensus sequence. */
def isFgbioSimplexConsensus(rec: SamRecord): Boolean = rec.contains(RawReadCount) && !isFgbioDuplexConsensus(rec)

/** Returns true if the record appears to be a duplex consensus sequence. */
def isFgbioDuplexConsensus(rec: SamRecord): Boolean = rec.contains(AbRawReadCount) && rec.contains(BaRawReadCount)
}
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
}

/** A builder where we have one duplex, with five total reads, three on one strand and two on the other. */
private val builderForMinReads: SamBuilder = {
private def builderForMinReads(): SamBuilder = {
val builder = new SamBuilder(readLength=10, baseQuality=20)
builder.addPair(name="q1", start1=100, start2=200, strand1=Plus, strand2=Minus, bases1="AAAAAAAAAA", bases2="CCCCCCCCCC", attrs=Map(MI -> "foo/A"))
builder.addPair(name="q2", start1=100, start2=200, strand1=Plus, strand2=Minus, bases1="AAAAAAAAAA", bases2="CCCCCCCCCC", attrs=Map(MI -> "foo/A"))
Expand All @@ -171,18 +171,42 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
}

it should "support the --min-reads option when one value is provided" in {
caller(minReads=Seq(3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty
caller(minReads=Seq(2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2
val builder = builderForMinReads()
caller(minReads=Seq(3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty
caller(minReads=Seq(2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2
}

it should "support the --min-reads option when two values are provided" in {
caller(minReads=Seq(5, 3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty
caller(minReads=Seq(5, 2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2
val builder = builderForMinReads()
caller(minReads=Seq(5, 3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty
caller(minReads=Seq(5, 2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2
}

it should "support the --min-reads option when three values are provided" in {
caller(minReads=Seq(5, 3, 3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty
caller(minReads=Seq(5, 3, 2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2
val builder = builderForMinReads()
caller(minReads=Seq(5, 3, 3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty
caller(minReads=Seq(5, 3, 2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2
}

it should "support the --min-reads option as a hard filter if some records are removed during consensus calling for not having the most common structural alignment" in {
val builder = builderForMinReads()
val records = builder.toSeq
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records) shouldBe empty
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records) should have size 2

// Test for a coalesced and simplified cigar
val similar1 = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "10M")
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ similar1) should have size 2 // the new read pair is considered in the final consensus
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ similar1) should have size 2

// Test for a non-coalesced but simplified cigar
val similar2 = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "5M5M")
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ similar2) should have size 2 // the new read pair is considered in the final consensus
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ similar2) should have size 2

val dissimilar = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "5M1D5M")
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ dissimilar) shouldBe empty // the new read pair is *not* considered in the final consensus
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ dissimilar) should have size 2
}

it should "not saturate the qualities with deep AB and light BA coverage" in {
Expand Down
41 changes: 32 additions & 9 deletions src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{RawReadCount, AbRawReadCount, BaRawReadCount}
import org.scalatest.OptionValues

class UmisTest extends UnitSpec with OptionValues {
Expand Down Expand Up @@ -112,19 +113,41 @@ class UmisTest extends UnitSpec with OptionValues {
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:CCKC-ACGT"))
}

"Umis.isConsensusRead" should "return false for reads without consensus tags" in {
"Umis.isFgbioStyleConsensus" should "return false for reads without consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
builder.addFrag(start=100).exists(Umis.isFgbioStyleConsensus) shouldBe false
builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isFgbioStyleConsensus) shouldBe false
Umis.isFgbioStyleConsensus(builder.addFrag(start=100).value) shouldBe false
val pair = builder.addPair(start1=100, start2=100, unmapped2=true)
pair.length shouldBe 2
pair.forall(rec => Umis.isFgbioStyleConsensus(rec)) shouldBe false
}

it should "return true for reads with consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
builder.addFrag(start=10, attrs=Map(ConsensusTags.PerRead.RawReadCount -> 10))
.exists(Umis.isFgbioStyleConsensus) shouldBe true
builder.addFrag(
start=10,
attrs=Map(ConsensusTags.PerRead.AbRawReadCount -> 10, ConsensusTags.PerRead.BaRawReadCount -> 10)
).exists(Umis.isFgbioStyleConsensus) shouldBe true
Umis.isFgbioStyleConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe true
Umis.isFgbioStyleConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true
}

"Umis.isFgbioSimplexConsensus" should "return true for reads with simplex only consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
Umis.isFgbioSimplexConsensus(builder.addFrag(start=100).value) shouldBe false
Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe true
Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe false
Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 20, AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe false

val pair = builder.addPair(start1=100, start2=100, unmapped2=true)
pair.length shouldBe 2
pair.forall(rec => Umis.isFgbioSimplexConsensus(rec)) shouldBe false
}

"Umis.isFgbioDuplexConsensus" should "return true for reads with duplex only consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
Umis.isFgbioDuplexConsensus(builder.addFrag(start=100).value) shouldBe false
Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe false
Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true
Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 20, AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true

val pair = builder.addPair(start1=100, start2=100, unmapped2=true)
pair.length shouldBe 2
pair.forall(rec => Umis.isFgbioDuplexConsensus(rec)) shouldBe false
}
}
Loading