Skip to content

Commit

Permalink
Bug fix: Consensus caller could improperly trim small inserts
Browse files Browse the repository at this point in the history
If the reads are longer than the insert size, but aligned with
insertions, the consensus read may have the insertion trimmed.
  • Loading branch information
nh13 committed Jan 14, 2022
1 parent 2941e4b commit 388ba43
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 8 deletions.
37 changes: 31 additions & 6 deletions src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import htsjdk.samtools.util.{Murmur3, SequenceUtil, TrimmingUtil}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.math.{abs, min}
import scala.math.{abs, min, max}
/**
* Contains shared types and functions used when writing UMI-driven consensus
* callers that take in SamRecords and emit SamRecords.
Expand Down Expand Up @@ -245,11 +245,36 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] {
// end of the read, or the end of the insert, whichever is shorter!
val len = {
var index = if (!rec.isFrPair) trimToLength - 1 else {
// Adjust the insert size based on clipping on the 5' end of the given read.
// This *does not* consider any clipping on the 5' end of its mate.
val fivePrimeClipping = cigar.takeWhile(_.operator.isClipping).filter(_.operator == CigarOperator.S).sumBy(_.length)
val adjustedInsertSize = abs(rec.insertSize) + fivePrimeClipping
min(adjustedInsertSize, trimToLength) - 1
// If this read reads through the template (i.e. past the start of the mate), then we need to trim bases on the
// 3' end. If not, then we can use the full read. This *does not* consider any clipping on the 5' end of its mate.
val adjustedReadLength = {
if (rec.positiveStrand) {
val mateEnd = rec.mateEnd.getOrElse(rec.start + abs(rec.insertSize) - 1)
if (rec.end >= mateEnd) {
// the # of bases to keep is the last read base of where the mate ends
rec.readPosAtRefPos(pos=mateEnd, returnLastBaseIfDeleted=false)
} else {
// Find where the end of the read goes past the mate start, if at all
val refPosAtReadEnd = min(mateEnd, rec.end) // last 3' base
rec.readPosAtRefPos(pos=refPosAtReadEnd, returnLastBaseIfDeleted=false)
}
} else {
// Find where the start of the read that goes before the mate start, if at all
if (rec.start <= rec.mateStart) {
// the # of bases to keep is the last read base of where the mate ends
rec.length - rec.readPosAtRefPos(pos=rec.mateStart, returnLastBaseIfDeleted=false) + 1
} else {
// The # of bases soft-clipped on the 3' end of the read (3' end in sequencing order)
val threePrimeClipping = cigar.reverseIterator.takeWhile(_.operator.isClipping).filter(_.operator == CigarOperator.S).sumBy(_.length)
// get the maximum # of allowable clipped bases on the 3' end
val maxThreePrimeClipping = rec.start - rec.mateStart
// if we have more than allowed, trim the read
if (threePrimeClipping <= maxThreePrimeClipping) bases.length
else bases.length - threePrimeClipping + maxThreePrimeClipping
}
}
}
min(adjustedReadLength, trimToLength) - 1
}
while (index >= 0 && (bases(index) == NoCall)) index -= 1
index + 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -513,13 +513,13 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues {
val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value
val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value

s1.baseString shouldBe "A"*2 + "C"*38 // trimmed by ten bases at the 3' end
s1.baseString shouldBe "A"*2 + "C"*38 // trimmed by ten bases at the 3' end, 5 due to soft-clipping and the other due to past mate start
s1.cigar.toString shouldBe "10S30M"
s2.baseString shouldBe "C"*2 + "G"*36 // trimmed by twelve bases at the 3' end
s2.cigar.toString shouldBe "8S30M" // NB: cigar is reversed in toSourceRead
}

it should "not to trim based on insert size in the presence of soft-clipping" in {
it should "not to trim based on insert size in the presence of soft-clipping (-/+)" in {
val builder = new SamBuilder(readLength=142)
val Seq(r1, r2) = builder.addPair(start1=545, start2=493, strand1=Minus, strand2=Plus, cigar1="47S72M23S", cigar2="46S96M")
.map { r => r.bases = "A"*142; r }
Expand All @@ -532,6 +532,20 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues {
s2.cigar.toString shouldBe "46S96M"
}

it should "not to trim based on insert size in the presence of soft-clipping (+/-)" in {
val builder = new SamBuilder(readLength=142)
val Seq(r1, r2) = builder.addPair(start2=545, start1=493, strand2=Minus, strand1=Plus, cigar2="47S72M23S", cigar1="46S96M")
.map { r => r.bases = "A"*142; r }
val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value
val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value

// for R2, the leading 47S in the alignment is trimmed because
s1.baseString shouldBe "A"*142
s1.cigar.toString shouldBe "46S96M"
s2.baseString shouldBe "T"*142
s2.cigar.toString shouldBe "23S72M47S" // NB: cigar is reversed in toSourceRead
}

it should "not trim the source read based on insert size if the read is not an FR pair" in {
val builder = new SamBuilder(readLength=50)
val Seq(r1, r2) = builder.addPair(start1=11, start2=1, strand1=Plus, strand2=Plus).map { r => r.bases = "A"*50; r }
Expand Down Expand Up @@ -560,4 +574,18 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues {
source.cigar.toString() shouldBe "3M"
source.sam shouldBe Some(rec)
}

it should "not trim reads with an insert size shorter than the read length when there's an insertion in the middle" in {
val builder = new SamBuilder(readLength=100)
val Seq(r1, r2) = builder.addPair(
start1=1, start2=1, strand1=Plus, strand2=Minus, cigar1="40M20I40M", cigar2="40M20I40M"
).map { r => r.bases = "A"*100; r }
val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value
val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value

s1.baseString shouldBe "A"*100
s1.cigar.toString shouldBe "40M20I40M"
s2.baseString shouldBe "T"*100
s2.cigar.toString shouldBe "40M20I40M"
}
}

0 comments on commit 388ba43

Please sign in to comment.