From 388ba432df8179b6bc12e69e41bcfdd144b38da3 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Thu, 13 Jan 2022 23:09:48 -0700 Subject: [PATCH] Bug fix: Consensus caller could improperly trim small inserts If the reads are longer than the insert size, but aligned with insertions, the consensus read may have the insertion trimmed. --- .../umi/UmiConsensusCaller.scala | 37 ++++++++++++++++--- .../umi/VanillaUmiConsensusCallerTest.scala | 32 +++++++++++++++- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala index 583f5a1db..4a6916c2e 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala @@ -36,7 +36,7 @@ import htsjdk.samtools.util.{Murmur3, SequenceUtil, TrimmingUtil} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -import scala.math.{abs, min} +import scala.math.{abs, min, max} /** * Contains shared types and functions used when writing UMI-driven consensus * callers that take in SamRecords and emit SamRecords. @@ -245,11 +245,36 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] { // end of the read, or the end of the insert, whichever is shorter! val len = { var index = if (!rec.isFrPair) trimToLength - 1 else { - // Adjust the insert size based on clipping on the 5' end of the given read. - // This *does not* consider any clipping on the 5' end of its mate. - val fivePrimeClipping = cigar.takeWhile(_.operator.isClipping).filter(_.operator == CigarOperator.S).sumBy(_.length) - val adjustedInsertSize = abs(rec.insertSize) + fivePrimeClipping - min(adjustedInsertSize, trimToLength) - 1 + // If this read reads through the template (i.e. past the start of the mate), then we need to trim bases on the + // 3' end. If not, then we can use the full read. This *does not* consider any clipping on the 5' end of its mate. + val adjustedReadLength = { + if (rec.positiveStrand) { + val mateEnd = rec.mateEnd.getOrElse(rec.start + abs(rec.insertSize) - 1) + if (rec.end >= mateEnd) { + // the # of bases to keep is the last read base of where the mate ends + rec.readPosAtRefPos(pos=mateEnd, returnLastBaseIfDeleted=false) + } else { + // Find where the end of the read goes past the mate start, if at all + val refPosAtReadEnd = min(mateEnd, rec.end) // last 3' base + rec.readPosAtRefPos(pos=refPosAtReadEnd, returnLastBaseIfDeleted=false) + } + } else { + // Find where the start of the read that goes before the mate start, if at all + if (rec.start <= rec.mateStart) { + // the # of bases to keep is the last read base of where the mate ends + rec.length - rec.readPosAtRefPos(pos=rec.mateStart, returnLastBaseIfDeleted=false) + 1 + } else { + // The # of bases soft-clipped on the 3' end of the read (3' end in sequencing order) + val threePrimeClipping = cigar.reverseIterator.takeWhile(_.operator.isClipping).filter(_.operator == CigarOperator.S).sumBy(_.length) + // get the maximum # of allowable clipped bases on the 3' end + val maxThreePrimeClipping = rec.start - rec.mateStart + // if we have more than allowed, trim the read + if (threePrimeClipping <= maxThreePrimeClipping) bases.length + else bases.length - threePrimeClipping + maxThreePrimeClipping + } + } + } + min(adjustedReadLength, trimToLength) - 1 } while (index >= 0 && (bases(index) == NoCall)) index -= 1 index + 1 diff --git a/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala b/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala index a73973deb..9b569143a 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala @@ -513,13 +513,13 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues { val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value - s1.baseString shouldBe "A"*2 + "C"*38 // trimmed by ten bases at the 3' end + s1.baseString shouldBe "A"*2 + "C"*38 // trimmed by ten bases at the 3' end, 5 due to soft-clipping and the other due to past mate start s1.cigar.toString shouldBe "10S30M" s2.baseString shouldBe "C"*2 + "G"*36 // trimmed by twelve bases at the 3' end s2.cigar.toString shouldBe "8S30M" // NB: cigar is reversed in toSourceRead } - it should "not to trim based on insert size in the presence of soft-clipping" in { + it should "not to trim based on insert size in the presence of soft-clipping (-/+)" in { val builder = new SamBuilder(readLength=142) val Seq(r1, r2) = builder.addPair(start1=545, start2=493, strand1=Minus, strand2=Plus, cigar1="47S72M23S", cigar2="46S96M") .map { r => r.bases = "A"*142; r } @@ -532,6 +532,20 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues { s2.cigar.toString shouldBe "46S96M" } + it should "not to trim based on insert size in the presence of soft-clipping (+/-)" in { + val builder = new SamBuilder(readLength=142) + val Seq(r1, r2) = builder.addPair(start2=545, start1=493, strand2=Minus, strand1=Plus, cigar2="47S72M23S", cigar1="46S96M") + .map { r => r.bases = "A"*142; r } + val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value + val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value + + // for R2, the leading 47S in the alignment is trimmed because + s1.baseString shouldBe "A"*142 + s1.cigar.toString shouldBe "46S96M" + s2.baseString shouldBe "T"*142 + s2.cigar.toString shouldBe "23S72M47S" // NB: cigar is reversed in toSourceRead + } + it should "not trim the source read based on insert size if the read is not an FR pair" in { val builder = new SamBuilder(readLength=50) val Seq(r1, r2) = builder.addPair(start1=11, start2=1, strand1=Plus, strand2=Plus).map { r => r.bases = "A"*50; r } @@ -560,4 +574,18 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues { source.cigar.toString() shouldBe "3M" source.sam shouldBe Some(rec) } + + it should "not trim reads with an insert size shorter than the read length when there's an insertion in the middle" in { + val builder = new SamBuilder(readLength=100) + val Seq(r1, r2) = builder.addPair( + start1=1, start2=1, strand1=Plus, strand2=Minus, cigar1="40M20I40M", cigar2="40M20I40M" + ).map { r => r.bases = "A"*100; r } + val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value + val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value + + s1.baseString shouldBe "A"*100 + s1.cigar.toString shouldBe "40M20I40M" + s2.baseString shouldBe "T"*100 + s2.cigar.toString shouldBe "40M20I40M" + } }