From 686fc9b83ec6c40095ed4c39b56049f499edef25 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 1 Jul 2024 15:54:03 -0700 Subject: [PATCH 01/13] feat: add isConsensus method --- .../com/fulcrumgenomics/bam/api/SamRecord.scala | 5 +++++ .../fulcrumgenomics/bam/api/SamRecordTest.scala | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala index cf05b54c6..40fce0b64 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala @@ -27,6 +27,7 @@ package com.fulcrumgenomics.bam.api import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.alignment.Cigar +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags import htsjdk.samtools import htsjdk.samtools.SamPairUtil.PairOrientation import htsjdk.samtools._ @@ -269,6 +270,10 @@ trait SamRecord { SamPairUtil.getPairOrientation(this) == PairOrientation.FR } + def isConsensus: Boolean = { + AllPerReadTags.exists(this.contains) + } + /** Clone method that does a "reasonably deep" clone. The bases and quals are cloned as is the attributes map, * though not the values in the attributes map. */ override def clone(): SamRecord = { diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala index 38251e0c4..82f34e940 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala @@ -27,6 +27,7 @@ package com.fulcrumgenomics.bam.api import com.fulcrumgenomics.alignment.Cigar import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags import htsjdk.samtools.TextCigarCodec import org.scalatest.OptionValues @@ -289,4 +290,17 @@ class SamRecordTest extends UnitSpec with OptionValues { rec1.matesOverlap shouldBe None // Mate's start is not enclosed by rec, and mate's end cannot be determined rec2.matesOverlap.value shouldBe true // Mate's start is enclosed by rec, regardless of where mate end is } + + "SamRecord.isConsensus" should "return false for reads without consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + builder.addFrag(start=100).exists(_.isConsensus) shouldBe false + builder.addPair(start1=100, start2=100, unmapped2=true).exists(_.isConsensus) shouldBe false + } + + it should "return true for reads with consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + AllPerReadTags.forall( + tag => builder.addFrag(start=10, attrs=Map(tag -> 123)).exists(_.isConsensus) + ) shouldBe true + } } From ebfbac15717623e8bd473212394107a42445110b Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 22 Jul 2024 15:55:19 -0700 Subject: [PATCH 02/13] feat: assert DuplexSeqMetrics collected on grouped BAM --- .../fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala | 1 + .../umi/CollectDuplexSeqMetricsTest.scala | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 5d18b6d98..1e34e70ca 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -340,6 +340,7 @@ class CollectDuplexSeqMetrics // Assign reads a random number between 0 and 1 inclusive based on their read name group.foreach { rec => + if (rec.isConsensus) throw new IllegalArgumentException("Found consensus record. Expected UMI-grouped BAM") val intHash = math.abs(this.hasher.hashUnencodedChars(rec.name)) val doubleHash = intHash / MaxIntAsDouble rec.transientAttrs(HashKey) = doubleHash diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index d9d0ed16f..e6a01a92c 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -32,6 +32,7 @@ import com.fulcrumgenomics.bam.api.SamOrder import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags import com.fulcrumgenomics.util.{Io, Metric, Rscript} import htsjdk.samtools.util.{Interval, IntervalList} import org.apache.commons.math3.distribution.NormalDistribution @@ -41,6 +42,7 @@ import scala.math.{max, min} class CollectDuplexSeqMetricsTest extends UnitSpec { private val MI = "MI" private val RX = "RX" + private val aD = "aD" // Case class to hold pointers to all the outputs of CollectDuplexSeqMetrics private case class Outputs(families: Path, duplexFamilies: Path, umis: Path, duplexUmis: Path, yields: Path, plots: Path) { @@ -312,6 +314,14 @@ class CollectDuplexSeqMetricsTest extends UnitSpec { duplexFamilies.find(f => f.ab_size == 6 && f.ba_size == 0).get.count shouldBe 1 } + it should "raise an exception if consensus records are provided" in { + val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.TemplateCoordinate)) + AllPerReadTags.foreach { + tag => builder.addPair(contig=1, start1=1000, start2=1100, attrs=Map(RX -> "AAA-GGG", MI -> "1/A", tag -> 10)) + an[IllegalArgumentException] shouldBe thrownBy { exec(builder) } + } + } + "CollectDuplexSeqMetrics.updateUmiMetrics" should "not count duplex umis" in collector(duplex=false).foreach { c => val builder = new SamBuilder(readLength=10) builder.addPair(start1=100, start2=200, attrs=Map(RX -> "AAA-CCC", MI -> "1/A")) From 52642f543c95d15df910f41631c147e75adfbe55 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Tue, 23 Jul 2024 10:01:52 -0700 Subject: [PATCH 03/13] refactor: move isConsensus --- .../com/fulcrumgenomics/bam/api/SamRecord.scala | 4 ---- .../umi/CollectDuplexSeqMetrics.scala | 1 - .../scala/com/fulcrumgenomics/umi/Umis.scala | 11 +++++++++++ .../fulcrumgenomics/bam/api/SamRecordTest.scala | 14 -------------- .../umi/CollectDuplexSeqMetricsTest.scala | 8 -------- .../com/fulcrumgenomics/umi/UmisTest.scala | 17 ++++++++++++++++- 6 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala index 40fce0b64..061228806 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala @@ -270,10 +270,6 @@ trait SamRecord { SamPairUtil.getPairOrientation(this) == PairOrientation.FR } - def isConsensus: Boolean = { - AllPerReadTags.exists(this.contains) - } - /** Clone method that does a "reasonably deep" clone. The bases and quals are cloned as is the attributes map, * though not the values in the attributes map. */ override def clone(): SamRecord = { diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 1e34e70ca..5d18b6d98 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -340,7 +340,6 @@ class CollectDuplexSeqMetrics // Assign reads a random number between 0 and 1 inclusive based on their read name group.foreach { rec => - if (rec.isConsensus) throw new IllegalArgumentException("Found consensus record. Expected UMI-grouped BAM") val intHash = math.abs(this.hasher.hashUnencodedChars(rec.name)) val doubleHash = intHash / MaxIntAsDouble rec.transientAttrs(HashKey) = doubleHash diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 08369a682..772f67351 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,6 +26,8 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{BaRawReadCount, RawReadCount} import com.fulcrumgenomics.util.Sequences object Umis { @@ -127,4 +129,13 @@ object Umis { @inline private def isValidUmiCharacter(ch: Char): Boolean = { ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-' } + + /** Tests if a record is a consensus or not + * + * @param rec the record to test + * @return boolean indicating if the record is a consensus or not + */ + def isConsensusRead(rec: SamRecord): Boolean = { + rec.contains(RawReadCount) | (rec.contains(AbRawReadCount) & rec.contains(BaRawReadCount)) + } } diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala index 82f34e940..38251e0c4 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamRecordTest.scala @@ -27,7 +27,6 @@ package com.fulcrumgenomics.bam.api import com.fulcrumgenomics.alignment.Cigar import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags import htsjdk.samtools.TextCigarCodec import org.scalatest.OptionValues @@ -290,17 +289,4 @@ class SamRecordTest extends UnitSpec with OptionValues { rec1.matesOverlap shouldBe None // Mate's start is not enclosed by rec, and mate's end cannot be determined rec2.matesOverlap.value shouldBe true // Mate's start is enclosed by rec, regardless of where mate end is } - - "SamRecord.isConsensus" should "return false for reads without consensus tags" in { - val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) - builder.addFrag(start=100).exists(_.isConsensus) shouldBe false - builder.addPair(start1=100, start2=100, unmapped2=true).exists(_.isConsensus) shouldBe false - } - - it should "return true for reads with consensus tags" in { - val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) - AllPerReadTags.forall( - tag => builder.addFrag(start=10, attrs=Map(tag -> 123)).exists(_.isConsensus) - ) shouldBe true - } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index e6a01a92c..f1939c68d 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -314,14 +314,6 @@ class CollectDuplexSeqMetricsTest extends UnitSpec { duplexFamilies.find(f => f.ab_size == 6 && f.ba_size == 0).get.count shouldBe 1 } - it should "raise an exception if consensus records are provided" in { - val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.TemplateCoordinate)) - AllPerReadTags.foreach { - tag => builder.addPair(contig=1, start1=1000, start2=1100, attrs=Map(RX -> "AAA-GGG", MI -> "1/A", tag -> 10)) - an[IllegalArgumentException] shouldBe thrownBy { exec(builder) } - } - } - "CollectDuplexSeqMetrics.updateUmiMetrics" should "not count duplex umis" in collector(duplex=false).foreach { c => val builder = new SamBuilder(readLength=10) builder.addPair(start1=100, start2=200, attrs=Map(RX -> "AAA-CCC", MI -> "1/A")) diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 3d8c010b3..f9b1ddd3e 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -25,8 +25,10 @@ package com.fulcrumgenomics.umi -import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{BaRawReadCount, RawReadCount} import org.scalatest.OptionValues class UmisTest extends UnitSpec with OptionValues { @@ -111,4 +113,17 @@ class UmisTest extends UnitSpec with OptionValues { an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:ACGT-CCKC")) an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:CCKC-ACGT")) } + + "Umis.isConsensusRead" should "return false for reads without consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + builder.addFrag(start=100).exists(Umis.isConsensusRead) shouldBe false + builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isConsensusRead) shouldBe false + } + + it should "return true for reads with consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).exists(Umis.isConsensusRead) shouldBe true + builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)) + .exists(Umis.isConsensusRead) shouldBe true + } } From 955b7594b48d4afc9ef54201c930168112998280 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Tue, 23 Jul 2024 11:56:00 -0700 Subject: [PATCH 04/13] refactor: move isConsensus check --- .../umi/CollectDuplexSeqMetrics.scala | 5 +++++ .../umi/CollectDuplexSeqMetricsTest.scala | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 5d18b6d98..4d32a53bf 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -307,6 +307,11 @@ class CollectDuplexSeqMetrics // Build the iterator we'll use based on whether or not we're restricting to a set of intervals val in = SamSource(input) val _filteredIterator = in.iterator.filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary) + .tapEach { + r => + if (Umis.isConsensusRead(r)) throw new IllegalArgumentException("Found consensus record. Expected UMI-grouped BAM") + else r + } val iterator = intervals match { case None => _filteredIterator case Some(path) => diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index f1939c68d..8c646a91a 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -26,13 +26,13 @@ package com.fulcrumgenomics.umi import java.nio.file.{Path, Paths} import java.util.Random - import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.SamOrder import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags +import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{AllPerReadTags, BaRawReadCount} import com.fulcrumgenomics.util.{Io, Metric, Rscript} import htsjdk.samtools.util.{Interval, IntervalList} import org.apache.commons.math3.distribution.NormalDistribution @@ -314,6 +314,17 @@ class CollectDuplexSeqMetricsTest extends UnitSpec { duplexFamilies.find(f => f.ab_size == 6 && f.ba_size == 0).get.count shouldBe 1 } + it should "raise an exception if duplex consensus records are provided" in { + val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.TemplateCoordinate)) + builder.addPair( + contig=1, + start1=1000, + start2=1100, + attrs=Map(RX -> "AAA-GGG", MI -> "1/A", AbRawReadCount -> 10, BaRawReadCount -> 10) + ) + an[IllegalArgumentException] shouldBe thrownBy { exec(builder) } + } + "CollectDuplexSeqMetrics.updateUmiMetrics" should "not count duplex umis" in collector(duplex=false).foreach { c => val builder = new SamBuilder(readLength=10) builder.addPair(start1=100, start2=200, attrs=Map(RX -> "AAA-CCC", MI -> "1/A")) From 97c52ac12b7250b1e1fd958c1e434322d91c6eab Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 08:24:06 -0700 Subject: [PATCH 05/13] fix: remove unused import --- src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala index 061228806..cf05b54c6 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala @@ -27,7 +27,6 @@ package com.fulcrumgenomics.bam.api import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.alignment.Cigar -import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags import htsjdk.samtools import htsjdk.samtools.SamPairUtil.PairOrientation import htsjdk.samtools._ From c015aef77cbf575c782ec4b9d52e25098af00682 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 08:31:56 -0700 Subject: [PATCH 06/13] refactor: only import ConsensusTags --- src/main/scala/com/fulcrumgenomics/umi/Umis.scala | 6 +++--- .../umi/CollectDuplexSeqMetricsTest.scala | 10 +++++++--- src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala | 10 ++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 772f67351..a024dfc2f 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,8 +26,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord -import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount -import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{BaRawReadCount, RawReadCount} +import com.fulcrumgenomics.umi.ConsensusTags import com.fulcrumgenomics.util.Sequences object Umis { @@ -136,6 +135,7 @@ object Umis { * @return boolean indicating if the record is a consensus or not */ def isConsensusRead(rec: SamRecord): Boolean = { - rec.contains(RawReadCount) | (rec.contains(AbRawReadCount) & rec.contains(BaRawReadCount)) + rec.contains(ConsensusTags.PerRead.RawReadCount) || + (rec.contains(ConsensusTags.PerRead.AbRawReadCount) && rec.contains(ConsensusTags.PerRead.BaRawReadCount)) } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index 8c646a91a..3624d357c 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -31,8 +31,7 @@ import com.fulcrumgenomics.bam.api.SamOrder import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount -import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{AllPerReadTags, BaRawReadCount} +import com.fulcrumgenomics.umi.ConsensusTags import com.fulcrumgenomics.util.{Io, Metric, Rscript} import htsjdk.samtools.util.{Interval, IntervalList} import org.apache.commons.math3.distribution.NormalDistribution @@ -320,7 +319,12 @@ class CollectDuplexSeqMetricsTest extends UnitSpec { contig=1, start1=1000, start2=1100, - attrs=Map(RX -> "AAA-GGG", MI -> "1/A", AbRawReadCount -> 10, BaRawReadCount -> 10) + attrs=Map( + RX -> "AAA-GGG", + MI -> "1/A", + ConsensusTags.PerRead.AbRawReadCount -> 10, + ConsensusTags.PerRead.BaRawReadCount -> 10 + ) ) an[IllegalArgumentException] shouldBe thrownBy { exec(builder) } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index f9b1ddd3e..427131982 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -27,8 +27,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount -import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{BaRawReadCount, RawReadCount} +import com.fulcrumgenomics.umi.ConsensusTags import org.scalatest.OptionValues class UmisTest extends UnitSpec with OptionValues { @@ -122,8 +121,11 @@ class UmisTest extends UnitSpec with OptionValues { it should "return true for reads with consensus tags" in { val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) - builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).exists(Umis.isConsensusRead) shouldBe true - builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)) + builder.addFrag(start=10, attrs=Map(ConsensusTags.PerRead.RawReadCount -> 10)) .exists(Umis.isConsensusRead) shouldBe true + builder.addFrag( + start=10, + attrs=Map(ConsensusTags.PerRead.AbRawReadCount -> 10, ConsensusTags.PerRead.BaRawReadCount -> 10) + ).exists(Umis.isConsensusRead) shouldBe true } } From d994edd041b46351c0dbd5c484e5f708c952f594 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 08:46:18 -0700 Subject: [PATCH 07/13] refactor: only check first record --- .../umi/CollectDuplexSeqMetrics.scala | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 4d32a53bf..db90460b0 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -307,11 +307,18 @@ class CollectDuplexSeqMetrics // Build the iterator we'll use based on whether or not we're restricting to a set of intervals val in = SamSource(input) val _filteredIterator = in.iterator.filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary) - .tapEach { - r => - if (Umis.isConsensusRead(r)) throw new IllegalArgumentException("Found consensus record. Expected UMI-grouped BAM") - else r - } + .bufferBetter + // Ensure the records are not consensus records + _filteredIterator.headOption.foreach { + r => + def exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus reads." + + "CollectDuplexSeqMetrics cannot run on consensus BAMs, and instead requires the UMI-grouped BAM generated " + + "prior to consensus calling. The UMI-grouped BAM is the output of running GroupReadsByUmi." + + s"\nFirst record in $input has consensus tags present:\n${_filteredIterator.head}" + + if (Umis.isConsensusRead(r)) throw new IllegalArgumentException(exceptionString) + else r + } val iterator = intervals match { case None => _filteredIterator case Some(path) => From d96173be77fc6e57ea3555fbcfb1f7d0205c95de Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 08:52:14 -0700 Subject: [PATCH 08/13] chore: cleanup imports --- .../com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index 3624d357c..27d30ba71 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -26,12 +26,12 @@ package com.fulcrumgenomics.umi import java.nio.file.{Path, Paths} import java.util.Random + import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.SamOrder import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import com.fulcrumgenomics.umi.ConsensusTags import com.fulcrumgenomics.util.{Io, Metric, Rscript} import htsjdk.samtools.util.{Interval, IntervalList} import org.apache.commons.math3.distribution.NormalDistribution From e9b2dde0add227a090b9368cf43f9d297c54cb1f Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 08:53:56 -0700 Subject: [PATCH 09/13] chore: cleanup import and unused val --- .../com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala | 1 - src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala | 1 - 2 files changed, 2 deletions(-) diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index 27d30ba71..0834c1532 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -41,7 +41,6 @@ import scala.math.{max, min} class CollectDuplexSeqMetricsTest extends UnitSpec { private val MI = "MI" private val RX = "RX" - private val aD = "aD" // Case class to hold pointers to all the outputs of CollectDuplexSeqMetrics private case class Outputs(families: Path, duplexFamilies: Path, umis: Path, duplexUmis: Path, yields: Path, plots: Path) { diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 427131982..8ffeffcc6 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -27,7 +27,6 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import com.fulcrumgenomics.umi.ConsensusTags import org.scalatest.OptionValues class UmisTest extends UnitSpec with OptionValues { From 3d78b3abfeedf972eeecfb841ae7696213abee8c Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 10:24:50 -0700 Subject: [PATCH 10/13] fix: cleanup error reporting --- .../umi/CollectDuplexSeqMetrics.scala | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index db90460b0..b5ee6c127 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -306,18 +306,19 @@ class CollectDuplexSeqMetrics override def execute(): Unit = { // Build the iterator we'll use based on whether or not we're restricting to a set of intervals val in = SamSource(input) - val _filteredIterator = in.iterator.filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary) + val _filteredIterator = in + .iterator + .filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary) .bufferBetter // Ensure the records are not consensus records _filteredIterator.headOption.foreach { - r => - def exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus reads." + + rec => + def exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus sequences. " + "CollectDuplexSeqMetrics cannot run on consensus BAMs, and instead requires the UMI-grouped BAM generated " + "prior to consensus calling. The UMI-grouped BAM is the output of running GroupReadsByUmi." + - s"\nFirst record in $input has consensus tags present:\n${_filteredIterator.head}" + s"\nFirst record in $input has consensus SAM tags present:\n$rec" - if (Umis.isConsensusRead(r)) throw new IllegalArgumentException(exceptionString) - else r + if (Umis.isConsensusRead(rec)) throw new IllegalArgumentException(exceptionString) } val iterator = intervals match { case None => _filteredIterator From 04a51e97a3c2a811c78a9d61959fe044b83b322c Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 10:34:07 -0700 Subject: [PATCH 11/13] fix: use val instead of def --- .../scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index b5ee6c127..891e93d40 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -313,7 +313,7 @@ class CollectDuplexSeqMetrics // Ensure the records are not consensus records _filteredIterator.headOption.foreach { rec => - def exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus sequences. " + + val exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus sequences. " + "CollectDuplexSeqMetrics cannot run on consensus BAMs, and instead requires the UMI-grouped BAM generated " + "prior to consensus calling. The UMI-grouped BAM is the output of running GroupReadsByUmi." + s"\nFirst record in $input has consensus SAM tags present:\n$rec" From ea80c6c8e480f5217a57062d7a1020826a5de3d4 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Wed, 24 Jul 2024 10:39:28 -0700 Subject: [PATCH 12/13] refactor: rename isConsensusRead to isFgbioStyleConsensus --- .../com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala | 2 +- src/main/scala/com/fulcrumgenomics/umi/Umis.scala | 2 +- src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 891e93d40..30d807a7c 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -318,7 +318,7 @@ class CollectDuplexSeqMetrics "prior to consensus calling. The UMI-grouped BAM is the output of running GroupReadsByUmi." + s"\nFirst record in $input has consensus SAM tags present:\n$rec" - if (Umis.isConsensusRead(rec)) throw new IllegalArgumentException(exceptionString) + if (Umis.isFgbioStyleConsensus(rec)) throw new IllegalArgumentException(exceptionString) } val iterator = intervals match { case None => _filteredIterator diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index a024dfc2f..63cb4e3f7 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -134,7 +134,7 @@ object Umis { * @param rec the record to test * @return boolean indicating if the record is a consensus or not */ - def isConsensusRead(rec: SamRecord): Boolean = { + def isFgbioStyleConsensus(rec: SamRecord): Boolean = { rec.contains(ConsensusTags.PerRead.RawReadCount) || (rec.contains(ConsensusTags.PerRead.AbRawReadCount) && rec.contains(ConsensusTags.PerRead.BaRawReadCount)) } diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 8ffeffcc6..d469aa168 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -114,17 +114,17 @@ class UmisTest extends UnitSpec with OptionValues { "Umis.isConsensusRead" should "return false for reads without consensus tags" in { val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) - builder.addFrag(start=100).exists(Umis.isConsensusRead) shouldBe false - builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isConsensusRead) shouldBe false + builder.addFrag(start=100).exists(Umis.isFgbioStyleConsensus) shouldBe false + builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isFgbioStyleConsensus) shouldBe false } it should "return true for reads with consensus tags" in { val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) builder.addFrag(start=10, attrs=Map(ConsensusTags.PerRead.RawReadCount -> 10)) - .exists(Umis.isConsensusRead) shouldBe true + .exists(Umis.isFgbioStyleConsensus) shouldBe true builder.addFrag( start=10, attrs=Map(ConsensusTags.PerRead.AbRawReadCount -> 10, ConsensusTags.PerRead.BaRawReadCount -> 10) - ).exists(Umis.isConsensusRead) shouldBe true + ).exists(Umis.isFgbioStyleConsensus) shouldBe true } } From c3aa60cb1c6a17c05053969454ea9ad671d397f2 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Thu, 25 Jul 2024 08:12:28 -0700 Subject: [PATCH 13/13] docs: improve doc string accuracy --- .../com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala | 4 ++-- src/main/scala/com/fulcrumgenomics/umi/Umis.scala | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 30d807a7c..8e2faa94a 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -315,8 +315,8 @@ class CollectDuplexSeqMetrics rec => val exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus sequences. " + "CollectDuplexSeqMetrics cannot run on consensus BAMs, and instead requires the UMI-grouped BAM generated " + - "prior to consensus calling. The UMI-grouped BAM is the output of running GroupReadsByUmi." + - s"\nFirst record in $input has consensus SAM tags present:\n$rec" + "by GroupReadsByUmi which is run prior to consensus calling." + + "\nFirst record in $input has consensus SAM tags present:\n$rec" if (Umis.isFgbioStyleConsensus(rec)) throw new IllegalArgumentException(exceptionString) } diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 63cb4e3f7..ed5512567 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -129,9 +129,10 @@ object Umis { ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-' } - /** Tests if a record is a consensus or not + /** Returns True if the record appears to be a consensus read, + * typically produced by fgbio's CallMolecularConsensusReads or CallDuplexConsensusReads. * - * @param rec the record to test + * @param rec the record to check * @return boolean indicating if the record is a consensus or not */ def isFgbioStyleConsensus(rec: SamRecord): Boolean = {