From 3a74fd28ff630b417410953541ff107a1b7b0fb7 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Fri, 9 Feb 2024 10:35:39 -0500 Subject: [PATCH] ZipperBams to produce mate score ("ms") for samtools markdup (#952) * ZipperBams to produce mate score ("ms") for samtools markdup Fixes: #951 --- .../scala/com/fulcrumgenomics/bam/Bams.scala | 7 ++- .../com/fulcrumgenomics/bam/BamsTest.scala | 44 ++++++++++++------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 61d6d52aa..623b97d76 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -105,16 +105,21 @@ case class Template(r1: Option[SamRecord], Template(x1, x2) } - /** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */ + /** Fixes mate information and sets mate cigar and mate score on all primary and supplementary (but not secondary) records. */ def fixMateInfo(): Unit = { + // Developer note: the mate score ("ms") tag is used by samtools markdup for (primary <- r1; supp <- r2Supplementals) { SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) + primary.get[Int]("AS").foreach(supp("ms") = _) } for (primary <- r2; supp <- r1Supplementals) { SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) + primary.get[Int]("AS").foreach(supp("ms") = _) } for (first <- r1; second <- r2) { SamPairUtil.setMateInfo(first.asSam, second.asSam, true) + first.get[Int]("AS").foreach(second("ms") = _) + second.get[Int]("AS").foreach(first("ms") = _) } } diff --git a/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala b/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala index c6d1b699e..6e7a9e1ff 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala @@ -473,6 +473,7 @@ class BamsTest extends UnitSpec { builder.addPair(name="q1", contig=1, contig2=Some(2), start1=100, start2=200, cigar1="50M50S", cigar2="50S50M", mapq1=51, mapq2=52) builder.addPair(name="q1", contig=3, contig2=Some(4), start1=300, start2=400, cigar1="50S50M", cigar2="50M50S", mapq1=21, mapq2=22) .foreach(_.supplementary = true) + builder.addPair(name="q2", contig=1, contig2=Some(2), start1=100, start2=200, cigar1="50M50S", cigar2="50S50M", mapq1=51, mapq2=52, attrs=Map(("ms", 2))) val recs = builder.toIndexedSeq.tapEach { r => r.mateMapped = false @@ -482,21 +483,34 @@ class BamsTest extends UnitSpec { r.remove("MQ") } - val template = Template(recs.iterator) - template.fixMateInfo() - - template.allReads.foreach { r => - r.mateMapped shouldBe true - - if (r.firstOfPair) { - r.mateRefIndex shouldBe 2 - r.mateStart shouldBe 200 - r.mateCigar.value.toString() shouldBe "50S50M" - } - else { - r.mateRefIndex shouldBe 1 - r.mateStart shouldBe 100 - r.mateCigar.value.toString() shouldBe "50M50S" + val q1Template = Template(recs.iterator.filter(_.name == "q1")) + val q2Template = Template(recs.iterator.filter(_.name == "q2")) + val templates = Seq(q1Template, q2Template) + + templates.foreach { template => + template.fixMateInfo() + + template.allReads.foreach { r => + r.mateMapped shouldBe true + + if (r.firstOfPair) { + r.mateRefIndex shouldBe 2 + r.mateStart shouldBe 200 + r.mateCigar.value.toString() shouldBe "50S50M" + } + else { + r.mateRefIndex shouldBe 1 + r.mateStart shouldBe 100 + r.mateCigar.value.toString() shouldBe "50M50S" + } + + if (r.name == "q1") { + r.get[Int]("ms").isEmpty shouldBe true + } + else { + r.name shouldBe "q2" + r.get[Int]("ms").value shouldBe 2 + } } } }