Take 2 at fixing the problem.

fulcrumgenomics · Feb 17, 2022 · 771c87a · 771c87a
1 parent bcd8ff9
commit 771c87a
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 10 deletions.
diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
@@ -185,7 +185,7 @@ object SamOrder {
       val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
         val index: Int = m.lastIndexOf('/')
         if (index >= 0) m.substring(0, index) else m
-      }.getOrElse(rec.get[String](ConsensusTags.UmiBases).getOrElse(""))
+      }.getOrElse("")
 
       if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
            (readChrom == mateChrom && readPos == matePos && !readNeg)) {

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -118,6 +118,9 @@ object GroupReadsByUmi {
     /** Returns true if the two UMIs are the same. */
     def isSameUmi(a: Umi, b: Umi): Boolean = a == b
 
+    /** Returns a canonical form of the UMI that is the same for all reads with the same UMI. */
+    def canonicalize(u: Umi): Umi = u
+
     /** Default implementation of a method to retrieve the next ID based on a counter. */
     protected def nextId: MoleculeId = this.counter.getAndIncrement().toString
 
@@ -272,6 +275,12 @@ object GroupReadsByUmi {
       }
     }
 
+    /** Returns the UMI with the lexically lower half first. */
+    override def canonicalize(u: Umi): Umi = {
+      val (a, b) = split(u)
+      if (a < b) u else s"${b}-${a}"
+    }
+
     /** Splits the paired UMI into it's two parts. */
     @inline private def split(umi: Umi): (Umi, Umi) = {
       val index = umi.indexOf('-')
@@ -460,15 +469,16 @@ class GroupReadsByUmi
 
   private val assigner = strategy.newStrategy(this.edits)
 
-  /** True if no differences in UMIs are tolerated and the UMI tag is RX, false otherwise. True here enables
+  /** True if no differences in UMIs are tolerated and the Molecular ID tag is MI, false otherwise. True here enables
     * an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI thus
     * reducing the number in memory.  This is helpful since edits=0 is often used for data that has
     * high numbers of reads with the same start/stop coordinates.
+    * We do this be setting the MI tag to the canonicalized, (optionally truncated) UMI prior to sorting, so that
+    * reads with the same UMI are grouped together in the sorted stream of records.
     */
   private val canTakeNextGroupByUmi =
-    (this.rawTag == ConsensusTags.UmiBases) &&
-    (this.edits == 0 || this.strategy == Strategy.Identity) &&
-      this.minUmiLength.isEmpty
+    (this.assignTag == ConsensusTags.MolecularId) &&
+    (this.edits == 0 || this.strategy == Strategy.Identity)
 
   /** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
   private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
@@ -509,7 +519,24 @@ class GroupReadsByUmi
           }
         } || { filterUmisTooShort += 1; false}
       }
-      .foreach(r => { sorter += r; kept += 1; sortProgress.record(r) })
+      .foreach { r =>
+        // If we're able to also group by the UMI because edits aren't allowed, push the trimmed, canonicalized UMI
+        // into the assign tag (which must be MI if cantakeNextGroupByUmi is true), since that is used by the
+        // SamOrder to sort the reads _and_ we'll overwrite it on the way out!
+        if (this.canTakeNextGroupByUmi) {
+          val umi = this.assigner.canonicalize(r[String](rawTag).toUpperCase)
+          val truncated = this.minUmiLength match {
+            case None    => umi
+            case Some(n) => umi.substring(0, n)
+          }
+
+          r(this.assignTag) = truncated
+        }
+
+        sorter += r
+        kept += 1
+        sortProgress.record(r)
+      }
 
     logger.info(f"Accepted $kept%,d reads for grouping.")
     if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
@@ -562,14 +589,15 @@ class GroupReadsByUmi
   def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
     val first     = iterator.next()
     val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
-    val firstUmi  = first.r1.get.apply[String](this.rawTag)
+    val firstUmi  = first.r1.get.apply[String](this.assignTag)
     val builder   = IndexedSeq.newBuilder[Template]
     builder    += first
 
     while (
       iterator.hasNext &&
       firstEnds == ReadInfo(iterator.head.r1.get) &&
-      (!canTakeNextGroupByUmi || this.assigner.isSameUmi(firstUmi, iterator.head.r1.get.apply[String](rawTag)))
+      // This last condition only works because we put a canonicalized UMI into rec(assignTag) if canTakeNextGroupByUmi
+      (!canTakeNextGroupByUmi || firstUmi == iterator.head.r1.get.apply[String](this.assignTag))
     ) {
       builder += iterator.next()
     }

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -348,7 +348,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
 
       val in   = builder.toTempFile()
       val out  = Files.createTempFile("umi_grouped.", ".sam")
-      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=Strategy.Identity, edits=0, minUmiLength=Some(6)).execute()
+      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=strategy, edits=0, minUmiLength=Some(6)).execute()
 
       val recs = readBamRecs(out)
       recs should have length 2
@@ -363,7 +363,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
 
       val in   = builder.toTempFile()
       val out  = Files.createTempFile("umi_grouped.", ".sam")
-      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=Strategy.Identity, edits=0, minUmiLength=Some(5)).execute()
+      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=strategy, edits=0, minUmiLength=Some(5)).execute()
 
       val recs = readBamRecs(out)
       recs should have length 4