From 33ff57b08b80fe0974ee0623c8a107c04b6f696a Mon Sep 17 00:00:00 2001
From: Nils Homer <nilshomer@gmail.com>
Date: Wed, 2 Mar 2022 20:16:57 -0700
Subject: [PATCH] fixes to the optimized path

---
 .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 3fa2513b6..b3fa0a305 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -473,16 +473,6 @@ class GroupReadsByUmi
 
   private val assigner = strategy.newStrategy(this.edits)
 
-  /** True if no differences in UMIs are tolerated and the Molecular ID tag is MI, false otherwise. True here enables
-    * an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI thus
-    * reducing the number of reads in memory.  This is helpful since edits=0 is often used for data that has
-    * high numbers of reads with the same start/stop coordinates.
-    * We do this by setting the MI tag to the canonicalized (optionally truncated) UMI prior to sorting, so that
-    * reads with the same UMI are grouped together in the sorted stream of records.
-    */
-  private val canTakeNextGroupByUmi =
-    (this.assignTag == ConsensusTags.MolecularId) &&
-    (this.edits == 0 || this.strategy == Strategy.Identity)
 
   /** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
   private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
@@ -506,6 +496,21 @@ class GroupReadsByUmi
     val header      = in.header
     val skipSorting = SamOrder(header).contains(TemplateCoordinate)
 
+    // True if the input will be sorted and no differences in UMIs are tolerated and the Molecular ID tag is MI,
+    // false otherwise.  Pre-sorted (TemplateCoordinate sorted) input does not enable this optimization as we rely on
+    // copying the `RX` tag into the `MI` tag so that same raw UMIs are grouped together in the sorted stream of records.
+    //
+    // True here enables an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI
+    // thus reducing the number of reads in memory.  This is helpful since edits=0 is often used for data that has
+    // high numbers of reads with the same start/stop coordinates.
+    // We do this by setting the MI tag to the canonicalized (optionally truncated) UMI prior to sorting, so that
+    // reads with the same UMI are grouped together in the sorted stream of records.
+    val canTakeNextGroupByUmi = {
+      !skipSorting &&
+      (this.assignTag == ConsensusTags.MolecularId) &&
+        (this.edits == 0 || this.strategy == Strategy.Identity)
+    }
+
     // Filter and sort the input BAM file
     logger.info("Filtering the input.")
     val filteredIterator = in.iterator
@@ -528,7 +533,7 @@ class GroupReadsByUmi
         // SamOrder to sort the reads _and_ we'll overwrite it on the way out!
         // Note that here we trim UMIs (if enabled) to the minimum UMI length for sorting, but that when doing the
         // actual grouping later we go back to the raw tag (RX) and use as much of the UMI as possible.
-        if (this.canTakeNextGroupByUmi) {
+        if (canTakeNextGroupByUmi) {
           val umi = this.assigner.canonicalize(r[String](rawTag).toUpperCase)
           val truncated = this.minUmiLength match {
             case None    => umi
@@ -571,7 +576,7 @@ class GroupReadsByUmi
     logger.info("Assigning reads to UMIs and outputting.")
     while (templateCoordinateIterator.hasNext) {
       // Take the next set of templates by position and assign UMIs
-      val templates = takeNextGroup(templateCoordinateIterator)
+      val templates = takeNextGroup(templateCoordinateIterator, canTakeNextGroupByUmi=canTakeNextGroupByUmi)
       assignUmiGroups(templates)
 
       // Then output the records in the right order (assigned tag, read name, r1, r2)
@@ -612,7 +617,7 @@ class GroupReadsByUmi
   }
 
   /** Consumes the next group of templates with all matching end positions and returns them. */
-  def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
+  def takeNextGroup(iterator: BufferedIterator[Template], canTakeNextGroupByUmi: Boolean) : Seq[Template] = {
     val first     = iterator.next()
     val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
     val firstUmi  = first.r1.get.apply[String](this.assignTag)