fulcrumgenomics · tfenne · Feb 20, 2022 · Feb 17, 2022 · Feb 17, 2022 · Feb 17, 2022
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -4,7 +4,7 @@ addDependencyTreePlugin
 
 addSbtPlugin("com.typesafe.sbt"  % "sbt-git"       % "1.0.0")
 addSbtPlugin("com.github.gseitz" % "sbt-release"   % "1.0.10")
-addSbtPlugin("com.eed3si9n"      % "sbt-assembly"  % "0.15.0")
+addSbtPlugin("com.eed3si9n"      % "sbt-assembly"  % "1.2.0")
 addSbtPlugin("org.scoverage"     % "sbt-scoverage" % "1.6.1")
 addSbtPlugin("org.xerial.sbt"    % "sbt-sonatype"  % "3.9.4")
 addSbtPlugin("com.jsuereth"      % "sbt-pgp"       % "2.0.1")

diff --git a/src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala b/src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala
@@ -185,7 +185,10 @@ class CorrectUmis
         case Some(umi: String) =>
           val sequences = umi.split('-')
           if (sequences.exists(_.length != umiLength)) {
-            if (wrongLengthRecords == 0) logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}")
+            if (wrongLengthRecords == 0) {
+              logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}.")
+              logger.warning(s"Expected UMI length: ${umiLength}")
+            }
             wrongLengthRecords += 1
             rejectOut.foreach(w => w += rec)
           }

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -115,6 +115,12 @@ object GroupReadsByUmi {
     /** Take in a sequence of UMIs and assign each UMI to a unique UMI group ID. */
     def assign(rawUmis: Seq[Umi]) : Map[Umi, MoleculeId]
 
+    /** Returns true if the two UMIs are the same. */
+    def isSameUmi(a: Umi, b: Umi): Boolean = a == b
+
+    /** Returns a canonical form of the UMI that is the same for all reads with the same UMI. */
+    def canonicalize(u: Umi): Umi = u
+
     /** Default implementation of a method to retrieve the next ID based on a counter. */
     protected def nextId: MoleculeId = this.counter.getAndIncrement().toString
 
@@ -259,13 +265,35 @@ object GroupReadsByUmi {
     /** String that is prefixed onto the UMI from the read with that maps to a higher coordinate in the genome.. */
     private[umi] val higherReadUmiPrefix: String = ("b" * (maxMismatches+1)) + ":"
 
+
+    /** Returns true if the two UMIs are the same. */
+    final override def isSameUmi(a: Umi, b: Umi): Boolean = {
+      if (a == b) true else {
+        val (a1, a2) = split(a)
+        val (b1, b2) = split(b)
+        a1 == b2 && a2 == b1
+      }
+    }
+
+    /** Returns the UMI with the lexically lower half first. */
+    override def canonicalize(u: Umi): Umi = {
+      val (a, b) = split(u)
+      if (a < b) u else s"${b}-${a}"
+    }
+
+    /** Splits the paired UMI into it's two parts. */
+    @inline private def split(umi: Umi): (Umi, Umi) = {
+      val index = umi.indexOf('-')
+      if (index == -1) throw new IllegalStateException(s"UMI $umi is not a paired UMI.")
+      val first  = umi.substring(0, index)
+      val second = umi.substring(index+1, umi.length)
+      (first, second)
+    }
+
     /** Takes a UMI of the form "A-B" and returns "B-A". */
-    def reverse(umi: Umi): Umi = umi.indexOf('-') match {
-      case -1 => throw new IllegalStateException(s"UMI $umi is not a paired UMI.")
-      case i  =>
-        val first  = umi.substring(0, i)
-        val second = umi.substring(i+1, umi.length)
-        second + '-' + first
+    def reverse(umi: Umi): Umi = {
+      val (first, second) = split(umi)
+      s"${second}-${first}"
     }
 
     /** Turns each UMI into the lexically earlier of A-B or B-A and then counts them. */
@@ -441,6 +469,17 @@ class GroupReadsByUmi
 
   private val assigner = strategy.newStrategy(this.edits)
 
+  /** True if no differences in UMIs are tolerated and the Molecular ID tag is MI, false otherwise. True here enables
+    * an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI thus
+    * reducing the number in memory.  This is helpful since edits=0 is often used for data that has
+    * high numbers of reads with the same start/stop coordinates.
+    * We do this be setting the MI tag to the canonicalized, (optionally truncated) UMI prior to sorting, so that
+    * reads with the same UMI are grouped together in the sorted stream of records.
+    */
+  private val canTakeNextGroupByUmi =
+    (this.assignTag == ConsensusTags.MolecularId) &&
+    (this.edits == 0 || this.strategy == Strategy.Identity)
+
   /** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
   private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
     val mateMqOk = if (rec.unpaired) true else rec.get[Int](SAMTag.MQ.name()) match {
@@ -480,7 +519,24 @@ class GroupReadsByUmi
           }
         } || { filterUmisTooShort += 1; false}
       }
-      .foreach(r => { sorter += r; kept += 1; sortProgress.record(r) })
+      .foreach { r =>
+        // If we're able to also group by the UMI because edits aren't allowed, push the trimmed, canonicalized UMI
+        // into the assign tag (which must be MI if cantakeNextGroupByUmi is true), since that is used by the
+        // SamOrder to sort the reads _and_ we'll overwrite it on the way out!
+        if (this.canTakeNextGroupByUmi) {
+          val umi = this.assigner.canonicalize(r[String](rawTag).toUpperCase)
+          val truncated = this.minUmiLength match {
+            case None    => umi
+            case Some(n) => umi.substring(0, n)
+          }
+
+          r(this.assignTag) = truncated
+        }
+
+        sorter += r
+        kept += 1
+        sortProgress.record(r)
+      }
 
     logger.info(f"Accepted $kept%,d reads for grouping.")
     if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
@@ -531,11 +587,22 @@ class GroupReadsByUmi
 
   /** Consumes the next group of templates with all matching end positions and returns them. */
   def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
-    val first = iterator.next()
+    val first     = iterator.next()
     val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
-    val buffer = ListBuffer[Template]()
-    while (iterator.hasNext && firstEnds == ReadInfo(iterator.head.r1.get)) buffer += iterator.next()
-    first :: buffer.toList
+    val firstUmi  = first.r1.get.apply[String](this.assignTag)
+    val builder   = IndexedSeq.newBuilder[Template]
+    builder    += first
+
+    while (
+      iterator.hasNext &&
+      firstEnds == ReadInfo(iterator.head.r1.get) &&
+      // This last condition only works because we put a canonicalized UMI into rec(assignTag) if canTakeNextGroupByUmi
+      (!canTakeNextGroupByUmi || firstUmi == iterator.head.r1.get.apply[String](this.assignTag))
+    ) {
+      builder += iterator.next()
+    }
+
+    builder.result()
   }
 
   /**

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -348,7 +348,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
 
       val in   = builder.toTempFile()
       val out  = Files.createTempFile("umi_grouped.", ".sam")
-      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=Strategy.Identity, edits=0, minUmiLength=Some(6)).execute()
+      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=strategy, edits=0, minUmiLength=Some(6)).execute()
 
       val recs = readBamRecs(out)
       recs should have length 2
@@ -363,7 +363,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
 
       val in   = builder.toTempFile()
       val out  = Files.createTempFile("umi_grouped.", ".sam")
-      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=Strategy.Identity, edits=0, minUmiLength=Some(5)).execute()
+      new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=strategy, edits=0, minUmiLength=Some(5)).execute()
 
       val recs = readBamRecs(out)
       recs should have length 4