Make GroupReadsByUmi significantly more permissive:

- Make allowing inter-contig reads the default - Allow through read pairs where one end is mapped and the other is unmapped
fulcrumgenomics · Mar 1, 2022 · 0eb2490 · 0eb2490
1 parent 2278509
commit 0eb2490
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 58 deletions.
diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala
@@ -65,6 +65,13 @@ class TransientAttrs(private val rec: SamRecord) {
     case null => default
     case value => value.asInstanceOf[A]
   }
+  def getOrElseUpdate[A](key: Any, default: => A): A = rec.asSam.getTransientAttribute(key) match {
+    case null  =>
+      val value: A = default
+      update(key, value)
+      value
+    case value => value.asInstanceOf[A]
+  }
 }
 
 /**

diff --git a/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala b/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala
@@ -27,7 +27,7 @@ package com.fulcrumgenomics.umi
 
 import com.fulcrumgenomics.FgBioDef._
 import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter}
-import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
+import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioMain, FgBioTool}
 import com.fulcrumgenomics.commons.io.Io
 import com.fulcrumgenomics.commons.util.{LazyLogging, LogLevel, Logger}
 import com.fulcrumgenomics.sopt._

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -65,9 +65,22 @@ object GroupReadsByUmi {
   private val ReadInfoTempAttributeName = "__GRBU_ReadInfo"
 
   /** A case class to represent all the information we need to order reads for duplicate marking / grouping. */
-  case class ReadInfo(refIndex: Int, start1: Int, start2: Int, strand1: Boolean, strand2: Boolean, library: String)
+  case class ReadInfo(refIndex1: Int,
+                      start1: Int,
+                      strand1: Byte,
+                      refIndex2: Int,
+                      start2: Int,
+                      strand2: Byte,
+                      library: String)
 
   object ReadInfo {
+    // Use the Max possible value for ref/pos/strand when we have unmapped reads to mirror what's done
+    // in the TemplateCoordinate sort order where we sort by the _lower_ of the two mates' positions
+    // and want to use the mapped position when one mate is unmapped
+    private val UnknownRef    = Int.MaxValue
+    private val UnknownPos    = Int.MaxValue
+    private val UnknownStrand = Byte.MaxValue
+
     /** Looks in all the places the library name can be hiding. Returns the library name
       * if one is found, otherwise returns "unknown".
       */
@@ -76,36 +89,57 @@ object GroupReadsByUmi {
       if (rg != null && rg.getLibrary != null) rg.getLibrary else "unknown"
     }
 
-    /** Creates/retrieves a ReadEnds object from a SamRecord and stores it in a temporary attribute for later user. */
-    def apply(rec: SamRecord) : ReadInfo = {
-      val tmp = rec.transientAttrs[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName)
-      if (tmp != null) {
-        tmp
-      }
-      else {
-        val lib       = library(rec)
-        val chrom     = rec.refIndex
-        val mateChrom = rec.mateRefIndex
-        val recNeg    = rec.negativeStrand
-        val recPos    = if (recNeg) rec.unclippedEnd else rec.unclippedStart
-
-        val (mateNeg, matePos) = if (!rec.paired) (false, Int.MaxValue) else {
-          val neg = rec.mateNegativeStrand
-          val pos = if (neg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
-          (neg, pos)
+    /** Extract a ReadInfo from a SamRecord; mate cigar must be present if a mate exists and is mapped. */
+    def apply(rec: SamRecord) : ReadInfo =
+      rec.transientAttrs.getOrElseUpdate[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName, {
+        val lib = library(rec)
+        val (ref1, start1, strand1) = positionOf(rec)
+        val (ref2, start2, strand2) = if (rec.unpaired || rec.mateUnmapped) (UnknownRef, UnknownPos, UnknownStrand) else {
+          val start = (if (rec.matePositiveStrand) rec.mateUnclippedStart else rec.mateUnclippedEnd)
+            .getOrElse(throw new IllegalStateException(s"Missing mate cigar tag (MC) for read $rec."))
+          (rec.mateRefIndex, start, strandToByte(rec.matePositiveStrand))
         }
 
-        val result = if (chrom < mateChrom || (chrom == mateChrom && (recPos < matePos || (recPos == matePos && !recNeg)))) {
-          new ReadInfo(chrom, recPos, matePos, recNeg, mateNeg, lib)
-        }
-        else {
-          new ReadInfo(mateChrom, matePos, recPos, mateNeg, recNeg, lib)
-        }
+        val r1Earlier =
+          (ref1 < ref2) ||
+          (ref1 == ref2 && start1 < start2) ||
+          (ref1 == ref2 && start1 == start2 && strand1 < strand2)
+
+        if (r1Earlier) new ReadInfo(ref1, start1, strand1, ref2, start2, strand2, lib)
+        else           new ReadInfo(ref2, start2, strand2, ref1, start1, strand1, lib)
+      })
 
-        rec.transientAttrs(GroupReadsByUmi.ReadInfoTempAttributeName, result)
-        result
+
+    /** Extract a ReadInfo from a Template object.  R1 primary must be present. */
+    def apply(t: Template): ReadInfo = {
+      val r1 = t.r1.getOrElse(throw new IllegalStateException(s"${t.name} did not have a primary R1 record."))
+      val r2 = t.r2
+
+      r1.transientAttrs.getOrElseUpdate[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName, {
+        val lib                     = library(r1)
+        val (ref1, start1, strand1) = positionOf(r1)
+        val (ref2, start2, strand2) = r2.map(positionOf).getOrElse((UnknownRef, UnknownPos, UnknownStrand))
+
+        val r1Earlier =
+          (ref1 < ref2) ||
+          (ref1 == ref2 && start1 < start2) ||
+          (ref1 == ref2 && start1 == start2 && strand1 < strand2)
+
+        if (r1Earlier) new ReadInfo(ref1, start1, strand1, ref2, start2, strand2, lib)
+        else           new ReadInfo(ref2, start2, strand2, ref1, start1, strand1, lib)
+      })
+    }
+
+    /** Extracts the refIndex, unclipped 5' end and strand of the read, or Unknown values for unmapped reads. */
+    private def positionOf(r: SamRecord): (Int, Int, Byte) = {
+      if (r.unmapped) (UnknownRef, UnknownPos, UnknownStrand) else {
+        val start = if (r.positiveStrand) r.unclippedStart else r.unclippedEnd
+        (r.refIndex, start, strandToByte(r.positiveStrand))
       }
     }
+
+    // Encodes a known strand into a byte value
+    @inline private def strandToByte(positive: Boolean): Byte = if (positive) 0 else 1
   }
 
   /** Trait that can be implemented to provide a UMI assignment strategy. */
@@ -405,20 +439,11 @@ object Strategy extends FgBioEnum[Strategy] {
     |   3. The assigned UMI tag
     |   4. Read Name
     |
-    |Reads are aggressively filtered out so that only high quality reads/mappings are taken forward. Single-end
-    |reads must have mapping quality >= `min-map-q`.  Paired-end reads must both have mapping quality >= `min-mapq`
-    |(Note: the `MQ` tag is required on reads with mapped mates). By default, paired-end reads must have both reads
-    |mapped to the same chromosome (to turn off this filter, use `--allow-inter-contig`).
-    |
-    |This is done with the expectation that the next step is building consensus reads, where
-    |it is undesirable to either:
-    |
-    |   1. Assign reads together that are really from different source molecules
-    |   2. Build two groups from reads that are really from the same molecule
+    |During grouping, reads are filtered out if a) all reads with the same queryname are unmapped, b) any primary
+    |read has mapping quality < `min-map-q` (default=1), or c) the primary mappings for R1 and R2 are on different
+    |chromosomes and `--allow-inter-contig` has been set to false.
     |
-    |Errors in mapping reads could lead to both and therefore are minimized.
-    |
-    |Grouping of UMIs is performed by one of three strategies:
+    |Grouping of UMIs is performed by one of four strategies:
     |
     |1. **identity**:  only reads with identical UMI sequences are grouped together. This strategy
     |                  may be useful for evaluating data, but should generally be avoided as it will
@@ -452,17 +477,18 @@ class GroupReadsByUmi
   @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
   @arg(flag='t', doc="The tag containing the raw UMI.")  val rawTag: String    = "RX",
   @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
-  @arg(flag='m', doc="Minimum mapping quality.")         val minMapQ: Int      = 30,
+  @arg(flag='m', doc="Minimum mapping quality for mapped reads.")         val minMapQ: Int      = 1,
   @arg(flag='n', doc="Include non-PF reads.")            val includeNonPfReads: Boolean = false,
-  @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy,
+  @arg(flag='s', doc="The UMI assignment strategy.")     val strategy: Strategy,
   @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
   @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
                        |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
                        |""")
   val minUmiLength: Option[Int] = None,
-  @arg(flag='x', doc= """Allow read pairs with primary alignments on different contigs to be grouped when using the
-                       |paired assigner (otherwise filtered out).""")
-  val allowInterContig: Boolean = false
+  @arg(flag='x', doc= """
+                       |DEPRECATED: this option will be removed in future versions and inter-contig reads will be
+                       |automatically processed.""")
+  @deprecated val allowInterContig: Boolean = true
 )extends FgBioTool with LazyLogging {
   import GroupReadsByUmi._
 
@@ -483,12 +509,18 @@ class GroupReadsByUmi
 
   /** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
   private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
-    val mateMqOk = if (rec.unpaired) true else rec.get[Int](SAMTag.MQ.name()) match {
-      case None     => fail(s"Mate mapping quality (MQ) tag not present on read ${rec.name}.")
-      case Some(mq) => mq >= minMapQ
+    if (rec.mapped && rec.mapq < minMapQ) {
+      false
+    }
+    else if (rec.unpaired || rec.mateUnmapped) {
+      true
+    }
+    else {
+      rec.get[Int](SAMTag.MQ.name()) match {
+        case None     => fail(s"Mate mapping quality (MQ) tag not present on read ${rec.name}.")
+        case Some(mq) => mq >= minMapQ
+      }
     }
-
-    rec.mapq >= minMapQ && mateMqOk
   }
 
   override def execute(): Unit = {
@@ -509,7 +541,7 @@ class GroupReadsByUmi
     in.iterator
       .filter(r => !r.secondary && !r.supplementary)
       .filter(r => (includeNonPfReads || r.pf)                                      || { filteredNonPf += 1; false })
-      .filter(r => (r.mapped && (r.unpaired || r.mateMapped))                       || { filteredPoorAlignment += 1; false })
+      .filter(r => (r.mapped || (r.paired && r.mateMapped))                         || { filteredPoorAlignment += 1; false })
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this.minMapQ)                                          || { filteredPoorAlignment += 1; false })
       .filter(r => !r.get[String](rawTag).exists(_.contains('N'))                   || { filteredNsInUmi += 1; false })
@@ -591,14 +623,14 @@ class GroupReadsByUmi
   /** Consumes the next group of templates with all matching end positions and returns them. */
   def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
     val first     = iterator.next()
-    val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
+    val firstEnds = ReadInfo(first)
     val firstUmi  = first.r1.get.apply[String](this.assignTag)
     val builder   = IndexedSeq.newBuilder[Template]
     builder    += first
 
     while (
       iterator.hasNext &&
-      firstEnds == ReadInfo(iterator.head.r1.get) &&
+      firstEnds == ReadInfo(iterator.head) &&
       // This last condition only works because we put a canonicalized UMI into rec(assignTag) if canTakeNextGroupByUmi
       (!canTakeNextGroupByUmi || firstUmi == iterator.head.r1.get.apply[String](this.assignTag))
     ) {

diff --git a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala
@@ -334,7 +334,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] {
     *
     * NOTE: filtered out reads are sent to the [[rejectRecords]] method and do not need further handling
     */
-  protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = {
+  protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = if (recs.size < 2) recs else {
     val groups = new ArrayBuffer[AlignmentGroup]
     val sorted = recs.sortBy(r => -r.length).toIndexedSeq
 

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -199,28 +199,51 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
   }
 
+  "GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
+    val builder = new SamBuilder(readLength=100, sort=None)
+    val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
+    val template    = Template(r1=Some(r1), r2=Some(r2))
+
+    val tReadInfo = ReadInfo(template)
+    val rReadInfo = ReadInfo(r1)
+    rReadInfo shouldBe tReadInfo
+
+    tReadInfo.refIndex1 shouldBe 1
+    tReadInfo.start1    shouldBe 499
+    tReadInfo.refIndex2 shouldBe 2
+    tReadInfo.start2    shouldBe 290
+  }
+
   // Test for running the GroupReadsByUmi command line program with some sample input
   "GroupReadsByUmi" should "group reads correctly" in {
     val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
     builder.addPair(name="a01", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAA"))
     builder.addPair(name="a02", start1=100, start2=300, attrs=Map("RX" -> "AAAAgAAA"))
     builder.addPair(name="a03", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAA"))
     builder.addPair(name="a04", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAt"))
-    builder.addPair(name="a05", start1=100, start2=300, unmapped2=true, attrs=Map("RX" -> "AAAAAAAt"))
-    builder.addPair(name="a06", start1=100, start2=300, mapq1=5)
+    builder.addPair(name="b01", start1=100, start2=100, unmapped2=true, attrs=Map("RX" -> "AAAAAAAt"))
+    builder.addPair(name="c01", start1=100, start2=300, mapq1=5)
 
     val in  = builder.toTempFile()
     val out = Files.createTempFile("umi_grouped.", ".sam")
     val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
-    new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1).execute()
+    new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=30).execute()
 
     val groups = readBamRecs(out).groupBy(_.name.charAt(0))
 
-    // Group A: Read 5 out for unmapped mate, 6 out for low mapq, 1-4 all passed through into one umi group
+    // Group A: 1-4 all passed through into one umi group
     groups('a') should have size 4*2
     groups('a').map(_.name).toSet shouldEqual Set("a01", "a02", "a03", "a04")
     groups('a').map(r => r[String]("MI")).toSet should have size 1
 
+    // 5 separated out into another group due to unmapped mate
+    groups('b') should have size 2
+    groups('b').map(r => r[String]("MI")).toSet should have size 1
+    groups('b').map(r => r[String]("MI")).head should not be groups('a').map(r => r[String]("MI")).head
+
+    // 6 out for low mapq,
+    groups.contains('c') shouldBe false
+
     hist.toFile.exists() shouldBe true
   }