Skip to content

Commit

Permalink
fixes to the optimized path
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Mar 3, 2022
1 parent 8254206 commit 33ff57b
Showing 1 changed file with 18 additions and 13 deletions.
31 changes: 18 additions & 13 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -473,16 +473,6 @@ class GroupReadsByUmi

private val assigner = strategy.newStrategy(this.edits)

/** True if no differences in UMIs are tolerated and the Molecular ID tag is MI, false otherwise. True here enables
* an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI thus
* reducing the number of reads in memory. This is helpful since edits=0 is often used for data that has
* high numbers of reads with the same start/stop coordinates.
* We do this by setting the MI tag to the canonicalized (optionally truncated) UMI prior to sorting, so that
* reads with the same UMI are grouped together in the sorted stream of records.
*/
private val canTakeNextGroupByUmi =
(this.assignTag == ConsensusTags.MolecularId) &&
(this.edits == 0 || this.strategy == Strategy.Identity)

/** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
Expand All @@ -506,6 +496,21 @@ class GroupReadsByUmi
val header = in.header
val skipSorting = SamOrder(header).contains(TemplateCoordinate)

// True if the input will be sorted and no differences in UMIs are tolerated and the Molecular ID tag is MI,
// false otherwise. Pre-sorted (TemplateCoordinate sorted) input does not enable this optimization as we rely on
// copying the `RX` tag into the `MI` tag so that same raw UMIs are grouped together in the sorted stream of records.
//
// True here enables an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI
// thus reducing the number of reads in memory. This is helpful since edits=0 is often used for data that has
// high numbers of reads with the same start/stop coordinates.
// We do this by setting the MI tag to the canonicalized (optionally truncated) UMI prior to sorting, so that
// reads with the same UMI are grouped together in the sorted stream of records.
val canTakeNextGroupByUmi = {
!skipSorting &&
(this.assignTag == ConsensusTags.MolecularId) &&
(this.edits == 0 || this.strategy == Strategy.Identity)
}

// Filter and sort the input BAM file
logger.info("Filtering the input.")
val filteredIterator = in.iterator
Expand All @@ -528,7 +533,7 @@ class GroupReadsByUmi
// SamOrder to sort the reads _and_ we'll overwrite it on the way out!
// Note that here we trim UMIs (if enabled) to the minimum UMI length for sorting, but that when doing the
// actual grouping later we go back to the raw tag (RX) and use as much of the UMI as possible.
if (this.canTakeNextGroupByUmi) {
if (canTakeNextGroupByUmi) {
val umi = this.assigner.canonicalize(r[String](rawTag).toUpperCase)
val truncated = this.minUmiLength match {
case None => umi
Expand Down Expand Up @@ -571,7 +576,7 @@ class GroupReadsByUmi
logger.info("Assigning reads to UMIs and outputting.")
while (templateCoordinateIterator.hasNext) {
// Take the next set of templates by position and assign UMIs
val templates = takeNextGroup(templateCoordinateIterator)
val templates = takeNextGroup(templateCoordinateIterator, canTakeNextGroupByUmi=canTakeNextGroupByUmi)
assignUmiGroups(templates)

// Then output the records in the right order (assigned tag, read name, r1, r2)
Expand Down Expand Up @@ -612,7 +617,7 @@ class GroupReadsByUmi
}

/** Consumes the next group of templates with all matching end positions and returns them. */
def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
def takeNextGroup(iterator: BufferedIterator[Template], canTakeNextGroupByUmi: Boolean) : Seq[Template] = {
val first = iterator.next()
val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
val firstUmi = first.r1.get.apply[String](this.assignTag)
Expand Down

0 comments on commit 33ff57b

Please sign in to comment.