From 33ff57b08b80fe0974ee0623c8a107c04b6f696a Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 2 Mar 2022 20:16:57 -0700 Subject: [PATCH] fixes to the optimized path --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 3fa2513b6..b3fa0a305 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -473,16 +473,6 @@ class GroupReadsByUmi private val assigner = strategy.newStrategy(this.edits) - /** True if no differences in UMIs are tolerated and the Molecular ID tag is MI, false otherwise. True here enables - * an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI thus - * reducing the number of reads in memory. This is helpful since edits=0 is often used for data that has - * high numbers of reads with the same start/stop coordinates. - * We do this by setting the MI tag to the canonicalized (optionally truncated) UMI prior to sorting, so that - * reads with the same UMI are grouped together in the sorted stream of records. - */ - private val canTakeNextGroupByUmi = - (this.assignTag == ConsensusTags.MolecularId) && - (this.edits == 0 || this.strategy == Strategy.Identity) /** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */ private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = { @@ -506,6 +496,21 @@ class GroupReadsByUmi val header = in.header val skipSorting = SamOrder(header).contains(TemplateCoordinate) + // True if the input will be sorted and no differences in UMIs are tolerated and the Molecular ID tag is MI, + // false otherwise. Pre-sorted (TemplateCoordinate sorted) input does not enable this optimization as we rely on + // copying the `RX` tag into the `MI` tag so that same raw UMIs are grouped together in the sorted stream of records. + // + // True here enables an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI + // thus reducing the number of reads in memory. This is helpful since edits=0 is often used for data that has + // high numbers of reads with the same start/stop coordinates. + // We do this by setting the MI tag to the canonicalized (optionally truncated) UMI prior to sorting, so that + // reads with the same UMI are grouped together in the sorted stream of records. + val canTakeNextGroupByUmi = { + !skipSorting && + (this.assignTag == ConsensusTags.MolecularId) && + (this.edits == 0 || this.strategy == Strategy.Identity) + } + // Filter and sort the input BAM file logger.info("Filtering the input.") val filteredIterator = in.iterator @@ -528,7 +533,7 @@ class GroupReadsByUmi // SamOrder to sort the reads _and_ we'll overwrite it on the way out! // Note that here we trim UMIs (if enabled) to the minimum UMI length for sorting, but that when doing the // actual grouping later we go back to the raw tag (RX) and use as much of the UMI as possible. - if (this.canTakeNextGroupByUmi) { + if (canTakeNextGroupByUmi) { val umi = this.assigner.canonicalize(r[String](rawTag).toUpperCase) val truncated = this.minUmiLength match { case None => umi @@ -571,7 +576,7 @@ class GroupReadsByUmi logger.info("Assigning reads to UMIs and outputting.") while (templateCoordinateIterator.hasNext) { // Take the next set of templates by position and assign UMIs - val templates = takeNextGroup(templateCoordinateIterator) + val templates = takeNextGroup(templateCoordinateIterator, canTakeNextGroupByUmi=canTakeNextGroupByUmi) assignUmiGroups(templates) // Then output the records in the right order (assigned tag, read name, r1, r2) @@ -612,7 +617,7 @@ class GroupReadsByUmi } /** Consumes the next group of templates with all matching end positions and returns them. */ - def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = { + def takeNextGroup(iterator: BufferedIterator[Template], canTakeNextGroupByUmi: Boolean) : Seq[Template] = { val first = iterator.next() val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}"))) val firstUmi = first.r1.get.apply[String](this.assignTag)