Skip to content

Commit

Permalink
Take 2 at fixing the problem.
Browse files Browse the repository at this point in the history
  • Loading branch information
tfenne committed Feb 17, 2022
1 parent bcd8ff9 commit 771c87a
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ object SamOrder {
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
val index: Int = m.lastIndexOf('/')
if (index >= 0) m.substring(0, index) else m
}.getOrElse(rec.get[String](ConsensusTags.UmiBases).getOrElse(""))
}.getOrElse("")

if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
Expand Down
42 changes: 35 additions & 7 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ object GroupReadsByUmi {
/** Returns true if the two UMIs are the same. */
def isSameUmi(a: Umi, b: Umi): Boolean = a == b

/** Returns a canonical form of the UMI that is the same for all reads with the same UMI. */
def canonicalize(u: Umi): Umi = u

/** Default implementation of a method to retrieve the next ID based on a counter. */
protected def nextId: MoleculeId = this.counter.getAndIncrement().toString

Expand Down Expand Up @@ -272,6 +275,12 @@ object GroupReadsByUmi {
}
}

/** Returns the UMI with the lexically lower half first. */
override def canonicalize(u: Umi): Umi = {
val (a, b) = split(u)
if (a < b) u else s"${b}-${a}"
}

/** Splits the paired UMI into it's two parts. */
@inline private def split(umi: Umi): (Umi, Umi) = {
val index = umi.indexOf('-')
Expand Down Expand Up @@ -460,15 +469,16 @@ class GroupReadsByUmi

private val assigner = strategy.newStrategy(this.edits)

/** True if no differences in UMIs are tolerated and the UMI tag is RX, false otherwise. True here enables
/** True if no differences in UMIs are tolerated and the Molecular ID tag is MI, false otherwise. True here enables
* an optimization where, when bringing groups of reads into memory, we can _also_ group by UMI thus
* reducing the number in memory. This is helpful since edits=0 is often used for data that has
* high numbers of reads with the same start/stop coordinates.
* We do this be setting the MI tag to the canonicalized, (optionally truncated) UMI prior to sorting, so that
* reads with the same UMI are grouped together in the sorted stream of records.
*/
private val canTakeNextGroupByUmi =
(this.rawTag == ConsensusTags.UmiBases) &&
(this.edits == 0 || this.strategy == Strategy.Identity) &&
this.minUmiLength.isEmpty
(this.assignTag == ConsensusTags.MolecularId) &&
(this.edits == 0 || this.strategy == Strategy.Identity)

/** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
Expand Down Expand Up @@ -509,7 +519,24 @@ class GroupReadsByUmi
}
} || { filterUmisTooShort += 1; false}
}
.foreach(r => { sorter += r; kept += 1; sortProgress.record(r) })
.foreach { r =>
// If we're able to also group by the UMI because edits aren't allowed, push the trimmed, canonicalized UMI
// into the assign tag (which must be MI if cantakeNextGroupByUmi is true), since that is used by the
// SamOrder to sort the reads _and_ we'll overwrite it on the way out!
if (this.canTakeNextGroupByUmi) {
val umi = this.assigner.canonicalize(r[String](rawTag).toUpperCase)
val truncated = this.minUmiLength match {
case None => umi
case Some(n) => umi.substring(0, n)
}

r(this.assignTag) = truncated
}

sorter += r
kept += 1
sortProgress.record(r)
}

logger.info(f"Accepted $kept%,d reads for grouping.")
if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
Expand Down Expand Up @@ -562,14 +589,15 @@ class GroupReadsByUmi
def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
val first = iterator.next()
val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
val firstUmi = first.r1.get.apply[String](this.rawTag)
val firstUmi = first.r1.get.apply[String](this.assignTag)
val builder = IndexedSeq.newBuilder[Template]
builder += first

while (
iterator.hasNext &&
firstEnds == ReadInfo(iterator.head.r1.get) &&
(!canTakeNextGroupByUmi || this.assigner.isSameUmi(firstUmi, iterator.head.r1.get.apply[String](rawTag)))
// This last condition only works because we put a canonicalized UMI into rec(assignTag) if canTakeNextGroupByUmi
(!canTakeNextGroupByUmi || firstUmi == iterator.head.r1.get.apply[String](this.assignTag))
) {
builder += iterator.next()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=Strategy.Identity, edits=0, minUmiLength=Some(6)).execute()
new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=strategy, edits=0, minUmiLength=Some(6)).execute()

val recs = readBamRecs(out)
recs should have length 2
Expand All @@ -363,7 +363,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=Strategy.Identity, edits=0, minUmiLength=Some(5)).execute()
new GroupReadsByUmi(input=in, output=out, familySizeHistogram=None, rawTag="RX", assignTag="MI", strategy=strategy, edits=0, minUmiLength=Some(5)).execute()

val recs = readBamRecs(out)
recs should have length 4
Expand Down

0 comments on commit 771c87a

Please sign in to comment.