Skip to content

Commit

Permalink
Make GroupReadsByUmi significantly more permissive:
Browse files Browse the repository at this point in the history
 - Make allowing inter-contig reads the default
 - Allow through read pairs where one end is mapped and the other is unmapped
  • Loading branch information
tfenne committed Mar 1, 2022
1 parent 2278509 commit 0eb2490
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 58 deletions.
7 changes: 7 additions & 0 deletions src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ class TransientAttrs(private val rec: SamRecord) {
case null => default
case value => value.asInstanceOf[A]
}
def getOrElseUpdate[A](key: Any, default: => A): A = rec.asSam.getTransientAttribute(key) match {
case null =>
val value: A = default
update(key, value)
value
case value => value.asInstanceOf[A]
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ package com.fulcrumgenomics.umi

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioMain, FgBioTool}
import com.fulcrumgenomics.commons.io.Io
import com.fulcrumgenomics.commons.util.{LazyLogging, LogLevel, Logger}
import com.fulcrumgenomics.sopt._
Expand Down
136 changes: 84 additions & 52 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,22 @@ object GroupReadsByUmi {
private val ReadInfoTempAttributeName = "__GRBU_ReadInfo"

/** A case class to represent all the information we need to order reads for duplicate marking / grouping. */
case class ReadInfo(refIndex: Int, start1: Int, start2: Int, strand1: Boolean, strand2: Boolean, library: String)
case class ReadInfo(refIndex1: Int,
start1: Int,
strand1: Byte,
refIndex2: Int,
start2: Int,
strand2: Byte,
library: String)

object ReadInfo {
// Use the Max possible value for ref/pos/strand when we have unmapped reads to mirror what's done
// in the TemplateCoordinate sort order where we sort by the _lower_ of the two mates' positions
// and want to use the mapped position when one mate is unmapped
private val UnknownRef = Int.MaxValue
private val UnknownPos = Int.MaxValue
private val UnknownStrand = Byte.MaxValue

/** Looks in all the places the library name can be hiding. Returns the library name
* if one is found, otherwise returns "unknown".
*/
Expand All @@ -76,36 +89,57 @@ object GroupReadsByUmi {
if (rg != null && rg.getLibrary != null) rg.getLibrary else "unknown"
}

/** Creates/retrieves a ReadEnds object from a SamRecord and stores it in a temporary attribute for later user. */
def apply(rec: SamRecord) : ReadInfo = {
val tmp = rec.transientAttrs[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName)
if (tmp != null) {
tmp
}
else {
val lib = library(rec)
val chrom = rec.refIndex
val mateChrom = rec.mateRefIndex
val recNeg = rec.negativeStrand
val recPos = if (recNeg) rec.unclippedEnd else rec.unclippedStart

val (mateNeg, matePos) = if (!rec.paired) (false, Int.MaxValue) else {
val neg = rec.mateNegativeStrand
val pos = if (neg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
(neg, pos)
/** Extract a ReadInfo from a SamRecord; mate cigar must be present if a mate exists and is mapped. */
def apply(rec: SamRecord) : ReadInfo =
rec.transientAttrs.getOrElseUpdate[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName, {
val lib = library(rec)
val (ref1, start1, strand1) = positionOf(rec)
val (ref2, start2, strand2) = if (rec.unpaired || rec.mateUnmapped) (UnknownRef, UnknownPos, UnknownStrand) else {
val start = (if (rec.matePositiveStrand) rec.mateUnclippedStart else rec.mateUnclippedEnd)
.getOrElse(throw new IllegalStateException(s"Missing mate cigar tag (MC) for read $rec."))
(rec.mateRefIndex, start, strandToByte(rec.matePositiveStrand))
}

val result = if (chrom < mateChrom || (chrom == mateChrom && (recPos < matePos || (recPos == matePos && !recNeg)))) {
new ReadInfo(chrom, recPos, matePos, recNeg, mateNeg, lib)
}
else {
new ReadInfo(mateChrom, matePos, recPos, mateNeg, recNeg, lib)
}
val r1Earlier =
(ref1 < ref2) ||
(ref1 == ref2 && start1 < start2) ||
(ref1 == ref2 && start1 == start2 && strand1 < strand2)

if (r1Earlier) new ReadInfo(ref1, start1, strand1, ref2, start2, strand2, lib)
else new ReadInfo(ref2, start2, strand2, ref1, start1, strand1, lib)
})

rec.transientAttrs(GroupReadsByUmi.ReadInfoTempAttributeName, result)
result

/** Extract a ReadInfo from a Template object. R1 primary must be present. */
def apply(t: Template): ReadInfo = {
val r1 = t.r1.getOrElse(throw new IllegalStateException(s"${t.name} did not have a primary R1 record."))
val r2 = t.r2

r1.transientAttrs.getOrElseUpdate[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName, {
val lib = library(r1)
val (ref1, start1, strand1) = positionOf(r1)
val (ref2, start2, strand2) = r2.map(positionOf).getOrElse((UnknownRef, UnknownPos, UnknownStrand))

val r1Earlier =
(ref1 < ref2) ||
(ref1 == ref2 && start1 < start2) ||
(ref1 == ref2 && start1 == start2 && strand1 < strand2)

if (r1Earlier) new ReadInfo(ref1, start1, strand1, ref2, start2, strand2, lib)
else new ReadInfo(ref2, start2, strand2, ref1, start1, strand1, lib)
})
}

/** Extracts the refIndex, unclipped 5' end and strand of the read, or Unknown values for unmapped reads. */
private def positionOf(r: SamRecord): (Int, Int, Byte) = {
if (r.unmapped) (UnknownRef, UnknownPos, UnknownStrand) else {
val start = if (r.positiveStrand) r.unclippedStart else r.unclippedEnd
(r.refIndex, start, strandToByte(r.positiveStrand))
}
}

// Encodes a known strand into a byte value
@inline private def strandToByte(positive: Boolean): Byte = if (positive) 0 else 1
}

/** Trait that can be implemented to provide a UMI assignment strategy. */
Expand Down Expand Up @@ -405,20 +439,11 @@ object Strategy extends FgBioEnum[Strategy] {
| 3. The assigned UMI tag
| 4. Read Name
|
|Reads are aggressively filtered out so that only high quality reads/mappings are taken forward. Single-end
|reads must have mapping quality >= `min-map-q`. Paired-end reads must both have mapping quality >= `min-mapq`
|(Note: the `MQ` tag is required on reads with mapped mates). By default, paired-end reads must have both reads
|mapped to the same chromosome (to turn off this filter, use `--allow-inter-contig`).
|
|This is done with the expectation that the next step is building consensus reads, where
|it is undesirable to either:
|
| 1. Assign reads together that are really from different source molecules
| 2. Build two groups from reads that are really from the same molecule
|During grouping, reads are filtered out if a) all reads with the same queryname are unmapped, b) any primary
|read has mapping quality < `min-map-q` (default=1), or c) the primary mappings for R1 and R2 are on different
|chromosomes and `--allow-inter-contig` has been set to false.
|
|Errors in mapping reads could lead to both and therefore are minimized.
|
|Grouping of UMIs is performed by one of three strategies:
|Grouping of UMIs is performed by one of four strategies:
|
|1. **identity**: only reads with identical UMI sequences are grouped together. This strategy
| may be useful for evaluating data, but should generally be avoided as it will
Expand Down Expand Up @@ -452,17 +477,18 @@ class GroupReadsByUmi
@arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
@arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = "RX",
@arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
@arg(flag='m', doc="Minimum mapping quality.") val minMapQ: Int = 30,
@arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1,
@arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false,
@arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy,
@arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy,
@arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
@arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
|otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
|""")
val minUmiLength: Option[Int] = None,
@arg(flag='x', doc= """Allow read pairs with primary alignments on different contigs to be grouped when using the
|paired assigner (otherwise filtered out).""")
val allowInterContig: Boolean = false
@arg(flag='x', doc= """
|DEPRECATED: this option will be removed in future versions and inter-contig reads will be
|automatically processed.""")
@deprecated val allowInterContig: Boolean = true
)extends FgBioTool with LazyLogging {
import GroupReadsByUmi._

Expand All @@ -483,12 +509,18 @@ class GroupReadsByUmi

/** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */
private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = {
val mateMqOk = if (rec.unpaired) true else rec.get[Int](SAMTag.MQ.name()) match {
case None => fail(s"Mate mapping quality (MQ) tag not present on read ${rec.name}.")
case Some(mq) => mq >= minMapQ
if (rec.mapped && rec.mapq < minMapQ) {
false
}
else if (rec.unpaired || rec.mateUnmapped) {
true
}
else {
rec.get[Int](SAMTag.MQ.name()) match {
case None => fail(s"Mate mapping quality (MQ) tag not present on read ${rec.name}.")
case Some(mq) => mq >= minMapQ
}
}

rec.mapq >= minMapQ && mateMqOk
}

override def execute(): Unit = {
Expand All @@ -509,7 +541,7 @@ class GroupReadsByUmi
in.iterator
.filter(r => !r.secondary && !r.supplementary)
.filter(r => (includeNonPfReads || r.pf) || { filteredNonPf += 1; false })
.filter(r => (r.mapped && (r.unpaired || r.mateMapped)) || { filteredPoorAlignment += 1; false })
.filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false })
.filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
.filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false })
.filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false })
Expand Down Expand Up @@ -591,14 +623,14 @@ class GroupReadsByUmi
/** Consumes the next group of templates with all matching end positions and returns them. */
def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = {
val first = iterator.next()
val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}")))
val firstEnds = ReadInfo(first)
val firstUmi = first.r1.get.apply[String](this.assignTag)
val builder = IndexedSeq.newBuilder[Template]
builder += first

while (
iterator.hasNext &&
firstEnds == ReadInfo(iterator.head.r1.get) &&
firstEnds == ReadInfo(iterator.head) &&
// This last condition only works because we put a canonicalized UMI into rec(assignTag) if canTakeNextGroupByUmi
(!canTakeNextGroupByUmi || firstUmi == iterator.head.r1.get.apply[String](this.assignTag))
) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] {
*
* NOTE: filtered out reads are sent to the [[rejectRecords]] method and do not need further handling
*/
protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = {
protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = if (recs.size < 2) recs else {
val groups = new ArrayBuffer[AlignmentGroup]
val sorted = recs.sortBy(r => -r.length).toIndexedSeq

Expand Down
31 changes: 27 additions & 4 deletions src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -199,28 +199,51 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

"GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
val builder = new SamBuilder(readLength=100, sort=None)
val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
val template = Template(r1=Some(r1), r2=Some(r2))

val tReadInfo = ReadInfo(template)
val rReadInfo = ReadInfo(r1)
rReadInfo shouldBe tReadInfo

tReadInfo.refIndex1 shouldBe 1
tReadInfo.start1 shouldBe 499
tReadInfo.refIndex2 shouldBe 2
tReadInfo.start2 shouldBe 290
}

// Test for running the GroupReadsByUmi command line program with some sample input
"GroupReadsByUmi" should "group reads correctly" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
builder.addPair(name="a01", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAA"))
builder.addPair(name="a02", start1=100, start2=300, attrs=Map("RX" -> "AAAAgAAA"))
builder.addPair(name="a03", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAA"))
builder.addPair(name="a04", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAt"))
builder.addPair(name="a05", start1=100, start2=300, unmapped2=true, attrs=Map("RX" -> "AAAAAAAt"))
builder.addPair(name="a06", start1=100, start2=300, mapq1=5)
builder.addPair(name="b01", start1=100, start2=100, unmapped2=true, attrs=Map("RX" -> "AAAAAAAt"))
builder.addPair(name="c01", start1=100, start2=300, mapq1=5)

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1).execute()
new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=30).execute()

val groups = readBamRecs(out).groupBy(_.name.charAt(0))

// Group A: Read 5 out for unmapped mate, 6 out for low mapq, 1-4 all passed through into one umi group
// Group A: 1-4 all passed through into one umi group
groups('a') should have size 4*2
groups('a').map(_.name).toSet shouldEqual Set("a01", "a02", "a03", "a04")
groups('a').map(r => r[String]("MI")).toSet should have size 1

// 5 separated out into another group due to unmapped mate
groups('b') should have size 2
groups('b').map(r => r[String]("MI")).toSet should have size 1
groups('b').map(r => r[String]("MI")).head should not be groups('a').map(r => r[String]("MI")).head

// 6 out for low mapq,
groups.contains('c') shouldBe false

hist.toFile.exists() shouldBe true
}

Expand Down

0 comments on commit 0eb2490

Please sign in to comment.