diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala index 856b3d1aa..cf05b54c6 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala @@ -65,6 +65,13 @@ class TransientAttrs(private val rec: SamRecord) { case null => default case value => value.asInstanceOf[A] } + def getOrElseUpdate[A](key: Any, default: => A): A = rec.asSam.getTransientAttribute(key) match { + case null => + val value: A = default + update(key, value) + value + case value => value.asInstanceOf[A] + } } /** diff --git a/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala b/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala index 8b1724046..2a50d6919 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala @@ -27,7 +27,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter} -import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} +import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioMain, FgBioTool} import com.fulcrumgenomics.commons.io.Io import com.fulcrumgenomics.commons.util.{LazyLogging, LogLevel, Logger} import com.fulcrumgenomics.sopt._ diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index e4ee90b2b..631fb8b35 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -65,9 +65,22 @@ object GroupReadsByUmi { private val ReadInfoTempAttributeName = "__GRBU_ReadInfo" /** A case class to represent all the information we need to order reads for duplicate marking / grouping. */ - case class ReadInfo(refIndex: Int, start1: Int, start2: Int, strand1: Boolean, strand2: Boolean, library: String) + case class ReadInfo(refIndex1: Int, + start1: Int, + strand1: Byte, + refIndex2: Int, + start2: Int, + strand2: Byte, + library: String) object ReadInfo { + // Use the Max possible value for ref/pos/strand when we have unmapped reads to mirror what's done + // in the TemplateCoordinate sort order where we sort by the _lower_ of the two mates' positions + // and want to use the mapped position when one mate is unmapped + private val UnknownRef = Int.MaxValue + private val UnknownPos = Int.MaxValue + private val UnknownStrand = Byte.MaxValue + /** Looks in all the places the library name can be hiding. Returns the library name * if one is found, otherwise returns "unknown". */ @@ -76,36 +89,57 @@ object GroupReadsByUmi { if (rg != null && rg.getLibrary != null) rg.getLibrary else "unknown" } - /** Creates/retrieves a ReadEnds object from a SamRecord and stores it in a temporary attribute for later user. */ - def apply(rec: SamRecord) : ReadInfo = { - val tmp = rec.transientAttrs[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName) - if (tmp != null) { - tmp - } - else { - val lib = library(rec) - val chrom = rec.refIndex - val mateChrom = rec.mateRefIndex - val recNeg = rec.negativeStrand - val recPos = if (recNeg) rec.unclippedEnd else rec.unclippedStart - - val (mateNeg, matePos) = if (!rec.paired) (false, Int.MaxValue) else { - val neg = rec.mateNegativeStrand - val pos = if (neg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) - (neg, pos) + /** Extract a ReadInfo from a SamRecord; mate cigar must be present if a mate exists and is mapped. */ + def apply(rec: SamRecord) : ReadInfo = + rec.transientAttrs.getOrElseUpdate[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName, { + val lib = library(rec) + val (ref1, start1, strand1) = positionOf(rec) + val (ref2, start2, strand2) = if (rec.unpaired || rec.mateUnmapped) (UnknownRef, UnknownPos, UnknownStrand) else { + val start = (if (rec.matePositiveStrand) rec.mateUnclippedStart else rec.mateUnclippedEnd) + .getOrElse(throw new IllegalStateException(s"Missing mate cigar tag (MC) for read $rec.")) + (rec.mateRefIndex, start, strandToByte(rec.matePositiveStrand)) } - val result = if (chrom < mateChrom || (chrom == mateChrom && (recPos < matePos || (recPos == matePos && !recNeg)))) { - new ReadInfo(chrom, recPos, matePos, recNeg, mateNeg, lib) - } - else { - new ReadInfo(mateChrom, matePos, recPos, mateNeg, recNeg, lib) - } + val r1Earlier = + (ref1 < ref2) || + (ref1 == ref2 && start1 < start2) || + (ref1 == ref2 && start1 == start2 && strand1 < strand2) + + if (r1Earlier) new ReadInfo(ref1, start1, strand1, ref2, start2, strand2, lib) + else new ReadInfo(ref2, start2, strand2, ref1, start1, strand1, lib) + }) - rec.transientAttrs(GroupReadsByUmi.ReadInfoTempAttributeName, result) - result + + /** Extract a ReadInfo from a Template object. R1 primary must be present. */ + def apply(t: Template): ReadInfo = { + val r1 = t.r1.getOrElse(throw new IllegalStateException(s"${t.name} did not have a primary R1 record.")) + val r2 = t.r2 + + r1.transientAttrs.getOrElseUpdate[ReadInfo](GroupReadsByUmi.ReadInfoTempAttributeName, { + val lib = library(r1) + val (ref1, start1, strand1) = positionOf(r1) + val (ref2, start2, strand2) = r2.map(positionOf).getOrElse((UnknownRef, UnknownPos, UnknownStrand)) + + val r1Earlier = + (ref1 < ref2) || + (ref1 == ref2 && start1 < start2) || + (ref1 == ref2 && start1 == start2 && strand1 < strand2) + + if (r1Earlier) new ReadInfo(ref1, start1, strand1, ref2, start2, strand2, lib) + else new ReadInfo(ref2, start2, strand2, ref1, start1, strand1, lib) + }) + } + + /** Extracts the refIndex, unclipped 5' end and strand of the read, or Unknown values for unmapped reads. */ + private def positionOf(r: SamRecord): (Int, Int, Byte) = { + if (r.unmapped) (UnknownRef, UnknownPos, UnknownStrand) else { + val start = if (r.positiveStrand) r.unclippedStart else r.unclippedEnd + (r.refIndex, start, strandToByte(r.positiveStrand)) } } + + // Encodes a known strand into a byte value + @inline private def strandToByte(positive: Boolean): Byte = if (positive) 0 else 1 } /** Trait that can be implemented to provide a UMI assignment strategy. */ @@ -405,20 +439,11 @@ object Strategy extends FgBioEnum[Strategy] { | 3. The assigned UMI tag | 4. Read Name | - |Reads are aggressively filtered out so that only high quality reads/mappings are taken forward. Single-end - |reads must have mapping quality >= `min-map-q`. Paired-end reads must both have mapping quality >= `min-mapq` - |(Note: the `MQ` tag is required on reads with mapped mates). By default, paired-end reads must have both reads - |mapped to the same chromosome (to turn off this filter, use `--allow-inter-contig`). - | - |This is done with the expectation that the next step is building consensus reads, where - |it is undesirable to either: - | - | 1. Assign reads together that are really from different source molecules - | 2. Build two groups from reads that are really from the same molecule + |During grouping, reads are filtered out if a) all reads with the same queryname are unmapped, b) any primary + |read has mapping quality < `min-map-q` (default=1), or c) the primary mappings for R1 and R2 are on different + |chromosomes and `--allow-inter-contig` has been set to false. | - |Errors in mapping reads could lead to both and therefore are minimized. - | - |Grouping of UMIs is performed by one of three strategies: + |Grouping of UMIs is performed by one of four strategies: | |1. **identity**: only reads with identical UMI sequences are grouped together. This strategy | may be useful for evaluating data, but should generally be avoided as it will @@ -452,17 +477,18 @@ class GroupReadsByUmi @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = "RX", @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", - @arg(flag='m', doc="Minimum mapping quality.") val minMapQ: Int = 30, + @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, - @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, + @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths. |""") val minUmiLength: Option[Int] = None, - @arg(flag='x', doc= """Allow read pairs with primary alignments on different contigs to be grouped when using the - |paired assigner (otherwise filtered out).""") - val allowInterContig: Boolean = false + @arg(flag='x', doc= """ + |DEPRECATED: this option will be removed in future versions and inter-contig reads will be + |automatically processed.""") + @deprecated val allowInterContig: Boolean = true )extends FgBioTool with LazyLogging { import GroupReadsByUmi._ @@ -483,12 +509,18 @@ class GroupReadsByUmi /** Checks that the read's mapq is over a minimum, and if the read is paired, that the mate mapq is also over the min. */ private def mapqOk(rec: SamRecord, minMapQ: Int): Boolean = { - val mateMqOk = if (rec.unpaired) true else rec.get[Int](SAMTag.MQ.name()) match { - case None => fail(s"Mate mapping quality (MQ) tag not present on read ${rec.name}.") - case Some(mq) => mq >= minMapQ + if (rec.mapped && rec.mapq < minMapQ) { + false + } + else if (rec.unpaired || rec.mateUnmapped) { + true + } + else { + rec.get[Int](SAMTag.MQ.name()) match { + case None => fail(s"Mate mapping quality (MQ) tag not present on read ${rec.name}.") + case Some(mq) => mq >= minMapQ + } } - - rec.mapq >= minMapQ && mateMqOk } override def execute(): Unit = { @@ -509,7 +541,7 @@ class GroupReadsByUmi in.iterator .filter(r => !r.secondary && !r.supplementary) .filter(r => (includeNonPfReads || r.pf) || { filteredNonPf += 1; false }) - .filter(r => (r.mapped && (r.unpaired || r.mateMapped)) || { filteredPoorAlignment += 1; false }) + .filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false }) .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) .filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false }) @@ -591,14 +623,14 @@ class GroupReadsByUmi /** Consumes the next group of templates with all matching end positions and returns them. */ def takeNextGroup(iterator: BufferedIterator[Template]) : Seq[Template] = { val first = iterator.next() - val firstEnds = ReadInfo(first.r1.getOrElse(fail(s"R1 missing for template ${first.name}"))) + val firstEnds = ReadInfo(first) val firstUmi = first.r1.get.apply[String](this.assignTag) val builder = IndexedSeq.newBuilder[Template] builder += first while ( iterator.hasNext && - firstEnds == ReadInfo(iterator.head.r1.get) && + firstEnds == ReadInfo(iterator.head) && // This last condition only works because we put a canonicalized UMI into rec(assignTag) if canTakeNextGroupByUmi (!canTakeNextGroupByUmi || firstUmi == iterator.head.r1.get.apply[String](this.assignTag)) ) { diff --git a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala index f3c04357b..55e51eb24 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala @@ -334,7 +334,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] { * * NOTE: filtered out reads are sent to the [[rejectRecords]] method and do not need further handling */ - protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = { + protected[umi] def filterToMostCommonAlignment(recs: Seq[SourceRead]): Seq[SourceRead] = if (recs.size < 2) recs else { val groups = new ArrayBuffer[AlignmentGroup] val sorted = recs.sortBy(r => -r.length).toIndexedSeq diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 29fdb88fe..f75fff4f0 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -199,6 +199,21 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected } + "GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in { + val builder = new SamBuilder(readLength=100, sort=None) + val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S") + val template = Template(r1=Some(r1), r2=Some(r2)) + + val tReadInfo = ReadInfo(template) + val rReadInfo = ReadInfo(r1) + rReadInfo shouldBe tReadInfo + + tReadInfo.refIndex1 shouldBe 1 + tReadInfo.start1 shouldBe 499 + tReadInfo.refIndex2 shouldBe 2 + tReadInfo.start2 shouldBe 290 + } + // Test for running the GroupReadsByUmi command line program with some sample input "GroupReadsByUmi" should "group reads correctly" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) @@ -206,21 +221,29 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT builder.addPair(name="a02", start1=100, start2=300, attrs=Map("RX" -> "AAAAgAAA")) builder.addPair(name="a03", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAA")) builder.addPair(name="a04", start1=100, start2=300, attrs=Map("RX" -> "AAAAAAAt")) - builder.addPair(name="a05", start1=100, start2=300, unmapped2=true, attrs=Map("RX" -> "AAAAAAAt")) - builder.addPair(name="a06", start1=100, start2=300, mapq1=5) + builder.addPair(name="b01", start1=100, start2=100, unmapped2=true, attrs=Map("RX" -> "AAAAAAAt")) + builder.addPair(name="c01", start1=100, start2=300, mapq1=5) val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1).execute() + new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=30).execute() val groups = readBamRecs(out).groupBy(_.name.charAt(0)) - // Group A: Read 5 out for unmapped mate, 6 out for low mapq, 1-4 all passed through into one umi group + // Group A: 1-4 all passed through into one umi group groups('a') should have size 4*2 groups('a').map(_.name).toSet shouldEqual Set("a01", "a02", "a03", "a04") groups('a').map(r => r[String]("MI")).toSet should have size 1 + // 5 separated out into another group due to unmapped mate + groups('b') should have size 2 + groups('b').map(r => r[String]("MI")).toSet should have size 1 + groups('b').map(r => r[String]("MI")).head should not be groups('a').map(r => r[String]("MI")).head + + // 6 out for low mapq, + groups.contains('c') shouldBe false + hist.toFile.exists() shouldBe true }