diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index aece16860..faa563844 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -466,6 +466,20 @@ case class TagFamilySizeMetric(family_size: Int, var fraction: Proportion = 0, var fraction_gt_or_eq_family_size: Proportion = 0) extends Metric +/** + * Metrics produced by `GroupReadsByUmi` to describe reads passed through UMI grouping + * @param accepted_sam_records The number of SAM records accepted for grouping. + * @param discarded_non_pf The number of discarded non-PF SAM records. + * @param discarded_poor_alignment The number of SAM records discarded for poor alignment. + * @param discarded_ns_in_umi The number of SAM records discarded due to one or more Ns in the UMI. + * @param discarded_umis_to_short The number of SAM records discarded due to a shorter than expected UMI. + */ +case class UmiGroupingMetric(accepted_sam_records: Long, + discarded_non_pf: Long, + discarded_poor_alignment: Long, + discarded_ns_in_umi: Long, + discarded_umis_to_short: Long) extends Metric + /** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/ sealed trait Strategy extends EnumEntry { def newStrategy(edits: Int, threads: Int): UmiAssigner @@ -568,6 +582,7 @@ class GroupReadsByUmi (@arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, + @arg(flag='g', doc="Optional output of UMI grouping metrics.") val groupingMetrics: Option[FilePath] = None, @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = ConsensusTags.UmiBases, @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = ConsensusTags.MolecularId, @arg(flag='d', doc="Turn on duplicate marking mode.") val markDuplicates: Boolean = false, @@ -742,14 +757,26 @@ class GroupReadsByUmi ms.tails.foreach { tail => tail.headOption.foreach(m => m.fraction_gt_or_eq_family_size = tail.map(_.fraction).sum) } Metric.write(p, ms) } + + // Write out UMI grouping metrics + this.groupingMetrics.foreach { path => + val groupingMetrics = UmiGroupingMetric( + accepted_sam_records = kept, + discarded_non_pf = filteredNonPf, + discarded_poor_alignment = filteredPoorAlignment, + discarded_ns_in_umi = filteredNsInUmi, + discarded_umis_to_short = filterUmisTooShort, + ) + Metric.write(path, groupingMetrics) + } } private def logStats(): Unit = { - logger.info(f"Accepted $kept%,d reads for grouping.") - if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.") - logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.") - logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") - this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") } + logger.info(f"Accepted $kept%,d SAM records for grouping.") + if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF SAM records.") + logger.info(f"Filtered out $filteredPoorAlignment%,d SAM records due to mapping issues.") + logger.info(f"Filtered out $filteredNsInUmi%,d SAM records that contained one or more Ns in their UMIs.") + this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d SAM records that contained UMIs that were too short.") } } /** Consumes the next group of templates with all matching end positions and returns them. */ diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 09c8ce0b6..2fb0526bf 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -33,6 +33,7 @@ import com.fulcrumgenomics.cmdline.FgBioMain.FailureException import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.umi.GroupReadsByUmi._ +import com.fulcrumgenomics.util.Metric import org.scalatest.{OptionValues, PrivateMethodTester} import java.nio.file.Files @@ -247,7 +248,8 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30)) + val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt") + val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30)) val logs = executeFgbioTool(tool) val groups = readBamRecs(out).groupBy(_.name.charAt(0)) @@ -267,6 +269,10 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT hist.toFile.exists() shouldBe true + // TODO: Consider creating more unit tests that vary the following metric fields + val expectedMetric = UmiGroupingMetric(accepted_sam_records = 10, discarded_non_pf = 0, discarded_poor_alignment = 2, discarded_ns_in_umi = 0, discarded_umis_to_short = 0) + Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric) + // Make sure that we skip sorting for TemplateCoordinate val sortMessage = "Sorting the input to TemplateCoordinate order" logs.exists(_.contains(sortMessage)) shouldBe (sortOrder != TemplateCoordinate) @@ -424,8 +430,12 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT builder.addPair(name="a03", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ANN")) val in = builder.toTempFile() + val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt") val out = Files.createTempFile("umi_grouped.", ".bam") - new GroupReadsByUmi(input=in, output=out, rawTag="RX", assignTag="MI", strategy=Strategy.Paired, edits=2).execute() + new GroupReadsByUmi(input=in, output=out, groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Paired, edits=2).execute() + + val expectedMetric = UmiGroupingMetric(accepted_sam_records = 4, discarded_non_pf = 0, discarded_poor_alignment = 0, discarded_ns_in_umi = 2, discarded_umis_to_short = 0) + Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric) readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02") }