Skip to content

Commit

Permalink
fixups
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Mar 2, 2022
1 parent 1129b9a commit 0cd6d67
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 47 deletions.
23 changes: 3 additions & 20 deletions src/main/scala/com/fulcrumgenomics/bam/Bams.scala
Original file line number Diff line number Diff line change
Expand Up @@ -409,34 +409,17 @@ object Bams extends LazyLogging {
* values are removed. If the read is mapped all three tags will have values regenerated.
*
* @param rec the SamRecord to update
* @param ref a reference sequence file walker to pull the reference information from
* @param ref a reference sequence if the record is mapped
*/
def regenerateNmUqMdTags(rec: SamRecord, ref: ReferenceSequenceFileWalker): SamRecord = {
if (rec.unmapped) regenerateNmUqMdTags(rec, Map.empty[Int, ReferenceSequence]) else {
val refSeq = ref.get(rec.refIndex)
regenerateNmUqMdTags(rec, Map(refSeq.getContigIndex -> refSeq))
}
rec
}

/**
* Ensures that any NM/UQ/MD tags on the read are accurate. If the read is unmapped, any existing
* values are removed. If the read is mapped all three tags will have values regenerated.
*
* @param rec the SamRecord to update
* @param ref a reference sequence file walker to pull the reference information from
*/
def regenerateNmUqMdTags(rec: SamRecord, ref: Map[Int, ReferenceSequence]): SamRecord = {
def regenerateNmUqMdTags(rec: SamRecord, ref: => ReferenceSequence): Unit = {
import SAMTag._
if (rec.unmapped) {
rec(NM.name()) = null
rec(UQ.name()) = null
rec(MD.name()) = null
}
else {
val refBases = ref.getOrElse(rec.refIndex, throw new IllegalArgumentException(
s"Record '${rec.name}' had contig index '${rec.refIndex}', but not found in the input reference map"
)).getBases
val refBases = ref.getBases
SequenceUtil.calculateMdAndNmTags(rec.asSam, refBases, true, true)
if (rec.quals != null && rec.quals.length != 0) {
rec(SAMTag.UQ.name) = SequenceUtil.sumQualitiesOfMismatches(rec.asSam, refBases, 0)
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class ClipBam
val out = SamWriter(output, header, ref=Some(ref))

sorter.foreach { rec =>
Bams.regenerateNmUqMdTags(rec, walker)
Bams.regenerateNmUqMdTags(rec, walker.get(rec.refIndex))
out += rec
}

Expand Down
70 changes: 46 additions & 24 deletions src/main/scala/com/fulcrumgenomics/umi/FilterConsensusReads.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.fasta.ReferenceSequenceIterator
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.NumericTypes.PhredScore
import com.fulcrumgenomics.util.{Io, ProgressLogger}
import com.fulcrumgenomics.util.Io
import htsjdk.samtools.SAMFileHeader
import htsjdk.samtools.SAMFileHeader.GroupOrder
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.reference.ReferenceSequence
import htsjdk.samtools.util.SequenceUtil

Expand Down Expand Up @@ -119,7 +119,7 @@ class FilterConsensusReads
val minMeanBaseQuality: Option[PhredScore] = None,
@arg(flag='s', doc="Mask (make `N`) consensus bases where the AB and BA consensus reads disagree (for duplex-sequencing only).")
val requireSingleStrandAgreement: Boolean = false,
@arg(flag='S', doc="The sort order of the output. If not given, query grouped if the input is also query grouped, otherwise queryname.")
@arg(flag='S', doc="The sort order of the output. If not given, output will be in the same order as input if the input is query grouped, otherwise queryname order.")
val sortOrder: Option[SamOrder] = None
) extends FgBioTool with LazyLogging {
// Baseline input validation
Expand Down Expand Up @@ -175,13 +175,12 @@ class FilterConsensusReads
private val EmptyFilterResult = FilterResult(keepRead=true, maskedBases=0)

override def execute(): Unit = {
logger.info("Reading the reference into memory")
logger.info("Reading the reference fasta into memory")
val refMap = ReferenceSequenceIterator(ref, stripComments=true).map { ref => ref.getContigIndex -> ref}.toMap
logger.info(f"Read ${refMap.size}%,d contigs.")

val progress = ProgressLogger(logger, verb="Filtered and masked")
val in = SamSource(input)
val out = buildOutputWriter(in.header, refMap)
val in = SamSource(input)
val out = buildOutputWriter(in.header, refMap)

// Go through the reads by template and do the filtering
val templateIterator = Bams.templateIterator(in, maxInMemory=MaxRecordsInMemoryWhenSorting)
Expand Down Expand Up @@ -209,19 +208,16 @@ class FilterConsensusReads
totalBases += r1.length + template.r2.map(_.length).getOrElse(0)
maskedBases += r1Result.maskedBases + r2Result.maskedBases
out += r1
progress.record(r1)
template.r2.foreach { r => out += r; progress.record(r) }
template.r2.foreach { r => out += r }

template.allSupplementaryAndSecondary.foreach { r =>
val result = filterRecord(r)
if (result.keepRead) {
out += r
progress.record(r)
}
}
}
}
progress.logLast()

logger.info("Finalizing the output")
in.safelyClose()
Expand All @@ -241,28 +237,54 @@ class FilterConsensusReads
* Otherwise, we can skip sorting!
*
* */
private def buildOutputWriter(inHeader: SAMFileHeader, refMap: Map[Int, ReferenceSequence]): Closeable with Writer[SamRecord] = {
val outHeader = inHeader.clone()
private def buildOutputWriter(inHeader: SAMFileHeader, refMap: Map[Int, ReferenceSequence]): Writer[SamRecord] with Closeable = {
val outHeader = inHeader.clone()

// Check if the input will be re-sorted into QueryName, or if the input sort order will be kept
val orderAfterFiltering = SamOrder(inHeader) match {
case Some(order) if order == SamOrder.Queryname || order.groupOrder == GroupOrder.query => order
case _ => SamOrder.Queryname // input will we resorted to queryname
val inSortOrder = inHeader.getSortOrder
val inGroupOrder = inHeader.getGroupOrder
val inSubSort = Option(inHeader.getAttribute("SS"))

// Get the order after filtering
val (afterFilteringSortOrder, afterFilteringGroupOrder, afterFilteringSubSort) = {
if (inSortOrder == SortOrder.queryname || inGroupOrder == GroupOrder.query) { // no sorting occurred, so same as input
(inSortOrder, inGroupOrder, inSubSort)
}
else { // sorting occurred, so it's queryname
val order = SamOrder.Queryname
(order.sortOrder, order.groupOrder, order.subSort)
}
}

// Get the output order
val outputOrder = this.sortOrder
.getOrElse(orderAfterFiltering) // use the order after filtering, so no sort occurs
outputOrder.applyTo(outHeader) // remember to apply it
// Get the desired output order
val (outputSortOrder, outputGroupOrder, outputSubSort) = this.sortOrder match {
case None => (inSortOrder, inGroupOrder, inSubSort) // same as input
case Some(order) => (order.sortOrder, order.groupOrder, order.subSort) // specific output
}

// Build the writer
val sort = if (orderAfterFiltering == outputOrder) None else Some(outputOrder)
val sort: Option[SamOrder] = {
// if the order after filtering and the output order match, no need to re-sort the output
if (afterFilteringSortOrder == outputSortOrder && afterFilteringGroupOrder == outputGroupOrder && afterFilteringSubSort == outputSubSort) {
None
} else { // output order and order after filtering do not match, we need to re-sort the output
SamOrder.values.find { order =>
order.sortOrder == outputSortOrder && order.groupOrder == outputGroupOrder && order.subSort == outputSubSort
}.orElse {
// this can only happen if the input order is unrecognized
throw new IllegalArgumentException(
s"The input BAM had an unrecognized sort order (SO:$inSortOrder GO:$inGroupOrder SS: $inSubSort) in $input"
)
}
}
}
val writer = SamWriter(output, outHeader, sort=sort, maxRecordsInRam=MaxRecordsInMemoryWhenSorting)
sort.foreach(o => logger.info(f"Output will be sorted into $o order"))

// Create the final writer based on if the full reference has been loaded, or not
new Writer[SamRecord] with Closeable {
override def write(rec: SamRecord): Unit = writer += Bams.regenerateNmUqMdTags(rec, refMap)
override def write(rec: SamRecord): Unit = {
Bams.regenerateNmUqMdTags(rec, refMap(rec.refIndex))
writer += rec
}
def close(): Unit = writer.close()
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ class BamsTest extends UnitSpec {
rec("NM") = 7
rec("MD") = "6A7C8T9G"
rec("UQ") = 237
Bams.regenerateNmUqMdTags(rec, DummyRefWalker)
Bams.regenerateNmUqMdTags(rec, DummyRefWalker.get(rec.refIndex))
rec.get[Int]("NM") shouldBe None
rec.get[String]("MD") shouldBe None
rec.get[Int]("UQ") shouldBe None
Expand All @@ -238,7 +238,7 @@ class BamsTest extends UnitSpec {
rec("MD") = "6A7C8T9G"
rec("UQ") = 237
rec.bases = "AAACAAAATA"
Bams.regenerateNmUqMdTags(rec, DummyRefWalker)
Bams.regenerateNmUqMdTags(rec, DummyRefWalker.get(rec.refIndex))
rec[Int]("NM") shouldBe 2
rec[String]("MD") shouldBe "3A4A1"
rec[Int]("UQ") shouldBe 40
Expand Down

0 comments on commit 0cd6d67

Please sign in to comment.