-
-
Notifications
You must be signed in to change notification settings - Fork 69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Specify an output sort order in FilterConsensusReads #782
Changes from 5 commits
610b100
a5a150a
1129b9a
0cd6d67
d495fd4
7ccc2b7
0c8b5fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,20 +24,24 @@ | |
|
||
package com.fulcrumgenomics.umi | ||
|
||
import java.lang.Math.{max, min} | ||
|
||
import com.fulcrumgenomics.FgBioDef._ | ||
import com.fulcrumgenomics.bam.Bams | ||
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter} | ||
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} | ||
import com.fulcrumgenomics.commons.io.Writer | ||
import com.fulcrumgenomics.commons.util.LazyLogging | ||
import com.fulcrumgenomics.fasta.ReferenceSequenceIterator | ||
import com.fulcrumgenomics.sopt.{arg, clp} | ||
import com.fulcrumgenomics.util.NumericTypes.PhredScore | ||
import com.fulcrumgenomics.util.{Io, ProgressLogger} | ||
import htsjdk.samtools.SAMFileHeader.SortOrder | ||
import htsjdk.samtools.reference.ReferenceSequenceFileWalker | ||
import com.fulcrumgenomics.util.Io | ||
import htsjdk.samtools.SAMFileHeader | ||
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} | ||
import htsjdk.samtools.reference.ReferenceSequence | ||
import htsjdk.samtools.util.SequenceUtil | ||
|
||
import java.io.Closeable | ||
import java.lang.Math.{max, min} | ||
|
||
/** Filter values for filtering consensus reads */ | ||
private[umi] case class ConsensusReadFilter(minReads: Int, maxReadErrorRate: Double, maxBaseErrorRate: Double) | ||
|
||
|
@@ -114,7 +118,9 @@ class FilterConsensusReads | |
@arg(flag='q', doc="The minimum mean base quality across the consensus read.") | ||
val minMeanBaseQuality: Option[PhredScore] = None, | ||
@arg(flag='s', doc="Mask (make `N`) consensus bases where the AB and BA consensus reads disagree (for duplex-sequencing only).") | ||
val requireSingleStrandAgreement: Boolean = false | ||
val requireSingleStrandAgreement: Boolean = false, | ||
@arg(flag='S', doc="The sort order of the output. If not given, output will be in the same order as input if the input is query grouped, otherwise queryname order.") | ||
val sortOrder: Option[SamOrder] = None | ||
) extends FgBioTool with LazyLogging { | ||
// Baseline input validation | ||
Io.assertReadable(input) | ||
|
@@ -169,12 +175,12 @@ class FilterConsensusReads | |
private val EmptyFilterResult = FilterResult(keepRead=true, maskedBases=0) | ||
|
||
override def execute(): Unit = { | ||
val in = SamSource(input) | ||
val header = in.header.clone() | ||
header.setSortOrder(SortOrder.coordinate) | ||
val sorter = Bams.sorter(SamOrder.Coordinate, header, maxRecordsInRam=MaxRecordsInMemoryWhenSorting) | ||
val out = SamWriter(output, header) | ||
val progress1 = ProgressLogger(logger, verb="Filtered and masked") | ||
logger.info("Reading the reference fasta into memory") | ||
val refMap = ReferenceSequenceIterator(ref, stripComments=true).map { ref => ref.getContigIndex -> ref}.toMap | ||
logger.info(f"Read ${refMap.size}%,d contigs.") | ||
nh13 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
val in = SamSource(input) | ||
val out = buildOutputWriter(in.header, refMap) | ||
|
||
// Go through the reads by template and do the filtering | ||
val templateIterator = Bams.templateIterator(in, maxInMemory=MaxRecordsInMemoryWhenSorting) | ||
|
@@ -201,34 +207,87 @@ class FilterConsensusReads | |
keptReads += primaryReadCount | ||
totalBases += r1.length + template.r2.map(_.length).getOrElse(0) | ||
maskedBases += r1Result.maskedBases + r2Result.maskedBases | ||
sorter += r1 | ||
progress1.record(r1) | ||
template.r2.foreach { r => sorter += r; progress1.record(r) } | ||
out += r1 | ||
template.r2.foreach { r => out += r } | ||
|
||
template.allSupplementaryAndSecondary.foreach { r => | ||
val result = filterRecord(r) | ||
if (result.keepRead) { | ||
sorter += r | ||
progress1.record(r) | ||
out += r | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Then iterate the reads in coordinate order and re-calculate key tags | ||
logger.info("Filtering complete; fixing tags and writing coordinate sorted reads.") | ||
val progress2 = new ProgressLogger(logger, verb="Wrote") | ||
val walker = new ReferenceSequenceFileWalker(ref.toFile) | ||
sorter.foreach { rec => | ||
Bams.regenerateNmUqMdTags(rec, walker) | ||
out += rec | ||
progress2.record(rec) | ||
} | ||
|
||
logger.info("Finalizing the output") | ||
in.safelyClose() | ||
out.close() | ||
logger.info(f"Output ${keptReads}%,d of ${totalReads}%,d primary consensus reads.") | ||
logger.info(f"Masked ${maskedBases}%,d of ${totalBases}%,d bases in retained primary consensus reads.") | ||
logger.info(f"Output $keptReads%,d of $totalReads%,d primary consensus reads.") | ||
logger.info(f"Masked $maskedBases%,d of $totalBases%,d bases in retained primary consensus reads.") | ||
Comment on lines
+227
to
+228
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why you remove my There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't need it, and IntelliJ is making it a game to get rid of the warnings There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So ... IntelliJ is not the boss of us. Put cursor into warning, option-enter, disable inspection. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really do think that Intellij is wrong on this. Since braces become necessary the moment the expression becomes more than a simple variable reference (e.g. |
||
} | ||
|
||
/** Builds the writer to which filtered records should be written. | ||
* | ||
* If the input order is [[SamOrder.Queryname]] or query grouped, then the filtered records will also be in the same | ||
* order. So if the output order is specified AND does not match the the input order, sorting will occur. | ||
* | ||
* If the input order is not [[SamOrder.Queryname]] or query grouped, then the input records will be resorted into | ||
* [[SamOrder.Queryname]]. So if the output order is specified AND is not [[SamOrder.Queryname]], sorting will occur. | ||
* | ||
* Otherwise, we can skip sorting! | ||
* | ||
* */ | ||
private def buildOutputWriter(inHeader: SAMFileHeader, refMap: Map[Int, ReferenceSequence]): Writer[SamRecord] with Closeable = { | ||
val outHeader = inHeader.clone() | ||
|
||
val inSortOrder = inHeader.getSortOrder | ||
val inGroupOrder = inHeader.getGroupOrder | ||
val inSubSort = Option(inHeader.getAttribute("SS")) | ||
|
||
// Get the order after filtering | ||
val (afterFilteringSortOrder, afterFilteringGroupOrder, afterFilteringSubSort) = { | ||
if (inSortOrder == SortOrder.queryname || inGroupOrder == GroupOrder.query) { // no sorting occurred, so same as input | ||
(inSortOrder, inGroupOrder, inSubSort) | ||
} | ||
else { // sorting occurred, so it's queryname | ||
val order = SamOrder.Queryname | ||
(order.sortOrder, order.groupOrder, order.subSort) | ||
} | ||
} | ||
|
||
// Get the desired output order | ||
val (outputSortOrder, outputGroupOrder, outputSubSort) = this.sortOrder match { | ||
case None => (inSortOrder, inGroupOrder, inSubSort) // same as input | ||
case Some(order) => (order.sortOrder, order.groupOrder, order.subSort) // specific output | ||
} | ||
|
||
val sort: Option[SamOrder] = { | ||
// if the order after filtering and the output order match, no need to re-sort the output | ||
if (afterFilteringSortOrder == outputSortOrder && afterFilteringGroupOrder == outputGroupOrder && afterFilteringSubSort == outputSubSort) { | ||
None | ||
} else { // output order and order after filtering do not match, we need to re-sort the output | ||
SamOrder.values.find { order => | ||
order.sortOrder == outputSortOrder && order.groupOrder == outputGroupOrder && order.subSort == outputSubSort | ||
}.orElse { | ||
// this can only happen if the input order is unrecognized | ||
throw new IllegalArgumentException( | ||
s"The input BAM had an unrecognized sort order (SO:$inSortOrder GO:$inGroupOrder SS: $inSubSort)" + | ||
s"\nTry re-running with --sort-order for a supported output order." in $input | ||
) | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't this whole
In this |
||
} | ||
val writer = SamWriter(output, outHeader, sort=sort, maxRecordsInRam=MaxRecordsInMemoryWhenSorting) | ||
sort.foreach(o => logger.info(f"Output will be sorted into $o order")) | ||
|
||
// Create the final writer based on if the full reference has been loaded, or not | ||
new Writer[SamRecord] with Closeable { | ||
override def write(rec: SamRecord): Unit = { | ||
Bams.regenerateNmUqMdTags(rec, refMap(rec.refIndex)) | ||
writer += rec | ||
} | ||
def close(): Unit = writer.close() | ||
} | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -290,6 +290,67 @@ class FilterConsensusReadsTest extends UnitSpec { | |
} | ||
} | ||
|
||
private case class ReadNames(in: Seq[String], out: Seq[String]) | ||
|
||
private def sortOrderTest(name1: String, start1R1: Int, start1R2: Int, name2: String, start2R1: Int, start2R2: Int, | ||
inOrder: SamOrder, outOrder: Option[SamOrder] = None): ReadNames = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that in order to test more combinations it would be better to take |
||
val builder = new SamBuilder(readLength=10, baseQuality=45, sort=Some(inOrder)) | ||
builder.addPair(name=name1, start1=start1R1, start2=start1R2).foreach(r => tag(r, minDepth=4, depth=4, readErr=0f, depths=arr(4, 10), errors=arr(0,10))) | ||
builder.addPair(name=name2, start1=start2R1, start2=start2R2).foreach(r => tag(r, minDepth=5, depth=5, readErr=0f, depths=arr(5, 10), errors=arr(0,10))) | ||
val in = builder.toTempFile() | ||
val out = makeTempFile("filtered.", ".bam") | ||
new FilterConsensusReads(input=in, output=out, ref=ref, reversePerBaseTags=false, | ||
minBaseQuality=45.toByte, minReads=Seq(3), maxReadErrorRate=Seq(0.025), maxBaseErrorRate=Seq(0.1), maxNoCallFraction=0.1, | ||
sortOrder=outOrder | ||
).execute() | ||
|
||
val recs = SamSource(out).toSeq | ||
recs.size shouldBe 4 | ||
recs.exists(_.basesString.contains("N")) shouldBe false | ||
ReadNames(in=readBamRecs(in).map(_.name), out=recs.map(_.name)) | ||
} | ||
|
||
it should "should output queryname sorted if the input is queryname sorted" in { | ||
val result = sortOrderTest( | ||
name1="q1", start1R1=101, start1R2=201, | ||
name2="q2", start2R1=100, start2R2=200, | ||
inOrder=SamOrder.Queryname | ||
) | ||
result.in should contain theSameElementsInOrderAs Seq("q1", "q1", "q2", "q2") // query name! | ||
result.out should contain theSameElementsInOrderAs Seq("q1", "q1", "q2", "q2") // query name! | ||
} | ||
|
||
it should "should output query grouped sorted if the input is query grouped sorted" in { | ||
val result = sortOrderTest( | ||
name1="q2", start1R1=100, start1R2=200, | ||
name2="q1", start2R1=101, start2R2=201, | ||
inOrder=SamOrder.TemplateCoordinate | ||
) | ||
result.in should contain theSameElementsInOrderAs Seq("q2", "q2", "q1", "q1") // query grouped, but not query name | ||
result.out should contain theSameElementsInOrderAs Seq("q2", "q2", "q1", "q1") // query grouped, but not query name | ||
} | ||
|
||
it should "should output queryname sorted if the input is neither queryname nor query grouped sorted" in { | ||
val result = sortOrderTest( | ||
name1="q2", start1R1=100, start1R2=200, | ||
name2="q1", start2R1=101, start2R2=201, | ||
inOrder=SamOrder.Unsorted | ||
) | ||
result.in should contain theSameElementsInOrderAs Seq("q2", "q2", "q1", "q1") // query grouped, but not query name | ||
result.out should contain theSameElementsInOrderAs Seq("q1", "q1", "q2", "q2") // query name | ||
} | ||
|
||
it should "should output coordinate sorted if the output order is coordinate" in { | ||
val result = sortOrderTest( | ||
name1="q1", start1R1=100, start1R2=200, | ||
name2="q2", start2R1=101, start2R2=201, | ||
inOrder=SamOrder.Queryname, | ||
outOrder=Some(SamOrder.Coordinate) | ||
) | ||
result.in should contain theSameElementsInOrderAs Seq("q1", "q1", "q2", "q2") // query name | ||
result.out should contain theSameElementsInOrderAs Seq("q1", "q2", "q1", "q2") // coordinate | ||
} | ||
|
||
////////////////////////////////////////////////////////////////////////////// | ||
// Below this line are tests for filtering of duplex consensus reads. | ||
////////////////////////////////////////////////////////////////////////////// | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't comment on the tool usage, but you seemed to miss my top-level review comment last time, so I'm replicating it here: