Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for clipping reads that extend past their mate #761

Merged
merged 9 commits into from
Mar 1, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 59 additions & 15 deletions src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala
Original file line number Diff line number Diff line change
Expand Up @@ -91,33 +91,71 @@ object Cigar {
}

/** A data class for holding statistics about a [[Cigar]]. */
private[alignment] case class CigarStats(lengthOnQuery: Int, lengthOnTarget: Int, alignedBases: Int, clippedBases: Int)
private[alignment] case class CigarStats(lengthOnQuery: Int,
lengthOnTarget: Int,
alignedBases: Int,
clippedBases: Int,
softClippedFivePrimeBases: Int,
softClippedThreePrimeBases: Int)
nh13 marked this conversation as resolved.
Show resolved Hide resolved

/** Companion object for [[Cigar]]. */
private[alignment] object CigarStats {

/** Build a [[CigarStats]] from a [[Cigar]]. */
private[alignment] def apply(cigar: Cigar): CigarStats = {
var lengthOnQuery = 0
var lengthOnTarget = 0
var alignedBases = 0
var clippedBases = 0
var lengthOnQuery = 0
var lengthOnTarget = 0
var alignedBases = 0
var clippedBases = 0
var softClippedFivePrimeBases = 0
var softClippedThreePrimeBases = 0
var index = 0

// leading clipping
while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) {
val elem = cigar.elems(index)
lengthOnQuery += elem.lengthOnQuery
lengthOnTarget += elem.lengthOnTarget
clippedBases += elem.length
if (elem.operator == CigarOperator.S) softClippedFivePrimeBases += elem.length
index += 1
}

forloop(0, cigar.elems.length) { index =>
// aligned bases
while (index < cigar.elems.length && !cigar.elems(index).operator.isClipping) {
val elem = cigar.elems(index)
lengthOnQuery += elem.lengthOnQuery
lengthOnTarget += elem.lengthOnTarget
if (elem.operator.isAlignment) alignedBases += elem.length
else if (elem.operator.isClipping) clippedBases += elem.length
if (elem.operator.isAlignment) alignedBases += elem.length
index += 1
}

// trailing clipping
while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) {
val elem = cigar.elems(index)
lengthOnQuery += elem.lengthOnQuery
lengthOnTarget += elem.lengthOnTarget
clippedBases += elem.length
if (elem.operator == CigarOperator.S) softClippedThreePrimeBases += elem.length
index += 1
}

require(index == cigar.elems.length,
s"Invalid cigar, did not consume all the elements: ${Cigar(cigar.elems.take(index))}[${Cigar(cigar.elems.drop(index))}]"
)

CigarStats(
lengthOnQuery = lengthOnQuery,
lengthOnTarget = lengthOnTarget,
alignedBases = alignedBases,
clippedBases = clippedBases
lengthOnQuery = lengthOnQuery,
lengthOnTarget = lengthOnTarget,
alignedBases = alignedBases,
clippedBases = clippedBases,
softClippedFivePrimeBases = softClippedFivePrimeBases,
softClippedThreePrimeBases = softClippedThreePrimeBases
)
}

/** Build a [[CigarStats]] from a string representation of the cigar. */
private[alignment] def apply(cigar: String): CigarStats = this.apply(Cigar(cigar))
nh13 marked this conversation as resolved.
Show resolved Hide resolved
}

/**
Expand Down Expand Up @@ -163,6 +201,12 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
/** Returns the number of bases that are clipped between the two sequences. */
def clippedBases: Int = stats.clippedBases

/** Returns the number of bases that are soft-clipped on the 5' end of this sequence. Ignores hard-clips. */
def softClippedFivePrimeBases: Int = stats.softClippedFivePrimeBases

/** Returns the number of bases that are soft-clipped on the 3' end of this sequence. Ignores hard-clips. */
def softClippedThreePrimeBases: Int = stats.softClippedThreePrimeBases
nh13 marked this conversation as resolved.
Show resolved Hide resolved

/** Truncates the cigar based on either query or target length cutoff. */
private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = {
var pos = 1
Expand Down Expand Up @@ -297,21 +341,21 @@ case class Alignment(query: Array[Byte],

cig.operator match {
case CigarOperator.INSERTION =>
forloop (from=0, until=len) { i =>
forloop (from=0, until=len) { _ =>
queryBuffer.append(this.query(qOffset).toChar)
alignBuffer.append(gapChar)
targetBuffer.append(padChar)
qOffset += 1
}
case CigarOperator.DELETION =>
forloop (from=0, until=len) { i =>
forloop (from=0, until=len) { _ =>
queryBuffer.append(padChar)
alignBuffer.append(gapChar)
targetBuffer.append(this.target(tOffset).toChar)
tOffset += 1
}
case op =>
forloop (from=0, until=len) { i =>
forloop (from=0, until=len) { _ =>
val q = this.query(qOffset).toChar
val t = this.target(tOffset).toChar
queryBuffer.append(q)
Expand Down
63 changes: 32 additions & 31 deletions src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,14 @@ class ClipBam
@arg( doc="Require at least this number of bases to be clipped on the 3' end of R1") val readOneThreePrime: Int = 0,
@arg( doc="Require at least this number of bases to be clipped on the 5' end of R2") val readTwoFivePrime: Int = 0,
@arg( doc="Require at least this number of bases to be clipped on the 3' end of R2") val readTwoThreePrime: Int = 0,
@arg( doc="Clip overlapping reads.") val clipOverlappingReads: Boolean = false
@arg( doc="Clip overlapping reads.", mutex=Array("clipBasesPastMate")) val clipOverlappingReads: Boolean = false,
@arg( doc="Clip reads who sequence past the start of their mate.", mutex=Array("clipOverlappingReads")) val clipBasesPastMate: Boolean = false
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like it's awkward to make these mutex. Yes, one implies the other, but they don't contradict each other. E.g. I could totally see a pipeline that wants to always set --clip-bases-past-mate and optionally set --clip-overlapping-reads, and now it'll have to know to unset the first option based on the second, when it's not strictly necessary.

Copy link
Member Author

@nh13 nh13 Feb 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer the mutex, since one is a subset of the other. If this was written from scratch, I'd prefer an enum that was "None", "PastMate", "Overlapping" for a single argument, but here we are. Perhaps we can deprecate clipOverlappingReads and make an Option[PairClipping] type, where PairClipping is PastMate or Overlapping. Does that work for you?

nh13 marked this conversation as resolved.
Show resolved Hide resolved
) extends FgBioTool with LazyLogging {
Io.assertReadable(input)
Io.assertReadable(ref)
Io.assertCanWriteFile(output)

validate(upgradeClipping || clipOverlappingReads || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0),
validate(upgradeClipping || clipOverlappingReads || clipBasesPastMate || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0),
"At least one clipping option is required")

private val clipper = new SamRecordClipper(mode=clippingMode, autoClipAttributes=autoClipAttributes)
Expand Down Expand Up @@ -164,7 +165,8 @@ class ClipBam
priorBasesClipped = priorBasesClipped,
numFivePrime = numFivePrime,
numThreePrime = numThreePrime,
numOverlappingBases = 0
numOverlappingBases = 0,
numExtendingBases = 0
)
}
}
Expand All @@ -182,9 +184,13 @@ class ClipBam
val numReadTwoThreePrime = this.clipper.clip3PrimeEndOfRead(r2, readTwoThreePrime)

val (numOverlappingBasesReadOne, numOverlappingBasesReadTwo) = {
if (clipOverlappingReads && r1.isFrPair) {
if (r1.positiveStrand) clipOverlappingReads(r1, r2) else clipOverlappingReads(r2, r1)
} else (0, 0)
if (clipOverlappingReads && r1.isFrPair) this.clipper.clipOverlappingReads(r1, r2)
else (0, 0)
}

val (numExtendingPastMateStartReadOne, numExtendingPastMateStartReadTwo) = {
if (clipBasesPastMate && r1.isFrPair) this.clipper.clipExtendingPastMateEnds(r1, r2)
else (0, 0)
}

r1Metric.foreach { m =>
Expand All @@ -193,7 +199,8 @@ class ClipBam
priorBasesClipped = priorBasesClippedReadOne,
numFivePrime = numReadOneFivePrime,
numThreePrime = numReadOneThreePrime,
numOverlappingBases = numOverlappingBasesReadOne
numOverlappingBases = numOverlappingBasesReadOne,
numExtendingBases = numExtendingPastMateStartReadOne
)
}

Expand All @@ -203,27 +210,11 @@ class ClipBam
priorBasesClipped = priorBasesClippedReadTwo,
numFivePrime = numReadTwoFivePrime,
numThreePrime = numReadTwoThreePrime,
numOverlappingBases = numOverlappingBasesReadTwo
numOverlappingBases = numOverlappingBasesReadTwo,
numExtendingBases = numExtendingPastMateStartReadTwo
)
}
}

/** Clips overlapping reads, where both ends of the read pair are mapped to the same chromosome, and in FR orientation. */
protected def clipOverlappingReads(f: SamRecord, r: SamRecord): (Int, Int) = {
var numOverlappingBasesReadOne: Int = 0
var numOverlappingBasesReadTwo: Int = 0
// What we really want is to trim by the number of _reference_ bases not read bases,
// in order to eliminate overlap. We could do something very complicated here, or
// we could just trim read bases in a loop until the overlap is eliminated!
while (f.end >= r.start && f.mapped && r.mapped) {
val lengthToClip = f.end - r.start + 1
val firstHalf = lengthToClip / 2
val secondHalf = lengthToClip - firstHalf // safe guard against rounding on odd lengths
numOverlappingBasesReadOne += this.clipper.clip3PrimeEndOfAlignment(f, firstHalf)
numOverlappingBasesReadTwo += this.clipper.clip3PrimeEndOfAlignment(r, secondHalf)
}
(numOverlappingBasesReadOne, numOverlappingBasesReadTwo)
}
}

sealed trait ReadType extends EnumEntry
Expand All @@ -246,13 +237,15 @@ object ReadType extends FgBioEnum[ReadType] {
* @param reads_clipped_five_prime The number of reads with the 5' end clipped.
* @param reads_clipped_three_prime The number of reads with the 3' end clipped.
* @param reads_clipped_overlapping The number of reads clipped due to overlapping reads.
* @param reads_clipped_extending The number of reads clipped due to a read extending past its mate.
* @param reads_unmapped The number of reads that became unmapped due to clipping.
* @param bases The number of aligned bases after clipping.
* @param bases_clipped_pre The number of bases clipped prior to clipping with [[ClipBam]].
* @param bases_clipped_post The number of bases clipped after clipping with [[ClipBam]], including bases from reads that became unmapped.
* @param bases_clipped_five_prime The number of bases clipped on the 5' end of the read.
* @param bases_clipped_three_prime The number of bases clipped on the 3 end of the read.
* @param bases_clipped_overlapping The number of bases clipped due to overlapping reads.
* @param bases_clipped_extending The number of bases clipped due to a read extending past its mate.
*/
case class ClippingMetrics
(read_type: ReadType,
Expand All @@ -263,14 +256,16 @@ case class ClippingMetrics
var reads_clipped_five_prime: Long = 0,
var reads_clipped_three_prime: Long = 0,
var reads_clipped_overlapping: Long = 0,
var reads_clipped_extending: Long = 0,
var bases: Long = 0,
var bases_clipped_pre: Long = 0,
var bases_clipped_post: Long = 0,
var bases_clipped_five_prime: Long = 0,
var bases_clipped_three_prime: Long = 0,
var bases_clipped_overlapping: Long = 0
var bases_clipped_overlapping: Long = 0,
var bases_clipped_extending: Long = 0,
) extends Metric {
def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int): Unit = {
def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int, numExtendingBases: Int): Unit = {
this.reads += 1
this.bases += rec.cigar.alignedBases
if (priorBasesClipped > 0) {
Expand All @@ -289,11 +284,15 @@ case class ClippingMetrics
this.reads_clipped_overlapping += 1
this.bases_clipped_overlapping += numOverlappingBases
}
val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases
val totalClippedBasees = additionalClippedBases + priorBasesClipped
if (totalClippedBasees > 0) {
if (numExtendingBases > 0) {
this.reads_clipped_extending += 1
this.bases_clipped_extending += numExtendingBases
}
val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases + numExtendingBases
val totalClippedBases = additionalClippedBases + priorBasesClipped
if (totalClippedBases > 0) {
this.reads_clipped_post += 1
this.bases_clipped_post += totalClippedBasees
this.bases_clipped_post += totalClippedBases
if (rec.unmapped && additionalClippedBases > 0) this.reads_unmapped += 1
}
}
Expand All @@ -306,11 +305,13 @@ case class ClippingMetrics
this.reads_clipped_five_prime += metric.sumBy(_.reads_clipped_five_prime)
this.reads_clipped_three_prime += metric.sumBy(_.reads_clipped_three_prime)
this.reads_clipped_overlapping += metric.sumBy(_.reads_clipped_overlapping)
this.reads_clipped_extending += metric.sumBy(_.reads_clipped_extending)
this.bases += metric.sumBy(_.bases)
this.bases_clipped_pre += metric.sumBy(_.bases_clipped_pre)
this.bases_clipped_post += metric.sumBy(_.bases_clipped_post)
this.bases_clipped_five_prime += metric.sumBy(_.bases_clipped_five_prime)
this.bases_clipped_three_prime += metric.sumBy(_.bases_clipped_three_prime)
this.bases_clipped_overlapping += metric.sumBy(_.bases_clipped_overlapping)
this.bases_clipped_extending += metric.sumBy(_.bases_clipped_extending)
}
}
Loading