Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for clipping reads that extend past their mate #761

Merged
merged 9 commits into from
Mar 1, 2022
Merged
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 75 additions & 15 deletions src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala
Original file line number Diff line number Diff line change
@@ -91,33 +91,71 @@ object Cigar {
}

/** A data class for holding statistics about a [[Cigar]]. */
private[alignment] case class CigarStats(lengthOnQuery: Int, lengthOnTarget: Int, alignedBases: Int, clippedBases: Int)
private[alignment] case class CigarStats(lengthOnQuery: Int,
lengthOnTarget: Int,
alignedBases: Int,
clippedBases: Int,
leadingSoftClippedBases: Int,
trailingSoftClippedBases: Int)

/** Companion object for [[Cigar]]. */
private[alignment] object CigarStats {

/** Build a [[CigarStats]] from a [[Cigar]]. */
private[alignment] def apply(cigar: Cigar): CigarStats = {
var lengthOnQuery = 0
var lengthOnTarget = 0
var alignedBases = 0
var clippedBases = 0
var lengthOnQuery = 0
var lengthOnTarget = 0
var alignedBases = 0
var clippedBases = 0
var leadingSoftClippedBases = 0
var trailingSoftClippedBases = 0
var index = 0

// leading clipping
while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) {
val elem = cigar.elems(index)
lengthOnQuery += elem.lengthOnQuery
lengthOnTarget += elem.lengthOnTarget
clippedBases += elem.length
if (elem.operator == CigarOperator.S) leadingSoftClippedBases += elem.length
index += 1
}

forloop(0, cigar.elems.length) { index =>
// aligned bases
while (index < cigar.elems.length && !cigar.elems(index).operator.isClipping) {
val elem = cigar.elems(index)
lengthOnQuery += elem.lengthOnQuery
lengthOnTarget += elem.lengthOnTarget
if (elem.operator.isAlignment) alignedBases += elem.length
else if (elem.operator.isClipping) clippedBases += elem.length
if (elem.operator.isAlignment) alignedBases += elem.length
index += 1
}

// trailing clipping
while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) {
val elem = cigar.elems(index)
lengthOnQuery += elem.lengthOnQuery
lengthOnTarget += elem.lengthOnTarget
clippedBases += elem.length
if (elem.operator == CigarOperator.S) trailingSoftClippedBases += elem.length
index += 1
}

require(index == cigar.elems.length,
s"Invalid cigar, did not consume all the elements: ${Cigar(cigar.elems.take(index))}[${Cigar(cigar.elems.drop(index))}]"
)

CigarStats(
lengthOnQuery = lengthOnQuery,
lengthOnTarget = lengthOnTarget,
alignedBases = alignedBases,
clippedBases = clippedBases
lengthOnQuery = lengthOnQuery,
lengthOnTarget = lengthOnTarget,
alignedBases = alignedBases,
clippedBases = clippedBases,
leadingSoftClippedBases = leadingSoftClippedBases,
trailingSoftClippedBases = trailingSoftClippedBases
)
}

/** Build a [[CigarStats]] from a string representation of the cigar. */
private[alignment] def apply(cigar: String): CigarStats = this.apply(Cigar(cigar))
nh13 marked this conversation as resolved.
Show resolved Hide resolved
}

/**
@@ -163,6 +201,28 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
/** Returns the number of bases that are clipped between the two sequences. */
def clippedBases: Int = stats.clippedBases

/** Returns the number of bases that are soft-clipped at the start of the sequence Ignores hard-clips. */
def leadingSoftClippedBases: Int = stats.leadingSoftClippedBases

/** Returns the number of bases that are soft-clipped at the end of this sequence. Ignores hard-clips. */
def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases

/** Returns the number of bases that are hard-clipped at the start of the sequence. */
def leadingHardClippedBases = this.headOption.map { elem =>
if (elem.operator == CigarOperator.H) elem.length else 0
}.getOrElse(0)

/** Returns the number of bases that are clipped at the start of the sequence. */
def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases

/** Returns the number of bases that are hard-clipped at the end of the sequence. */
def trailingHardClippedBases = this.lastOption.map { elem =>
if (elem.operator == CigarOperator.H) elem.length else 0
}.getOrElse(0)

/** Returns the number of bases that are clipped at the end of the sequence. */
def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases

/** Truncates the cigar based on either query or target length cutoff. */
private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = {
var pos = 1
@@ -297,21 +357,21 @@ case class Alignment(query: Array[Byte],

cig.operator match {
case CigarOperator.INSERTION =>
forloop (from=0, until=len) { i =>
forloop (from=0, until=len) { _ =>
queryBuffer.append(this.query(qOffset).toChar)
alignBuffer.append(gapChar)
targetBuffer.append(padChar)
qOffset += 1
}
case CigarOperator.DELETION =>
forloop (from=0, until=len) { i =>
forloop (from=0, until=len) { _ =>
queryBuffer.append(padChar)
alignBuffer.append(gapChar)
targetBuffer.append(this.target(tOffset).toChar)
tOffset += 1
}
case op =>
forloop (from=0, until=len) { i =>
forloop (from=0, until=len) { _ =>
val q = this.query(qOffset).toChar
val t = this.target(tOffset).toChar
queryBuffer.append(q)
67 changes: 36 additions & 31 deletions src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala
Original file line number Diff line number Diff line change
@@ -78,13 +78,14 @@ class ClipBam
@arg( doc="Require at least this number of bases to be clipped on the 3' end of R1") val readOneThreePrime: Int = 0,
@arg( doc="Require at least this number of bases to be clipped on the 5' end of R2") val readTwoFivePrime: Int = 0,
@arg( doc="Require at least this number of bases to be clipped on the 3' end of R2") val readTwoThreePrime: Int = 0,
@arg( doc="Clip overlapping reads.") val clipOverlappingReads: Boolean = false
@arg( doc="Clip overlapping reads.", mutex=Array("clipBasesPastMate")) val clipOverlappingReads: Boolean = false,
@arg( doc="Clip reads in FR pairs that sequence past the far end of their mate.", mutex=Array("clipOverlappingReads")) val clipBasesPastMate: Boolean = false
) extends FgBioTool with LazyLogging {
Io.assertReadable(input)
Io.assertReadable(ref)
Io.assertCanWriteFile(output)

validate(upgradeClipping || clipOverlappingReads || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0),
validate(upgradeClipping || clipOverlappingReads || clipBasesPastMate || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0),
"At least one clipping option is required")

private val clipper = new SamRecordClipper(mode=clippingMode, autoClipAttributes=autoClipAttributes)
@@ -164,7 +165,8 @@ class ClipBam
priorBasesClipped = priorBasesClipped,
numFivePrime = numFivePrime,
numThreePrime = numThreePrime,
numOverlappingBases = 0
numOverlappingBases = 0,
numExtendingBases = 0
)
}
}
@@ -182,9 +184,17 @@ class ClipBam
val numReadTwoThreePrime = this.clipper.clip3PrimeEndOfRead(r2, readTwoThreePrime)

val (numOverlappingBasesReadOne, numOverlappingBasesReadTwo) = {
if (clipOverlappingReads && r1.isFrPair) {
if (r1.positiveStrand) clipOverlappingReads(r1, r2) else clipOverlappingReads(r2, r1)
} else (0, 0)
if (clipOverlappingReads && r1.isFrPair) this.clipper.clipOverlappingReads(r1, r2)
else (0, 0)
}

val (numExtendingPastMateStartReadOne, numExtendingPastMateStartReadTwo) = {
if (clipBasesPastMate && r1.isFrPair) {
val clip1 = this.clipper.clipExtendingPastMateEnd(rec=r1, mateEnd=r2.end)
val clip2 = this.clipper.clipExtendingPastMateEnd(rec=r2, mateEnd=r1.end)
(clip1, clip2)
}
else (0, 0)
}

r1Metric.foreach { m =>
@@ -193,7 +203,8 @@ class ClipBam
priorBasesClipped = priorBasesClippedReadOne,
numFivePrime = numReadOneFivePrime,
numThreePrime = numReadOneThreePrime,
numOverlappingBases = numOverlappingBasesReadOne
numOverlappingBases = numOverlappingBasesReadOne,
numExtendingBases = numExtendingPastMateStartReadOne
)
}

@@ -203,27 +214,11 @@ class ClipBam
priorBasesClipped = priorBasesClippedReadTwo,
numFivePrime = numReadTwoFivePrime,
numThreePrime = numReadTwoThreePrime,
numOverlappingBases = numOverlappingBasesReadTwo
numOverlappingBases = numOverlappingBasesReadTwo,
numExtendingBases = numExtendingPastMateStartReadTwo
)
}
}

/** Clips overlapping reads, where both ends of the read pair are mapped to the same chromosome, and in FR orientation. */
protected def clipOverlappingReads(f: SamRecord, r: SamRecord): (Int, Int) = {
var numOverlappingBasesReadOne: Int = 0
var numOverlappingBasesReadTwo: Int = 0
// What we really want is to trim by the number of _reference_ bases not read bases,
// in order to eliminate overlap. We could do something very complicated here, or
// we could just trim read bases in a loop until the overlap is eliminated!
while (f.end >= r.start && f.mapped && r.mapped) {
val lengthToClip = f.end - r.start + 1
val firstHalf = lengthToClip / 2
val secondHalf = lengthToClip - firstHalf // safe guard against rounding on odd lengths
numOverlappingBasesReadOne += this.clipper.clip3PrimeEndOfAlignment(f, firstHalf)
numOverlappingBasesReadTwo += this.clipper.clip3PrimeEndOfAlignment(r, secondHalf)
}
(numOverlappingBasesReadOne, numOverlappingBasesReadTwo)
}
}

sealed trait ReadType extends EnumEntry
@@ -246,13 +241,15 @@ object ReadType extends FgBioEnum[ReadType] {
* @param reads_clipped_five_prime The number of reads with the 5' end clipped.
* @param reads_clipped_three_prime The number of reads with the 3' end clipped.
* @param reads_clipped_overlapping The number of reads clipped due to overlapping reads.
* @param reads_clipped_extending The number of reads clipped due to a read extending past its mate.
* @param reads_unmapped The number of reads that became unmapped due to clipping.
* @param bases The number of aligned bases after clipping.
* @param bases_clipped_pre The number of bases clipped prior to clipping with [[ClipBam]].
* @param bases_clipped_post The number of bases clipped after clipping with [[ClipBam]], including bases from reads that became unmapped.
* @param bases_clipped_five_prime The number of bases clipped on the 5' end of the read.
* @param bases_clipped_three_prime The number of bases clipped on the 3 end of the read.
* @param bases_clipped_overlapping The number of bases clipped due to overlapping reads.
* @param bases_clipped_extending The number of bases clipped due to a read extending past its mate.
*/
case class ClippingMetrics
(read_type: ReadType,
@@ -263,14 +260,16 @@ case class ClippingMetrics
var reads_clipped_five_prime: Long = 0,
var reads_clipped_three_prime: Long = 0,
var reads_clipped_overlapping: Long = 0,
var reads_clipped_extending: Long = 0,
var bases: Long = 0,
var bases_clipped_pre: Long = 0,
var bases_clipped_post: Long = 0,
var bases_clipped_five_prime: Long = 0,
var bases_clipped_three_prime: Long = 0,
var bases_clipped_overlapping: Long = 0
var bases_clipped_overlapping: Long = 0,
var bases_clipped_extending: Long = 0,
) extends Metric {
def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int): Unit = {
def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int, numExtendingBases: Int): Unit = {
this.reads += 1
this.bases += rec.cigar.alignedBases
if (priorBasesClipped > 0) {
@@ -289,11 +288,15 @@ case class ClippingMetrics
this.reads_clipped_overlapping += 1
this.bases_clipped_overlapping += numOverlappingBases
}
val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases
val totalClippedBasees = additionalClippedBases + priorBasesClipped
if (totalClippedBasees > 0) {
if (numExtendingBases > 0) {
this.reads_clipped_extending += 1
this.bases_clipped_extending += numExtendingBases
}
val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases + numExtendingBases
val totalClippedBases = additionalClippedBases + priorBasesClipped
if (totalClippedBases > 0) {
this.reads_clipped_post += 1
this.bases_clipped_post += totalClippedBasees
this.bases_clipped_post += totalClippedBases
if (rec.unmapped && additionalClippedBases > 0) this.reads_unmapped += 1
}
}
@@ -306,11 +309,13 @@ case class ClippingMetrics
this.reads_clipped_five_prime += metric.sumBy(_.reads_clipped_five_prime)
this.reads_clipped_three_prime += metric.sumBy(_.reads_clipped_three_prime)
this.reads_clipped_overlapping += metric.sumBy(_.reads_clipped_overlapping)
this.reads_clipped_extending += metric.sumBy(_.reads_clipped_extending)
this.bases += metric.sumBy(_.bases)
this.bases_clipped_pre += metric.sumBy(_.bases_clipped_pre)
this.bases_clipped_post += metric.sumBy(_.bases_clipped_post)
this.bases_clipped_five_prime += metric.sumBy(_.bases_clipped_five_prime)
this.bases_clipped_three_prime += metric.sumBy(_.bases_clipped_three_prime)
this.bases_clipped_overlapping += metric.sumBy(_.bases_clipped_overlapping)
this.bases_clipped_extending += metric.sumBy(_.bases_clipped_extending)
}
}
Loading