diff --git a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala index c1276cd7b..04df916b6 100644 --- a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala +++ b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala @@ -91,33 +91,71 @@ object Cigar { } /** A data class for holding statistics about a [[Cigar]]. */ -private[alignment] case class CigarStats(lengthOnQuery: Int, lengthOnTarget: Int, alignedBases: Int, clippedBases: Int) +private[alignment] case class CigarStats(lengthOnQuery: Int, + lengthOnTarget: Int, + alignedBases: Int, + clippedBases: Int, + leadingSoftClippedBases: Int, + trailingSoftClippedBases: Int) /** Companion object for [[Cigar]]. */ private[alignment] object CigarStats { /** Build a [[CigarStats]] from a [[Cigar]]. */ private[alignment] def apply(cigar: Cigar): CigarStats = { - var lengthOnQuery = 0 - var lengthOnTarget = 0 - var alignedBases = 0 - var clippedBases = 0 + var lengthOnQuery = 0 + var lengthOnTarget = 0 + var alignedBases = 0 + var clippedBases = 0 + var leadingSoftClippedBases = 0 + var trailingSoftClippedBases = 0 + var index = 0 + + // leading clipping + while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) { + val elem = cigar.elems(index) + lengthOnQuery += elem.lengthOnQuery + lengthOnTarget += elem.lengthOnTarget + clippedBases += elem.length + if (elem.operator == CigarOperator.S) leadingSoftClippedBases += elem.length + index += 1 + } - forloop(0, cigar.elems.length) { index => + // aligned bases + while (index < cigar.elems.length && !cigar.elems(index).operator.isClipping) { val elem = cigar.elems(index) lengthOnQuery += elem.lengthOnQuery lengthOnTarget += elem.lengthOnTarget - if (elem.operator.isAlignment) alignedBases += elem.length - else if (elem.operator.isClipping) clippedBases += elem.length + if (elem.operator.isAlignment) alignedBases += elem.length + index += 1 } + // trailing clipping + while (index < cigar.elems.length && cigar.elems(index).operator.isClipping) { + val elem = cigar.elems(index) + lengthOnQuery += elem.lengthOnQuery + lengthOnTarget += elem.lengthOnTarget + clippedBases += elem.length + if (elem.operator == CigarOperator.S) trailingSoftClippedBases += elem.length + index += 1 + } + + require(index == cigar.elems.length, + s"Invalid cigar, did not consume all the elements: ${Cigar(cigar.elems.take(index))}[${Cigar(cigar.elems.drop(index))}]" + ) + CigarStats( - lengthOnQuery = lengthOnQuery, - lengthOnTarget = lengthOnTarget, - alignedBases = alignedBases, - clippedBases = clippedBases + lengthOnQuery = lengthOnQuery, + lengthOnTarget = lengthOnTarget, + alignedBases = alignedBases, + clippedBases = clippedBases, + leadingSoftClippedBases = leadingSoftClippedBases, + trailingSoftClippedBases = trailingSoftClippedBases ) } + + /** Build a [[CigarStats]] from a string representation of the cigar. */ + private[alignment] def apply(cigar: String): CigarStats = this.apply(Cigar(cigar)) } /** @@ -163,6 +201,28 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] { /** Returns the number of bases that are clipped between the two sequences. */ def clippedBases: Int = stats.clippedBases + /** Returns the number of bases that are soft-clipped at the start of the sequence Ignores hard-clips. */ + def leadingSoftClippedBases: Int = stats.leadingSoftClippedBases + + /** Returns the number of bases that are soft-clipped at the end of this sequence. Ignores hard-clips. */ + def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases + + /** Returns the number of bases that are hard-clipped at the start of the sequence. */ + def leadingHardClippedBases = this.headOption.map { elem => + if (elem.operator == CigarOperator.H) elem.length else 0 + }.getOrElse(0) + + /** Returns the number of bases that are clipped at the start of the sequence. */ + def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases + + /** Returns the number of bases that are hard-clipped at the end of the sequence. */ + def trailingHardClippedBases = this.lastOption.map { elem => + if (elem.operator == CigarOperator.H) elem.length else 0 + }.getOrElse(0) + + /** Returns the number of bases that are clipped at the end of the sequence. */ + def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases + /** Truncates the cigar based on either query or target length cutoff. */ private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = { var pos = 1 @@ -297,21 +357,21 @@ case class Alignment(query: Array[Byte], cig.operator match { case CigarOperator.INSERTION => - forloop (from=0, until=len) { i => + forloop (from=0, until=len) { _ => queryBuffer.append(this.query(qOffset).toChar) alignBuffer.append(gapChar) targetBuffer.append(padChar) qOffset += 1 } case CigarOperator.DELETION => - forloop (from=0, until=len) { i => + forloop (from=0, until=len) { _ => queryBuffer.append(padChar) alignBuffer.append(gapChar) targetBuffer.append(this.target(tOffset).toChar) tOffset += 1 } case op => - forloop (from=0, until=len) { i => + forloop (from=0, until=len) { _ => val q = this.query(qOffset).toChar val t = this.target(tOffset).toChar queryBuffer.append(q) diff --git a/src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala b/src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala index f74998a5f..a667d7373 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala @@ -78,13 +78,14 @@ class ClipBam @arg( doc="Require at least this number of bases to be clipped on the 3' end of R1") val readOneThreePrime: Int = 0, @arg( doc="Require at least this number of bases to be clipped on the 5' end of R2") val readTwoFivePrime: Int = 0, @arg( doc="Require at least this number of bases to be clipped on the 3' end of R2") val readTwoThreePrime: Int = 0, - @arg( doc="Clip overlapping reads.") val clipOverlappingReads: Boolean = false + @arg( doc="Clip overlapping reads.", mutex=Array("clipBasesPastMate")) val clipOverlappingReads: Boolean = false, + @arg( doc="Clip reads in FR pairs that sequence past the far end of their mate.", mutex=Array("clipOverlappingReads")) val clipBasesPastMate: Boolean = false ) extends FgBioTool with LazyLogging { Io.assertReadable(input) Io.assertReadable(ref) Io.assertCanWriteFile(output) - validate(upgradeClipping || clipOverlappingReads || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0), + validate(upgradeClipping || clipOverlappingReads || clipBasesPastMate || Seq(readOneFivePrime, readOneThreePrime, readTwoFivePrime, readTwoThreePrime).exists(_ != 0), "At least one clipping option is required") private val clipper = new SamRecordClipper(mode=clippingMode, autoClipAttributes=autoClipAttributes) @@ -164,7 +165,8 @@ class ClipBam priorBasesClipped = priorBasesClipped, numFivePrime = numFivePrime, numThreePrime = numThreePrime, - numOverlappingBases = 0 + numOverlappingBases = 0, + numExtendingBases = 0 ) } } @@ -182,9 +184,17 @@ class ClipBam val numReadTwoThreePrime = this.clipper.clip3PrimeEndOfRead(r2, readTwoThreePrime) val (numOverlappingBasesReadOne, numOverlappingBasesReadTwo) = { - if (clipOverlappingReads && r1.isFrPair) { - if (r1.positiveStrand) clipOverlappingReads(r1, r2) else clipOverlappingReads(r2, r1) - } else (0, 0) + if (clipOverlappingReads && r1.isFrPair) this.clipper.clipOverlappingReads(r1, r2) + else (0, 0) + } + + val (numExtendingPastMateStartReadOne, numExtendingPastMateStartReadTwo) = { + if (clipBasesPastMate && r1.isFrPair) { + val clip1 = this.clipper.clipExtendingPastMateEnd(rec=r1, mateEnd=r2.end) + val clip2 = this.clipper.clipExtendingPastMateEnd(rec=r2, mateEnd=r1.end) + (clip1, clip2) + } + else (0, 0) } r1Metric.foreach { m => @@ -193,7 +203,8 @@ class ClipBam priorBasesClipped = priorBasesClippedReadOne, numFivePrime = numReadOneFivePrime, numThreePrime = numReadOneThreePrime, - numOverlappingBases = numOverlappingBasesReadOne + numOverlappingBases = numOverlappingBasesReadOne, + numExtendingBases = numExtendingPastMateStartReadOne ) } @@ -203,27 +214,11 @@ class ClipBam priorBasesClipped = priorBasesClippedReadTwo, numFivePrime = numReadTwoFivePrime, numThreePrime = numReadTwoThreePrime, - numOverlappingBases = numOverlappingBasesReadTwo + numOverlappingBases = numOverlappingBasesReadTwo, + numExtendingBases = numExtendingPastMateStartReadTwo ) } } - - /** Clips overlapping reads, where both ends of the read pair are mapped to the same chromosome, and in FR orientation. */ - protected def clipOverlappingReads(f: SamRecord, r: SamRecord): (Int, Int) = { - var numOverlappingBasesReadOne: Int = 0 - var numOverlappingBasesReadTwo: Int = 0 - // What we really want is to trim by the number of _reference_ bases not read bases, - // in order to eliminate overlap. We could do something very complicated here, or - // we could just trim read bases in a loop until the overlap is eliminated! - while (f.end >= r.start && f.mapped && r.mapped) { - val lengthToClip = f.end - r.start + 1 - val firstHalf = lengthToClip / 2 - val secondHalf = lengthToClip - firstHalf // safe guard against rounding on odd lengths - numOverlappingBasesReadOne += this.clipper.clip3PrimeEndOfAlignment(f, firstHalf) - numOverlappingBasesReadTwo += this.clipper.clip3PrimeEndOfAlignment(r, secondHalf) - } - (numOverlappingBasesReadOne, numOverlappingBasesReadTwo) - } } sealed trait ReadType extends EnumEntry @@ -246,6 +241,7 @@ object ReadType extends FgBioEnum[ReadType] { * @param reads_clipped_five_prime The number of reads with the 5' end clipped. * @param reads_clipped_three_prime The number of reads with the 3' end clipped. * @param reads_clipped_overlapping The number of reads clipped due to overlapping reads. + * @param reads_clipped_extending The number of reads clipped due to a read extending past its mate. * @param reads_unmapped The number of reads that became unmapped due to clipping. * @param bases The number of aligned bases after clipping. * @param bases_clipped_pre The number of bases clipped prior to clipping with [[ClipBam]]. @@ -253,6 +249,7 @@ object ReadType extends FgBioEnum[ReadType] { * @param bases_clipped_five_prime The number of bases clipped on the 5' end of the read. * @param bases_clipped_three_prime The number of bases clipped on the 3 end of the read. * @param bases_clipped_overlapping The number of bases clipped due to overlapping reads. + * @param bases_clipped_extending The number of bases clipped due to a read extending past its mate. */ case class ClippingMetrics (read_type: ReadType, @@ -263,14 +260,16 @@ case class ClippingMetrics var reads_clipped_five_prime: Long = 0, var reads_clipped_three_prime: Long = 0, var reads_clipped_overlapping: Long = 0, + var reads_clipped_extending: Long = 0, var bases: Long = 0, var bases_clipped_pre: Long = 0, var bases_clipped_post: Long = 0, var bases_clipped_five_prime: Long = 0, var bases_clipped_three_prime: Long = 0, - var bases_clipped_overlapping: Long = 0 + var bases_clipped_overlapping: Long = 0, + var bases_clipped_extending: Long = 0, ) extends Metric { - def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int): Unit = { + def update(rec: SamRecord, priorBasesClipped: Int, numFivePrime: Int, numThreePrime: Int, numOverlappingBases: Int, numExtendingBases: Int): Unit = { this.reads += 1 this.bases += rec.cigar.alignedBases if (priorBasesClipped > 0) { @@ -289,11 +288,15 @@ case class ClippingMetrics this.reads_clipped_overlapping += 1 this.bases_clipped_overlapping += numOverlappingBases } - val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases - val totalClippedBasees = additionalClippedBases + priorBasesClipped - if (totalClippedBasees > 0) { + if (numExtendingBases > 0) { + this.reads_clipped_extending += 1 + this.bases_clipped_extending += numExtendingBases + } + val additionalClippedBases = numFivePrime + numThreePrime + numOverlappingBases + numExtendingBases + val totalClippedBases = additionalClippedBases + priorBasesClipped + if (totalClippedBases > 0) { this.reads_clipped_post += 1 - this.bases_clipped_post += totalClippedBasees + this.bases_clipped_post += totalClippedBases if (rec.unmapped && additionalClippedBases > 0) this.reads_unmapped += 1 } } @@ -306,11 +309,13 @@ case class ClippingMetrics this.reads_clipped_five_prime += metric.sumBy(_.reads_clipped_five_prime) this.reads_clipped_three_prime += metric.sumBy(_.reads_clipped_three_prime) this.reads_clipped_overlapping += metric.sumBy(_.reads_clipped_overlapping) + this.reads_clipped_extending += metric.sumBy(_.reads_clipped_extending) this.bases += metric.sumBy(_.bases) this.bases_clipped_pre += metric.sumBy(_.bases_clipped_pre) this.bases_clipped_post += metric.sumBy(_.bases_clipped_post) this.bases_clipped_five_prime += metric.sumBy(_.bases_clipped_five_prime) this.bases_clipped_three_prime += metric.sumBy(_.bases_clipped_three_prime) this.bases_clipped_overlapping += metric.sumBy(_.bases_clipped_overlapping) + this.bases_clipped_extending += metric.sumBy(_.bases_clipped_extending) } } diff --git a/src/main/scala/com/fulcrumgenomics/bam/SamRecordClipper.scala b/src/main/scala/com/fulcrumgenomics/bam/SamRecordClipper.scala index afeeb1d13..151d87ff0 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/SamRecordClipper.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/SamRecordClipper.scala @@ -31,6 +31,7 @@ import enumeratum.EnumEntry import htsjdk.samtools.{SAMUtils, CigarOperator => Op} import scala.collection.mutable.ArrayBuffer +import scala.math.abs /** The base trait for all clipping modes. */ sealed trait ClippingMode extends EnumEntry @@ -162,7 +163,7 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean) val oldElems = rec.cigar.elems.reverse val oldBases = rec.bases.reverse val oldQuals = rec.quals.reverse - val (newElems, readBasesClipped, refBasesClipped, bases, quals) = clip(oldElems, numberOfBasesToClip, oldBases, oldQuals) + val (newElems, readBasesClipped, _, bases, quals) = clip(oldElems, numberOfBasesToClip, oldBases, oldQuals) rec.cigar = Cigar(newElems.reverse) rec.bases = bases.reverse rec.quals = quals.reverse @@ -187,7 +188,7 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean) } /** - * Attempts to clip an additional numberOfBasesToClip from the 5' end of the read. For + * Attempts to clip an additional numberOfBasesToClip from the 3' end of the read. For * details see [[com.fulcrumgenomics.bam.SamRecordClipper.clipStartOfAlignment]] and * [[com.fulcrumgenomics.bam.SamRecordClipper.clipEndOfAlignment]]. * @@ -233,7 +234,7 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean) /** * Ensures that there are at least clipLength bases clipped at the 5' end * of the read, _including_ any existing soft and hard clipping. Calculates any - * additional clipping and delegates to [com.fulcrumgenomics.bam.SamRecordClipper.[clipStartOfAlignment]]. + * additional clipping and delegates to [[com.fulcrumgenomics.bam.SamRecordClipper.clipStartOfAlignment]]. * * @param rec the record to be clipped * @param numberOfBasesToClip the number of additional bases to be clipped @@ -289,6 +290,111 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean) } } + /** Clips overlapping read pairs, where both ends of the read pair are mapped to the same chromosome, and in FR orientation. + * + * If the reads do not overlap, or are not an FR pair, return (0, 0). + * + * Clips at the reference midpoint between the two reads. + * + * @param rec the read + * @param mate the mate + * @return the number of overlapping bases to that were clipped on the record and mate respectively (3' end in sequencing order) + */ + def clipOverlappingReads(rec: SamRecord, mate: SamRecord): (Int, Int) = { + if (rec.matesOverlap.contains(false)) (0, 0) // do not overlap, don't clip + else if (!rec.isFrPair) (0, 0) + else if (rec.negativeStrand) clipOverlappingReads(rec=mate, mate=rec).swap // don't for get to swap the results since we swapped inputs + else { + // Pick the mid point in the reference window over which the record and its mate overlap. The read ends at the + // mid point, or if the mid point is in a deletion, the base prior to the deletion. The mate ends at the mid + // point, or if the mid point is in a deletion, the base after the deletion. + val midPoint = (rec.start + mate.end) / 2 + val readEnd = rec.readPosAtRefPos(pos=midPoint, returnLastBaseIfDeleted=true) + val mateStart = { // NB: need to be careful if the midpoint falls in a deletion + val retval = mate.readPosAtRefPos(pos=midPoint + 1, returnLastBaseIfDeleted=false) + if (retval != 0) retval else mate.readPosAtRefPos(pos=midPoint + 1, returnLastBaseIfDeleted=true) + 1 + } + val numOverlappingBasesRead = this.clip3PrimeEndOfRead(rec, rec.cigar.trailingClippedBases + rec.length - readEnd) + val numOverlappingBasesMate = this.clip3PrimeEndOfRead(mate, mate.cigar.leadingClippedBases + mateStart - 1) + (numOverlappingBasesRead, numOverlappingBasesMate) + } + } + + /** Returns the number of bases extending past the mate end for FR pairs including any soft-clipped bases, zero otherwise. + * + * The largest mapped genomic coordinate of the mate is computed via the mate-cigar (MC) SAM tag if present, + * otherwise the reported insert size is used. + * + * @param rec the record to clip + */ + def numBasesExtendingPastMate(rec: SamRecord): Int = { + val mateEnd = rec.mateEnd.getOrElse(rec.start + abs(rec.insertSize) - 1) + numBasesExtendingPastMate(rec=rec, mateEnd=mateEnd) + } + + /** Returns the number of bases extending past the mate end for FR pairs including any soft-clipped bases, zero otherwise. + * + * @param rec the record to examine + * @param mateEnd the largest mapped genomic coordinate of the mate + */ + def numBasesExtendingPastMate(rec: SamRecord, mateEnd: Int): Int = { + if (!rec.isFrPair) 0 // not an FR pair + else { + if (rec.positiveStrand && rec.end >= mateEnd) { + // clip from where last read base of where the mate ends + Math.max(0, rec.length - rec.readPosAtRefPos(pos=mateEnd, returnLastBaseIfDeleted=false)) + } + else if (rec.negativeStrand && rec.start <= rec.mateStart) { + // clip up to and including one base before where the mate starts + Math.max(0, rec.readPosAtRefPos(pos=rec.mateStart, returnLastBaseIfDeleted=false) - 1) + } else { + // no bases extend past + 0 + } + } + } + + /** Clips the reads in FR read pairs whose alignments extend beyond the far end of their mate's alignment. + * + * @param rec the read + * @param mate the mate + * @return the additional number of bases clipped (3' end in sequencing order) for the read and mate respectively + */ + def clipExtendingPastMateEnds(rec: SamRecord, mate: SamRecord): (Int, Int) = { + val basesClipped1 = clipExtendingPastMateEnd(rec=rec, mateEnd=mate.end) + val basesClipped2 = clipExtendingPastMateEnd(rec=mate, mateEnd=rec.end) + (basesClipped1, basesClipped2) + } + + /** Clips the read in FR read pairs whose alignments extend beyond the far end of their mate's alignment. + * + * The mate end is computed via the mate-cigar (MC) SAM tag if present, otherwise the reported insert size is used. + * + * @param rec the record to clip + * @return the additional number of bases clipped (3' end in sequencing order) + */ + def clipExtendingPastMateEnd(rec: SamRecord): Int = { + val mateEnd = rec.mateEnd.getOrElse(rec.start + abs(rec.insertSize) - 1) + clipExtendingPastMateEnd(rec=rec, mateEnd=mateEnd) + } + + /** Clips the read in FR read pairs whose alignments extend beyond the far end of their mate's alignment. + * + * @param rec the record to clip + * @param mateEnd the end coordinate of the mate + * @return the additional number of bases clipped (3' end in sequencing order) + */ + def clipExtendingPastMateEnd(rec: SamRecord, mateEnd: Int): Int = { + if (!rec.isFrPair) 0 // do not overlap, don't clip + else { + val totalClippedBases = numBasesExtendingPastMate(rec=rec, mateEnd=mateEnd) + if (totalClippedBases == 0) 0 else { + if (rec.positiveStrand) this.clipEndOfRead(rec, totalClippedBases) + else this.clipStartOfRead(rec, totalClippedBases) + } + } + } + /** Computes the number of bases that are available to be clipped in a mapped SamRecord. */ private def numberOfClippableBases(rec: SamRecord): Int = { rec.cigar.filter(e => e.operator.isAlignment || e.operator == Op.INSERTION).map(_.length).sum @@ -440,7 +546,7 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean) unreachable(s"Cigar $es doesn't contain soft clipping at the start") } - elems = newElems.result + elems = newElems.result() } if (fromStart) { diff --git a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala index 583f5a1db..f3c04357b 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala @@ -26,6 +26,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.alignment.{Cigar, CigarElem} +import com.fulcrumgenomics.bam.{ClippingMode, SamRecordClipper} import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} import com.fulcrumgenomics.commons.util.{Logger, SimpleCounter} import com.fulcrumgenomics.umi.UmiConsensusCaller._ @@ -36,7 +37,8 @@ import htsjdk.samtools.util.{Murmur3, SequenceUtil, TrimmingUtil} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -import scala.math.{abs, min} +import scala.math.min + /** * Contains shared types and functions used when writing UMI-driven consensus * callers that take in SamRecords and emit SamRecords. @@ -168,6 +170,9 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] { /** A consensus caller used to generate consensus UMI sequences */ private val consensusBuilder = new SimpleConsensusCaller() + /** Clipper utility used to _calculate_ clipping, but not do the actual clipping */ + private val clipper = new SamRecordClipper(mode=ClippingMode.Soft, autoClipAttributes=true) + /** Returns a clone of this consensus caller in a state where no previous reads were processed. I.e. all counters * are set to zero.*/ def emptyClone(): UmiConsensusCaller[ConsensusRead] @@ -216,7 +221,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] { /** * Converts from a SamRecord into a SourceRead. During conversion the record is end-trimmed * to remove Ns and bases below the `minBaseQuality`. Remaining bases that are below - * `minBaseQuality` are then masked to Ns. + * `minBaseQuality` are then masked to Ns. Also trims reads so that no mapped bases extend past their mate. * * @return Some(SourceRead) if there are any called bases with quality > minBaseQuality, else None */ @@ -241,16 +246,17 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] { } } - // Find the last non-N base of sufficient quality in the record, starting from either the - // end of the read, or the end of the insert, whichever is shorter! + // Get the length of the read based on trimming bases that are beyond the mate's end (FR only) and then any + // remaining trailing Ns. This includes both mapped bases and soft-clipped bases past the mate's end. val len = { var index = if (!rec.isFrPair) trimToLength - 1 else { - // Adjust the insert size based on clipping on the 5' end of the given read. - // This *does not* consider any clipping on the 5' end of its mate. - val fivePrimeClipping = cigar.takeWhile(_.operator.isClipping).filter(_.operator == CigarOperator.S).sumBy(_.length) - val adjustedInsertSize = abs(rec.insertSize) + fivePrimeClipping - min(adjustedInsertSize, trimToLength) - 1 + // Get the number of mapped bases to clip that maps beyond the mate's end, including any soft-clipped bases. Use + // that to compute where in the read to keep. + val clipPosition = rec.length - this.clipper.numBasesExtendingPastMate(rec=rec) + min(clipPosition, trimToLength) - 1 } + // Find the last non-N base of sufficient quality in the record, starting from either the + // end of the read, or the end of the insert, whichever is shorter! while (index >= 0 && (bases(index) == NoCall)) index -= 1 index + 1 } diff --git a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala index c79a36df5..66b287ec7 100644 --- a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala +++ b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala @@ -281,3 +281,23 @@ class AlignmentTest extends UnitSpec { alignment.subByTarget(6, 11).cigar.toString() shouldBe "5D1=" } } + +class CigarStatsTest extends UnitSpec { + + "CigarStats" should "compute various stats about a cigar" in { + CigarStats("100M") shouldBe CigarStats(100, 100, 100, 0, 0, 0) + CigarStats("50M10I50M") shouldBe CigarStats(110, 100, 100, 0, 0, 0) + CigarStats("50M10D50M") shouldBe CigarStats(100, 110, 100, 0, 0, 0) + CigarStats("10H100M") shouldBe CigarStats(100, 100, 100, 10, 0, 0) + CigarStats("100M10H") shouldBe CigarStats(100, 100, 100, 10, 0, 0) + CigarStats("10S100M") shouldBe CigarStats(110, 100, 100, 10, 10, 0) + CigarStats("100M10S") shouldBe CigarStats(110, 100, 100, 10, 0, 10) + CigarStats("10H10S100M") shouldBe CigarStats(110, 100, 100, 20, 10, 0) + CigarStats("100M10S10H") shouldBe CigarStats(110, 100, 100, 20, 0, 10) + CigarStats("1H2S3M4D5I6M7S8H") shouldBe CigarStats(23, 13, 9, 18, 2, 7) + } + + it should "fail on an invalid cigar" in { + an[Exception] should be thrownBy CigarStats("100M10S100M") + } +} diff --git a/src/test/scala/com/fulcrumgenomics/bam/ClipBamTest.scala b/src/test/scala/com/fulcrumgenomics/bam/ClipBamTest.scala index 8aef92083..1aa4be757 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/ClipBamTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/ClipBamTest.scala @@ -220,6 +220,7 @@ class ClipBamTest extends UnitSpec with ErrorLogLevel with OptionValues { val prior = StartsAndEnds(r1, r2) r1.end shouldBe (r2.start + 3) // four bases overlap! clipper.clipPair(r1, r2) + r1.end shouldBe (r2.start - 1) prior.checkClipping(r1, r2, 0, 2, 0, 2) // clipping due to overlapping reads } @@ -240,10 +241,10 @@ class ClipBamTest extends UnitSpec with ErrorLogLevel with OptionValues { } "ClippingMetrics.add" should "add two metrics" in { - val fragment = ClippingMetrics(ReadType.Fragment, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13) - val readOne = ClippingMetrics(ReadType.ReadTwo, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14) - val readTwo = ClippingMetrics(ReadType.ReadTwo, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - val all = ClippingMetrics(ReadType.All, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42) + val fragment = ClippingMetrics(ReadType.Fragment, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + val readOne = ClippingMetrics(ReadType.ReadTwo, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + val readTwo = ClippingMetrics(ReadType.ReadTwo, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17) + val all = ClippingMetrics(ReadType.All, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48) // Check that adding fragment, readOne, and readTwo is correct val added = ClippingMetrics(ReadType.All) @@ -524,4 +525,16 @@ class ClipBamTest extends UnitSpec with ErrorLogLevel with OptionValues { } } } + + it should "clip FR reads that extend past the mate" in { + val builder = new SamBuilder(readLength=50) + val clipper = new ClipBam(input=dummyBam, output=dummyBam, ref=ref, clipBasesPastMate=true) + val Seq(r1, r2) = builder.addPair(start1=100, start2=90) + + r1.end shouldBe 149 + r2.end shouldBe 139 + clipper.clipPair(r1, r2) + r1.start shouldBe r2.start + r1.end shouldBe r2.end + } } diff --git a/src/test/scala/com/fulcrumgenomics/bam/SamRecordClipperTest.scala b/src/test/scala/com/fulcrumgenomics/bam/SamRecordClipperTest.scala index 027aa4f38..2d09980ea 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/SamRecordClipperTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/SamRecordClipperTest.scala @@ -31,13 +31,20 @@ import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus, Strand} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.util.NumericTypes.PhredScore import htsjdk.samtools.TextCigarCodec +import org.scalatest.OptionValues -class SamRecordClipperTest extends UnitSpec { +class SamRecordClipperTest extends UnitSpec with OptionValues { /** Returns a fragment SAM record with the start / cigar / strand requested. */ def r(start: Int, cigar: String, strand: Strand = Plus, attrs:Map[String,Any] = Map.empty): SamRecord = { - val cig = TextCigarCodec.decode(cigar) - val len = cig.getReadLength - new SamBuilder(readLength=len).addFrag(start=start, cigar = cigar, strand=strand, attrs=attrs).get + new SamBuilder(readLength=Cigar(cigar).lengthOnQuery) + .addFrag(start=start, cigar=cigar, strand=strand, attrs=attrs).value + } + + /** Returns a pair of SAM records (read pair) with the respective starts / cigars / strands requested. */ + def pair(start1: Int, cigar1: String, strand1: Strand = Plus, start2: Int, cigar2: String, strand2: Strand = Minus): (SamRecord, SamRecord) = { + val recs = new SamBuilder(readLength=Cigar(cigar1).lengthOnQuery) + .addPair(start1=start1, cigar1=cigar1, strand1=strand1, start2=start2, cigar2=cigar2, strand2=strand2) + recs match { case Seq(r1, r2) => (r1, r2) } } def clipper(mode: ClippingMode, autoClip: Boolean=false) = new SamRecordClipper(mode, autoClip) @@ -551,4 +558,204 @@ class SamRecordClipperTest extends UnitSpec { clipper(SoftWithMask).upgradeAllClipping(mapped) shouldBe (0, 0) clipper(Hard).upgradeAllClipping(unmapped) shouldBe (0, 0) } + + "SamRecordClipper.clipOverlappingReads" should "not clip if the reads are not overlapping" in { + val (rec, mate) = pair(1, "100M", Plus, 101, "100M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (0, 0) + rec.start shouldBe 1 + rec.cigar.toString shouldBe "100M" + mate.start shouldBe 101 + mate.cigar.toString shouldBe "100M" + } + + it should "clip reads that overlap by one base" in { + val (rec, mate) = pair(1, "100M", Plus, 100, "100M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (0, 1) + rec.start shouldBe 1 + rec.cigar.toString shouldBe "100M" + mate.start shouldBe 101 + mate.cigar.toString shouldBe "1S99M" + } + + it should "clip reads that overlap by two bases" in { + val (rec, mate) = pair(2, "100M", Plus, 100, "100M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (1, 1) + rec.start shouldBe 2 + rec.end shouldBe 100 + rec.cigar.toString shouldBe "99M1S" + mate.start shouldBe 101 + mate.end shouldBe 199 + mate.cigar.toString shouldBe "1S99M" + } + + it should "clip reads that overlap with one end having a deletion" in { + val (rec, mate) = pair(1, "60M10D40M", Plus, 50, "10M10D80M10D10M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (25, 26) + rec.start shouldBe 1 + rec.end shouldBe 85 + rec.cigar.toString shouldBe "60M10D15M25S" + mate.start shouldBe 86 + mate.cigar.toString shouldBe "26S64M10D10M" + } + + it should "clip reads that overlap with one end having a deletion with mismatching cigars" in { + val (rec, mate) = pair(1, "100M", Plus, 50, "10M10D80M10D10M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (15, 26) + rec.start shouldBe 1 + rec.end shouldBe 85 + rec.cigar.toString shouldBe "85M15S" + mate.start shouldBe 86 + mate.cigar.toString shouldBe "26S64M10D10M" + } + + it should "clip reads that fully overlap with both ends having deletions" in { + // NB: mid point occurs right in the middle of the deletion + val (rec, mate) = pair(1, "50M10D50M", Plus, 1, "50M10D50M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (50, 50) + rec.start shouldBe 1 + rec.end shouldBe 50 + rec.cigar.toString shouldBe "50M50S" + mate.start shouldBe 61 + mate.cigar.toString shouldBe "50S50M" + } + + it should "clip reads that overlap with both ends having deletions" in { + // NB: mid point occurs right in the middle of the deletion. Clipping loses the deletion! + val (rec, mate) = pair(1, "50M10D50M", Plus, 3, "47M10D53M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (50, 47) + rec.start shouldBe 1 + rec.end shouldBe 50 + rec.cigar.toString shouldBe "50M50S" + mate.start shouldBe 60 + mate.cigar.toString shouldBe "47S53M" + } + + it should "clip reads that fully overlap" in { + val (rec, mate) = pair(1, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (50, 50) + rec.start shouldBe 1 + rec.cigar.toString shouldBe "50M50S" + mate.start shouldBe 51 + mate.cigar.toString shouldBe "50S50M" + } + + it should "clip reads that extend past each other" in { + // R1 is 50-149, while R2 is 1-100, so the reference midpoint is (50 + 100) / 2 = 75 + val (rec, mate) = pair(50, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (74, 75) + rec.start shouldBe 50 + rec.end shouldBe 75 + rec.cigar.toString shouldBe "26M74S" + mate.start shouldBe 76 + mate.end shouldBe 100 + mate.cigar.toString shouldBe "75S25M" + } + + it should "clip reads that extend past each other with one read having deletions" in { + // R1 is 50-149, while R2 is 1-120, so the reference midpoint is (50 + 120) / 2 = 85 + val (rec, mate) = pair(50, "100M", Plus, 1, "10M10D80M10D10M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (64, 75) + rec.start shouldBe 50 + rec.end shouldBe 85 + rec.cigar.toString shouldBe "36M64S" + mate.start shouldBe 86 + mate.end shouldBe 120 + mate.cigar.toString shouldBe "75S15M10D10M" + } + + it should "clip reads that extend past each other with both read having deletions" in { + // R1 is 50-149, while R2 is 1-120, so the reference midpoint is (50 + 120) / 2 = 85 + val (rec, mate) = pair(50, "50M10D50M", Plus, 1, "10M10D80M10D10M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (64, 75) + rec.start shouldBe 50 + rec.end shouldBe 85 + rec.cigar.toString shouldBe "36M64S" + mate.start shouldBe 86 + mate.end shouldBe 120 + mate.cigar.toString shouldBe "75S15M10D10M" + } + + it should "not clip when the read pairs are mapped +/- with start(R1) > end(R2) but do not overlap" in { + val (rec, mate) = pair(1000, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipOverlappingReads(rec=rec, mate=mate) shouldBe (0, 0) + rec.start shouldBe 1000 + rec.cigar.toString shouldBe "100M" + mate.start shouldBe 1 + mate.cigar.toString shouldBe "100M" + } + + "SamRecordClipper.clipExtendingPastMateEnds" should "not clip reads that do not extend past each other" in { + Seq((Plus, Minus), (Minus, Plus)).foreach { case (strand1, strand2) => + val (rec, mate) = pair(1, "100M", strand1, 1, "100M", strand2) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (0, 0) + rec.start shouldBe 1 + rec.cigar.toString shouldBe "100M" + mate.start shouldBe 1 + mate.cigar.toString shouldBe "100M" + } + } + + it should "clip reads that extend one base past their mate's start" in { + val (rec, mate) = pair(2, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (1, 1) + rec.start shouldBe 2 + rec.cigar.toString shouldBe "99M1S" + mate.start shouldBe 2 + mate.cigar.toString shouldBe "1S99M" + } + + it should "clip reads that extend two bases past their mate's start" in { + val (rec, mate) = pair(3, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (2, 2) + rec.start shouldBe 3 + rec.cigar.toString shouldBe "98M2S" + mate.start shouldBe 3 + mate.cigar.toString shouldBe "2S98M" + } + + it should "clip reads that where both ends extends their mate's start" in { + val (rec, mate) = pair(51, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (50, 50) + rec.start shouldBe 51 + rec.cigar.toString shouldBe "50M50S" + mate.start shouldBe 51 + mate.cigar.toString shouldBe "50S50M" + } + + it should "clip reads that where only one end extends their mate's start" in { + val (rec, mate) = pair(1, "100M", Plus, 1, "50S50M", Minus) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (50, 0) + rec.start shouldBe 1 + rec.cigar.toString shouldBe "50M50S" // added clipping + mate.start shouldBe 1 + mate.cigar.toString shouldBe "50S50M" // clipping remains + } + + it should "clip reads that where only one end extends their mate's start that has insertions" in { + val (rec, mate) = pair(1, "40M10I50M", Plus, 1, "50S50M", Minus) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (40, 0) + rec.start shouldBe 1 + rec.cigar.toString shouldBe "40M10I10M40S" // added clipping + mate.start shouldBe 1 + mate.cigar.toString shouldBe "50S50M" // clipping remains + } + + it should "not clip when the read pairs are mapped +/- with start(R1) > end(R2) but do not overlap" in { + val (rec, mate) = pair(1000, "100M", Plus, 1, "100M", Minus) + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (0, 0) + rec.start shouldBe 1000 + rec.cigar.toString shouldBe "100M" + mate.start shouldBe 1 + mate.cigar.toString shouldBe "100M" + } + + it should "not clip when the reads do not extend past each other with insertions" in { + val builder = new SamBuilder(readLength=100) + val (rec, mate) = pair(start1=1, start2=1, strand1=Plus, strand2=Minus, cigar1="40M20I40M", cigar2="40M20I40M") + clipper(Soft).clipExtendingPastMateEnds(rec=rec, mate=mate) shouldBe (0, 0) + rec.start shouldBe 1 + rec.cigar.toString() shouldBe "40M20I40M" + mate.start shouldBe 1 + mate.cigar.toString shouldBe "40M20I40M" + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/AnnotateBamWithUmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/AnnotateBamWithUmisTest.scala index 81246e2c1..29511bdd3 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/AnnotateBamWithUmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/AnnotateBamWithUmisTest.scala @@ -211,7 +211,6 @@ class AnnotateBamWithUmisTest extends UnitSpec { Io.writeLines(reverseFq, Io.readLines(fq).grouped(4).toSeq.reverse.flatten) val annotator = new AnnotateBamWithUmis(input=sam, fastq=Seq(fq, reverseFq), readStructure=Seq(ReadStructure("2M4B+M"), ReadStructure("1B+M")), output=out, attribute=umiTag, sorted=true) val result = intercept[Exception] { annotator.execute() } - println(result) result.getMessage should include ("out of sync") } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala b/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala index a73973deb..b1b5dffb3 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/VanillaUmiConsensusCallerTest.scala @@ -513,25 +513,39 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues { val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value - s1.baseString shouldBe "A"*2 + "C"*38 // trimmed by ten bases at the 3' end + s1.baseString shouldBe "A"*2 + "C"*38 // trimmed by ten bases at the 3' end, five due to existing soft-clipping and the other due to past mate start s1.cigar.toString shouldBe "10S30M" - s2.baseString shouldBe "C"*2 + "G"*36 // trimmed by twelve bases at the 3' end + s2.baseString shouldBe "C"*2 + "G"*36 // trimmed by twelve bases at the 3' end (the "12S" in the cigar) s2.cigar.toString shouldBe "8S30M" // NB: cigar is reversed in toSourceRead } - it should "not to trim based on insert size in the presence of soft-clipping" in { + it should "not trim based on insert size in the presence of soft-clipping (-/+)" in { val builder = new SamBuilder(readLength=142) val Seq(r1, r2) = builder.addPair(start1=545, start2=493, strand1=Minus, strand2=Plus, cigar1="47S72M23S", cigar2="46S96M") .map { r => r.bases = "A"*142; r } val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value - s1.baseString shouldBe "T"*142 + s1.baseString shouldBe "T"*142 // all the bases remain, no clipping s1.cigar.toString shouldBe "23S72M47S" // NB: cigar is reversed in toSourceRead - s2.baseString shouldBe "A"*142 + s2.baseString shouldBe "A"*142 // all the bases remain, no clipping s2.cigar.toString shouldBe "46S96M" } + it should "not trim based on insert size in the presence of soft-clipping (+/-)" in { + val builder = new SamBuilder(readLength=142) + val Seq(r1, r2) = builder.addPair(start2=545, start1=493, strand2=Minus, strand1=Plus, cigar2="47S72M23S", cigar1="46S96M") + .map { r => r.bases = "A"*142; r } + val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value + val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value + + // for R2, the leading 47S in the alignment is trimmed because + s1.baseString shouldBe "A"*142 + s1.cigar.toString shouldBe "46S96M" + s2.baseString shouldBe "T"*142 + s2.cigar.toString shouldBe "23S72M47S" // NB: cigar is reversed in toSourceRead + } + it should "not trim the source read based on insert size if the read is not an FR pair" in { val builder = new SamBuilder(readLength=50) val Seq(r1, r2) = builder.addPair(start1=11, start2=1, strand1=Plus, strand2=Plus).map { r => r.bases = "A"*50; r } @@ -560,4 +574,18 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues { source.cigar.toString() shouldBe "3M" source.sam shouldBe Some(rec) } + + it should "not trim reads with an insert size shorter than the read length when there's an insertion in the middle" in { + val builder = new SamBuilder(readLength=100) + val Seq(r1, r2) = builder.addPair( + start1=1, start2=1, strand1=Plus, strand2=Minus, cigar1="40M20I40M", cigar2="40M20I40M" + ).map { r => r.bases = "A"*100; r } + val s1 = cc().toSourceRead(r1, minBaseQuality=2.toByte, trim=false).value + val s2 = cc().toSourceRead(r2, minBaseQuality=2.toByte, trim=false).value + + s1.baseString shouldBe "A"*100 + s1.cigar.toString shouldBe "40M20I40M" + s2.baseString shouldBe "T"*100 + s2.cigar.toString shouldBe "40M20I40M" + } }