Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add --umi-prefix to CopyUmiFromReadName #958

Merged
merged 9 commits into from
Jul 15, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,26 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger}
"""
|Copies the UMI at the end of the BAM's read name to the RX tag.
|
|The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI
|The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI
|will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of
|valid bases (ACGTN), the program will report the error and fail.
|
|If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The
|resulting UMI in the `RX` tag will always be hyphen delimited.
|If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)).
|The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag
|will always be hyphen delimited.
|
|Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add an 'r' prefix to indicate that the sequence
|has been reverse-complemented. By default, the 'r' prefix is removed and the sequence is reverse-complemented
|back to the forward orientation. The `--override-reverse-complement-umis` disables the latter behavior, such that
|the 'r' prefix is removed but the UMI sequence is left as reverse-complemented.
""")
class CopyUmiFromReadName
( @arg(flag='i', doc="The input BAM file") input: PathToBam,
@arg(flag='o', doc="The output BAM file") output: PathToBam,
@arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false
( @arg(flag='i', doc="The input BAM file.") input: PathToBam,
@arg(flag='o', doc="The output BAM file.") output: PathToBam,
@arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false,
@arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':',
@arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+',
@arg(doc="Do not reverse-complement UMIs prefixed with 'r'.") overrideReverseComplementUmis: Boolean = false,
) extends FgBioTool with LazyLogging {

nh13 marked this conversation as resolved.
Show resolved Hide resolved
Io.assertReadable(input)
Expand All @@ -58,7 +67,14 @@ class CopyUmiFromReadName
val progress = new ProgressLogger(logger)
source.foreach { rec =>
progress.record(rec)
writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi)

writer += Umis.copyUmiFromReadName(
rec = rec,
removeUmi = removeUmi,
fieldDelimiter = fieldDelimiter,
umiDelimiter = umiDelimiter,
reverseComplementPrefixedUmis = !overrideReverseComplementUmis,
)
}
progress.logLast()
source.safelyClose()
Expand Down
62 changes: 49 additions & 13 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.util.Sequences

object Umis {
val RevcompPrefix: String = "r"

/** Copies the UMI sequence from the read name.
*
Expand All @@ -38,17 +40,29 @@ object Umis {
*
* @param rec the record to modify
* @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag
* @param delimiter the delimiter of fields within the read name
* @param fieldDelimiter the delimiter of fields within the read name
* @param umiDelimiter the delimiter between sequences in the UMI string
* @param reverseComplementPrefixedUmis whether to reverse-compliment UMIs prefixed with 'r'
* @return the modified record
*/
def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':'): SamRecord = {
def copyUmiFromReadName(rec: SamRecord,
removeUmi: Boolean = false,
fieldDelimiter: Char = ':',
umiDelimiter: Char = '+',
reverseComplementPrefixedUmis: Boolean = true): SamRecord = {
// Extract and set the UMI
val umi = extractUmisFromReadName(rec.name, delimiter, strict=false)
val umi = extractUmisFromReadName(
name = rec.name,
fieldDelimiter = fieldDelimiter,
strict = false,
umiDelimiter = umiDelimiter,
reverseComplementPrefixedUmis = reverseComplementPrefixedUmis,
)
require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}")
umi.foreach(u => rec(ConsensusTags.UmiBases) = u)

// Remove the UMI from the read name if requested
if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(delimiter))
if (removeUmi) rec.name = rec.name.substring(0, rec.name.lastIndexOf(fieldDelimiter))

rec
}
Expand All @@ -59,29 +73,51 @@ object Umis {
*
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
* translated to hyphens before returning.
* field may contain multiple UMIs, in which case they will delimit them with `umiDelimiter` characters, which
* will be translated to hyphens before returning.
*
* If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments,
with the UMI being the last in the case of 8 and `None` in the case of 7.
*
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
*/
def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean): Option[String] = {
def extractUmisFromReadName(name: String,
fieldDelimiter: Char = ':',
strict: Boolean,
umiDelimiter: Char = '+',
reverseComplementPrefixedUmis: Boolean = true): Option[String] = {
// If strict, check that the read name actually has eight parts, which is expected
val rawUmi = if (strict) {
val colons = name.count(_ == delimiter)
val colons = name.count(_ == fieldDelimiter)
if (colons == 6) None
else if (colons == 7) Some(name.substring(name.lastIndexOf(delimiter) + 1, name.length))
else if (colons == 7) Some(name.substring(name.lastIndexOf(fieldDelimiter) + 1, name.length))
else throw new IllegalArgumentException(s"Trying to extract UMI from read with ${colons + 1} parts (7-8 expected): ${name}")
} else {
val idx = name.lastIndexOf(delimiter)
require(idx != -1, s"Read did not have multiple '${delimiter}'-separated fields: ${name}")
val idx = name.lastIndexOf(fieldDelimiter)
require(idx != -1, s"Read did not have multiple '${fieldDelimiter}'-separated fields: ${name}")
Some(name.substring(idx + 1, name.length))
}

val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
val valid = umi.forall(u => u.forall(isValidUmiCharacter))
// Remove 'r' prefixes, optionally reverse-complementing the prefixed UMIs if
// reverseComplementPrefixedUmis = true, replace the delimiter (if any) with '-',
// and make sure the sequence is upper-case.
var umi = rawUmi.map(raw =>
(raw.indexOf(RevcompPrefix) >= 0, raw.indexOf(umiDelimiter) > 0) match {
case (true, true) if reverseComplementPrefixedUmis =>
raw.split(umiDelimiter).map(seq =>
if (seq.startsWith(RevcompPrefix)) Sequences.revcomp(seq.stripPrefix(RevcompPrefix))
else seq.stripPrefix(RevcompPrefix)
).mkString("-").toUpperCase
case (true, false) if reverseComplementPrefixedUmis =>
Sequences.revcomp(raw.stripPrefix(RevcompPrefix)).toUpperCase
case (true, true) => raw.replace(RevcompPrefix, "").replace(umiDelimiter, '-').toUpperCase
case (true, false) => raw.replace(RevcompPrefix, "").toUpperCase
case (false, true) => raw.replace(umiDelimiter, '-').toUpperCase
case (false, false) => raw.toUpperCase
}
)

val valid = umi.forall(u => u.forall(isValidUmiCharacter))

if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}")
else if (!valid) None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,21 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
private case class Result(name: String, umi: String)

/** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */
private def run(names: Iterable[String], removeUmi: Boolean): IndexedSeq[Result] = {
private def run(names: Iterable[String],
removeUmi: Boolean,
overrideReverseComplementUmis: Boolean = false): IndexedSeq[Result] = {
// build the reads
val builder = new SamBuilder()
names.foreach { name => builder.addFrag(name=name, unmapped=true) }

// run the tool
val out = makeTempFile("test.", ".bam")
val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi)
val tool = new CopyUmiFromReadName(
input = builder.toTempFile(),
output = out,
removeUmi = removeUmi,
overrideReverseComplementUmis = overrideReverseComplementUmis
)
executeFgbioTool(tool)

// slurp the results
Expand Down Expand Up @@ -69,4 +76,26 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}

it should "remove a reverse-complement prefix to the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
val results = run(
names = names,
removeUmi = true,
overrideReverseComplementUmis = true
)
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}

it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
val results = run(
names = names,
removeUmi = true,
overrideReverseComplementUmis = false
)
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC")
}
}
6 changes: 3 additions & 3 deletions src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ class UmisTest extends UnitSpec with OptionValues {
}

it should "split on a different name delimiter if specified" in {
copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
copyUmiFromReadName(rec=rec("UMI-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-A"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), fieldDelimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
}

it should "change the UMI delimiter from + to -" in {
Expand Down
Loading