Skip to content

Commit

Permalink
Adding SortSequenceDictionary tool (#769)
Browse files Browse the repository at this point in the history
* Adding SortSequenceDictionary tool, which sorts a sequence dictionary file based on the order in another sequence dictionary
  • Loading branch information
NatPRoach authored Feb 10, 2022
1 parent 86a0db7 commit bb92249
Show file tree
Hide file tree
Showing 2 changed files with 321 additions and 0 deletions.
103 changes: 103 additions & 0 deletions src/main/scala/com/fulcrumgenomics/fasta/SortSequenceDictionary.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* The MIT License
*
* Copyright (c) 2022 Fulcrum Genomics LLC
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package com.fulcrumgenomics.fasta

import com.fulcrumgenomics.FgBioDef.PathToSequenceDictionary
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.CommonsDef._
import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt._
import com.fulcrumgenomics.util.Io

import scala.collection.immutable.IndexedSeq
import scala.collection.mutable.{ListBuffer, Builder}

@clp(description =
"""
|Sorts a sequence dictionary file in the order of another sequence dictionary.
|
|The inputs are to two `*.dict` files. One to be sorted, and the other to provide the order for the sorting.
|
|If there is a contig in the input dictionary that is not in the sorting dictionary, that contig will be appended
|to the end of the sequence dictionary in the same relative order to other appended contigs as in the input dictionary.
|Missing contigs can be omitted by setting `--skip-missing-contigs` to true.
|
|If there is a contig in the sorting dictionary that is not in the input dictionary, that contig will be ignored.
|
|The output will be a sequence dictionary, containing the version header line and one
|line per contig. The fields of the entries in this dictionary will be the same as in input, but in the order of
|`--sort-dictionary`.
""",
group = ClpGroups.Fasta)
class SortSequenceDictionary
(@arg(flag='i', doc="Input sequence dictionary file to be sorted.") val input: PathToSequenceDictionary,
@arg(flag='d', doc="Input sequence dictionary file containing contigs in the desired sort order.") val sortDictionary: PathToSequenceDictionary,
@arg(flag='o', doc="Output sequence dictionary file.") val output: PathToSequenceDictionary,
@arg(doc="Skip input contigs that have no matching contig in the sort dictionary rather than appending to the end of the output dictionary.") val skipMissingContigs: Boolean = false,
) extends FgBioTool with LazyLogging {

Io.assertReadable(input)
Io.assertReadable(sortDictionary)
Io.assertCanWriteFile(output)

override def execute(): Unit = {
val inputDict = SequenceDictionary(input)
val sortOrderDict = SequenceDictionary(sortDictionary)

// Iterate through the sort dictionary collecting metas from the input that match by name
val metasBuilder = IndexedSeq.newBuilder[SequenceMetadata]
sortOrderDict.foreach { sortMeta =>
sortMeta.allNames.find { name => inputDict.contains(name) } match {
case Some(name) => metasBuilder += inputDict(name)
case None => logger.info(s"Contig '${sortMeta.name}' corresponded to no contig in input dictionary, skipping")
}
}

// build a dictionary from the input contigs found in the sort dictionary
val metasFoundInSortDictDict = {
val metadata = metasBuilder.result().zipWithIndex.map {
case (meta, index) => meta.copy(index=index)
}.toSeq
SequenceDictionary(metadata:_*)
}

// maybe append input contigs not found in the sort dictionary. Their index will be reset after aggregation.
inputDict.foreach { inMeta =>
if (!metasFoundInSortDictDict.contains(inMeta.name)) {
val skipBehavior = if (skipMissingContigs) "skipping." else "appending."
logger.warning(s"Contig '${inMeta.name}' was not found in sort order dictionary: $skipBehavior")
// Append if desired. The index will be reset later.
if (!skipMissingContigs) {
metasBuilder += inMeta.copy()
}
}
}
// Finally we have all the contigs, so reset the index and write out the dictionary.
val finalMetadataDict = metasBuilder.result().zipWithIndex.map {
case (meta, index) => meta.copy(index=index)
}.toSeq
SequenceDictionary(finalMetadataDict:_*).write(output)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
/*
* The MIT License
*
* Copyright (c) 2022 Fulcrum Genomics LLC
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package com.fulcrumgenomics.fasta

import com.fulcrumgenomics.commons.io.PathUtil
import com.fulcrumgenomics.testing.UnitSpec

class SortSequenceDictionaryTest extends UnitSpec {
private val dir = PathUtil.pathTo("src/test/resources/com/fulcrumgenomics/fasta")
private val chr1 = SequenceMetadata(
name = "chr1",
length = 100,
aliases = Seq( "chr1_1", "chr1_2", "chr1_3"),
)

private val chr2 = SequenceMetadata(
name = "chr2",
length = 100,
aliases = Seq( "chr2_1", "chr2_2", "chr2_3"),
)
private val chrM = SequenceMetadata(
name = "chrM",
length = 100,
aliases = Seq( "chrM_1", "chrM_2", "chrM_3"),
)

private val chr1Alias = SequenceMetadata(
name = "chr1_1",
length = 100,
)
private val chr2Alias = SequenceMetadata(
name = "chr2_1",
length = 100,
)
private val chrMAlias = SequenceMetadata(
name = "chrM_1",
length = 100,
)

private val dictChr1Chr2ChrM = SequenceDictionary(chr1, chr2, chrM)

"SortSequenceDictionary" should "keep output identical to input if the sort order is the same" in {
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

dictChr1Chr2ChrM.write(inputDictionaryPath)
dictChr1Chr2ChrM.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
)
executeFgbioTool(tool)

SequenceDictionary(outputDictionaryPath).sameAs(dictChr1Chr2ChrM) shouldBe true
}

it should "append metadata row if contig is present in input and not sortDictionary, and `skip-missing-contigs` is false" in {
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

val subsetDictionary = SequenceDictionary(chr2, chrM)
val expectedDictionary = SequenceDictionary(chr2, chrM, chr1)

dictChr1Chr2ChrM.write(inputDictionaryPath)
subsetDictionary.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
skipMissingContigs = false,
)
executeFgbioTool(tool)

SequenceDictionary(outputDictionaryPath).sameAs(expectedDictionary) shouldBe true
}

it should "skip metadata row if contig is present in input and not sortDictionary, and `--skip-missing-contigs` is true" in {
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

val subsetDictionary = SequenceDictionary(chr2, chrM)

dictChr1Chr2ChrM.write(inputDictionaryPath)
subsetDictionary.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
skipMissingContigs = true,
)
executeFgbioTool(tool)

SequenceDictionary(outputDictionaryPath).sameAs(subsetDictionary) shouldBe true
}

it should "skip metadata row if contig is present in sortDictionary and not input" in {
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

val subsetDictionary = SequenceDictionary(chr2, chrM)

subsetDictionary.write(inputDictionaryPath)
dictChr1Chr2ChrM.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
)
executeFgbioTool(tool)

SequenceDictionary(outputDictionaryPath).sameAs(subsetDictionary) shouldBe true
}

it should "reorder contigs if sortDictionary is provided in a different order" in {
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

val reorderedDictionary = SequenceDictionary(chrM, chr1, chr2)

dictChr1Chr2ChrM.write(inputDictionaryPath)
reorderedDictionary.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
)
executeFgbioTool(tool)

SequenceDictionary(outputDictionaryPath).sameAs(reorderedDictionary) shouldBe true
}

it should "reorder contigs even if contigs in input are an alias of contigs in sort dictionary" in {
val aliasDictionary = SequenceDictionary(chr1Alias, chr2Alias, chrMAlias)
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

val reorderedDictionary = SequenceDictionary(chrM, chr1, chr2)
val expectedDictionary = SequenceDictionary(chrMAlias, chr1Alias, chr2Alias)


aliasDictionary.write(inputDictionaryPath)
reorderedDictionary.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
)
executeFgbioTool(tool)

SequenceDictionary(outputDictionaryPath).sameAs(expectedDictionary) shouldBe true
}

it should "raise Error over duplicate entries that point to the same contig in the input" in {
val inputDictionaryPath = makeTempFile("test.", ".1.dict")
val sortDictionaryPath = makeTempFile("test.", ".2.dict")
val outputDictionaryPath = makeTempFile("test.", ".3.dict")

val chr1IncorrectAlias = SequenceMetadata(
name = "chr1_2",
length = 100,
aliases = Seq("chr1_3"),
)

val chrMIncorrectAlias = SequenceMetadata(
name = "MT",
length = 100,
aliases = Seq( "chr1_1"),
)

val duplicateAliasDictionary = SequenceDictionary(chr1IncorrectAlias, chr2Alias, chrMIncorrectAlias)

dictChr1Chr2ChrM.write(inputDictionaryPath)
duplicateAliasDictionary.write(sortDictionaryPath)

val tool = new SortSequenceDictionary(
input = inputDictionaryPath,
sortDictionary = sortDictionaryPath,
output = outputDictionaryPath,
)
val ex = intercept[Exception] {executeFgbioTool(tool) }
ex.getMessage should include ("Found duplicate sequence name:")
}
}

0 comments on commit bb92249

Please sign in to comment.