-
-
Notifications
You must be signed in to change notification settings - Fork 69
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding SortSequenceDictionary tool (#769)
* Adding SortSequenceDictionary tool, which sorts a sequence dictionary file based on the order in another sequence dictionary
- Loading branch information
Showing
2 changed files
with
321 additions
and
0 deletions.
There are no files selected for viewing
103 changes: 103 additions & 0 deletions
103
src/main/scala/com/fulcrumgenomics/fasta/SortSequenceDictionary.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2022 Fulcrum Genomics LLC | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package com.fulcrumgenomics.fasta | ||
|
||
import com.fulcrumgenomics.FgBioDef.PathToSequenceDictionary | ||
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} | ||
import com.fulcrumgenomics.commons.CommonsDef._ | ||
import com.fulcrumgenomics.commons.util.LazyLogging | ||
import com.fulcrumgenomics.sopt._ | ||
import com.fulcrumgenomics.util.Io | ||
|
||
import scala.collection.immutable.IndexedSeq | ||
import scala.collection.mutable.{ListBuffer, Builder} | ||
|
||
@clp(description = | ||
""" | ||
|Sorts a sequence dictionary file in the order of another sequence dictionary. | ||
| | ||
|The inputs are to two `*.dict` files. One to be sorted, and the other to provide the order for the sorting. | ||
| | ||
|If there is a contig in the input dictionary that is not in the sorting dictionary, that contig will be appended | ||
|to the end of the sequence dictionary in the same relative order to other appended contigs as in the input dictionary. | ||
|Missing contigs can be omitted by setting `--skip-missing-contigs` to true. | ||
| | ||
|If there is a contig in the sorting dictionary that is not in the input dictionary, that contig will be ignored. | ||
| | ||
|The output will be a sequence dictionary, containing the version header line and one | ||
|line per contig. The fields of the entries in this dictionary will be the same as in input, but in the order of | ||
|`--sort-dictionary`. | ||
""", | ||
group = ClpGroups.Fasta) | ||
class SortSequenceDictionary | ||
(@arg(flag='i', doc="Input sequence dictionary file to be sorted.") val input: PathToSequenceDictionary, | ||
@arg(flag='d', doc="Input sequence dictionary file containing contigs in the desired sort order.") val sortDictionary: PathToSequenceDictionary, | ||
@arg(flag='o', doc="Output sequence dictionary file.") val output: PathToSequenceDictionary, | ||
@arg(doc="Skip input contigs that have no matching contig in the sort dictionary rather than appending to the end of the output dictionary.") val skipMissingContigs: Boolean = false, | ||
) extends FgBioTool with LazyLogging { | ||
|
||
Io.assertReadable(input) | ||
Io.assertReadable(sortDictionary) | ||
Io.assertCanWriteFile(output) | ||
|
||
override def execute(): Unit = { | ||
val inputDict = SequenceDictionary(input) | ||
val sortOrderDict = SequenceDictionary(sortDictionary) | ||
|
||
// Iterate through the sort dictionary collecting metas from the input that match by name | ||
val metasBuilder = IndexedSeq.newBuilder[SequenceMetadata] | ||
sortOrderDict.foreach { sortMeta => | ||
sortMeta.allNames.find { name => inputDict.contains(name) } match { | ||
case Some(name) => metasBuilder += inputDict(name) | ||
case None => logger.info(s"Contig '${sortMeta.name}' corresponded to no contig in input dictionary, skipping") | ||
} | ||
} | ||
|
||
// build a dictionary from the input contigs found in the sort dictionary | ||
val metasFoundInSortDictDict = { | ||
val metadata = metasBuilder.result().zipWithIndex.map { | ||
case (meta, index) => meta.copy(index=index) | ||
}.toSeq | ||
SequenceDictionary(metadata:_*) | ||
} | ||
|
||
// maybe append input contigs not found in the sort dictionary. Their index will be reset after aggregation. | ||
inputDict.foreach { inMeta => | ||
if (!metasFoundInSortDictDict.contains(inMeta.name)) { | ||
val skipBehavior = if (skipMissingContigs) "skipping." else "appending." | ||
logger.warning(s"Contig '${inMeta.name}' was not found in sort order dictionary: $skipBehavior") | ||
// Append if desired. The index will be reset later. | ||
if (!skipMissingContigs) { | ||
metasBuilder += inMeta.copy() | ||
} | ||
} | ||
} | ||
// Finally we have all the contigs, so reset the index and write out the dictionary. | ||
val finalMetadataDict = metasBuilder.result().zipWithIndex.map { | ||
case (meta, index) => meta.copy(index=index) | ||
}.toSeq | ||
SequenceDictionary(finalMetadataDict:_*).write(output) | ||
} | ||
} |
218 changes: 218 additions & 0 deletions
218
src/test/scala/com/fulcrumgenomics/fasta/SortSequenceDictionaryTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2022 Fulcrum Genomics LLC | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package com.fulcrumgenomics.fasta | ||
|
||
import com.fulcrumgenomics.commons.io.PathUtil | ||
import com.fulcrumgenomics.testing.UnitSpec | ||
|
||
class SortSequenceDictionaryTest extends UnitSpec { | ||
private val dir = PathUtil.pathTo("src/test/resources/com/fulcrumgenomics/fasta") | ||
private val chr1 = SequenceMetadata( | ||
name = "chr1", | ||
length = 100, | ||
aliases = Seq( "chr1_1", "chr1_2", "chr1_3"), | ||
) | ||
|
||
private val chr2 = SequenceMetadata( | ||
name = "chr2", | ||
length = 100, | ||
aliases = Seq( "chr2_1", "chr2_2", "chr2_3"), | ||
) | ||
private val chrM = SequenceMetadata( | ||
name = "chrM", | ||
length = 100, | ||
aliases = Seq( "chrM_1", "chrM_2", "chrM_3"), | ||
) | ||
|
||
private val chr1Alias = SequenceMetadata( | ||
name = "chr1_1", | ||
length = 100, | ||
) | ||
private val chr2Alias = SequenceMetadata( | ||
name = "chr2_1", | ||
length = 100, | ||
) | ||
private val chrMAlias = SequenceMetadata( | ||
name = "chrM_1", | ||
length = 100, | ||
) | ||
|
||
private val dictChr1Chr2ChrM = SequenceDictionary(chr1, chr2, chrM) | ||
|
||
"SortSequenceDictionary" should "keep output identical to input if the sort order is the same" in { | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
dictChr1Chr2ChrM.write(inputDictionaryPath) | ||
dictChr1Chr2ChrM.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
) | ||
executeFgbioTool(tool) | ||
|
||
SequenceDictionary(outputDictionaryPath).sameAs(dictChr1Chr2ChrM) shouldBe true | ||
} | ||
|
||
it should "append metadata row if contig is present in input and not sortDictionary, and `skip-missing-contigs` is false" in { | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
val subsetDictionary = SequenceDictionary(chr2, chrM) | ||
val expectedDictionary = SequenceDictionary(chr2, chrM, chr1) | ||
|
||
dictChr1Chr2ChrM.write(inputDictionaryPath) | ||
subsetDictionary.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
skipMissingContigs = false, | ||
) | ||
executeFgbioTool(tool) | ||
|
||
SequenceDictionary(outputDictionaryPath).sameAs(expectedDictionary) shouldBe true | ||
} | ||
|
||
it should "skip metadata row if contig is present in input and not sortDictionary, and `--skip-missing-contigs` is true" in { | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
val subsetDictionary = SequenceDictionary(chr2, chrM) | ||
|
||
dictChr1Chr2ChrM.write(inputDictionaryPath) | ||
subsetDictionary.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
skipMissingContigs = true, | ||
) | ||
executeFgbioTool(tool) | ||
|
||
SequenceDictionary(outputDictionaryPath).sameAs(subsetDictionary) shouldBe true | ||
} | ||
|
||
it should "skip metadata row if contig is present in sortDictionary and not input" in { | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
val subsetDictionary = SequenceDictionary(chr2, chrM) | ||
|
||
subsetDictionary.write(inputDictionaryPath) | ||
dictChr1Chr2ChrM.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
) | ||
executeFgbioTool(tool) | ||
|
||
SequenceDictionary(outputDictionaryPath).sameAs(subsetDictionary) shouldBe true | ||
} | ||
|
||
it should "reorder contigs if sortDictionary is provided in a different order" in { | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
val reorderedDictionary = SequenceDictionary(chrM, chr1, chr2) | ||
|
||
dictChr1Chr2ChrM.write(inputDictionaryPath) | ||
reorderedDictionary.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
) | ||
executeFgbioTool(tool) | ||
|
||
SequenceDictionary(outputDictionaryPath).sameAs(reorderedDictionary) shouldBe true | ||
} | ||
|
||
it should "reorder contigs even if contigs in input are an alias of contigs in sort dictionary" in { | ||
val aliasDictionary = SequenceDictionary(chr1Alias, chr2Alias, chrMAlias) | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
val reorderedDictionary = SequenceDictionary(chrM, chr1, chr2) | ||
val expectedDictionary = SequenceDictionary(chrMAlias, chr1Alias, chr2Alias) | ||
|
||
|
||
aliasDictionary.write(inputDictionaryPath) | ||
reorderedDictionary.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
) | ||
executeFgbioTool(tool) | ||
|
||
SequenceDictionary(outputDictionaryPath).sameAs(expectedDictionary) shouldBe true | ||
} | ||
|
||
it should "raise Error over duplicate entries that point to the same contig in the input" in { | ||
val inputDictionaryPath = makeTempFile("test.", ".1.dict") | ||
val sortDictionaryPath = makeTempFile("test.", ".2.dict") | ||
val outputDictionaryPath = makeTempFile("test.", ".3.dict") | ||
|
||
val chr1IncorrectAlias = SequenceMetadata( | ||
name = "chr1_2", | ||
length = 100, | ||
aliases = Seq("chr1_3"), | ||
) | ||
|
||
val chrMIncorrectAlias = SequenceMetadata( | ||
name = "MT", | ||
length = 100, | ||
aliases = Seq( "chr1_1"), | ||
) | ||
|
||
val duplicateAliasDictionary = SequenceDictionary(chr1IncorrectAlias, chr2Alias, chrMIncorrectAlias) | ||
|
||
dictChr1Chr2ChrM.write(inputDictionaryPath) | ||
duplicateAliasDictionary.write(sortDictionaryPath) | ||
|
||
val tool = new SortSequenceDictionary( | ||
input = inputDictionaryPath, | ||
sortDictionary = sortDictionaryPath, | ||
output = outputDictionaryPath, | ||
) | ||
val ex = intercept[Exception] {executeFgbioTool(tool) } | ||
ex.getMessage should include ("Found duplicate sequence name:") | ||
} | ||
} |