Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New module taxonkit/reformat #52

Merged
merged 9 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions modules/ebi-metagenomics/taxonkit/reformat/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::taxonkit=0.17.0"
52 changes: 52 additions & 0 deletions modules/ebi-metagenomics/taxonkit/reformat/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
process TAXONKIT_REFORMAT {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/taxonkit:0.17.0--h9ee0642_1':
'biocontainers/taxonkit:0.17.0--h9ee0642_1' }"

input:
tuple val(meta), path(tsv)
path taxdb

output:
tuple val(meta), path("*.tsv"), emit: reformat_tsv
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
taxonkit \\
reformat \\
$args \\
--threads $task.cpus \\
--data-dir $taxdb \\
--out-file ${prefix}.tsv \\
$tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
taxonkit: \$( taxonkit version | sed 's/.* v//' )
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
touch ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
taxonkit: \$( taxonkit version | sed 's/.* v//' )
END_VERSIONS
"""
}
51 changes: 51 additions & 0 deletions modules/ebi-metagenomics/taxonkit/reformat/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "taxonkit_reformat"
description: Reformat lineage in canonical ranks
keywords:
- taxonomy
- taxids
- ncbi
- lineage
tools:
- "taxonkit":
description: "A Cross-platform and Efficient NCBI Taxonomy Toolkit"
homepage: "https://bioinf.shenwei.me/taxonkit/"
documentation: "https://bioinf.shenwei.me/taxonkit/usage/#reformat"
tool_dev_url: "https://github.com/shenwei356/taxonkit"
doi: "10.1016/j.jgg.2021.03.006"
licence: ["MIT"]
identifier: biotools:taxonkit

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1', single_end:false ]`
- tsv:
type: file
description: Input TSV file
pattern: "*.{tsv,tsv.gz}"
- - taxdb:
type: file
description: Taxonomy database unpacked from ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
output:
- reformat_tsv:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1', single_end:false ]`
- "*.tsv":
type: file
description: Reformated output TSV file
pattern: "*.{tsv}"
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@arajkovic"
maintainers:
- "@arajkovic"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
accession accession.version taxid gi
MT192765 MT192765.1 2697049 1821109001
NZ_LS483480 NZ_LS483480.1 727 1409087034
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
2697049 | 2019-nCoV | | equivalent name |
2697049 | COVID-19 virus | | equivalent name |
2697049 | HCoV-19 | | equivalent name |
2697049 | Human coronavirus 2019 | | equivalent name |
2697049 | SARS-2 | | equivalent name |
2697049 | SARS2 | | equivalent name |
2697049 | SARS-CoV-2 | | acronym |
2697049 | SARS-CoV2 | | equivalent name |
2697049 | Severe acute respiratory syndrome coronavirus 2 | | scientific name |
727 | ATCC 33391 | ATCC 33391 <type strain> | type material |
727 | "Bacterium influenzae" Lehmann and Neumann 1896 | | authority |
727 | Bacterium influenzae | | synonym |
727 | CCUG 23945 | CCUG 23945 <type strain> | type material |
727 | CIP 102514 | CIP 102514 <type strain> | type material |
727 | "Coccobacillus pfeifferi" Neveu-Lemaire 1921 | | authority |
727 | Coccobacillus pfeifferi | | synonym |
727 | DSM 4690 | DSM 4690 <type strain> | type material |
727 | Haemophilus influenzae (Lehmann and Neumann 1896) Winslow et al. 1917 | | authority |
727 | Haemophilus influenzae | | scientific name |
727 | "Haemophilus meningitidis" (Martins) Hauduroy et al. 1937 | | authority |
727 | Haemophilus meningitidis | | synonym |
727 | "Influenza-bacillus" Pfeiffer 1892 | | authority |
727 | Influenza-bacillus | | synonym |
727 | "Mycobacterium influenzae" (Lehmann and Neumann 1896) Chester 1901 | | authority |
727 | Mycobacterium influenzae | | synonym |
727 | NCTC 8143 | NCTC 8143 <type strain> | type material |
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
10239 | 1 | superkingdom | | 9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
2559587 | 10239 | clade | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | |
2732396 | 2559587 | kingdom | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | |
2732408 | 2732396 | phylum | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | |
2732506 | 2732408 | class | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | |
76804 | 2732506 | order | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant |
2499399 | 76804 | suborder | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant |
11118 | 2499399 | family | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant |
2501931 | 11118 | subfamily | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant |
694002 | 2501931 | genus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant |
2509511 | 694002 | subgenus | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant |
694009 | 2509511 | species | SA | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | code compliant; specified |
2697049 | 694009 | no rank | | 9 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | |
131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | |
2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | |
1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant |
135625 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant |
712 | 135625 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant |
724 | 712 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | code compliant |
727 | 724 | species | HI | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | code compliant; specified |
61 changes: 61 additions & 0 deletions modules/ebi-metagenomics/taxonkit/reformat/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
nextflow_process {

name "Test Process TAXONKIT_REFORMAT"
script "../main.nf"
process "TAXONKIT_REFORMAT"

tag "modules"
tag "modules_ebimetagenomics"
tag "taxonkit"
tag "taxonkit/reformat"

config "./nextflow.config"

test("tsv with header") {
when {
process {
"""
input[0] = [
[ id:'test', single_end:false ], // meta map
file("$moduleTestDir/fixtures/nucl_gb.accession2taxid", checkIfExists: true)
]
input[1] = file("$moduleTestDir/fixtures/taxdump", checkIfExists: true)
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}

}

test("tsv with header - stub") {

options "-stub"

when {
process {
"""
input[0] = [
[ id:'test', single_end:false ], // meta map
file("$moduleTestDir/fixtures/nucl_gb.accession2taxid", checkIfExists: true)
]
input[1] = file("$moduleTestDir/fixtures/taxdump", checkIfExists: true)
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}

}

}
72 changes: 72 additions & 0 deletions modules/ebi-metagenomics/taxonkit/reformat/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"tsv with header": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.tsv:md5,331e1290ca8ae7e3962b43e28d865bae"
]
],
"1": [
"versions.yml:md5,25e183506a3cbe3ddc98e6036b4a0525"
],
"reformat_tsv": [
[
{
"id": "test",
"single_end": false
},
"test.tsv:md5,331e1290ca8ae7e3962b43e28d865bae"
]
],
"versions": [
"versions.yml:md5,25e183506a3cbe3ddc98e6036b4a0525"
]
}
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.04.3"
},
"timestamp": "2024-10-08T16:51:54.404921"
},
"tsv with header - stub": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"1": [
"versions.yml:md5,25e183506a3cbe3ddc98e6036b4a0525"
],
"reformat_tsv": [
[
{
"id": "test",
"single_end": false
},
"test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,25e183506a3cbe3ddc98e6036b4a0525"
]
}
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.04.3"
},
"timestamp": "2024-10-08T16:52:03.328778"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
process {
ext.args = '-t -r Unassigned -R -1 --taxid-field 3'
}
2 changes: 2 additions & 0 deletions modules/ebi-metagenomics/taxonkit/reformat/tests/tags.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
taxonkit/reformat:
- "modules/ebi-metagenomics/taxonkit/reformat/**"
Loading