From 0c6bbf8f5f4cdb2cda2186e9c4120f6395be0c4a Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 16 Jun 2023 18:19:04 +0200 Subject: [PATCH 1/6] Makes damage correction optional when running aDNA workflow --- conf/test_ancient_dna.config | 1 + docs/output.md | 2 +- modules.json | 9 ++--- nextflow.config | 3 +- nextflow_schema.json | 18 ++++++---- subworkflows/local/ancient_dna.nf | 55 +++++++++++++++++++------------ workflows/mag.nf | 2 +- 7 files changed, 55 insertions(+), 35 deletions(-) diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index dcb8f7c9..421eee20 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -29,6 +29,7 @@ params { busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" gtdb = false ancient_dna = true + run_ancient_damagecorrection = true binning_map_mode = 'own' skip_spades = false skip_spadeshybrid = true diff --git a/docs/output.md b/docs/output.md index 0d805bb8..44ad112d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -641,7 +641,7 @@ Optional, only running when parameter `-profile ancient_dna` is specified. ### `variant_calling` -Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is re-called with a variant calling software using the reads aligned back to the contigs +Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is optionally re-called with a variant calling software using the reads aligned back to the contigs when `--run_ancient_damagecorrection` is supplied.
Output files diff --git a/modules.json b/modules.json index 46389ff2..add56e46 100644 --- a/modules.json +++ b/modules.json @@ -176,14 +176,15 @@ "git_sha": "371eff7748d769c2ddc8bd593773523a364a52fe", "installed_by": ["modules"] }, - "tiara/tiara": { - "branch": "master", - "git_sha": "d91e3d3d4806179065b087b91ff36c11976bf233" - }, "seqtk/mergepe": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] + }, + "tiara/tiara": { + "branch": "master", + "git_sha": "d91e3d3d4806179065b087b91ff36c11976bf233", + "installed_by": ["modules"] } } }, diff --git a/nextflow.config b/nextflow.config index d996ffb4..90aee053 100644 --- a/nextflow.config +++ b/nextflow.config @@ -61,13 +61,14 @@ params { // ancient DNA assembly validation options ancient_dna = false + pydamage_accuracy = 0.5 + run_ancient_damagecorrection = false freebayes_ploidy = 1 freebayes_min_basequality = 20 freebayes_minallelefreq = 0.33 bcftools_view_high_variant_quality = 30 bcftools_view_medium_variant_quality = 20 bcftools_view_minimal_allelesupport = 3 - pydamage_accuracy = 0.5 // taxonomy options centrifuge_db = null diff --git a/nextflow_schema.json b/nextflow_schema.json index fe35aed0..a33483a4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,7 +32,7 @@ "format": "file-path", "description": "Additional input CSV samplesheet containing information about pre-computed assemblies. When set, both read pre-processing and assembly are skipped and the pipeline begins at the binning stage.", "help_text": "If you have pre-computed assemblies from another source, it is possible to jump straight to the binning stage of the pipeline by supplying these assemblies in a CSV file. This CSV file should have three columns and the following header: `id,group,assembler,fasta`. Short reads must still be supplied in to `--input` in CSV format. See [usage docs](https://nf-co.re/mag/usage#input-specifications) for further details.", - "default": null, + "default": "None", "fa_icon": "fas fa-file-csv" }, "outdir": { @@ -740,7 +740,7 @@ "type": "number", "default": 0.5, "description": "Specify single-copy gene score threshold for bin refinement.", - "help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836–43. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n" + "help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836\u201343. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n" }, "postbinning_input": { "type": "string", @@ -780,6 +780,15 @@ "type": "boolean", "description": "Turn on/off the ancient DNA subworfklow" }, + "pydamage_accuracy": { + "type": "number", + "default": 0.5, + "description": "PyDamage accuracy threshold" + }, + "run_ancient_damagecorrection": { + "type": "boolean", + "description": "activate damage correction of ancient contigs using variant and consensus calling" + }, "freebayes_ploidy": { "type": "integer", "default": 1, @@ -809,11 +818,6 @@ "type": "integer", "default": 3, "description": "minimum number of bases supporting the alternative allele" - }, - "pydamage_accuracy": { - "type": "number", - "default": 0.5, - "description": "PyDamage accuracy threshold" } } } diff --git a/subworkflows/local/ancient_dna.nf b/subworkflows/local/ancient_dna.nf index 442a8c1d..c0b2af4c 100644 --- a/subworkflows/local/ancient_dna.nf +++ b/subworkflows/local/ancient_dna.nf @@ -10,31 +10,44 @@ workflow ANCIENT_DNA_ASSEMBLY_VALIDATION { take: input //channel: [val(meta), path(contigs), path(bam), path(bam_index)] main: + ch_versions = Channel.empty() + PYDAMAGE_ANALYZE(input.map {item -> [item[0], item[2], item[3]]}) PYDAMAGE_FILTER(PYDAMAGE_ANALYZE.out.csv) - FAIDX(input.map { item -> [ item[0], item[1] ] }) - freebayes_input = input.join(FAIDX.out.fai) // [val(meta), path(contigs), path(bam), path(bam_index), path(fai)] - FREEBAYES (freebayes_input.map { item -> [item[0], item[2], item[3], [], [], []] }, - freebayes_input.map { item -> item[1] }, - freebayes_input.map { item -> item[4] }, - [], - [], - [] ) - - BCFTOOLS_INDEX_PRE(FREEBAYES.out.vcf) - BCFTOOLS_VIEW(FREEBAYES.out.vcf.join(BCFTOOLS_INDEX_PRE.out.tbi), [], [], []) - BCFTOOLS_INDEX_POST(BCFTOOLS_VIEW.out.vcf) - BCFTOOLS_CONSENSUS(BCFTOOLS_VIEW.out.vcf - .join(BCFTOOLS_INDEX_POST.out.tbi) - .join(input.map { item -> [ item[0], item[1] ] })) + ch_versions = ch_versions.mix(PYDAMAGE_ANALYZE.out.versions.first()) + + if ( !params.run_ancient_damagecorrection ) { + ch_corrected_contigs = Channel.empty() + } + + if ( params.run_ancient_damagecorrection ) { + FAIDX(input.map { item -> [ item[0], item[1] ] }) + freebayes_input = input.join(FAIDX.out.fai) // [val(meta), path(contigs), path(bam), path(bam_index), path(fai)] + FREEBAYES (freebayes_input.map { item -> [item[0], item[2], item[3], [], [], []] }, + freebayes_input.map { item -> item[1] }, + freebayes_input.map { item -> item[4] }, + [], + [], + [] ) + + BCFTOOLS_INDEX_PRE(FREEBAYES.out.vcf) + BCFTOOLS_VIEW(FREEBAYES.out.vcf.join(BCFTOOLS_INDEX_PRE.out.tbi), [], [], []) + BCFTOOLS_INDEX_POST(BCFTOOLS_VIEW.out.vcf) + BCFTOOLS_CONSENSUS(BCFTOOLS_VIEW.out.vcf + .join(BCFTOOLS_INDEX_POST.out.tbi) + .join(input.map { item -> [ item[0], item[1] ] })) + + ch_corrected_contigs = BCFTOOLS_CONSENSUS.out.fasta + + ch_versions = ch_versions.mix(FAIDX.out.versions.first()) + ch_versions = ch_versions.mix(FREEBAYES.out.versions.first()) + ch_versions = ch_versions.mix(BCFTOOLS_CONSENSUS.out.versions.first()) + } + + - ch_versions = Channel.empty() - ch_versions = PYDAMAGE_ANALYZE.out.versions.first() - ch_versions = ch_versions.mix(FAIDX.out.versions.first()) - ch_versions = ch_versions.mix(FREEBAYES.out.versions.first()) - ch_versions = ch_versions.mix(BCFTOOLS_CONSENSUS.out.versions.first()) emit: - contigs_recalled = BCFTOOLS_CONSENSUS.out.fasta // channel: [ val(meta), path(fasta) ] + contigs_recalled = ch_corrected_contigs // channel: [ val(meta), path(fasta) ] pydamage_results = PYDAMAGE_ANALYZE.out.csv // channel: [ val(meta), path(csv) ] pydamage_filtered_results = PYDAMAGE_FILTER.out.csv // channel: [ val(meta), path(csv) ] versions = ch_versions // channel: [ versions.yml ] diff --git a/workflows/mag.nf b/workflows/mag.nf index d5ea3b1f..61ad3a84 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -647,7 +647,7 @@ workflow MAG { if (!params.skip_binning){ - if (params.ancient_dna) { + if (params.ancient_dna && params.run_ancient_damagecorrection) { BINNING ( BINNING_PREPARATION.out.grouped_mappings .join(ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled) From 8d17991dc92f4b1cfce78e03d9c87adf389dc7b4 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 16 Jun 2023 18:23:16 +0200 Subject: [PATCH 2/6] Make to opt in --- conf/test_ancient_dna.config | 1 - nextflow_schema.json | 4 ++-- subworkflows/local/ancient_dna.nf | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index 421eee20..dcb8f7c9 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -29,7 +29,6 @@ params { busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" gtdb = false ancient_dna = true - run_ancient_damagecorrection = true binning_map_mode = 'own' skip_spades = false skip_spadeshybrid = true diff --git a/nextflow_schema.json b/nextflow_schema.json index a33483a4..c0979e3d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -785,9 +785,9 @@ "default": 0.5, "description": "PyDamage accuracy threshold" }, - "run_ancient_damagecorrection": { + "skip_ancient_damagecorrection": { "type": "boolean", - "description": "activate damage correction of ancient contigs using variant and consensus calling" + "description": "deactivate damage correction of ancient contigs using variant and consensus calling" }, "freebayes_ploidy": { "type": "integer", diff --git a/subworkflows/local/ancient_dna.nf b/subworkflows/local/ancient_dna.nf index c0b2af4c..de47e49b 100644 --- a/subworkflows/local/ancient_dna.nf +++ b/subworkflows/local/ancient_dna.nf @@ -16,11 +16,11 @@ workflow ANCIENT_DNA_ASSEMBLY_VALIDATION { PYDAMAGE_FILTER(PYDAMAGE_ANALYZE.out.csv) ch_versions = ch_versions.mix(PYDAMAGE_ANALYZE.out.versions.first()) - if ( !params.run_ancient_damagecorrection ) { + if ( params.skip_ancient_damagecorrection ) { ch_corrected_contigs = Channel.empty() } - if ( params.run_ancient_damagecorrection ) { + if ( !params.skip_ancient_damagecorrection ) { FAIDX(input.map { item -> [ item[0], item[1] ] }) freebayes_input = input.join(FAIDX.out.fai) // [val(meta), path(contigs), path(bam), path(bam_index), path(fai)] FREEBAYES (freebayes_input.map { item -> [item[0], item[2], item[3], [], [], []] }, From c91af905b1d653408d16691ee03ad5eba17ab2a1 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 16 Jun 2023 18:25:21 +0200 Subject: [PATCH 3/6] Update changelgo --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d4af5dea..af014c0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#395](https://github.com/nf-core/mag/pull/395) - Add support for fast domain-level classification of bins using Tiara, to allow bins to be separated into eukaryotic and prokaryotic-specific processes. - [#422](https://github.com/nf-core/mag/pull/422) - Adds support for normalization of read depth with BBNorm (added by @erikrikarddaniel and @fabianegli) - [#439](https://github.com/nf-core/mag/pull/439) - Adds ability to enter the pipeline at the binning stage by providing a CSV of pre-computed assemblies (by @prototaxites) +- [#459](https://github.com/nf-core/mag/pull/459) - Adds ability to skip damage correction step in the ancient DNA workflow and just run pyDamage (by @jfy133) ### `Changed` From 937941243a453cbb900082cede779b844fc060d7 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 16 Jun 2023 18:43:45 +0200 Subject: [PATCH 4/6] Update nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 90aee053..3db1d045 100644 --- a/nextflow.config +++ b/nextflow.config @@ -62,7 +62,7 @@ params { // ancient DNA assembly validation options ancient_dna = false pydamage_accuracy = 0.5 - run_ancient_damagecorrection = false + skip_ancient_damagecorrection = false freebayes_ploidy = 1 freebayes_min_basequality = 20 freebayes_minallelefreq = 0.33 From 9fe3478af3e703126112319175b63b8b0476dc97 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 21 Jun 2023 08:44:22 +0200 Subject: [PATCH 5/6] Ensure pydamage results go in unique directory --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c1d4569e..83711049 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -521,7 +521,7 @@ process { withName: PYDAMAGE_ANALYZE { ext.prefix = { "${meta.assembler}-${meta.id}" } publishDir = [ - path: {"${params.outdir}/Ancient_DNA/pydamage/analyze/" }, + path: {"${params.outdir}/Ancient_DNA/pydamage/analyze/"${meta.assembler}-${meta.id}" }, mode: params.publish_dir_mode ] } @@ -530,7 +530,7 @@ process { ext.prefix = { "${meta.assembler}-${meta.id}" } ext.args = "-t ${params.pydamage_accuracy}" publishDir = [ - path: {"${params.outdir}/Ancient_DNA/pydamage/filter/" }, + path: {"${params.outdir}/Ancient_DNA/pydamage/filter/"${meta.assembler}-${meta.id}" }, mode: params.publish_dir_mode ] } From 91439581f372b61f1a3a8270e3e14e8026da2219 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 21 Jun 2023 08:55:01 +0200 Subject: [PATCH 6/6] Apply suggestions from code review --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 83711049..b61e95ca 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -521,7 +521,7 @@ process { withName: PYDAMAGE_ANALYZE { ext.prefix = { "${meta.assembler}-${meta.id}" } publishDir = [ - path: {"${params.outdir}/Ancient_DNA/pydamage/analyze/"${meta.assembler}-${meta.id}" }, + path: {"${params.outdir}/Ancient_DNA/pydamage/analyze/${meta.assembler}-${meta.id}" }, mode: params.publish_dir_mode ] } @@ -530,7 +530,7 @@ process { ext.prefix = { "${meta.assembler}-${meta.id}" } ext.args = "-t ${params.pydamage_accuracy}" publishDir = [ - path: {"${params.outdir}/Ancient_DNA/pydamage/filter/"${meta.assembler}-${meta.id}" }, + path: {"${params.outdir}/Ancient_DNA/pydamage/filter/${meta.assembler}-${meta.id}" }, mode: params.publish_dir_mode ] }