diff --git a/CHANGELOG.md b/CHANGELOG.md index 9252bbf2..ab810c4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#366](https://github.com/nf-core/mag/pull/366) - Added CAT_SUMMARISE process and cat_official_taxonomy parameter (by @prototaxites) - [#372](https://github.com/nf-core/mag/pull/372) - Allow CAT_DB to take an extracted database as well as a tar.gz file. - [#380](https://github.com/nf-core/mag/pull/380) - Added support for saving processed reads (clipped, host removed etc.) to results directory (by @jfy133) +- [#394](https://github.com/nf-core/mag/pull/394) - Added GUNC for additional chimeric bin/contamination QC (added by @jfy13) ### `Changed` diff --git a/CITATIONS.md b/CITATIONS.md index 2ff9d7cf..09d165b3 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -62,6 +62,10 @@ > Chaumeil, P. A., Mussig, A. J., Hugenholtz, P., & Parks, D. H. (2020). GTDB-Tk: a toolkit to classify genomes with the Genome Taxonomy Database. Bioinformatics , 36(6), 1925–1927. doi: 10.1093/bioinformatics/btz848. +- [GUNC](https://doi.org/10.1186/s13059-021-02393-0.) + + > Orakov, A., Fullam, A., Coelho, A. P., Khedkar, S., Szklarczyk, D., Mende, D. R., Schmidt, T. S. B., and Bork, P.. 2021. “GUNC: Detection of Chimerism and Contamination in Prokaryotic Genomes.” Genome Biology 22 (1): 178. doi: 10.1186/s13059-021-02393-0. + - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) > Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. doi: 10.1186/s13059-019-1891-0. diff --git a/README.md b/README.md index a8b4eef4..c851487d 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The pipeline then: - performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast) - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal) -- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/) or [CheckM](https://ecogenomics.github.io/CheckM/). +- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) diff --git a/conf/modules.config b/conf/modules.config index 420ee35a..487e7c83 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -366,6 +366,33 @@ process { ] } + withName: 'GUNC_DOWNLOADDB' { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/GUNC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.gunc_save_db + ] + } + + // Make sure to keep directory in sync with gunc_qc.nf + withName: 'GUNC_RUN' { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/GUNC/raw/${meta.assembler}-${meta.binner}-${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // Make sure to keep directory in sync with gunc_qc.nf + withName: 'GUNC_MERGECHECKM' { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/GUNC/checkmmerged/${meta.assembler}-${meta.binner}-${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CAT_DB_GENERATE { publishDir = [ path: { "${params.outdir}/Taxonomy/CAT" }, diff --git a/docs/images/mag_workflow.png b/docs/images/mag_workflow.png index fca7bfd9..f476287a 100644 Binary files a/docs/images/mag_workflow.png and b/docs/images/mag_workflow.png differ diff --git a/docs/images/mag_workflow.svg b/docs/images/mag_workflow.svg index f4d3fb60..f847cec3 100644 --- a/docs/images/mag_workflow.svg +++ b/docs/images/mag_workflow.svg @@ -5,7 +5,7 @@ viewBox="0 0 320.14583 150.81249" version="1.1" id="svg8" - inkscape:version="1.2.1 (1:1.2.1+202210291243+9c6d41e410)" + inkscape:version="1.2.2 (1:1.2.2+202212051552+b0a8486541)" sodipodi:docname="mag_workflow.svg" inkscape:export-filename="mag_workflow.png" inkscape:export-xdpi="289.40701" @@ -289,7 +289,7 @@ y2="103.70081" gradientUnits="userSpaceOnUse" /> + transform="translate(-78.042851,154.76453)"> + transform="translate(143.1274,-254.71088)"> @@ -1476,7 +1476,7 @@ inkscape:export-ydpi="289.40701" inkscape:export-xdpi="289.40701" ry="1.528429" - y="73.379021" + y="62.795689" x="150.20613" height="24.352978" width="37.306007" @@ -1484,7 +1484,7 @@ style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.340915;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + transform="translate(-40.31832,141.06376)"> @@ -1549,7 +1549,7 @@ + transform="translate(1.0583333,-24.341666)"> + transform="translate(31.0114,40.710298)"> MaxBin2 + transform="translate(110.93518,-155.93399)"> + + + CONCOCT + Evaluation BUSCO CheckM GUNC + + + + QUAST v2.3.0 + transform="translate(-33.816969,120.10905)"> diff --git a/docs/output.md b/docs/output.md index d1daba7e..dbbddcf5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -416,7 +416,9 @@ For each bin or refined bin the median sequencing depth is computed based on the -### QC for metagenome assembled genomes with BUSCO +### QC for metagenome assembled genomes + +#### BUSCO [BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated. @@ -455,7 +457,7 @@ Besides the reference files or output files created by BUSCO, the following summ -### QC for metagenome assembled genomes with CheckM +#### CheckM [CheckM](https://ecogenomics.github.io/CheckM/) CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage @@ -483,6 +485,28 @@ If the parameter `--save_checkm_reference` is set, additionally the used the Che +#### GUNC + +[Genome UNClutterer (GUNC)](https://grp-bork.embl-community.io/gunc/index.html) is a tool for detection of chimerism and contamination in prokaryotic genomes resulting from mis-binning of genomic contigs from unrelated lineages. It does so by applying an entropy based score on taxonomic assignment and contig location of all genes in a genome. It is generally considered as a additional complement to CheckM results. + +
+Output files + +- `GenomeBinning/QC/gunc_summary.tsv` +- `GenomeBinning/QC/gunc_checkm_summary.tsv` +- `[gunc-database].dmnd` +- `GUNC/` + - `raw/` + - `[assembler]-[binner]-[sample/group]/GUNC_checkM.merged.tsv`: Per sample GUNC [output](https://grp-bork.embl-community.io/gunc/output.html) containing with taxonomic and completeness QC statistics. + - `checkmmerged/` + - `[assembler]-[binner]-[sample/group]/GUNC.progenomes_2.1.maxCSS_level.tsv`: Per sample GUNC output merged with output from [CheckM](#checkm) + +
+ +GUNC will be run if specified with `--run_gunc` as a standalone, unless CheckM is also activate via `--qc_tool 'checkm'`, in which case GUNC output will be merged with the CheckM output using `gunc merge_checkm`. + +If `--gunc_save_db` is specified, the output directory will also contain the requested database (progenomes, or GTDB) in DIAMOND format. + ## Taxonomic classification of binned genomes ### CAT diff --git a/modules.json b/modules.json index ae10c58c..ede87e6b 100644 --- a/modules.json +++ b/modules.json @@ -96,6 +96,21 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "gunc/downloaddb": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "gunc/mergecheckm": { + "branch": "master", + "git_sha": "93f8308f6c1ef35b6b8cd264cefd22853fc51526", + "installed_by": ["modules"] + }, + "gunc/run": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, "gunzip": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", diff --git a/modules/nf-core/gunc/downloaddb/main.nf b/modules/nf-core/gunc/downloaddb/main.nf new file mode 100644 index 00000000..1e77a4c6 --- /dev/null +++ b/modules/nf-core/gunc/downloaddb/main.nf @@ -0,0 +1,30 @@ +process GUNC_DOWNLOADDB { + tag "$db_name" + label 'process_single' + + conda "bioconda::gunc=1.0.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : + 'quay.io/biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + + input: + val db_name + + output: + path "*.dmnd" , emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + gunc download_db . -db $db_name $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunc/downloaddb/meta.yml b/modules/nf-core/gunc/downloaddb/meta.yml new file mode 100644 index 00000000..c36ff3f3 --- /dev/null +++ b/modules/nf-core/gunc/downloaddb/meta.yml @@ -0,0 +1,36 @@ +name: gunc_downloaddb +description: Download database for GUNC detection of Chimerism and Contamination in Prokaryotic Genomes +keywords: + - download + - prokaryote + - assembly + - genome + - quality control + - chimeras +tools: + - gunc: + description: Python package for detection of chimerism and contamination in prokaryotic genomes. + homepage: https://grp-bork.embl-community.io/gunc/ + documentation: https://grp-bork.embl-community.io/gunc/ + tool_dev_url: https://github.com/grp-bork/gunc + doi: "10.1186/s13059-021-02393-0" + licence: ["GNU General Public v3 or later (GPL v3+)"] + +input: + - db_name: + type: string + description: "Which database to download. Options: progenomes or gtdb" + pattern: "progenomes|gtdb" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - db: + type: file + description: GUNC database file + pattern: "*.dmnd" + +authors: + - "@jfy133" diff --git a/modules/nf-core/gunc/mergecheckm/main.nf b/modules/nf-core/gunc/mergecheckm/main.nf new file mode 100644 index 00000000..a5c46aca --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/main.nf @@ -0,0 +1,36 @@ +process GUNC_MERGECHECKM { + tag "$meta.id" + label 'process_single' + + conda "bioconda::gunc=1.0.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : + 'quay.io/biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + + input: + tuple val(meta), path(gunc_file), path(checkm_file) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gunc \\ + merge_checkm \\ + $args \\ + -g $gunc_file \\ + -c $checkm_file \\ + -o . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunc/mergecheckm/meta.yml b/modules/nf-core/gunc/mergecheckm/meta.yml new file mode 100644 index 00000000..a88298f7 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/meta.yml @@ -0,0 +1,52 @@ +name: "gunc_mergecheckm" +description: Merging of CheckM and GUNC results in one summary table +keywords: + - gunc + - checkm + - summary + - prokaryote + - assembly + - genome + - quality control + - chimeras +tools: + - gunc: + description: Python package for detection of chimerism and contamination in prokaryotic genomes. + homepage: https://grp-bork.embl-community.io/gunc/ + documentation: https://grp-bork.embl-community.io/gunc/ + tool_dev_url: https://github.com/grp-bork/gunc + doi: "10.1186/s13059-021-02393-0" + licence: ["GNU General Public v3 or later (GPL v3+)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gunc_file: + type: file + description: Path of a gunc_scores.tsv file (mandatory) + pattern: "*.{bam,cram,sam}" + - checkm_file: + type: file + description: Output TSV from CheckM qa (ideally with -o 2 extended format) (mandatory) + pattern: "*.{bam,cram,sam}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tsv: + type: file + description: Merged checkm/gunc results in TSV format + pattern: "*.tsv" + +authors: + - "@jfy133" diff --git a/modules/nf-core/gunc/run/main.nf b/modules/nf-core/gunc/run/main.nf new file mode 100644 index 00000000..07511c51 --- /dev/null +++ b/modules/nf-core/gunc/run/main.nf @@ -0,0 +1,38 @@ +process GUNC_RUN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gunc=1.0.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : + 'quay.io/biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + path(db) + + output: + tuple val(meta), path("*maxCSS_level.tsv") , emit: maxcss_level_tsv + tuple val(meta), path("*all_levels.tsv") , optional: true, emit: all_levels_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gunc \\ + run \\ + --input_fasta $fasta \\ + --db_file $db \\ + --threads $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunc/run/meta.yml b/modules/nf-core/gunc/run/meta.yml new file mode 100644 index 00000000..3a85e1fb --- /dev/null +++ b/modules/nf-core/gunc/run/meta.yml @@ -0,0 +1,53 @@ +name: gunc_run +description: Detection of Chimerism and Contamination in Prokaryotic Genomes +keywords: + - prokaryote + - assembly + - genome + - quality control + - chimeras +tools: + - gunc: + description: Python package for detection of chimerism and contamination in prokaryotic genomes. + homepage: https://grp-bork.embl-community.io/gunc/ + documentation: https://grp-bork.embl-community.io/gunc/ + tool_dev_url: https://github.com/grp-bork/gunc + doi: "10.1186/s13059-021-02393-0" + licence: ["GNU General Public v3 or later (GPL v3+)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file containing contig (bins) + pattern: "*.fa" + - db: + type: file + description: GUNC database file + pattern: "*.dmnd" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - maxcss_levels_tsv: + type: file + description: Output file with scores for a taxonomic level with the highest CSS score + pattern: "*.tsv" + - all_levels_tsv: + type: file + description: Optional output file with results for each taxonomic level + pattern: "*.tsv" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index a39efba5..11f4add0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -110,6 +110,10 @@ params { busco_clean = false checkm_db = null save_checkm_data = false + run_gunc = false + gunc_database_type = 'progenomes' + gunc_db = null + gunc_save_db = false // Reproducibility options megahit_fix_cpu_1 = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 72ebe3c5..1a6c2d45 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -483,7 +483,7 @@ }, "gtdbtk_min_completeness": { "type": "number", - "default": 50.0, + "default": 50, "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", "minimum": 0.01, @@ -491,7 +491,7 @@ }, "gtdbtk_max_contamination": { "type": "number", - "default": 10.0, + "default": 10, "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", "minimum": 0, @@ -499,7 +499,7 @@ }, "gtdbtk_min_perc_aa": { "type": "number", - "default": 10.0, + "default": 10, "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", "minimum": 0, "maximum": 100 @@ -513,7 +513,7 @@ }, "gtdbtk_pplacer_cpus": { "type": "number", - "default": 1.0, + "default": 1, "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." }, @@ -699,6 +699,26 @@ "description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.", "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n`both`: bins and unbinned contigs from both the binning and bin refinement steps.", "enum": ["raw_bins_only", "refined_bins_only", "both"] + }, + "run_gunc": { + "type": "boolean", + "description": "Turn on GUNC genome chimerism checks" + }, + "gunc_db": { + "type": "string", + "default": "None", + "description": "Specify a path to a pre-downloaded GUNC dmnd database file" + }, + "gunc_database_type": { + "type": "string", + "default": "progenomes", + "description": "Specify which database to auto-download if not supplying own", + "enum": ["progenomes", "gtdb"] + }, + "gunc_save_db": { + "type": "boolean", + "description": "Save the used GUNC reference files downloaded when not using --gunc_db parameter.", + "help_text": "If specified, the corresponding DIAMOND file downloaded from the GUNC server will be stored in your output directory alongside your GUNC results." } } }, diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf index c7d57cb4..5fd3efe8 100644 --- a/subworkflows/local/checkm_qc.nf +++ b/subworkflows/local/checkm_qc.nf @@ -23,7 +23,7 @@ workflow CHECKM_QC { } CHECKM_LINEAGEWF ( ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db ) - ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output .join(CHECKM_LINEAGEWF.out.marker_file) @@ -33,7 +33,7 @@ workflow CHECKM_QC { } CHECKM_QA ( ch_checkmqa_input, [] ) - ch_versions = ch_versions.mix(CHECKM_QA.out.versions) + ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) // TODO Check output files published correctly diff --git a/subworkflows/local/gunc_qc.nf b/subworkflows/local/gunc_qc.nf new file mode 100644 index 00000000..82d621bb --- /dev/null +++ b/subworkflows/local/gunc_qc.nf @@ -0,0 +1,51 @@ +/* + * GUNC: Detection and quantification of genome chimerism based on lineage homogeneity + */ + +include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' +include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' +include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm' + +workflow GUNC_QC { + take: + ch_bins // [ [ meta] , fasta ], input bins (mandatory) + ch_gunc_db // [ db ], presupplied GUNC database (optional) + ch_checkm_table // [ [ meta ], checkm_qa_table ], extended checkm table from CHECKM_QA, (optional) + + main: + ch_versions = Channel.empty() + + if ( params.gunc_db ) { + ch_db_for_gunc = ch_gunc_db + } else { + ch_db_for_gunc = GUNC_DOWNLOADDB( params.gunc_database_type ).db + ch_versions.mix( GUNC_DOWNLOADDB.out.versions ) + } + + + GUNC_RUN ( ch_bins, ch_db_for_gunc ) + ch_versions.mix( GUNC_RUN.out.versions ) + + // Make sure to keep directory in sync with modules.conf + GUNC_RUN.out.maxcss_level_tsv + .map{it[1]} + .collectFile(name: "gunc_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") + + if ( params.binqc_tool == 'checkm' ) { + + ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv + .combine(ch_checkm_table, by: 0) + + GUNC_MERGECHECKM ( ch_input_to_mergecheckm ) + ch_versions.mix( GUNC_MERGECHECKM.out.versions ) + + // Make sure to keep directory in sync with modules.conf + GUNC_MERGECHECKM.out.tsv + .map{it[1]} + .collectFile(name: "gunc_checkm_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") + } + + emit: + versions = ch_versions + +} diff --git a/workflows/mag.nf b/workflows/mag.nf index 87a99405..ad9a01c6 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -97,6 +97,7 @@ include { BINNING } from '../subworkflows/local/binning' include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { CHECKM_QC } from '../subworkflows/local/checkm_qc' +include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' @@ -154,6 +155,12 @@ if(params.checkm_db) { ch_checkm_db = file(params.checkm_db, checkIfExists: true) } +if (params.gunc_db) { + ch_gunc_db = file(params.gunc_db, checkIfExists: true) +} else { + ch_gunc_db = Channel.empty() +} + if(params.centrifuge_db){ ch_centrifuge_db_file = Channel .value(file( "${params.centrifuge_db}" )) @@ -599,7 +606,7 @@ workflow MAG { } /* - * Bin QC subworkflows: for checking bin completeness with either BUSCO or CHECKM + * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, and/or GUNC */ // Results in: [ [meta], path_to_bin.fa ] @@ -635,11 +642,17 @@ workflow MAG { ch_checkm_summary = CHECKM_QC.out.summary // TODO custom output parsing? Add to MultiQC? - - ch_versions = ch_versions.mix(CHECKM_QC.out.versions.first()) + ch_versions = ch_versions.mix(CHECKM_QC.out.versions) } + if ( params.run_gunc && params.binqc_tool == 'checkm' ) { + GUNC_QC ( ch_input_bins_for_qc, ch_gunc_db, CHECKM_QC.out.checkm_tsv ) + ch_versions = ch_versions.mix( GUNC_QC.out.versions ) + } else if ( params.run_gunc ) { + GUNC_QC ( ch_input_bins_for_qc, ch_gunc_db, [] ) + ch_versions = ch_versions.mix( GUNC_QC.out.versions ) + } ch_quast_bins_summary = Channel.empty() if (!params.skip_quast){