From a4f42ef7032a36a337f7799986c501c87b7cf464 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 31 Oct 2024 21:08:59 -0300 Subject: [PATCH] Improve GUNC execution --- modules/local/combine_tsv.nf | 6 ++--- subworkflows/local/bin_qc.nf | 46 +++++++++++++++++++++++------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/modules/local/combine_tsv.nf b/modules/local/combine_tsv.nf index 1fe7ec1a..a719b909 100644 --- a/modules/local/combine_tsv.nf +++ b/modules/local/combine_tsv.nf @@ -7,16 +7,16 @@ process COMBINE_TSV { 'biocontainers/bioawk:1.0--hed695b0_5' }" input: - path(bin_summaries, stageAs: "bin_summaries/*.tsv") + path(tsv_files, stageAs: "tsv_files/*.tsv") output: path("*.tsv") , emit: combined path "versions.yml", emit: versions script: - def prefix = task.ext.prefix ?: "bin_depths_summary_combined" + def prefix = task.ext.prefix ?: "tsv_summary_combined" """ - bioawk '(NR == 1) || (FNR > 1)' ${bin_summaries} > ${prefix}.tsv + bioawk '(NR == 1) || (FNR > 1)' ${tsv_files} > ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index b87e7998..88cb72b2 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -10,6 +10,7 @@ include { CHECKM_QA } from '../../modules/nf-core/checkm include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' +include { COMBINE_TSV as COMBINE_GUNC_TSV } from '../../modules/local/combine_tsv' include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' @@ -17,11 +18,11 @@ include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/m workflow BIN_QC { take: - ch_bins // [ [ meta] , fasta ], input bins (mandatory) - ch_checkm_db // [ db ], presupplied CheckM database (optional) - ch_checkm2_db // [ [meta] , db ], presupplied CheckM2 database (optional) - ch_busco_db // [ db ], presupplied BUSCO database (optional) - ch_gunc_db // [ db ], presupplied GUNC database (optional) + ch_bins // [ [ meta] , fasta ], input bins (mandatory) + ch_checkm_db // [ db ], presupplied CheckM database (optional) + ch_checkm2_db // [ [meta] , db ], presupplied CheckM2 database (optional) + ch_busco_db // [ db ], presupplied BUSCO database (optional) + ch_gunc_db // [ db ], presupplied GUNC database (optional) main: qc_summary = [] @@ -81,7 +82,7 @@ workflow BIN_QC { ) ch_multiqc_files = ch_multiqc_files.mix( - BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{ it[1] } + BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { it[1] } ) qc_summary = BUSCO_SUMMARY.out.summary ch_versions = ch_versions.mix(BUSCO.out.versions.first()) @@ -93,11 +94,12 @@ workflow BIN_QC { ch_bins_for_checkmlineagewf = ch_input_bins_for_qc .groupTuple() .filter { meta, _bins -> - meta.domain != "eukarya" - } + meta.domain != "eukarya" + } .multiMap { meta, fa -> reads: [meta, fa] - ext: fa.extension.unique().join("") // the pipeline ensures that all bins will have the same extension + // the pipeline ensures that all bins will have the same extension + ext: fa.extension.unique().join("") } CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, ch_checkm_db) @@ -137,10 +139,11 @@ workflow BIN_QC { meta.domain != "eukarya" } .flatMap { meta, bins -> - bins.collect { bin -> [meta, bin] } + // Set ID per bin, but save original ID for merging with CheckM output + bins.collect { bin -> [[id: bin.baseName, _id: meta.id] + meta, bin] } } - if ( params.gunc_db ) { + if (params.gunc_db) { ch_db_for_gunc = ch_gunc_db } else { @@ -153,23 +156,32 @@ workflow BIN_QC { // Make sure to keep directory in sync with modules.conf GUNC_RUN.out.maxcss_level_tsv - .map{it[1]} + .map { it[1] } .collectFile(name: "gunc_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") - if ( params.binqc_tool == 'checkm' ) { - ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv.combine(CHECKM_QA.out.output, by: 0) + if (params.binqc_tool == 'checkm') { + ch_input_to_gunc_combine = GUNC_RUN.out.maxcss_level_tsv + .map { meta, output -> + // restore original ID to combine with CheckM output + [[id: meta._id] + meta - meta.subMap(['_id']), output] + } + .groupTuple() + + COMBINE_GUNC_TSV(ch_input_to_gunc_combine) - GUNC_MERGECHECKM(ch_input_to_mergecheckm) + ch_input_to_mergegunccheckm = COMBINE_BINQC_TSV.out.combined + .combine(CHECKM_QA.out.output, by: 0) + + GUNC_MERGECHECKM(ch_input_to_mergegunccheckm) ch_versions.mix(GUNC_MERGECHECKM.out.versions) // Make sure to keep directory in sync with modules.conf GUNC_MERGECHECKM.out.tsv - .map{it[1]} + .map { it[1] } .collectFile(name: "gunc_checkm_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") } } - emit: qc_summary = qc_summary multiqc_files = ch_multiqc_files