From 7508612bc1c1511244d7101d274b016abcd01b58 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Wed, 20 Sep 2023 13:44:55 +0200 Subject: [PATCH 1/4] fix doc samplesheet file extension --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 48f0d4a8..d4f57207 100644 --- a/README.md +++ b/README.md @@ -52,9 +52,9 @@ In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quas First, prepare a samplesheet with your input data that looks as follows: -`samplesheet.tsv`: +`samplesheet.csv`: -```tsv +```csv ID R1 R2 LongFastQ Fast5 GenomeSize shortreads ./data/S1_R1.fastq.gz ./data/S1_R2.fastq.gz NA NA NA longreads NA NA ./data/S1_long_fastq.gz ./data/FAST5 2.8m @@ -67,13 +67,13 @@ Each row represents a fastq file (single-end) or a pair of fastq files (paired e Default: Short read assembly with Unicycler, `--kraken2db` can be any [compressed database (`.tar.gz`/`.tgz`)](https://benlangmead.github.io/aws-indexes/k2): ```console - nextflow run nf-core/bacass -profile --input samplesheet.tsv --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" + nextflow run nf-core/bacass -profile --input samplesheet.csv --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" ``` Long read assembly with Miniasm: ```console - nextflow run nf-core/bacass -profile --input samplesheet.tsv --assembly_type 'long' --assembler 'miniasm' --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" + nextflow run nf-core/bacass -profile --input samplesheet.csv --assembly_type 'long' --assembler 'miniasm' --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" ``` @@ -81,7 +81,7 @@ Default: Short read assembly with Unicycler, `--kraken2db` can be any [compresse ```bash nextflow run nf-core/bacass \ -profile \ - --input samplesheet.tsv \ + --input samplesheet.csv \ --outdir ``` From d88c7e17dd925169d3c802dfa0100934843398cf Mon Sep 17 00:00:00 2001 From: Dani VM Date: Thu, 21 Sep 2023 11:33:47 +0200 Subject: [PATCH 2/4] update access to test datasets --- README.md | 18 +++++++++--------- conf/test.config | 2 +- conf/test_dfast.config | 2 +- conf/test_full.config | 2 +- conf/test_hybrid.config | 2 +- conf/test_long.config | 2 +- conf/test_long_miniasm.config | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d4f57207..46b21c62 100644 --- a/README.md +++ b/README.md @@ -52,13 +52,13 @@ In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quas First, prepare a samplesheet with your input data that looks as follows: -`samplesheet.csv`: +`samplesheet.tsv`: -```csv -ID R1 R2 LongFastQ Fast5 GenomeSize -shortreads ./data/S1_R1.fastq.gz ./data/S1_R2.fastq.gz NA NA NA -longreads NA NA ./data/S1_long_fastq.gz ./data/FAST5 2.8m -shortNlong ./data/S1_R1.fastq.gz ./data/S1_R2.fastq.gz ./data/S1_long_fastq.gz ./data/FAST5 2.8m +```tsv +ID R1 R2 LongFastQ Fast5 GenomeSize +shortreads ./data/S1_R1.fastq.gz ./data/S1_R2.fastq.gz NA NA NA +longreads NA NA ./data/S1_long_fastq.gz ./data/FAST5 2.8m +shortNlong ./data/S1_R1.fastq.gz ./data/S1_R2.fastq.gz ./data/S1_long_fastq.gz ./data/FAST5 2.8m ``` @@ -67,13 +67,13 @@ Each row represents a fastq file (single-end) or a pair of fastq files (paired e Default: Short read assembly with Unicycler, `--kraken2db` can be any [compressed database (`.tar.gz`/`.tgz`)](https://benlangmead.github.io/aws-indexes/k2): ```console - nextflow run nf-core/bacass -profile --input samplesheet.csv --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" + nextflow run nf-core/bacass -profile --input samplesheet.tsv --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" ``` Long read assembly with Miniasm: ```console - nextflow run nf-core/bacass -profile --input samplesheet.csv --assembly_type 'long' --assembler 'miniasm' --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" + nextflow run nf-core/bacass -profile --input samplesheet.tsv --assembly_type 'long' --assembler 'miniasm' --kraken2db "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz" ``` @@ -81,7 +81,7 @@ Default: Short read assembly with Unicycler, `--kraken2db` can be any [compresse ```bash nextflow run nf-core/bacass \ -profile \ - --input samplesheet.csv \ + --input samplesheet.tsv \ --outdir ``` diff --git a/conf/test.config b/conf/test.config index be18c00e..c827fd2d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.tsv' // some extra args to speed tests up unicycler_args="--no_correct --no_pilon" diff --git a/conf/test_dfast.config b/conf/test_dfast.config index 2ab10265..b1b02c4b 100644 --- a/conf/test_dfast.config +++ b/conf/test_dfast.config @@ -20,7 +20,7 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.tsv' // some extra args to speed tests up unicycler_args="--no_correct --no_pilon" diff --git a/conf/test_full.config b/conf/test_full.config index e10e6a13..9432d763 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,6 +15,6 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv' kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz' } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index cd93e699..c27563a8 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -20,7 +20,7 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_hybrid.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_hybrid.tsv' // some extra args to speed tests up assembly_type='hybrid' diff --git a/conf/test_long.config b/conf/test_long.config index be225894..e722aae8 100644 --- a/conf/test_long.config +++ b/conf/test_long.config @@ -20,7 +20,7 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv' // some extra args to speed tests up prokka_args = " --fast" diff --git a/conf/test_long_miniasm.config b/conf/test_long_miniasm.config index a68d3124..07af1a2c 100644 --- a/conf/test_long_miniasm.config +++ b/conf/test_long_miniasm.config @@ -20,7 +20,7 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv' // some extra args to speed tests up prokka_args = " --fast" From d512b458f769dbc96014778d487bf95723513455 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 22 Sep 2023 11:39:21 +0200 Subject: [PATCH 3/4] add nf-validation on samplesheet --- assets/schema_input.json | 72 ++++++++++++++++++++++---- nextflow_schema.json | 67 +++++++++++++++++------- subworkflows/local/input_check.nf | 86 ------------------------------- workflows/bacass.nf | 52 +++++++++++++------ 4 files changed, 146 insertions(+), 131 deletions(-) delete mode 100644 subworkflows/local/input_check.nf diff --git a/assets/schema_input.json b/assets/schema_input.json index 9146feaa..a34ad666 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,22 +7,76 @@ "items": { "type": "object", "properties": { - "sample": { + "ID": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "unique": true, + "errorMessage": "Sample name must be provided and cannot contain spaces", + "meta": ["id"] }, - "fastq_1": { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "R1": { + "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": ["string", "null"], + "exists": true, + "pattern": "^(\\S+\\.f(ast)?q\\.gz|NA)$" + }, + { + "type": "string", + "maxLength": 0 + } + ] }, - "fastq_2": { + "R2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", "anyOf": [ + { + "type": ["string", "null"], + "exists": true, + "pattern": "^(\\S+\\.f(ast)?q\\.gz|NA)$" + }, { "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$" + "maxLength": 0 + } + ] + }, + "LongFastQ": { + "errorMessage": "FastQ file for long reads cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": ["string", "null"], + "exists": true, + "pattern": "^(\\S+\\.f(ast)?q\\.gz|NA)$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "Fast5": { + "errorMessage": "A valid path to Fast5 files. Example: ./data/FAST5", + "anyOf": [ + { + "type": ["string", "null"], + "format": "directory-path", + "exists": true, + "pattern": "^(\\/[\\S\\s]*|NA)$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "GenomeSize": { + "errorMessage": "A number (including decimals) ending with 'm', representing genome size. No spaces allowed.", + "anyOf": [ + { + "type": ["string", "null"], + "pattern": "(\\b\\d+\\.\\d+m\\b|NA)" }, { "type": "string", @@ -31,6 +85,6 @@ ] } }, - "required": ["sample", "fastq_1"] + "required": ["ID"] } } diff --git a/nextflow_schema.json b/nextflow_schema.json index 119f5403..45cf2d48 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -14,13 +14,11 @@ "properties": { "input": { "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a tab-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/bacass/usage#samplesheet-input).\n\nFor example:\n\n`--input 'design_hybrid.csv'`\n\nAn example of properly formatted input files can be found at the [nf-core/test-datasets](https://github.com/nf-core/test-datasets/tree/bacass). \n\nFor example, this is the input used for a hybrid assembly in testing:\nID R1 R2 LongFastQ Fast5 GenomeSize\nERR044595 https://github.com/nf-core/test-datasets/raw/bacass/ERR044595_1M_1.fastq.gz https://github.com/nf-core/test-datasets/raw/bacass/ERR044595_1M_2.fastq.gz https://github.com/nf-core/test-datasets/raw/bacass/nanopore/subset15000.fq.gz NA 2.8m\n\n* `ID`: The identifier to use for handling the dataset e.g. sample name\n* `R1`: The forward reads in case of available short-read data\n* `R2`: The reverse reads in case of available short-read data\n* `LongFastQ`: The long read FastQ file with reads in FASTQ format\n* `Fast5`: The folder containing the basecalled fast5 files\n* `GenomeSize`: The expected genome size of the assembly. Only used by the canu assembler.\n\nMissing values (e.g. Fast5 folder in case of short reads) can be omitted by using a `NA` in the TSV file. The pipeline will handle such cases appropriately then.", - "fa_icon": "fas fa-file-csv" + "mimetype": "text/tsv", + "fa_icon": "fas fa-dna", + "description": "Path to tab-separated sample sheet", + "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have two to four tab-separated columns/entries with the following headers: \n- `sampleID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)", + "schema": "assets/schema_input.json" }, "outdir": { "type": "string", @@ -52,8 +50,14 @@ "type": "boolean", "description": "save all merged reads to the a file ending in `*.merged.fastq.gz`" }, - "skip_fastqc": { "type": "boolean", "description": "Skip FastQC" }, - "skip_fastp": { "type": "boolean", "description": "Skip FastP" } + "skip_fastqc": { + "type": "boolean", + "description": "Skip FastQC" + }, + "skip_fastp": { + "type": "boolean", + "description": "Skip FastP" + } } }, "contamination_screening": { @@ -177,7 +181,10 @@ "fa_icon": "fas fa-forward", "description": "Skip polishing the long-read assembly with fast5 input. Will not affect short/hybrid assemblies." }, - "skip_multiqc": { "type": "boolean", "description": "Skip MultiQC" } + "skip_multiqc": { + "type": "boolean", + "description": "Skip MultiQC" + } } }, "institutional_config_options": { @@ -387,15 +394,35 @@ } }, "allOf": [ - { "$ref": "#/definitions/input_output_options" }, - { "$ref": "#/definitions/qc_and_trim" }, - { "$ref": "#/definitions/contamination_screening" }, - { "$ref": "#/definitions/assembly_parameters" }, - { "$ref": "#/definitions/assembly_polishing" }, - { "$ref": "#/definitions/annotation" }, - { "$ref": "#/definitions/skipping_options" }, - { "$ref": "#/definitions/institutional_config_options" }, - { "$ref": "#/definitions/max_job_request_options" }, - { "$ref": "#/definitions/generic_options" } + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/qc_and_trim" + }, + { + "$ref": "#/definitions/contamination_screening" + }, + { + "$ref": "#/definitions/assembly_parameters" + }, + { + "$ref": "#/definitions/assembly_polishing" + }, + { + "$ref": "#/definitions/annotation" + }, + { + "$ref": "#/definitions/skipping_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/generic_options" + } ] } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index eec3f75a..00000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,86 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -params.options = [:] - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - Channel - .fromPath( samplesheet ) - .ifEmpty {exit 1, "Cannot find path file $samplesheet"} - .splitCsv ( header:true, sep:'\t' ) - .map { create_fastq_channels(it) } - .set { reads } - - // reconfigure channels - reads - .map { meta, reads, long_fastq, fast5 -> [ meta, reads ] } - .filter{ meta, reads -> reads != 'NA' } - .filter{ meta, reads -> reads[0] != 'NA' && reads[1] != 'NA' } - .set { shortreads } - reads - .map { meta, reads, long_fastq, fast5 -> [ meta, long_fastq ] } - .filter{ meta, long_fastq -> long_fastq != 'NA' } - .set { longreads } - reads - .map { meta, reads, long_fastq, fast5 -> [ meta, fast5 ] } - .filter{ meta, fast5 -> fast5 != 'NA' } - .set { fast5 } - - emit: - reads // channel: [ val(meta), [ reads ], long_fastq, fast5 ] - shortreads // channel: [ val(meta), [ reads ] ] - longreads // channel: [ val(meta), long_fastq ] - fast5 // channel: [ val(meta), fast5 ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ], long_fastq, fast5 ] -def create_fastq_channels(LinkedHashMap row) { - def meta = [:] - meta.id = row.ID - meta.single_end = false - meta.genome_size = row.GenomeSize == null ? 'NA' : row.GenomeSize - - def array = [] - // check short reads - if ( !(row.R1 == 'NA') ) { - if ( !file(row.R1).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.R1}" - } - fastq_1 = file(row.R1) - } else { fastq_1 = 'NA' } - if ( !(row.R2 == 'NA') ) { - if ( !file(row.R2).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.R2}" - } - fastq_2 = file(row.R2) - } else { fastq_2 = 'NA' } - - // check long_fastq - if ( !(row.LongFastQ == 'NA') ) { - if ( !file(row.LongFastQ).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Long FastQ file does not exist!\n${row.R1}" - } - long_fastq = file(row.LongFastQ) - } else { long_fastq = 'NA' } - - // check long_fastq - if ( !(row.Fast5 == 'NA') ) { - if ( !file(row.Fast5).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Fast5 file does not exist!\n${row.R1}" - } - fast5 = file(row.Fast5) - } else { fast5 = 'NA' } - - // prepare output // currently does not allow single end data! - if ( meta.single_end ) { - array = [ meta, fastq_1 , long_fastq, fast5 ] - } else { - array = [ meta, [ fastq_1, fastq_2 ], long_fastq, fast5 ] - } - return array -} diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 5b6c5eb0..1ae3cb3e 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -4,7 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' @@ -25,9 +25,6 @@ WorkflowBacass.initialise(params, log) def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - // Check krakendb if(! params.skip_kraken2){ if(params.kraken2db){ @@ -67,7 +64,6 @@ include { DFAST } from '../modules/local/dfast' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -117,18 +113,42 @@ workflow BACASS { // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - INPUT_CHECK ( - file(params.input) - ) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema + if (params.input) { + def criteria = multiMapCriteria { + meta, fastq_1, fastq_2, long_fastq, fast5, genome_size -> + shortreads: fastq_1 != 'NA' ? tuple(tuple(meta, [fastq_1, fastq_2])) : null + longreads: long_fastq != 'NA' ? tuple(meta, long_fastq) : null + fast5: fast5 != 'NA' ? tuple(meta, fast5) : null + } + + // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ + Channel + .fromSamplesheet('input') + .multiMap (criteria) + .set { ch_input } + + // reconfigure channels + ch_input + .shortreads + .filter{ it != null } + .set { ch_shortreads } + ch_input + .longreads + .filter{ it != null } + .set { ch_longreads } + ch_input + .fast5 + .filter{ it != null } + .set { ch_fast5 } + } else { + exit 1, 'Input samplesheet not specified!' + } // // SUBWORKFLOW: Short reads QC and trim adapters // FASTQ_TRIM_FASTP_FASTQC ( - INPUT_CHECK.out.shortreads, + ch_shortreads, [], params.save_trimmed_fail, params.save_merged, @@ -141,7 +161,7 @@ workflow BACASS { // MODULE: Nanoplot, quality check for nanopore reads and Quality/Length Plots // NANOPLOT ( - INPUT_CHECK.out.longreads + ch_longreads ) ch_versions = ch_versions.mix(NANOPLOT.out.versions.ifEmpty(null)) @@ -151,7 +171,7 @@ workflow BACASS { // TODO: Couldn't be tested. No configuration test available (lack of fast5 file or params.skip_pycoqc=false). if ( !params.skip_pycoqc ) { PYCOQC ( - INPUT_CHECK.out.fast5.dump(tag: 'fast5') + ch_fast5.dump(tag: 'fast5') ) versions = ch_versions.mix(PYCOQC.out.versions.ifEmpty(null)) } @@ -161,7 +181,7 @@ workflow BACASS { // if ( params.assembly_type == 'hybrid' || params.assembly_type == 'long' && !('short' in params.assembly_type) ) { PORECHOP_PORECHOP ( - INPUT_CHECK.out.longreads.dump(tag: 'longreads') + ch_longreads.dump(tag: 'longreads') ) ch_versions = ch_versions.mix( PORECHOP_PORECHOP.out.versions.ifEmpty(null) ) } @@ -295,7 +315,7 @@ workflow BACASS { ch_for_polish // tuple val(meta), val(reads), file(longreads), file(assembly) .join( MINIMAP2_POLISH.out.bam ) // tuple val(meta), file(bam) .join( SAMTOOLS_INDEX.out.bai ) // tuple val(meta), file(bai) - .join( INPUT_CHECK.out.fast5 ) // tuple val(meta), file(fast5) + .join( ch_fast5 ) // tuple val(meta), file(fast5) .set { ch_for_nanopolish } // tuple val(meta), val(reads), file(longreads), file(assembly), file(bam), file(bai), file(fast5) // TODO: 'nanopolish index' couldn't be tested. No fast5 provided in test datasets. From 8ba1df5ecc39e49012338710022c433a276b8688 Mon Sep 17 00:00:00 2001 From: Dani VM Date: Fri, 22 Sep 2023 16:12:03 +0200 Subject: [PATCH 4/4] add reviewer suggestions --- nextflow_schema.json | 3 ++- workflows/bacass.nf | 52 ++++++++++++++++++++------------------------ 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 45cf2d48..a8ccce98 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -14,10 +14,11 @@ "properties": { "input": { "type": "string", + "exist": true, "mimetype": "text/tsv", "fa_icon": "fas fa-dna", "description": "Path to tab-separated sample sheet", - "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have two to four tab-separated columns/entries with the following headers: \n- `sampleID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)", + "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have six tab-separated columns/entries with the following headers: \n- `ID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `R1` (optional): Paths to (forward) reads zipped FastQ files\n- `R2` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `LongFastQ` (optional): Paths to long reads zipped FastQ files\n- `Fast5` (optional): Paths to the directory containing FAST5 files\n- `GenomeSize` (optional): A number (including decimals) ending with 'm', representing genome size.\n\n Please be aware that files will be required based on the chosen assembly type specified with the '--assembly_type' option, which can be set to one of the following values: ['short', 'long', 'hybrid'].`", "schema": "assets/schema_input.json" }, "outdir": { diff --git a/workflows/bacass.nf b/workflows/bacass.nf index 1ae3cb3e..34bdf48f 100644 --- a/workflows/bacass.nf +++ b/workflows/bacass.nf @@ -113,36 +113,30 @@ workflow BACASS { // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - if (params.input) { - def criteria = multiMapCriteria { - meta, fastq_1, fastq_2, long_fastq, fast5, genome_size -> - shortreads: fastq_1 != 'NA' ? tuple(tuple(meta, [fastq_1, fastq_2])) : null - longreads: long_fastq != 'NA' ? tuple(meta, long_fastq) : null - fast5: fast5 != 'NA' ? tuple(meta, fast5) : null - } - - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - Channel - .fromSamplesheet('input') - .multiMap (criteria) - .set { ch_input } - - // reconfigure channels - ch_input - .shortreads - .filter{ it != null } - .set { ch_shortreads } - ch_input - .longreads - .filter{ it != null } - .set { ch_longreads } - ch_input - .fast5 - .filter{ it != null } - .set { ch_fast5 } - } else { - exit 1, 'Input samplesheet not specified!' + def criteria = multiMapCriteria { + meta, fastq_1, fastq_2, long_fastq, fast5, genome_size -> + shortreads: fastq_1 != 'NA' ? tuple(tuple(meta, [fastq_1, fastq_2])) : null + longreads: long_fastq != 'NA' ? tuple(meta, long_fastq) : null + fast5: fast5 != 'NA' ? tuple(meta, fast5) : null } + // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ + Channel + .fromSamplesheet('input') + .multiMap (criteria) + .set { ch_input } + // reconfigure channels + ch_input + .shortreads + .filter{ it != null } + .set { ch_shortreads } + ch_input + .longreads + .filter{ it != null } + .set { ch_longreads } + ch_input + .fast5 + .filter{ it != null } + .set { ch_fast5 } // // SUBWORKFLOW: Short reads QC and trim adapters