nf-core · Daniel-VM · Sep 23, 2023 · Sep 20, 2023 · Sep 21, 2023 · Sep 22, 2023
diff --git a/README.md b/README.md
@@ -55,10 +55,10 @@ First, prepare a samplesheet with your input data that looks as follows:
 `samplesheet.tsv`:
 
 ```tsv
-ID    R1                            R2                            LongFastQ                Fast5              GenomeSize
-shortreads    ./data/S1_R1.fastq.gz    ./data/S1_R2.fastq.gz    NA                    NA                    NA
-longreads      NA                        NA                        ./data/S1_long_fastq.gz    ./data/FAST5    2.8m
-shortNlong    ./data/S1_R1.fastq.gz    ./data/S1_R2.fastq.gz    ./data/S1_long_fastq.gz    ./data/FAST5    2.8m
+ID      R1                            R2                            LongFastQ                    Fast5    GenomeSize
+shortreads      ./data/S1_R1.fastq.gz       ./data/S1_R2.fastq.gz       NA                            NA      NA
+longreads       NA                          NA                          ./data/S1_long_fastq.gz      ./data/FAST5  2.8m
+shortNlong      ./data/S1_R1.fastq.gz       ./data/S1_R2.fastq.gz       ./data/S1_long_fastq.gz      ./data/FAST5  2.8m
 
 ```
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -7,22 +7,76 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample": {
+            "ID": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
+                "unique": true,
+                "errorMessage": "Sample name must be provided and cannot contain spaces",
+                "meta": ["id"]
             },
-            "fastq_1": {
-                "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+            "R1": {
+                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
+                "anyOf": [
+                    {
+                        "type": ["string", "null"],
+                        "exists": true,
+                        "pattern": "^(\\S+\\.f(ast)?q\\.gz|NA)$"
+                    },
+                    {
+                        "type": "string",
+                        "maxLength": 0
+                    }
+                ]
             },
-            "fastq_2": {
+            "R2": {
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
                 "anyOf": [
+                    {
+                        "type": ["string", "null"],
+                        "exists": true,
+                        "pattern": "^(\\S+\\.f(ast)?q\\.gz|NA)$"
+                    },
                     {
                         "type": "string",
-                        "pattern": "^\\S+\\.f(ast)?q\\.gz$"
+                        "maxLength": 0
+                    }
+                ]
+            },
+            "LongFastQ": {
+                "errorMessage": "FastQ file for long reads cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
+                "anyOf": [
+                    {
+                        "type": ["string", "null"],
+                        "exists": true,
+                        "pattern": "^(\\S+\\.f(ast)?q\\.gz|NA)$"
+                    },
+                    {
+                        "type": "string",
+                        "maxLength": 0
+                    }
+                ]
+            },
+            "Fast5": {
+                "errorMessage": "A valid path to Fast5 files. Example: ./data/FAST5",
+                "anyOf": [
+                    {
+                        "type": ["string", "null"],
+                        "format": "directory-path",
+                        "exists": true,
+                        "pattern": "^(\\/[\\S\\s]*|NA)$"
+                    },
+                    {
+                        "type": "string",
+                        "maxLength": 0
+                    }
+                ]
+            },
+            "GenomeSize": {
+                "errorMessage": "A number (including decimals) ending with 'm', representing genome size. No spaces allowed.",
+                "anyOf": [
+                    {
+                        "type": ["string", "null"],
+                        "pattern": "(\\b\\d+\\.\\d+m\\b|NA)"
                     },
                     {
                         "type": "string",
@@ -31,6 +85,6 @@
                 ]
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["ID"]
     }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,7 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.tsv'
 
     // some extra args to speed tests up
     unicycler_args="--no_correct --no_pilon"

diff --git a/conf/test_dfast.config b/conf/test_dfast.config
@@ -20,7 +20,7 @@ params {
     max_time   = 6.h
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_short.tsv'
 
     // some extra args to speed tests up
     unicycler_args="--no_correct --no_pilon"

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,6 +15,6 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv'
     kraken2db = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz'
 }
diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config
@@ -20,7 +20,7 @@ params {
     max_time   = 6.h
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_hybrid.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_hybrid.tsv'
 
     // some extra args to speed tests up
     assembly_type='hybrid'

diff --git a/conf/test_long.config b/conf/test_long.config
@@ -20,7 +20,7 @@ params {
     max_time   = 6.h
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv'
 
     // some extra args to speed tests up
     prokka_args = " --fast"

diff --git a/conf/test_long_miniasm.config b/conf/test_long_miniasm.config
@@ -20,7 +20,7 @@ params {
     max_time   = 6.h
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_long_miniasm.tsv'
 
     // some extra args to speed tests up
     prokka_args = " --fast"

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -14,13 +14,11 @@
             "properties": {
                 "input": {
                     "type": "string",
-                    "format": "file-path",
-                    "exists": true,
-                    "mimetype": "text/csv",
-                    "pattern": "^\\S+\\.csv$",
-                    "description": "Path to comma-separated file containing information about the samples in the experiment.",
-                    "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a tab-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/bacass/usage#samplesheet-input).\n\nFor example:\n\n`--input 'design_hybrid.csv'`\n\nAn example of properly formatted input files can be found at the [nf-core/test-datasets](https://github.com/nf-core/test-datasets/tree/bacass). \n\nFor example, this is the input used for a hybrid assembly in testing:\nID R1 R2 LongFastQ Fast5 GenomeSize\nERR044595 https://github.com/nf-core/test-datasets/raw/bacass/ERR044595_1M_1.fastq.gz https://github.com/nf-core/test-datasets/raw/bacass/ERR044595_1M_2.fastq.gz https://github.com/nf-core/test-datasets/raw/bacass/nanopore/subset15000.fq.gz NA 2.8m\n\n* `ID`: The identifier to use for handling the dataset e.g. sample name\n* `R1`: The forward reads in case of available short-read data\n* `R2`: The reverse reads in case of available short-read data\n* `LongFastQ`: The long read FastQ file with reads in FASTQ format\n* `Fast5`: The folder containing the basecalled fast5 files\n* `GenomeSize`: The expected genome size of the assembly. Only used by the canu assembler.\n\nMissing values (e.g. Fast5 folder in case of short reads) can be omitted by using a `NA` in the TSV file. The pipeline will handle such cases appropriately then.",
-                    "fa_icon": "fas fa-file-csv"
+                    "mimetype": "text/tsv",
+                    "fa_icon": "fas fa-dna",
+                    "description": "Path to tab-separated sample sheet",
+                    "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have two to four tab-separated columns/entries with the following headers: \n- `sampleID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)",
+                    "schema": "assets/schema_input.json"
                 },
                 "outdir": {
                     "type": "string",
@@ -52,8 +50,14 @@
                     "type": "boolean",
                     "description": "save all merged reads to the a file ending in `*.merged.fastq.gz`"
                 },
-                "skip_fastqc": { "type": "boolean", "description": "Skip FastQC" },
-                "skip_fastp": { "type": "boolean", "description": "Skip FastP" }
+                "skip_fastqc": {
+                    "type": "boolean",
+                    "description": "Skip FastQC"
+                },
+                "skip_fastp": {
+                    "type": "boolean",
+                    "description": "Skip FastP"
+                }
             }
         },
         "contamination_screening": {
@@ -177,7 +181,10 @@
                     "fa_icon": "fas fa-forward",
                     "description": "Skip polishing the long-read assembly with fast5 input. Will not affect short/hybrid assemblies."
                 },
-                "skip_multiqc": { "type": "boolean", "description": "Skip MultiQC" }
+                "skip_multiqc": {
+                    "type": "boolean",
+                    "description": "Skip MultiQC"
+                }
             }
         },
         "institutional_config_options": {
@@ -387,15 +394,35 @@
         }
     },
     "allOf": [
-        { "$ref": "#/definitions/input_output_options" },
-        { "$ref": "#/definitions/qc_and_trim" },
-        { "$ref": "#/definitions/contamination_screening" },
-        { "$ref": "#/definitions/assembly_parameters" },
-        { "$ref": "#/definitions/assembly_polishing" },
-        { "$ref": "#/definitions/annotation" },
-        { "$ref": "#/definitions/skipping_options" },
-        { "$ref": "#/definitions/institutional_config_options" },
-        { "$ref": "#/definitions/max_job_request_options" },
-        { "$ref": "#/definitions/generic_options" }
+        {
+            "$ref": "#/definitions/input_output_options"
+        },
+        {
+            "$ref": "#/definitions/qc_and_trim"
+        },
+        {
+            "$ref": "#/definitions/contamination_screening"
+        },
+        {
+            "$ref": "#/definitions/assembly_parameters"
+        },
+        {
+            "$ref": "#/definitions/assembly_polishing"
+        },
+        {
+            "$ref": "#/definitions/annotation"
+        },
+        {
+            "$ref": "#/definitions/skipping_options"
+        },
+        {
+            "$ref": "#/definitions/institutional_config_options"
+        },
+        {
+            "$ref": "#/definitions/max_job_request_options"
+        },
+        {
+            "$ref": "#/definitions/generic_options"
+        }
     ]
 }
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf