From bab8e0a2018692ceb5448ab428f86caeb889c67b Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Fri, 1 Dec 2023 16:22:55 +0100
Subject: [PATCH 01/13] allow vcfgz again

---
 bin/check_samplesheet.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 97f66c6..614b807 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -23,7 +23,7 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (".tsv", ".fasta", ".vcf", "GSvar")
+    VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz" ".GSvar")
 
     def __init__(
         self,
@@ -134,7 +134,7 @@ def _validate_unique_sample(self):
 
 def get_file_type(file):
     """Read file extension and return file type"""
-    extension = file.split(".")[-1]
+    extension = ''.join(Path(file).suffixes
     # check input file is empty
     # it needs to be distinguished if there's a given local file or internet address
     if str(file).startswith("http"):
@@ -148,13 +148,13 @@ def get_file_type(file):
             raise AssertionError(f"Input file {file} is empty.")
 
     try:
-        if extension == "vcf.gz":
+        if ".vcf.gz" in extension:
             file_type = "compressed_variant"
-        elif extension == "vcf":
+        elif extension == ".vcf":
             file_type = "variant"
-        elif extension == "fasta":
+        elif extension == ".fasta":
             file_type = "protein"
-        elif extension in ["tsv", "GSvar"]:
+        elif extension in [".tsv", ".GSvar"]:
             # Check if the file is a variant annotation file or a peptide file
             header_columns = [col.strip() for col in file[0].split("\t")]
 

From 202fa4416e973e80a736ed9aeb906b0e5fef6e9e Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Mon, 4 Dec 2023 13:13:26 +0000
Subject: [PATCH 02/13] fix compressed vcf parsing, refactored allele checking

---
 bin/check_samplesheet.py | 65 ++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 36 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 614b807..069ec61 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -23,7 +23,7 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz" ".GSvar")
+    VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz", ".GSvar")
 
     def __init__(
         self,
@@ -63,8 +63,8 @@ def validate(self, row):
         """
         self._validate_row_length(row)
         self._validate_sample(row)
-        self._validate_allele(row)
         self._validate_mhc_class(row)
+        self._validate_allele(row)
         self._validate_file(row[self._filename_col])
         self._seen.add(
             (row[self._sample_col], row[self._alleles_col], row[self._mhc_class_col], row[self._filename_col])
@@ -80,26 +80,20 @@ def _validate_sample(self, row):
 
     def _validate_allele(self, row):
         """Assert that the alleles have the right format."""
-        valid_class1_loci = ["A*", "B*", "C*", "E*", "G*"]
-        valid_class2_loci = ["DR", "DP", "DQ"]
-
         if len(row[self._alleles_col]) <= 0:
             raise AssertionError(f"No alleles specified.\nLine: {row}")
-        if (
-            not os.path.isfile(row[self._alleles_col])
-            and (
-                row[self._mhc_class_col] == "I"
-                and any(substring in row[self._alleles_col] for substring in valid_class2_loci)
-            )
-            or (
-                row[self._mhc_class_col] == "II"
-                and any(substring in row[self._alleles_col] for substring in valid_class1_loci)
-            )
-        ):
-            raise AssertionError(
-                f"Samplesheet contains invalid mhc class and allele combination!\nLine: {row} \
-                                      \nValid loci: {valid_class1_loci if row[self._mhc_class_col] == 'I' else valid_class2_loci}"
-            )
+        if os.path.isfile(row[self._alleles_col]):
+            logging.info(f"Alleles file found: {row[self._alleles_col]}. Attempting to read file.")
+            try:
+                with open(row[self._alleles_col], "r") as f:
+                    alleles = f.readlines()[0]
+            except Exception as e:
+                raise AssertionError(f"Error with reading alleles file: {e}. Check correct format for input file {row[self._alleles_col]} in documentation.")
+        else:
+            alleles = row[self._alleles_col]
+
+        if not all([check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]):
+                raise AssertionError(f"Alleles {alleles} of MHC-class {row[self._mhc_class_col]} don't have the right format. \nLine: {row}. See the documentation for more information.")
 
     def _validate_mhc_class(self, row):
         """Assert that the mhc_class has the right format."""
@@ -134,22 +128,10 @@ def _validate_unique_sample(self):
 
 def get_file_type(file):
     """Read file extension and return file type"""
-    extension = ''.join(Path(file).suffixes
-    # check input file is empty
-    # it needs to be distinguished if there's a given local file or internet address
-    if str(file).startswith("http"):
-        with urllib.request.urlopen(file) as response:
-            file = response.read().decode("utf-8").split("\n")
-            if len(file) == 0:
-                raise AssertionError(f"Input file {file} is empty.")
-    else:
-        file = open(file, "r").readlines()
-        if file == 0:
-            raise AssertionError(f"Input file {file} is empty.")
-
+    extension = ''.join(Path(file).suffixes)
     try:
         if ".vcf.gz" in extension:
-            file_type = "compressed_variant"
+            file_type = "variant_compressed"
         elif extension == ".vcf":
             file_type = "variant"
         elif extension == ".fasta":
@@ -203,8 +185,19 @@ def parse_args(argv=None):
     return parser.parse_args(argv)
 
 
-def check_allele_nomenclature(allele):
-    pattern = re.compile("(^[A-Z][\*][0-9][0-9][:][0-9][0-9])$")
+def check_allele_nomenclature(allele, mhc_class) -> bool:
+    allele = allele.replace('HLA-','')
+    if mhc_class == 'I':
+        pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$")
+    elif mhc_class == 'II':
+        # Check if allele is from two chains
+        if allele.contains("-"):
+            pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
+        else:
+            pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
+    else: # Mouse
+        pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$")
+
     return pattern.match(allele) is not None
 
 

From 2dd79dd330e6b25bece7a85b456ecb0ce86fd844 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Mon, 4 Dec 2023 14:16:26 +0000
Subject: [PATCH 03/13] bump python version of samplechecker

---
 modules/local/samplesheet_check.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
index ae63801..d1c48c3 100644
--- a/modules/local/samplesheet_check.nf
+++ b/modules/local/samplesheet_check.nf
@@ -2,10 +2,10 @@ process SAMPLESHEET_CHECK {
     tag "$samplesheet"
     label 'process_single'
 
-    conda "conda-forge::python=3.8.3"
+    conda "conda-forge::python=3.10.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
-        'biocontainers/python:3.8.3' }"
+        'https://depot.galaxyproject.org/singularity/python:3.10.2' :
+        'biocontainers/python:3.10.2' }"
 
     input:
     path samplesheet

From d04e77f6cbeb47f632d7ec8eff61248655f9a8b3 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Mon, 4 Dec 2023 14:17:17 +0000
Subject: [PATCH 04/13] add staging in samplesheet checker to determine
 inputtype

---
 bin/check_samplesheet.py | 59 +++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 069ec61..d176a94 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -11,6 +11,7 @@
 import csv
 from pathlib import Path
 import urllib.request
+from urllib.parse import urlparse
 
 
 class RowChecker:
@@ -82,17 +83,8 @@ def _validate_allele(self, row):
         """Assert that the alleles have the right format."""
         if len(row[self._alleles_col]) <= 0:
             raise AssertionError(f"No alleles specified.\nLine: {row}")
-        if os.path.isfile(row[self._alleles_col]):
-            logging.info(f"Alleles file found: {row[self._alleles_col]}. Attempting to read file.")
-            try:
-                with open(row[self._alleles_col], "r") as f:
-                    alleles = f.readlines()[0]
-            except Exception as e:
-                raise AssertionError(f"Error with reading alleles file: {e}. Check correct format for input file {row[self._alleles_col]} in documentation.")
-        else:
-            alleles = row[self._alleles_col]
-
-        if not all([check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]):
+        alleles = read_source(row[self._alleles_col])
+        if not all([self._check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]):
                 raise AssertionError(f"Alleles {alleles} of MHC-class {row[self._mhc_class_col]} don't have the right format. \nLine: {row}. See the documentation for more information.")
 
     def _validate_mhc_class(self, row):
@@ -125,6 +117,32 @@ def _validate_unique_sample(self):
         if len(set(sample_names)) != len(sample_names):
             raise AssertionError(f"Duplicate sample name found: {self.rows[-1]}")
 
+    def _check_allele_nomenclature(self, allele, mhc_class) -> bool:
+        allele = allele.replace('HLA-','')
+        if mhc_class == 'I':
+            pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$")
+        elif mhc_class == 'II':
+            # Check if allele is from two chains
+            if allele.contains("-"):
+                pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
+            else:
+                pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
+        else: # Mouse
+            pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$")
+
+        return pattern.match(allele) is not None
+
+
+def read_source(source):
+    """Read the alleles from a file/url or return the string."""
+    if urlparse(source).scheme in ('http', 'https'):
+        with urllib.request.urlopen(source) as response:
+            return response.read().decode('utf-8').splitlines()[0]
+    elif os.path.isfile(source):
+        with open(source, "r") as f:
+            return f.readlines()[0]
+    else:
+        return source
 
 def get_file_type(file):
     """Read file extension and return file type"""
@@ -138,7 +156,8 @@ def get_file_type(file):
             file_type = "protein"
         elif extension in [".tsv", ".GSvar"]:
             # Check if the file is a variant annotation file or a peptide file
-            header_columns = [col.strip() for col in file[0].split("\t")]
+            header = read_source(file)
+            header_columns = [col.strip() for col in header.split("\t")]
 
             required_variant_columns = ["#chr", "start", "end"]
 
@@ -185,22 +204,6 @@ def parse_args(argv=None):
     return parser.parse_args(argv)
 
 
-def check_allele_nomenclature(allele, mhc_class) -> bool:
-    allele = allele.replace('HLA-','')
-    if mhc_class == 'I':
-        pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$")
-    elif mhc_class == 'II':
-        # Check if allele is from two chains
-        if allele.contains("-"):
-            pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
-        else:
-            pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
-    else: # Mouse
-        pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$")
-
-    return pattern.match(allele) is not None
-
-
 def make_dir(path):
     if len(path) > 0:
         try:

From 7868e5ce7e5813dc53261f44b3ef78bb49e0258d Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 5 Dec 2023 10:28:54 +0000
Subject: [PATCH 05/13] switch to nf-validation

---
 assets/schema_input.json           |  22 ++-
 bin/check_samplesheet.py           | 290 -----------------------------
 modules/local/samplesheet_check.nf |  31 ---
 nextflow_schema.json               |  14 +-
 subworkflows/local/input_check.nf  |  61 ------
 workflows/epitopeprediction.nf     |  58 +++---
 6 files changed, 57 insertions(+), 419 deletions(-)
 delete mode 100755 bin/check_samplesheet.py
 delete mode 100644 modules/local/samplesheet_check.nf
 delete mode 100644 subworkflows/local/input_check.nf

diff --git a/assets/schema_input.json b/assets/schema_input.json
index b564a5c..3c6bdc4 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -20,20 +20,32 @@
                     },
                     {
                         "type": "string",
-                        "pattern": "(^[A-Z][*][0-9][0-9][:][0-9][0-9])$"
+                        "pattern": "^([A-E]{1}[*][0-9]{2}[:][0-9]{2})(;[A-E]{1}[*][0-9]{2}[:][0-9]{2})*$"
+                    },
+                    {
+                        "type": "string",
+                        "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;(DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})*$"
+                    },
+                    {
+                        "type": "string",
+                        "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$",
+                    },
+                    {
+                        "type":"string",
+                        "pattern": "^[H][-][2][-][A-Za-z]{2,3}$"
                     }
                 ],
-                "errorMessage": "Alleles must be provided as string or file with extension '.txt''"
+                "errorMessage": "Alleles must be provided as string or file with extension '.txt'. Please check the documentation for more information."
             },
             "mhc_class": {
                 "type": "string",
                 "pattern": "^(I|II|H-2)$",
-                "errorMessage": "The MHC class must be provided. Valid values: "
+                "errorMessage": "The MHC class must be provided. Valid values: 'I', 'II' or 'H-2'"
             },
             "filename": {
                 "type": "string",
-                "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$",
-                "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions:  '.vcf', '.tsv', '.fasta', '.fa', '.txt'"
+                "pattern": "^\\S+\\.(vcf|vcf.gz|tsv|fasta|fa|txt)$",
+                "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.vcf.gz', '.tsv', '.fasta', '.fa', '.txt'"
             }
         },
         "required": ["sample", "alleles", "mhc_class", "filename"]
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
deleted file mode 100755
index d176a94..0000000
--- a/bin/check_samplesheet.py
+++ /dev/null
@@ -1,290 +0,0 @@
-#!/usr/bin/env python
-# Written by Jonas Scheid, Christopher Mohr and released under the MIT license.
-
-import argparse
-import logging
-import os
-import re
-import sys
-import errno
-import re
-import csv
-from pathlib import Path
-import urllib.request
-from urllib.parse import urlparse
-
-
-class RowChecker:
-    """
-    Define a service that can validate and transform each given row.
-
-    Attributes:
-        rows (list): A list of dicts, where each dict corresponds to a previously
-            validated and transformed row. The order of rows is maintained.
-
-    """
-
-    VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz", ".GSvar")
-
-    def __init__(
-        self,
-        sample_col=0,
-        alleles_col=1,
-        mhc_class_col=2,
-        filename_col=3,
-        **kwargs,
-    ):
-        """
-        Initialize the row checker with the expected column names.
-
-        Args:
-            sample_col (str): The name of the column that contains the sample name
-                (default "sample").
-            alleles_col (str): The name of the column that contains the MHC alleles.
-            mhc_class_col (str): The name of the column that contains the MHC class.
-            filename_col (str): The name of the column that contains the filename.
-
-        """
-        super().__init__(**kwargs)
-        self._sample_col = sample_col
-        self._alleles_col = alleles_col
-        self._mhc_class_col = mhc_class_col
-        self._filename_col = filename_col
-        self._seen = set()
-        self.rows = []
-
-    def validate(self, row):
-        """
-        Perform all validations on the given row.
-
-        Args:
-            row (dict): A mapping from column headers (keys) to elements of that row
-                (values).
-
-        """
-        self._validate_row_length(row)
-        self._validate_sample(row)
-        self._validate_mhc_class(row)
-        self._validate_allele(row)
-        self._validate_file(row[self._filename_col])
-        self._seen.add(
-            (row[self._sample_col], row[self._alleles_col], row[self._mhc_class_col], row[self._filename_col])
-        )
-        self.rows.append(row)
-        self._validate_unique_row()
-        self._validate_unique_sample()
-
-    def _validate_sample(self, row):
-        """Assert that the sample name exists and convert spaces to underscores."""
-        if len(row[self._sample_col]) <= 0:
-            raise AssertionError(f"Sample input is required.\nLine: {row}")
-
-    def _validate_allele(self, row):
-        """Assert that the alleles have the right format."""
-        if len(row[self._alleles_col]) <= 0:
-            raise AssertionError(f"No alleles specified.\nLine: {row}")
-        alleles = read_source(row[self._alleles_col])
-        if not all([self._check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]):
-                raise AssertionError(f"Alleles {alleles} of MHC-class {row[self._mhc_class_col]} don't have the right format. \nLine: {row}. See the documentation for more information.")
-
-    def _validate_mhc_class(self, row):
-        """Assert that the mhc_class has the right format."""
-        valid_classes = ["I", "II", "H-2"]
-        if row[self._mhc_class_col] not in valid_classes:
-            raise AssertionError(f"MHC class must be one of: {valid_classes}\nLine: {row}")
-
-    def _validate_file(self, filename):
-        """Assert that a given filename has one of the expected FASTQ extensions."""
-        if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
-            raise AssertionError(
-                f"The input file has an unrecognized extension: {filename}\n"
-                f"It should be one of: {', '.join(self.VALID_FORMATS)}"
-            )
-
-    def _validate_row_length(self, row):
-        """Assert the row length."""
-        if len(row) != 4:
-            raise AssertionError(f"Invalid row length: {len(row)}\nLine: {row}.")
-
-    def _validate_unique_row(self):
-        """Assert that the combination of sample name, alleles, mhc_class and filename is unique."""
-        if len(self._seen) != len(self.rows) and len(self.rows) > 1:
-            raise AssertionError(f"Duplicate row found: {self.rows[-1]}")
-
-    def _validate_unique_sample(self):
-        """Assert that the combination sample names are unique."""
-        sample_names = [row[self._sample_col] for row in self.rows]
-        if len(set(sample_names)) != len(sample_names):
-            raise AssertionError(f"Duplicate sample name found: {self.rows[-1]}")
-
-    def _check_allele_nomenclature(self, allele, mhc_class) -> bool:
-        allele = allele.replace('HLA-','')
-        if mhc_class == 'I':
-            pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$")
-        elif mhc_class == 'II':
-            # Check if allele is from two chains
-            if allele.contains("-"):
-                pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
-            else:
-                pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$")
-        else: # Mouse
-            pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$")
-
-        return pattern.match(allele) is not None
-
-
-def read_source(source):
-    """Read the alleles from a file/url or return the string."""
-    if urlparse(source).scheme in ('http', 'https'):
-        with urllib.request.urlopen(source) as response:
-            return response.read().decode('utf-8').splitlines()[0]
-    elif os.path.isfile(source):
-        with open(source, "r") as f:
-            return f.readlines()[0]
-    else:
-        return source
-
-def get_file_type(file):
-    """Read file extension and return file type"""
-    extension = ''.join(Path(file).suffixes)
-    try:
-        if ".vcf.gz" in extension:
-            file_type = "variant_compressed"
-        elif extension == ".vcf":
-            file_type = "variant"
-        elif extension == ".fasta":
-            file_type = "protein"
-        elif extension in [".tsv", ".GSvar"]:
-            # Check if the file is a variant annotation file or a peptide file
-            header = read_source(file)
-            header_columns = [col.strip() for col in header.split("\t")]
-
-            required_variant_columns = ["#chr", "start", "end"]
-
-            file_type = "peptide"
-
-            if all(col in header_columns for col in required_variant_columns):
-                file_type = "variant"
-            elif "sequence" not in header_columns:
-                raise AssertionError("Peptide input file does not contain mandatory column 'sequence'")
-
-        return file_type
-
-    except Exception as e:
-        raise AssertionError(
-            f"Error with checking samplesheet: {e}. Check correct format for input file {file} in documentation."
-        )
-
-
-def parse_args(argv=None):
-    """Define and immediately parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Validate and transform a tabular samplesheet.",
-        epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
-    )
-    parser.add_argument(
-        "file_in",
-        metavar="FILE_IN",
-        type=Path,
-        help="Tabular input samplesheet in CSV or TSV format.",
-    )
-    parser.add_argument(
-        "file_out",
-        metavar="FILE_OUT",
-        type=Path,
-        help="Transformed output samplesheet in CSV format.",
-    )
-    parser.add_argument(
-        "-l",
-        "--log-level",
-        help="The desired log level (default WARNING).",
-        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
-        default="WARNING",
-    )
-    return parser.parse_args(argv)
-
-
-def make_dir(path):
-    if len(path) > 0:
-        try:
-            os.makedirs(path)
-        except OSError as exception:
-            if exception.errno != errno.EEXIST:
-                raise exception
-
-
-def check_samplesheet(file_in, file_out):
-    """
-    sample,alleles,mhc_class,filename
-    GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
-    GBM_2,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
-
-    or
-
-    sample,alleles,mhc_class,filename
-    GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
-    GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
-
-
-    where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins)
-    and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
-    containing one allele per line (no header)
-
-    Further examples:
-    - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
-    - Mouse allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.H2.txt
-    - Peptide format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/peptides/peptides.tsv
-    - Variant TSV format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.tsv
-    - Variant VCF format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.vcf
-    """
-
-    with open(
-        file_in,
-        newline="",
-    ) as samplesheet:
-        reader = csv.reader(samplesheet)
-
-        ## Check header
-        valid_header = ["sample", "alleles", "mhc_class", "filename"]
-        header = [x.strip('"') for x in samplesheet.readline().strip().split(",")]
-        if len(header) != 4:
-            raise ValueError(
-                f"Invalid number of header columns! Make sure the samplesheet is properly comma-separated."
-            )
-        elif header != valid_header:
-            raise AssertionError(f"Invalid samplesheet header (valid = {valid_header})!")
-
-        ## Check samplesheet entries
-        checker = RowChecker()
-        rows = []
-        for i, row in enumerate(reader):
-            checker.validate(row)
-            # here an allele check with mhcgnomes would be suitable
-            row.append(get_file_type(row[3]))
-            rows.append(row)
-
-        if len(checker.rows) == 0:
-            raise AssertionError("Samplesheet contains no entries!")
-
-        ## Write validated samplesheet with appropriate columns
-        out_dir = os.path.dirname(file_out)
-        make_dir(out_dir)
-        with open(file_out, "w") as fout:
-            valid_header.append("inputtype")
-            fout.write(",".join(valid_header) + "\n")
-            for row in rows:
-                fout.write(",".join(row) + "\n")
-
-
-def main(argv=None):
-    """Coordinate argument parsing and program execution."""
-    args = parse_args(argv)
-    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
-    if not args.file_in.is_file():
-        raise AssertionError(f"The given input file {args.file_in} does not exist!")
-    args.file_out.parent.mkdir(parents=True, exist_ok=True)
-    check_samplesheet(args.file_in, args.file_out)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
deleted file mode 100644
index d1c48c3..0000000
--- a/modules/local/samplesheet_check.nf
+++ /dev/null
@@ -1,31 +0,0 @@
-process SAMPLESHEET_CHECK {
-    tag "$samplesheet"
-    label 'process_single'
-
-    conda "conda-forge::python=3.10.2"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.10.2' :
-        'biocontainers/python:3.10.2' }"
-
-    input:
-    path samplesheet
-
-    output:
-    path '*.csv'       , emit: csv
-    path "versions.yml", emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script: // This script is bundled with the pipeline, in nf-core/epitopeprediction/bin/
-    """
-    check_samplesheet.py \\
-        $samplesheet \\
-        samplesheet.valid.csv
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        python: \$(python --version | sed 's/Python //g')
-    END_VERSIONS
-    """
-}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index a983071..c467dc9 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -18,6 +18,7 @@
                     "exists": true,
                     "mimetype": "text/csv",
                     "pattern": "^\\S+\\.csv$",
+                    "schema": "assets/schema_input.json",
                     "description": "Path to comma-separated file containing information about the samples in the experiment.",
                     "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/epitopeprediction/usage#samplesheet-input).",
                     "fa_icon": "fas fa-file-csv"
@@ -51,6 +52,7 @@
                 "genome_reference": {
                     "type": "string",
                     "default": "grch37",
+                    "enum":["grch37", "grch38"],
                     "help_text": "This defines against which human Ensembl genome reference the pipeline performs the analysis including the incorporation of genetic variants e.g.. If `grch37` or `grch38` are specified the most recent Ensembl Biomart version for genome versions will be used. Alternatively, an Ensembl Biomart (archive) version can be specified, e.g. http://jan2020.archive.ensembl.org/.",
                     "description": "Specifies the Ensembl genome reference version that will be used."
                 },
@@ -105,7 +107,7 @@
                 },
                 "max_peptide_length": {
                     "type": "integer",
-                    "default": 11,
+                    "default": 14,
                     "description": "Specifies the maximum peptide length.",
                     "help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa"
                 },
@@ -117,17 +119,18 @@
                 },
                 "max_peptide_length_class2": {
                     "type": "integer",
-                    "default": 16,
+                    "default": 25,
                     "description": "Specifies the maximum peptide length for MHC class II peptides."
                 },
                 "min_peptide_length_class2": {
                     "type": "integer",
-                    "default": 15,
+                    "default": 9,
                     "description": "Specifies the minimum peptide length for MHC class II peptides."
                 },
                 "tools": {
                     "type": "string",
                     "default": "syfpeithi",
+                    "pattern": "^(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.1|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1)(,(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.1|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1))*$",
                     "help_text": "Specifies the tool(s) to use. Multiple tools can be combined in a list separated by comma.\nAvailable are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`,`netmhcpan-4.0`,`netmhcpan-4.1`,`netmhc-4.0`,`netmhciipan-4.1`.",
                     "description": "Specifies the prediction tool(s) to use."
                 },
@@ -207,26 +210,31 @@
                 "netmhc_system": {
                     "type": "string",
                     "default": "linux",
+                    "enum": ["linux", "darwin"],
                     "description": "Specifies the operating system in use (Linux or Darwin). This is only necessary if conda is used."
                 },
                 "netmhcpan_path": {
                     "type": "string",
                     "default": "None",
+                    "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhcpan' tool, specify the path to the original software tarball for NetMHCpan 4.0 here."
                 },
                 "netmhc_path": {
                     "type": "string",
                     "default": "None",
+                    "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhc' tool, specify the path to the original software tarball for NetMHC 4.0 here."
                 },
                 "netmhciipan_path": {
                     "type": "string",
                     "default": "None",
+                    "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhciipan' tool, specify the path to the original software tarball for NetMHCIIpan 3.1 here."
                 },
                 "netmhcii_path": {
                     "type": "string",
                     "default": "None",
+                    "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhcii' tool, specify the path to the original software tarball for NetMHCII 2.2 here."
                 }
             }
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
deleted file mode 100644
index af9f467..0000000
--- a/subworkflows/local/input_check.nf
+++ /dev/null
@@ -1,61 +0,0 @@
-//
-// Check input samplesheet and get read channels
-//
-
-include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check'
-
-workflow INPUT_CHECK {
-    take:
-    samplesheet // file: /path/to/samplesheet.csv
-
-    main:
-    SAMPLESHEET_CHECK ( samplesheet )
-        .csv
-        .splitCsv ( header:true )
-        .map { get_samplesheet_paths(it) }
-        .set { meta }
-
-    emit: meta                  // channel: [ val(meta), [ files ] ]
-    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
-}
-
-// Function to get list of [ meta, filenames ]
-def get_samplesheet_paths(LinkedHashMap row) {
-    //---------
-    // Save sample, alleles, mhc_class and file_type in a dictionary (metadata)
-    // and return a list of meta and the filename.
-    //---------
-
-    def meta = [:]
-    meta.sample         = row.sample
-    meta.alleles        = row.alleles
-    meta.mhcclass       = row.mhc_class
-    meta.inputtype      = row.inputtype
-
-    def array = []
-    if (!file(row.filename).exists()) {
-        exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}"
-    } else {
-        array = [ meta, file(row.filename) ]
-    }
-    return array
-}
-
-def generate_allele_string(String alleles, String mhcclass) {
-    // Collect the allele information from the file
-    def allele_string
-    valid_class1_loci = ['A*','B*','C*','E*','G*']
-    valid_class2_loci = ['DR','DP','DQ']
-    if ( alleles.endsWith(".txt") || alleles.endsWith(".alleles") )  {
-        allele_string = file(alleles).readLines().join(';')
-        if ((mhcclass == 'I' & valid_class2_loci.any { allele_string.contains(it)}) |
-        (mhcclass == 'II' & valid_class1_loci.any { allele_string.contains(it)})) {
-            exit 1, "ERROR: Please check input samplesheet -> invalid mhc class and allele combination found!\n${row.Filename}"
-        }
-    }
-    // or assign the information to a new variable
-    else {
-        allele_string = alleles
-    }
-    return allele_string
-}
diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index 849ad28..9dd5253 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -4,7 +4,7 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation'
+include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet; validateParameters } from 'plugin/nf-validation'
 
 def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs)
 def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
@@ -63,10 +63,6 @@ include { CSVTK_CONCAT }
 include { MERGE_JSON as MERGE_JSON_SINGLE }                                         from '../modules/local/merge_json'
 include { MERGE_JSON as MERGE_JSON_MULTI }                                          from '../modules/local/merge_json'
 
-//
-// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
-
-include { INPUT_CHECK } from '../subworkflows/local/input_check'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,35 +93,39 @@ def external_tools_meta = jsonSlurper.parse(file(params.external_tools_meta, che
 
 workflow EPITOPEPREDICTION {
 
+    validateParameters()
+
     ch_versions = Channel.empty()
     ch_software_versions = Channel.empty()
     // Non-free prediction tools
     ch_nonfree_paths = Channel.empty()
 
-    //
-    // SUBWORKFLOW: Read in samplesheet, validate and stage input files
-    //
-    INPUT_CHECK (
-        file(params.input)
-    )
-    ch_versions = ch_versions.mix(INPUT_CHECK.out.versions)
-    // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input")
-    // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/
-    // ! There is currently no tooling to help you write a sample sheet schema
-
-    INPUT_CHECK.out.meta
-                .branch {
-                    meta_data, input_file ->
-                        variant_compressed : meta_data.inputtype == 'variant_compressed'
-                            return [ meta_data, input_file ]
-                        variant_uncompressed :  meta_data.inputtype == 'variant'
-                            return [ meta_data, input_file ]
-                        peptide :  meta_data.inputtype == 'peptide'
-                            return [ meta_data, input_file ]
-                        protein :  meta_data.inputtype == 'protein'
-                            return [ meta_data, input_file ]
-                    }
-                .set { ch_samples_from_sheet }
+    // Function to read the alleles from a file or stage it from url
+    def readAlleles = { input ->
+        if (input.endsWith(".txt")) {
+            def file = file(input)
+            // Alleles are listed in the first line of the file
+            return file.readLines().get(0)
+        } else {
+            // Not a file path, return the original string
+            return input
+        }
+    }
+
+    ch_input = Channel.fromSamplesheet("input")
+    ch_input
+        .branch {
+            sample, alleles, mhc_class, filename ->
+                def allele_list = readAlleles(alleles)
+                variant_compressed : filename.endsWith('.vcf.gz')
+                    return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant_compressed'], filename ]
+                variant_uncompressed : filename.endsWith('.vcf') || filename.endsWith('.GSvar')
+                    return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant'], filename ]
+                peptide : filename.endsWith('.tsv')
+                    return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'peptide'], filename ]
+                protein : filename.endsWith('.fasta') || filename.endsWith('.fa')
+                    return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'protein'], filename ]
+    } .set { ch_samples_from_sheet }
 
     // gunzip variant files
     GUNZIP_VCF (

From 73964727652af580f39931a6d8b2e5a94d21d005 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 5 Dec 2023 11:07:53 +0000
Subject: [PATCH 06/13] remove conda check, threw warning

---
 conf/modules.config            |  8 ------
 nextflow.config                |  2 --
 workflows/epitopeprediction.nf | 45 ++++++++++++++++------------------
 3 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 9fc3089..2704650 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -20,14 +20,6 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
-    withName: SAMPLESHEET_CHECK {
-        publishDir = [
-            path: { "${params.outdir}/pipeline_info" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
diff --git a/nextflow.config b/nextflow.config
index fd47846..34282d0 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -94,8 +94,6 @@ params {
     validationShowHiddenParams       = false
     validate_params                  = true
 
-    conda.enabled                    = false
-
 }
 
 // Load base.config by default for all pipelines
diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index 9dd5253..410a9bd 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -131,7 +131,7 @@ workflow EPITOPEPREDICTION {
     GUNZIP_VCF (
         ch_samples_from_sheet.variant_compressed
     )
-    ch_versions = ch_versions.mix(GUNZIP_VCF.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(GUNZIP_VCF.out.versions)
 
     ch_variants_uncompressed = GUNZIP_VCF.out.gunzip
         .mix(ch_samples_from_sheet.variant_uncompressed)
@@ -152,9 +152,6 @@ workflow EPITOPEPREDICTION {
 
     if (tools.isEmpty()) { exit 1, "No valid tools specified." }
 
-    if (params.conda.enabled && params.tools.contains("netmhc")) {
-            log.warn("Please note: if you want to use external prediction tools with conda it might be necessary to set --netmhc_system to darwin depending on your system.")
-    }
 
     c_purple = params.monochrome_logs ? '' : "\033[0;35m";
     c_reset = params.monochrome_logs ? '' : "\033[0m";
@@ -168,7 +165,7 @@ workflow EPITOPEPREDICTION {
 
     // get versions of all prediction tools
     GET_PREDICTION_VERSIONS(ch_external_versions.ifEmpty(""))
-    ch_prediction_tool_versions = GET_PREDICTION_VERSIONS.out.versions.ifEmpty(null)
+    ch_prediction_tool_versions = GET_PREDICTION_VERSIONS.out.versions
 
     // TODO I guess it would be better to have two subworkflows for the if else parts (CM)
     if (params.show_supported_models) {
@@ -179,7 +176,7 @@ workflow EPITOPEPREDICTION {
                 .combine(ch_prediction_tool_versions)
                 .first()
         )
-        ch_versions = ch_versions.mix(SHOW_SUPPORTED_MODELS.out.versions.ifEmpty(null))
+        ch_versions = ch_versions.mix(SHOW_SUPPORTED_MODELS.out.versions)
     }
 
     else {
@@ -195,14 +192,14 @@ workflow EPITOPEPREDICTION {
         ch_samples_uncompressed.variant,
         ch_prediction_tool_versions
     )
-    ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS.out.versions)
 
     // perform the check requested models on the protein files
     EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN(
         ch_samples_uncompressed.protein,
         ch_prediction_tool_versions
     )
-    ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN.out.versions)
     // perform the check requested models on the peptide file where we need the input itself to determine the given peptide lengths
     EPYTOPE_CHECK_REQUESTED_MODELS_PEP(
         ch_samples_uncompressed
@@ -210,7 +207,7 @@ workflow EPITOPEPREDICTION {
             .map { meta_data, input_file -> tuple( meta_data, input_file ) },
         ch_prediction_tool_versions
     )
-    ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PEP.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PEP.out.versions)
 
     // Return a warning if this is raised
     EPYTOPE_CHECK_REQUESTED_MODELS
@@ -273,7 +270,7 @@ workflow EPITOPEPREDICTION {
     EXTERNAL_TOOLS_IMPORT(
         ch_nonfree_paths
     )
-    ch_versions = ch_versions.mix(EXTERNAL_TOOLS_IMPORT.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(EXTERNAL_TOOLS_IMPORT.out.versions)
 
     /*
     ========================================================================================
@@ -299,7 +296,7 @@ workflow EPITOPEPREDICTION {
             ch_variants.vcf
         )
         .set { ch_split_variants }
-        ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions.ifEmpty(null) )
+        ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions )
 
     }
     else {
@@ -307,32 +304,32 @@ workflow EPITOPEPREDICTION {
             ch_variants.vcf
         )
         .set { ch_split_variants }
-        ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions.ifEmpty(null) )
+        ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions )
     }
     // include the csvtk_split function (only variant files with an tsv and GSvar executable)
     CSVTK_SPLIT(
         ch_variants.tab
     )
 
-    ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions )
 
     // process FASTA file and generated peptides
     EPYTOPE_GENERATE_PEPTIDES(
         ch_samples_uncompressed.protein
     )
-    ch_versions = ch_versions.mix(EPYTOPE_GENERATE_PEPTIDES.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(EPYTOPE_GENERATE_PEPTIDES.out.versions)
 
 
     SPLIT_PEPTIDES_PROTEIN(
         EPYTOPE_GENERATE_PEPTIDES.out.splitted
     )
-    ch_versions = ch_versions.mix(SPLIT_PEPTIDES_PROTEIN.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(SPLIT_PEPTIDES_PROTEIN.out.versions)
 
     // split peptide data
     SPLIT_PEPTIDES_PEPTIDES(
         ch_samples_uncompressed.peptide
     )
-    ch_versions = ch_versions.mix( SPLIT_PEPTIDES_PEPTIDES.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( SPLIT_PEPTIDES_PEPTIDES.out.versions )
 
     /*
     ========================================================================================
@@ -350,7 +347,7 @@ workflow EPITOPEPREDICTION {
             .transpose(),
             EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
     )
-    ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions )
 
 
     // Run epitope prediction for peptides
@@ -362,7 +359,7 @@ workflow EPITOPEPREDICTION {
             .transpose(),
             EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
     )
-    ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions )
 
 
     // Run epitope prediction for variants
@@ -375,7 +372,7 @@ workflow EPITOPEPREDICTION {
             .transpose(),
             EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
     )
-    ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions )
 
     // Combine the predicted files and save them in a branch to make a distinction between samples with single and multi files
     EPYTOPE_PEPTIDE_PREDICTION_PEP
@@ -397,12 +394,12 @@ workflow EPITOPEPREDICTION {
     CAT_TSV(
         ch_predicted_peptides.single
     )
-    ch_versions = ch_versions.mix( CAT_TSV.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( CAT_TSV.out.versions )
 
     CSVTK_CONCAT(
         ch_predicted_peptides.multi
     )
-    ch_versions = ch_versions.mix( CSVTK_CONCAT.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( CSVTK_CONCAT.out.versions )
 
     // Combine protein sequences
     CAT_FASTA(
@@ -412,7 +409,7 @@ workflow EPITOPEPREDICTION {
             .mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.fasta, EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.fasta )
             .groupTuple()
     )
-    ch_versions = ch_versions.mix( CAT_FASTA.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( CAT_FASTA.out.versions )
 
     EPYTOPE_PEPTIDE_PREDICTION_PEP
         .out
@@ -434,12 +431,12 @@ workflow EPITOPEPREDICTION {
     MERGE_JSON_SINGLE(
         ch_json_reports.single
     )
-    ch_versions = ch_versions.mix( MERGE_JSON_SINGLE.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( MERGE_JSON_SINGLE.out.versions )
 
     MERGE_JSON_MULTI(
         ch_json_reports.multi
     )
-    ch_versions = ch_versions.mix( MERGE_JSON_MULTI.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix( MERGE_JSON_MULTI.out.versions )
 
     //
     // MODULE: Pipeline reporting

From 0b1358aba3c1cee168a3a1d1fa8ab5c27557cbee Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 5 Dec 2023 11:13:35 +0000
Subject: [PATCH 07/13] swtich back to old len settings

---
 nextflow_schema.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index c467dc9..6cb51bf 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -107,7 +107,7 @@
                 },
                 "max_peptide_length": {
                     "type": "integer",
-                    "default": 14,
+                    "default": 11,
                     "description": "Specifies the maximum peptide length.",
                     "help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa"
                 },
@@ -119,12 +119,12 @@
                 },
                 "max_peptide_length_class2": {
                     "type": "integer",
-                    "default": 25,
+                    "default": 16,
                     "description": "Specifies the maximum peptide length for MHC class II peptides."
                 },
                 "min_peptide_length_class2": {
                     "type": "integer",
-                    "default": 9,
+                    "default": 15,
                     "description": "Specifies the minimum peptide length for MHC class II peptides."
                 },
                 "tools": {

From 3f3bcf97cc2ef258bd78260d6622d7055d4a4d3f Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 5 Dec 2023 11:35:15 +0000
Subject: [PATCH 08/13] add changelog, fix lint

---
 CHANGELOG.md             |  1 +
 assets/schema_input.json |  4 ++--
 nextflow_schema.json     | 14 +++++++++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dcb7f43..866c94e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Changed`
 
+- [#220](https://github.com/nf-core/epitopeprediction/pull/220) - Switch to nf-validation to parse samplesheet
 - [#213](https://github.com/nf-core/epitopeprediction/pull/203) - Rename param `genome_version` to `genome_reference`, add functionality to handle BioMart archive urls
 - [#213](https://github.com/nf-core/epitopeprediction/pull/203) - Update to nf-core template `2.10`
 - [#203](https://github.com/nf-core/epitopeprediction/pull/203) - Update to nf-core template `2.9`
diff --git a/assets/schema_input.json b/assets/schema_input.json
index 3c6bdc4..df0fd34 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -28,10 +28,10 @@
                     },
                     {
                         "type": "string",
-                        "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$",
+                        "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$"
                     },
                     {
-                        "type":"string",
+                        "type": "string",
                         "pattern": "^[H][-][2][-][A-Za-z]{2,3}$"
                     }
                 ],
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6cb51bf..4180cfd 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -52,7 +52,7 @@
                 "genome_reference": {
                     "type": "string",
                     "default": "grch37",
-                    "enum":["grch37", "grch38"],
+                    "enum": ["grch37", "grch38"],
                     "help_text": "This defines against which human Ensembl genome reference the pipeline performs the analysis including the incorporation of genetic variants e.g.. If `grch37` or `grch38` are specified the most recent Ensembl Biomart version for genome versions will be used. Alternatively, an Ensembl Biomart (archive) version can be specified, e.g. http://jan2020.archive.ensembl.org/.",
                     "description": "Specifies the Ensembl genome reference version that will be used."
                 },
@@ -215,25 +215,29 @@
                 },
                 "netmhcpan_path": {
                     "type": "string",
-                    "default": "None",
+                    "format": "file-path",
+                    "exists": true,
                     "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhcpan' tool, specify the path to the original software tarball for NetMHCpan 4.0 here."
                 },
                 "netmhc_path": {
                     "type": "string",
-                    "default": "None",
+                    "format": "file-path",
+                    "exists": true,
                     "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhc' tool, specify the path to the original software tarball for NetMHC 4.0 here."
                 },
                 "netmhciipan_path": {
                     "type": "string",
-                    "default": "None",
+                    "format": "file-path",
+                    "exists": true,
                     "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhciipan' tool, specify the path to the original software tarball for NetMHCIIpan 3.1 here."
                 },
                 "netmhcii_path": {
                     "type": "string",
-                    "default": "None",
+                    "format": "file-path",
+                    "exists": true,
                     "pattern": "^\\S+\\.tar\\.gz$",
                     "description": "To use the 'netmhcii' tool, specify the path to the original software tarball for NetMHCII 2.2 here."
                 }

From 1ffdacb4e89cb7268bebc4a424989340f5dce0b0 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Tue, 5 Dec 2023 13:05:53 +0000
Subject: [PATCH 09/13] change internal mhcclass to mhc_class for consistency

---
 bin/check_requested_models.py                   |  2 +-
 modules/local/epytope_check_requested_models.nf |  4 ++--
 modules/local/epytope_generate_peptides.nf      |  4 ++--
 modules/local/epytope_peptide_prediction.nf     | 10 +++++-----
 nextflow_schema.json                            |  1 -
 workflows/epitopeprediction.nf                  |  4 ++--
 6 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/bin/check_requested_models.py b/bin/check_requested_models.py
index bc7a485..342505f 100755
--- a/bin/check_requested_models.py
+++ b/bin/check_requested_models.py
@@ -54,7 +54,7 @@ def __main__():
         "Write out information about supported models by epytope for installed predictor tool versions."
     )
     parser.add_argument("-p", "--peptides", help="File with one peptide per line")
-    parser.add_argument("-c", "--mhcclass", default=1, help="MHC class I or II")
+    parser.add_argument("-c", "--mhc_class", default=1, help="MHC class I or II")
     parser.add_argument("-l", "--max_length", help="Maximum peptide length", type=int)
     parser.add_argument("-ml", "--min_length", help="Minimum peptide length", type=int)
     parser.add_argument("-a", "--alleles", help="<Required> MHC Alleles", required=True, type=str)
diff --git a/modules/local/epytope_check_requested_models.nf b/modules/local/epytope_check_requested_models.nf
index f78a77a..8438c4d 100644
--- a/modules/local/epytope_check_requested_models.nf
+++ b/modules/local/epytope_check_requested_models.nf
@@ -30,8 +30,8 @@ process EPYTOPE_CHECK_REQUESTED_MODELS {
     }
 
     def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides"
-    def min_length = ("${meta.mhcclass}" == "I") ? params.min_peptide_length : params.min_peptide_length_class2
-    def max_length = ("${meta.mhcclass}" == "I") ? params.max_peptide_length : params.max_peptide_length_class2
+    def min_length = ("${meta.mhc_class}" == "I") ? params.min_peptide_length : params.min_peptide_length_class2
+    def max_length = ("${meta.mhc_class}" == "I") ? params.max_peptide_length : params.max_peptide_length_class2
 
     """
     check_requested_models.py ${argument} \
diff --git a/modules/local/epytope_generate_peptides.nf b/modules/local/epytope_generate_peptides.nf
index 401afa7..d79ce4a 100644
--- a/modules/local/epytope_generate_peptides.nf
+++ b/modules/local/epytope_generate_peptides.nf
@@ -19,8 +19,8 @@ process EPYTOPE_GENERATE_PEPTIDES {
 
     script:
     def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides"
-    def min_length = (meta.mhcclass == "I") ? params.min_peptide_length : params.min_peptide_length_class2
-    def max_length = (meta.mhcclass == "I") ? params.max_peptide_length : params.max_peptide_length_class2
+    def min_length = (meta.mhc_class == "I") ? params.min_peptide_length : params.min_peptide_length_class2
+    def max_length = (meta.mhc_class == "I") ? params.max_peptide_length : params.max_peptide_length_class2
 
     """
     gen_peptides.py --input ${raw} \\
diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf
index 8b8ca73..ef0c07a 100644
--- a/modules/local/epytope_peptide_prediction.nf
+++ b/modules/local/epytope_peptide_prediction.nf
@@ -47,14 +47,14 @@ process EPYTOPE_PEPTIDE_PREDICTION {
     def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') }
     def class2_tools = tools_split.findAll { it.matches('.*(?i)(syf|class-2|ii).*') }
 
-    if (((meta.mhcclass == "I") & class1_tools.empty) | ((meta.mhcclass == "II") & class2_tools.empty)) {
-        exit 1, "No tools specified for mhc class ${meta.mhcclass}"
+    if (((meta.mhc_class == "I") & class1_tools.empty) | ((meta.mhc_class == "II") & class2_tools.empty)) {
+        exit 1, "No tools specified for mhc class ${meta.mhc_class}"
     }
 
-    def min_length = (meta.mhcclass == "I") ? params.min_peptide_length : params.min_peptide_length_class2
-    def max_length = (meta.mhcclass == "I") ? params.max_peptide_length : params.max_peptide_length_class2
+    def min_length = (meta.mhc_class == "I") ? params.min_peptide_length : params.min_peptide_length_class2
+    def max_length = (meta.mhc_class == "I") ? params.max_peptide_length : params.max_peptide_length_class2
 
-    def tools_to_use = ((meta.mhcclass == "I") | (meta.mhcclass == "H-2")) ? class1_tools.join(',') : class2_tools.join(',')
+    def tools_to_use = ((meta.mhc_class == "I") | (meta.mhc_class == "H-2")) ? class1_tools.join(',') : class2_tools.join(',')
 
     """
     # create folder for MHCflurry downloads to avoid permission problems when running pipeline with docker profile and mhcflurry selected
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 4180cfd..e69966b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -52,7 +52,6 @@
                 "genome_reference": {
                     "type": "string",
                     "default": "grch37",
-                    "enum": ["grch37", "grch38"],
                     "help_text": "This defines against which human Ensembl genome reference the pipeline performs the analysis including the incorporation of genetic variants e.g.. If `grch37` or `grch38` are specified the most recent Ensembl Biomart version for genome versions will be used. Alternatively, an Ensembl Biomart (archive) version can be specified, e.g. http://jan2020.archive.ensembl.org/.",
                     "description": "Specifies the Ensembl genome reference version that will be used."
                 },
diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index 410a9bd..2d60755 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -283,9 +283,9 @@ workflow EPITOPEPREDICTION {
         .variant
         .branch {
             meta_data, input_file ->
-                vcf : input_file.extension == 'vcf' || input_file.extension == 'vcf.gz'
+                vcf : input_file.endsWith('.vcf') || input_file.endsWith('.vcf.gz')
                     return [ meta_data, input_file ]
-                tab :  input_file.extension == 'tsv' || input_file.extension == 'GSvar'
+                tab :  input_file.endsWith('.tsv') || input_file.endsWith('.GSvar')
                     return [ meta_data, input_file ]
         }
         .set { ch_variants }

From 400b831b83cab39867be7341f651ce35714ef0a3 Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Wed, 13 Dec 2023 12:33:30 +0000
Subject: [PATCH 10/13] Add check for mhc_class-allele combination

---
 workflows/epitopeprediction.nf | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index f3a8bf0..7b2cf8f 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -90,6 +90,17 @@ import groovy.json.JsonSlurper
 def jsonSlurper = new JsonSlurper()
 def external_tools_meta = jsonSlurper.parse(file(params.external_tools_meta, checkIfExists: true))
 
+// Function to check if the alleles are valid for the given mhc class
+def validate_alleles(String alleles, String mhc_class) {
+    valid_class1_loci = ['A*','B*','C*','E*','G*']
+    valid_class2_loci = ['DR','DP','DQ']
+    allele_list = alleles.split(';')
+    if (( mhc_class == 'I'  & allele_list.every { allele -> valid_class2_loci.any { allele.startsWith(it) }}) |
+        ( mhc_class == 'II' & allele_list.every { allele -> valid_class1_loci.any { allele.startsWith(it) }})) {
+        exit 1, "Please check input samplesheet -> Invalid mhc class ${mhc_class} and allele combination ${allele_list} found!"
+    }
+}
+
 workflow EPITOPEPREDICTION {
 
     validateParameters()
@@ -116,6 +127,7 @@ workflow EPITOPEPREDICTION {
         .branch {
             sample, alleles, mhc_class, filename ->
                 def allele_list = readAlleles(alleles)
+                validate_alleles(allele_list, mhc_class)
                 variant_compressed : filename.endsWith('.vcf.gz')
                     return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant_compressed'], filename ]
                 variant_uncompressed : filename.endsWith('.vcf') || filename.endsWith('.GSvar')

From a34c450d8f1749092055928405ef321007e2baab Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Mon, 18 Dec 2023 10:39:04 +0100
Subject: [PATCH 11/13] Remove txt filename input from schema_input.json

---
 assets/schema_input.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index df0fd34..c0728a6 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -44,8 +44,8 @@
             },
             "filename": {
                 "type": "string",
-                "pattern": "^\\S+\\.(vcf|vcf.gz|tsv|fasta|fa|txt)$",
-                "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.vcf.gz', '.tsv', '.fasta', '.fa', '.txt'"
+                "pattern": "^\\S+\\.(vcf|vcf.gz|tsv|fasta|fa)$",
+                "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.vcf.gz', '.tsv', '.fasta', '.fa'"
             }
         },
         "required": ["sample", "alleles", "mhc_class", "filename"]

From 74da74b7edaf49ad609bc2c4a856a408586c554f Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Tue, 19 Dec 2023 12:26:26 +0100
Subject: [PATCH 12/13] Update workflows/epitopeprediction.nf

Co-authored-by: Christopher Mohr <christopher.mohr@qbic.uni-tuebingen.de>
---
 workflows/epitopeprediction.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index 7b2cf8f..3e03824 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -130,7 +130,7 @@ workflow EPITOPEPREDICTION {
                 validate_alleles(allele_list, mhc_class)
                 variant_compressed : filename.endsWith('.vcf.gz')
                     return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant_compressed'], filename ]
-                variant_uncompressed : filename.endsWith('.vcf') || filename.endsWith('.GSvar')
+                variant_uncompressed : filename.endsWith('.vcf')
                     return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant'], filename ]
                 peptide : filename.endsWith('.tsv')
                     return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'peptide'], filename ]

From 693bd500fe3efb92c94aaa873cce2546d39d6a5b Mon Sep 17 00:00:00 2001
From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com>
Date: Tue, 19 Dec 2023 12:26:33 +0100
Subject: [PATCH 13/13] Update workflows/epitopeprediction.nf

Co-authored-by: Christopher Mohr <christopher.mohr@qbic.uni-tuebingen.de>
---
 workflows/epitopeprediction.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index 3e03824..00549dd 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -110,7 +110,7 @@ workflow EPITOPEPREDICTION {
     // Non-free prediction tools
     ch_nonfree_paths = Channel.empty()
 
-    // Function to read the alleles from a file or stage it from url
+    // Function to read the alleles from a file or use given string
     def readAlleles = { input ->
         if (input.endsWith(".txt")) {
             def file = file(input)