From bab8e0a2018692ceb5448ab428f86caeb889c67b Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Fri, 1 Dec 2023 16:22:55 +0100 Subject: [PATCH 01/13] allow vcfgz again --- bin/check_samplesheet.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 97f66c6..614b807 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -23,7 +23,7 @@ class RowChecker: """ - VALID_FORMATS = (".tsv", ".fasta", ".vcf", "GSvar") + VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz" ".GSvar") def __init__( self, @@ -134,7 +134,7 @@ def _validate_unique_sample(self): def get_file_type(file): """Read file extension and return file type""" - extension = file.split(".")[-1] + extension = ''.join(Path(file).suffixes # check input file is empty # it needs to be distinguished if there's a given local file or internet address if str(file).startswith("http"): @@ -148,13 +148,13 @@ def get_file_type(file): raise AssertionError(f"Input file {file} is empty.") try: - if extension == "vcf.gz": + if ".vcf.gz" in extension: file_type = "compressed_variant" - elif extension == "vcf": + elif extension == ".vcf": file_type = "variant" - elif extension == "fasta": + elif extension == ".fasta": file_type = "protein" - elif extension in ["tsv", "GSvar"]: + elif extension in [".tsv", ".GSvar"]: # Check if the file is a variant annotation file or a peptide file header_columns = [col.strip() for col in file[0].split("\t")] From 202fa4416e973e80a736ed9aeb906b0e5fef6e9e Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 4 Dec 2023 13:13:26 +0000 Subject: [PATCH 02/13] fix compressed vcf parsing, refactored allele checking --- bin/check_samplesheet.py | 65 ++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 614b807..069ec61 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -23,7 +23,7 @@ class RowChecker: """ - VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz" ".GSvar") + VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz", ".GSvar") def __init__( self, @@ -63,8 +63,8 @@ def validate(self, row): """ self._validate_row_length(row) self._validate_sample(row) - self._validate_allele(row) self._validate_mhc_class(row) + self._validate_allele(row) self._validate_file(row[self._filename_col]) self._seen.add( (row[self._sample_col], row[self._alleles_col], row[self._mhc_class_col], row[self._filename_col]) @@ -80,26 +80,20 @@ def _validate_sample(self, row): def _validate_allele(self, row): """Assert that the alleles have the right format.""" - valid_class1_loci = ["A*", "B*", "C*", "E*", "G*"] - valid_class2_loci = ["DR", "DP", "DQ"] - if len(row[self._alleles_col]) <= 0: raise AssertionError(f"No alleles specified.\nLine: {row}") - if ( - not os.path.isfile(row[self._alleles_col]) - and ( - row[self._mhc_class_col] == "I" - and any(substring in row[self._alleles_col] for substring in valid_class2_loci) - ) - or ( - row[self._mhc_class_col] == "II" - and any(substring in row[self._alleles_col] for substring in valid_class1_loci) - ) - ): - raise AssertionError( - f"Samplesheet contains invalid mhc class and allele combination!\nLine: {row} \ - \nValid loci: {valid_class1_loci if row[self._mhc_class_col] == 'I' else valid_class2_loci}" - ) + if os.path.isfile(row[self._alleles_col]): + logging.info(f"Alleles file found: {row[self._alleles_col]}. Attempting to read file.") + try: + with open(row[self._alleles_col], "r") as f: + alleles = f.readlines()[0] + except Exception as e: + raise AssertionError(f"Error with reading alleles file: {e}. Check correct format for input file {row[self._alleles_col]} in documentation.") + else: + alleles = row[self._alleles_col] + + if not all([check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]): + raise AssertionError(f"Alleles {alleles} of MHC-class {row[self._mhc_class_col]} don't have the right format. \nLine: {row}. See the documentation for more information.") def _validate_mhc_class(self, row): """Assert that the mhc_class has the right format.""" @@ -134,22 +128,10 @@ def _validate_unique_sample(self): def get_file_type(file): """Read file extension and return file type""" - extension = ''.join(Path(file).suffixes - # check input file is empty - # it needs to be distinguished if there's a given local file or internet address - if str(file).startswith("http"): - with urllib.request.urlopen(file) as response: - file = response.read().decode("utf-8").split("\n") - if len(file) == 0: - raise AssertionError(f"Input file {file} is empty.") - else: - file = open(file, "r").readlines() - if file == 0: - raise AssertionError(f"Input file {file} is empty.") - + extension = ''.join(Path(file).suffixes) try: if ".vcf.gz" in extension: - file_type = "compressed_variant" + file_type = "variant_compressed" elif extension == ".vcf": file_type = "variant" elif extension == ".fasta": @@ -203,8 +185,19 @@ def parse_args(argv=None): return parser.parse_args(argv) -def check_allele_nomenclature(allele): - pattern = re.compile("(^[A-Z][\*][0-9][0-9][:][0-9][0-9])$") +def check_allele_nomenclature(allele, mhc_class) -> bool: + allele = allele.replace('HLA-','') + if mhc_class == 'I': + pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$") + elif mhc_class == 'II': + # Check if allele is from two chains + if allele.contains("-"): + pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") + else: + pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") + else: # Mouse + pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$") + return pattern.match(allele) is not None From 2dd79dd330e6b25bece7a85b456ecb0ce86fd844 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 4 Dec 2023 14:16:26 +0000 Subject: [PATCH 03/13] bump python version of samplechecker --- modules/local/samplesheet_check.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index ae63801..d1c48c3 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -2,10 +2,10 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" label 'process_single' - conda "conda-forge::python=3.8.3" + conda "conda-forge::python=3.10.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" + 'https://depot.galaxyproject.org/singularity/python:3.10.2' : + 'biocontainers/python:3.10.2' }" input: path samplesheet From d04e77f6cbeb47f632d7ec8eff61248655f9a8b3 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 4 Dec 2023 14:17:17 +0000 Subject: [PATCH 04/13] add staging in samplesheet checker to determine inputtype --- bin/check_samplesheet.py | 59 +++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 069ec61..d176a94 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -11,6 +11,7 @@ import csv from pathlib import Path import urllib.request +from urllib.parse import urlparse class RowChecker: @@ -82,17 +83,8 @@ def _validate_allele(self, row): """Assert that the alleles have the right format.""" if len(row[self._alleles_col]) <= 0: raise AssertionError(f"No alleles specified.\nLine: {row}") - if os.path.isfile(row[self._alleles_col]): - logging.info(f"Alleles file found: {row[self._alleles_col]}. Attempting to read file.") - try: - with open(row[self._alleles_col], "r") as f: - alleles = f.readlines()[0] - except Exception as e: - raise AssertionError(f"Error with reading alleles file: {e}. Check correct format for input file {row[self._alleles_col]} in documentation.") - else: - alleles = row[self._alleles_col] - - if not all([check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]): + alleles = read_source(row[self._alleles_col]) + if not all([self._check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]): raise AssertionError(f"Alleles {alleles} of MHC-class {row[self._mhc_class_col]} don't have the right format. \nLine: {row}. See the documentation for more information.") def _validate_mhc_class(self, row): @@ -125,6 +117,32 @@ def _validate_unique_sample(self): if len(set(sample_names)) != len(sample_names): raise AssertionError(f"Duplicate sample name found: {self.rows[-1]}") + def _check_allele_nomenclature(self, allele, mhc_class) -> bool: + allele = allele.replace('HLA-','') + if mhc_class == 'I': + pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$") + elif mhc_class == 'II': + # Check if allele is from two chains + if allele.contains("-"): + pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") + else: + pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") + else: # Mouse + pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$") + + return pattern.match(allele) is not None + + +def read_source(source): + """Read the alleles from a file/url or return the string.""" + if urlparse(source).scheme in ('http', 'https'): + with urllib.request.urlopen(source) as response: + return response.read().decode('utf-8').splitlines()[0] + elif os.path.isfile(source): + with open(source, "r") as f: + return f.readlines()[0] + else: + return source def get_file_type(file): """Read file extension and return file type""" @@ -138,7 +156,8 @@ def get_file_type(file): file_type = "protein" elif extension in [".tsv", ".GSvar"]: # Check if the file is a variant annotation file or a peptide file - header_columns = [col.strip() for col in file[0].split("\t")] + header = read_source(file) + header_columns = [col.strip() for col in header.split("\t")] required_variant_columns = ["#chr", "start", "end"] @@ -185,22 +204,6 @@ def parse_args(argv=None): return parser.parse_args(argv) -def check_allele_nomenclature(allele, mhc_class) -> bool: - allele = allele.replace('HLA-','') - if mhc_class == 'I': - pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$") - elif mhc_class == 'II': - # Check if allele is from two chains - if allele.contains("-"): - pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") - else: - pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") - else: # Mouse - pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$") - - return pattern.match(allele) is not None - - def make_dir(path): if len(path) > 0: try: From 7868e5ce7e5813dc53261f44b3ef78bb49e0258d Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 5 Dec 2023 10:28:54 +0000 Subject: [PATCH 05/13] switch to nf-validation --- assets/schema_input.json | 22 ++- bin/check_samplesheet.py | 290 ----------------------------- modules/local/samplesheet_check.nf | 31 --- nextflow_schema.json | 14 +- subworkflows/local/input_check.nf | 61 ------ workflows/epitopeprediction.nf | 58 +++--- 6 files changed, 57 insertions(+), 419 deletions(-) delete mode 100755 bin/check_samplesheet.py delete mode 100644 modules/local/samplesheet_check.nf delete mode 100644 subworkflows/local/input_check.nf diff --git a/assets/schema_input.json b/assets/schema_input.json index b564a5c..3c6bdc4 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -20,20 +20,32 @@ }, { "type": "string", - "pattern": "(^[A-Z][*][0-9][0-9][:][0-9][0-9])$" + "pattern": "^([A-E]{1}[*][0-9]{2}[:][0-9]{2})(;[A-E]{1}[*][0-9]{2}[:][0-9]{2})*$" + }, + { + "type": "string", + "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;(DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})*$" + }, + { + "type": "string", + "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$", + }, + { + "type":"string", + "pattern": "^[H][-][2][-][A-Za-z]{2,3}$" } ], - "errorMessage": "Alleles must be provided as string or file with extension '.txt''" + "errorMessage": "Alleles must be provided as string or file with extension '.txt'. Please check the documentation for more information." }, "mhc_class": { "type": "string", "pattern": "^(I|II|H-2)$", - "errorMessage": "The MHC class must be provided. Valid values: " + "errorMessage": "The MHC class must be provided. Valid values: 'I', 'II' or 'H-2'" }, "filename": { "type": "string", - "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$", - "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.tsv', '.fasta', '.fa', '.txt'" + "pattern": "^\\S+\\.(vcf|vcf.gz|tsv|fasta|fa|txt)$", + "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.vcf.gz', '.tsv', '.fasta', '.fa', '.txt'" } }, "required": ["sample", "alleles", "mhc_class", "filename"] diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index d176a94..0000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,290 +0,0 @@ -#!/usr/bin/env python -# Written by Jonas Scheid, Christopher Mohr and released under the MIT license. - -import argparse -import logging -import os -import re -import sys -import errno -import re -import csv -from pathlib import Path -import urllib.request -from urllib.parse import urlparse - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - rows (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz", ".GSvar") - - def __init__( - self, - sample_col=0, - alleles_col=1, - mhc_class_col=2, - filename_col=3, - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - alleles_col (str): The name of the column that contains the MHC alleles. - mhc_class_col (str): The name of the column that contains the MHC class. - filename_col (str): The name of the column that contains the filename. - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._alleles_col = alleles_col - self._mhc_class_col = mhc_class_col - self._filename_col = filename_col - self._seen = set() - self.rows = [] - - def validate(self, row): - """ - Perform all validations on the given row. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_row_length(row) - self._validate_sample(row) - self._validate_mhc_class(row) - self._validate_allele(row) - self._validate_file(row[self._filename_col]) - self._seen.add( - (row[self._sample_col], row[self._alleles_col], row[self._mhc_class_col], row[self._filename_col]) - ) - self.rows.append(row) - self._validate_unique_row() - self._validate_unique_sample() - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError(f"Sample input is required.\nLine: {row}") - - def _validate_allele(self, row): - """Assert that the alleles have the right format.""" - if len(row[self._alleles_col]) <= 0: - raise AssertionError(f"No alleles specified.\nLine: {row}") - alleles = read_source(row[self._alleles_col]) - if not all([self._check_allele_nomenclature(allele, row[self._mhc_class_col]) for allele in alleles.split(";")]): - raise AssertionError(f"Alleles {alleles} of MHC-class {row[self._mhc_class_col]} don't have the right format. \nLine: {row}. See the documentation for more information.") - - def _validate_mhc_class(self, row): - """Assert that the mhc_class has the right format.""" - valid_classes = ["I", "II", "H-2"] - if row[self._mhc_class_col] not in valid_classes: - raise AssertionError(f"MHC class must be one of: {valid_classes}\nLine: {row}") - - def _validate_file(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The input file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def _validate_row_length(self, row): - """Assert the row length.""" - if len(row) != 4: - raise AssertionError(f"Invalid row length: {len(row)}\nLine: {row}.") - - def _validate_unique_row(self): - """Assert that the combination of sample name, alleles, mhc_class and filename is unique.""" - if len(self._seen) != len(self.rows) and len(self.rows) > 1: - raise AssertionError(f"Duplicate row found: {self.rows[-1]}") - - def _validate_unique_sample(self): - """Assert that the combination sample names are unique.""" - sample_names = [row[self._sample_col] for row in self.rows] - if len(set(sample_names)) != len(sample_names): - raise AssertionError(f"Duplicate sample name found: {self.rows[-1]}") - - def _check_allele_nomenclature(self, allele, mhc_class) -> bool: - allele = allele.replace('HLA-','') - if mhc_class == 'I': - pattern = re.compile("(^[A-E]{1}[\*][0-9]{2}[:][0-9]{2})$") - elif mhc_class == 'II': - # Check if allele is from two chains - if allele.contains("-"): - pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") - else: - pattern = re.compile("(^(DR|DP|DQ){1}(A|B){1}[0-9]{1}[\*][0-9]{2}[:][0-9]{2})$") - else: # Mouse - pattern = re.compile("(^[H]{1}[-][2]{1}[-][A-Za-z]{2,3})$") - - return pattern.match(allele) is not None - - -def read_source(source): - """Read the alleles from a file/url or return the string.""" - if urlparse(source).scheme in ('http', 'https'): - with urllib.request.urlopen(source) as response: - return response.read().decode('utf-8').splitlines()[0] - elif os.path.isfile(source): - with open(source, "r") as f: - return f.readlines()[0] - else: - return source - -def get_file_type(file): - """Read file extension and return file type""" - extension = ''.join(Path(file).suffixes) - try: - if ".vcf.gz" in extension: - file_type = "variant_compressed" - elif extension == ".vcf": - file_type = "variant" - elif extension == ".fasta": - file_type = "protein" - elif extension in [".tsv", ".GSvar"]: - # Check if the file is a variant annotation file or a peptide file - header = read_source(file) - header_columns = [col.strip() for col in header.split("\t")] - - required_variant_columns = ["#chr", "start", "end"] - - file_type = "peptide" - - if all(col in header_columns for col in required_variant_columns): - file_type = "variant" - elif "sequence" not in header_columns: - raise AssertionError("Peptide input file does not contain mandatory column 'sequence'") - - return file_type - - except Exception as e: - raise AssertionError( - f"Error with checking samplesheet: {e}. Check correct format for input file {file} in documentation." - ) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def check_samplesheet(file_in, file_out): - """ - sample,alleles,mhc_class,filename - GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta - GBM_2,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta - - or - - sample,alleles,mhc_class,filename - GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta - GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta - - - where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins) - and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file - containing one allele per line (no header) - - Further examples: - - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt - - Mouse allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.H2.txt - - Peptide format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/peptides/peptides.tsv - - Variant TSV format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.tsv - - Variant VCF format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.vcf - """ - - with open( - file_in, - newline="", - ) as samplesheet: - reader = csv.reader(samplesheet) - - ## Check header - valid_header = ["sample", "alleles", "mhc_class", "filename"] - header = [x.strip('"') for x in samplesheet.readline().strip().split(",")] - if len(header) != 4: - raise ValueError( - f"Invalid number of header columns! Make sure the samplesheet is properly comma-separated." - ) - elif header != valid_header: - raise AssertionError(f"Invalid samplesheet header (valid = {valid_header})!") - - ## Check samplesheet entries - checker = RowChecker() - rows = [] - for i, row in enumerate(reader): - checker.validate(row) - # here an allele check with mhcgnomes would be suitable - row.append(get_file_type(row[3])) - rows.append(row) - - if len(checker.rows) == 0: - raise AssertionError("Samplesheet contains no entries!") - - ## Write validated samplesheet with appropriate columns - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - valid_header.append("inputtype") - fout.write(",".join(valid_header) + "\n") - for row in rows: - fout.write(",".join(row) + "\n") - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - raise AssertionError(f"The given input file {args.file_in} does not exist!") - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index d1c48c3..0000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.10.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.10.2' : - 'biocontainers/python:3.10.2' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/epitopeprediction/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/nextflow_schema.json b/nextflow_schema.json index a983071..c467dc9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -18,6 +18,7 @@ "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", + "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/epitopeprediction/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" @@ -51,6 +52,7 @@ "genome_reference": { "type": "string", "default": "grch37", + "enum":["grch37", "grch38"], "help_text": "This defines against which human Ensembl genome reference the pipeline performs the analysis including the incorporation of genetic variants e.g.. If `grch37` or `grch38` are specified the most recent Ensembl Biomart version for genome versions will be used. Alternatively, an Ensembl Biomart (archive) version can be specified, e.g. http://jan2020.archive.ensembl.org/.", "description": "Specifies the Ensembl genome reference version that will be used." }, @@ -105,7 +107,7 @@ }, "max_peptide_length": { "type": "integer", - "default": 11, + "default": 14, "description": "Specifies the maximum peptide length.", "help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa" }, @@ -117,17 +119,18 @@ }, "max_peptide_length_class2": { "type": "integer", - "default": 16, + "default": 25, "description": "Specifies the maximum peptide length for MHC class II peptides." }, "min_peptide_length_class2": { "type": "integer", - "default": 15, + "default": 9, "description": "Specifies the minimum peptide length for MHC class II peptides." }, "tools": { "type": "string", "default": "syfpeithi", + "pattern": "^(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.1|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1)(,(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.1|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1))*$", "help_text": "Specifies the tool(s) to use. Multiple tools can be combined in a list separated by comma.\nAvailable are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`,`netmhcpan-4.0`,`netmhcpan-4.1`,`netmhc-4.0`,`netmhciipan-4.1`.", "description": "Specifies the prediction tool(s) to use." }, @@ -207,26 +210,31 @@ "netmhc_system": { "type": "string", "default": "linux", + "enum": ["linux", "darwin"], "description": "Specifies the operating system in use (Linux or Darwin). This is only necessary if conda is used." }, "netmhcpan_path": { "type": "string", "default": "None", + "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhcpan' tool, specify the path to the original software tarball for NetMHCpan 4.0 here." }, "netmhc_path": { "type": "string", "default": "None", + "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhc' tool, specify the path to the original software tarball for NetMHC 4.0 here." }, "netmhciipan_path": { "type": "string", "default": "None", + "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhciipan' tool, specify the path to the original software tarball for NetMHCIIpan 3.1 here." }, "netmhcii_path": { "type": "string", "default": "None", + "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhcii' tool, specify the path to the original software tarball for NetMHCII 2.2 here." } } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index af9f467..0000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,61 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true ) - .map { get_samplesheet_paths(it) } - .set { meta } - - emit: meta // channel: [ val(meta), [ files ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, filenames ] -def get_samplesheet_paths(LinkedHashMap row) { - //--------- - // Save sample, alleles, mhc_class and file_type in a dictionary (metadata) - // and return a list of meta and the filename. - //--------- - - def meta = [:] - meta.sample = row.sample - meta.alleles = row.alleles - meta.mhcclass = row.mhc_class - meta.inputtype = row.inputtype - - def array = [] - if (!file(row.filename).exists()) { - exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}" - } else { - array = [ meta, file(row.filename) ] - } - return array -} - -def generate_allele_string(String alleles, String mhcclass) { - // Collect the allele information from the file - def allele_string - valid_class1_loci = ['A*','B*','C*','E*','G*'] - valid_class2_loci = ['DR','DP','DQ'] - if ( alleles.endsWith(".txt") || alleles.endsWith(".alleles") ) { - allele_string = file(alleles).readLines().join(';') - if ((mhcclass == 'I' & valid_class2_loci.any { allele_string.contains(it)}) | - (mhcclass == 'II' & valid_class1_loci.any { allele_string.contains(it)})) { - exit 1, "ERROR: Please check input samplesheet -> invalid mhc class and allele combination found!\n${row.Filename}" - } - } - // or assign the information to a new variable - else { - allele_string = alleles - } - return allele_string -} diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 849ad28..9dd5253 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -4,7 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet; validateParameters } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' @@ -63,10 +63,6 @@ include { CSVTK_CONCAT } include { MERGE_JSON as MERGE_JSON_SINGLE } from '../modules/local/merge_json' include { MERGE_JSON as MERGE_JSON_MULTI } from '../modules/local/merge_json' -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules - -include { INPUT_CHECK } from '../subworkflows/local/input_check' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,35 +93,39 @@ def external_tools_meta = jsonSlurper.parse(file(params.external_tools_meta, che workflow EPITOPEPREDICTION { + validateParameters() + ch_versions = Channel.empty() ch_software_versions = Channel.empty() // Non-free prediction tools ch_nonfree_paths = Channel.empty() - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // - INPUT_CHECK ( - file(params.input) - ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema - - INPUT_CHECK.out.meta - .branch { - meta_data, input_file -> - variant_compressed : meta_data.inputtype == 'variant_compressed' - return [ meta_data, input_file ] - variant_uncompressed : meta_data.inputtype == 'variant' - return [ meta_data, input_file ] - peptide : meta_data.inputtype == 'peptide' - return [ meta_data, input_file ] - protein : meta_data.inputtype == 'protein' - return [ meta_data, input_file ] - } - .set { ch_samples_from_sheet } + // Function to read the alleles from a file or stage it from url + def readAlleles = { input -> + if (input.endsWith(".txt")) { + def file = file(input) + // Alleles are listed in the first line of the file + return file.readLines().get(0) + } else { + // Not a file path, return the original string + return input + } + } + + ch_input = Channel.fromSamplesheet("input") + ch_input + .branch { + sample, alleles, mhc_class, filename -> + def allele_list = readAlleles(alleles) + variant_compressed : filename.endsWith('.vcf.gz') + return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant_compressed'], filename ] + variant_uncompressed : filename.endsWith('.vcf') || filename.endsWith('.GSvar') + return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant'], filename ] + peptide : filename.endsWith('.tsv') + return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'peptide'], filename ] + protein : filename.endsWith('.fasta') || filename.endsWith('.fa') + return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'protein'], filename ] + } .set { ch_samples_from_sheet } // gunzip variant files GUNZIP_VCF ( From 73964727652af580f39931a6d8b2e5a94d21d005 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 5 Dec 2023 11:07:53 +0000 Subject: [PATCH 06/13] remove conda check, threw warning --- conf/modules.config | 8 ------ nextflow.config | 2 -- workflows/epitopeprediction.nf | 45 ++++++++++++++++------------------ 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9fc3089..2704650 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,14 +20,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/nextflow.config b/nextflow.config index fd47846..34282d0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -94,8 +94,6 @@ params { validationShowHiddenParams = false validate_params = true - conda.enabled = false - } // Load base.config by default for all pipelines diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 9dd5253..410a9bd 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -131,7 +131,7 @@ workflow EPITOPEPREDICTION { GUNZIP_VCF ( ch_samples_from_sheet.variant_compressed ) - ch_versions = ch_versions.mix(GUNZIP_VCF.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(GUNZIP_VCF.out.versions) ch_variants_uncompressed = GUNZIP_VCF.out.gunzip .mix(ch_samples_from_sheet.variant_uncompressed) @@ -152,9 +152,6 @@ workflow EPITOPEPREDICTION { if (tools.isEmpty()) { exit 1, "No valid tools specified." } - if (params.conda.enabled && params.tools.contains("netmhc")) { - log.warn("Please note: if you want to use external prediction tools with conda it might be necessary to set --netmhc_system to darwin depending on your system.") - } c_purple = params.monochrome_logs ? '' : "\033[0;35m"; c_reset = params.monochrome_logs ? '' : "\033[0m"; @@ -168,7 +165,7 @@ workflow EPITOPEPREDICTION { // get versions of all prediction tools GET_PREDICTION_VERSIONS(ch_external_versions.ifEmpty("")) - ch_prediction_tool_versions = GET_PREDICTION_VERSIONS.out.versions.ifEmpty(null) + ch_prediction_tool_versions = GET_PREDICTION_VERSIONS.out.versions // TODO I guess it would be better to have two subworkflows for the if else parts (CM) if (params.show_supported_models) { @@ -179,7 +176,7 @@ workflow EPITOPEPREDICTION { .combine(ch_prediction_tool_versions) .first() ) - ch_versions = ch_versions.mix(SHOW_SUPPORTED_MODELS.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(SHOW_SUPPORTED_MODELS.out.versions) } else { @@ -195,14 +192,14 @@ workflow EPITOPEPREDICTION { ch_samples_uncompressed.variant, ch_prediction_tool_versions ) - ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS.out.versions) // perform the check requested models on the protein files EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN( ch_samples_uncompressed.protein, ch_prediction_tool_versions ) - ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN.out.versions) // perform the check requested models on the peptide file where we need the input itself to determine the given peptide lengths EPYTOPE_CHECK_REQUESTED_MODELS_PEP( ch_samples_uncompressed @@ -210,7 +207,7 @@ workflow EPITOPEPREDICTION { .map { meta_data, input_file -> tuple( meta_data, input_file ) }, ch_prediction_tool_versions ) - ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PEP.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(EPYTOPE_CHECK_REQUESTED_MODELS_PEP.out.versions) // Return a warning if this is raised EPYTOPE_CHECK_REQUESTED_MODELS @@ -273,7 +270,7 @@ workflow EPITOPEPREDICTION { EXTERNAL_TOOLS_IMPORT( ch_nonfree_paths ) - ch_versions = ch_versions.mix(EXTERNAL_TOOLS_IMPORT.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(EXTERNAL_TOOLS_IMPORT.out.versions) /* ======================================================================================== @@ -299,7 +296,7 @@ workflow EPITOPEPREDICTION { ch_variants.vcf ) .set { ch_split_variants } - ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions ) } else { @@ -307,32 +304,32 @@ workflow EPITOPEPREDICTION { ch_variants.vcf ) .set { ch_split_variants } - ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions ) } // include the csvtk_split function (only variant files with an tsv and GSvar executable) CSVTK_SPLIT( ch_variants.tab ) - ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions ) // process FASTA file and generated peptides EPYTOPE_GENERATE_PEPTIDES( ch_samples_uncompressed.protein ) - ch_versions = ch_versions.mix(EPYTOPE_GENERATE_PEPTIDES.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(EPYTOPE_GENERATE_PEPTIDES.out.versions) SPLIT_PEPTIDES_PROTEIN( EPYTOPE_GENERATE_PEPTIDES.out.splitted ) - ch_versions = ch_versions.mix(SPLIT_PEPTIDES_PROTEIN.out.versions.ifEmpty(null)) + ch_versions = ch_versions.mix(SPLIT_PEPTIDES_PROTEIN.out.versions) // split peptide data SPLIT_PEPTIDES_PEPTIDES( ch_samples_uncompressed.peptide ) - ch_versions = ch_versions.mix( SPLIT_PEPTIDES_PEPTIDES.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( SPLIT_PEPTIDES_PEPTIDES.out.versions ) /* ======================================================================================== @@ -350,7 +347,7 @@ workflow EPITOPEPREDICTION { .transpose(), EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) ) - ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions ) // Run epitope prediction for peptides @@ -362,7 +359,7 @@ workflow EPITOPEPREDICTION { .transpose(), EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) ) - ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions ) // Run epitope prediction for variants @@ -375,7 +372,7 @@ workflow EPITOPEPREDICTION { .transpose(), EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) ) - ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions ) // Combine the predicted files and save them in a branch to make a distinction between samples with single and multi files EPYTOPE_PEPTIDE_PREDICTION_PEP @@ -397,12 +394,12 @@ workflow EPITOPEPREDICTION { CAT_TSV( ch_predicted_peptides.single ) - ch_versions = ch_versions.mix( CAT_TSV.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( CAT_TSV.out.versions ) CSVTK_CONCAT( ch_predicted_peptides.multi ) - ch_versions = ch_versions.mix( CSVTK_CONCAT.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( CSVTK_CONCAT.out.versions ) // Combine protein sequences CAT_FASTA( @@ -412,7 +409,7 @@ workflow EPITOPEPREDICTION { .mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.fasta, EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.fasta ) .groupTuple() ) - ch_versions = ch_versions.mix( CAT_FASTA.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( CAT_FASTA.out.versions ) EPYTOPE_PEPTIDE_PREDICTION_PEP .out @@ -434,12 +431,12 @@ workflow EPITOPEPREDICTION { MERGE_JSON_SINGLE( ch_json_reports.single ) - ch_versions = ch_versions.mix( MERGE_JSON_SINGLE.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( MERGE_JSON_SINGLE.out.versions ) MERGE_JSON_MULTI( ch_json_reports.multi ) - ch_versions = ch_versions.mix( MERGE_JSON_MULTI.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( MERGE_JSON_MULTI.out.versions ) // // MODULE: Pipeline reporting From 0b1358aba3c1cee168a3a1d1fa8ab5c27557cbee Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 5 Dec 2023 11:13:35 +0000 Subject: [PATCH 07/13] swtich back to old len settings --- nextflow_schema.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index c467dc9..6cb51bf 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -107,7 +107,7 @@ }, "max_peptide_length": { "type": "integer", - "default": 14, + "default": 11, "description": "Specifies the maximum peptide length.", "help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa" }, @@ -119,12 +119,12 @@ }, "max_peptide_length_class2": { "type": "integer", - "default": 25, + "default": 16, "description": "Specifies the maximum peptide length for MHC class II peptides." }, "min_peptide_length_class2": { "type": "integer", - "default": 9, + "default": 15, "description": "Specifies the minimum peptide length for MHC class II peptides." }, "tools": { From 3f3bcf97cc2ef258bd78260d6622d7055d4a4d3f Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 5 Dec 2023 11:35:15 +0000 Subject: [PATCH 08/13] add changelog, fix lint --- CHANGELOG.md | 1 + assets/schema_input.json | 4 ++-- nextflow_schema.json | 14 +++++++++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dcb7f43..866c94e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` +- [#220](https://github.com/nf-core/epitopeprediction/pull/220) - Switch to nf-validation to parse samplesheet - [#213](https://github.com/nf-core/epitopeprediction/pull/203) - Rename param `genome_version` to `genome_reference`, add functionality to handle BioMart archive urls - [#213](https://github.com/nf-core/epitopeprediction/pull/203) - Update to nf-core template `2.10` - [#203](https://github.com/nf-core/epitopeprediction/pull/203) - Update to nf-core template `2.9` diff --git a/assets/schema_input.json b/assets/schema_input.json index 3c6bdc4..df0fd34 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -28,10 +28,10 @@ }, { "type": "string", - "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$", + "pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$" }, { - "type":"string", + "type": "string", "pattern": "^[H][-][2][-][A-Za-z]{2,3}$" } ], diff --git a/nextflow_schema.json b/nextflow_schema.json index 6cb51bf..4180cfd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,7 +52,7 @@ "genome_reference": { "type": "string", "default": "grch37", - "enum":["grch37", "grch38"], + "enum": ["grch37", "grch38"], "help_text": "This defines against which human Ensembl genome reference the pipeline performs the analysis including the incorporation of genetic variants e.g.. If `grch37` or `grch38` are specified the most recent Ensembl Biomart version for genome versions will be used. Alternatively, an Ensembl Biomart (archive) version can be specified, e.g. http://jan2020.archive.ensembl.org/.", "description": "Specifies the Ensembl genome reference version that will be used." }, @@ -215,25 +215,29 @@ }, "netmhcpan_path": { "type": "string", - "default": "None", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhcpan' tool, specify the path to the original software tarball for NetMHCpan 4.0 here." }, "netmhc_path": { "type": "string", - "default": "None", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhc' tool, specify the path to the original software tarball for NetMHC 4.0 here." }, "netmhciipan_path": { "type": "string", - "default": "None", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhciipan' tool, specify the path to the original software tarball for NetMHCIIpan 3.1 here." }, "netmhcii_path": { "type": "string", - "default": "None", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.tar\\.gz$", "description": "To use the 'netmhcii' tool, specify the path to the original software tarball for NetMHCII 2.2 here." } From 1ffdacb4e89cb7268bebc4a424989340f5dce0b0 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 5 Dec 2023 13:05:53 +0000 Subject: [PATCH 09/13] change internal mhcclass to mhc_class for consistency --- bin/check_requested_models.py | 2 +- modules/local/epytope_check_requested_models.nf | 4 ++-- modules/local/epytope_generate_peptides.nf | 4 ++-- modules/local/epytope_peptide_prediction.nf | 10 +++++----- nextflow_schema.json | 1 - workflows/epitopeprediction.nf | 4 ++-- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/bin/check_requested_models.py b/bin/check_requested_models.py index bc7a485..342505f 100755 --- a/bin/check_requested_models.py +++ b/bin/check_requested_models.py @@ -54,7 +54,7 @@ def __main__(): "Write out information about supported models by epytope for installed predictor tool versions." ) parser.add_argument("-p", "--peptides", help="File with one peptide per line") - parser.add_argument("-c", "--mhcclass", default=1, help="MHC class I or II") + parser.add_argument("-c", "--mhc_class", default=1, help="MHC class I or II") parser.add_argument("-l", "--max_length", help="Maximum peptide length", type=int) parser.add_argument("-ml", "--min_length", help="Minimum peptide length", type=int) parser.add_argument("-a", "--alleles", help=" MHC Alleles", required=True, type=str) diff --git a/modules/local/epytope_check_requested_models.nf b/modules/local/epytope_check_requested_models.nf index f78a77a..8438c4d 100644 --- a/modules/local/epytope_check_requested_models.nf +++ b/modules/local/epytope_check_requested_models.nf @@ -30,8 +30,8 @@ process EPYTOPE_CHECK_REQUESTED_MODELS { } def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides" - def min_length = ("${meta.mhcclass}" == "I") ? params.min_peptide_length : params.min_peptide_length_class2 - def max_length = ("${meta.mhcclass}" == "I") ? params.max_peptide_length : params.max_peptide_length_class2 + def min_length = ("${meta.mhc_class}" == "I") ? params.min_peptide_length : params.min_peptide_length_class2 + def max_length = ("${meta.mhc_class}" == "I") ? params.max_peptide_length : params.max_peptide_length_class2 """ check_requested_models.py ${argument} \ diff --git a/modules/local/epytope_generate_peptides.nf b/modules/local/epytope_generate_peptides.nf index 401afa7..d79ce4a 100644 --- a/modules/local/epytope_generate_peptides.nf +++ b/modules/local/epytope_generate_peptides.nf @@ -19,8 +19,8 @@ process EPYTOPE_GENERATE_PEPTIDES { script: def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides" - def min_length = (meta.mhcclass == "I") ? params.min_peptide_length : params.min_peptide_length_class2 - def max_length = (meta.mhcclass == "I") ? params.max_peptide_length : params.max_peptide_length_class2 + def min_length = (meta.mhc_class == "I") ? params.min_peptide_length : params.min_peptide_length_class2 + def max_length = (meta.mhc_class == "I") ? params.max_peptide_length : params.max_peptide_length_class2 """ gen_peptides.py --input ${raw} \\ diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf index 8b8ca73..ef0c07a 100644 --- a/modules/local/epytope_peptide_prediction.nf +++ b/modules/local/epytope_peptide_prediction.nf @@ -47,14 +47,14 @@ process EPYTOPE_PEPTIDE_PREDICTION { def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') } def class2_tools = tools_split.findAll { it.matches('.*(?i)(syf|class-2|ii).*') } - if (((meta.mhcclass == "I") & class1_tools.empty) | ((meta.mhcclass == "II") & class2_tools.empty)) { - exit 1, "No tools specified for mhc class ${meta.mhcclass}" + if (((meta.mhc_class == "I") & class1_tools.empty) | ((meta.mhc_class == "II") & class2_tools.empty)) { + exit 1, "No tools specified for mhc class ${meta.mhc_class}" } - def min_length = (meta.mhcclass == "I") ? params.min_peptide_length : params.min_peptide_length_class2 - def max_length = (meta.mhcclass == "I") ? params.max_peptide_length : params.max_peptide_length_class2 + def min_length = (meta.mhc_class == "I") ? params.min_peptide_length : params.min_peptide_length_class2 + def max_length = (meta.mhc_class == "I") ? params.max_peptide_length : params.max_peptide_length_class2 - def tools_to_use = ((meta.mhcclass == "I") | (meta.mhcclass == "H-2")) ? class1_tools.join(',') : class2_tools.join(',') + def tools_to_use = ((meta.mhc_class == "I") | (meta.mhc_class == "H-2")) ? class1_tools.join(',') : class2_tools.join(',') """ # create folder for MHCflurry downloads to avoid permission problems when running pipeline with docker profile and mhcflurry selected diff --git a/nextflow_schema.json b/nextflow_schema.json index 4180cfd..e69966b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,7 +52,6 @@ "genome_reference": { "type": "string", "default": "grch37", - "enum": ["grch37", "grch38"], "help_text": "This defines against which human Ensembl genome reference the pipeline performs the analysis including the incorporation of genetic variants e.g.. If `grch37` or `grch38` are specified the most recent Ensembl Biomart version for genome versions will be used. Alternatively, an Ensembl Biomart (archive) version can be specified, e.g. http://jan2020.archive.ensembl.org/.", "description": "Specifies the Ensembl genome reference version that will be used." }, diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 410a9bd..2d60755 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -283,9 +283,9 @@ workflow EPITOPEPREDICTION { .variant .branch { meta_data, input_file -> - vcf : input_file.extension == 'vcf' || input_file.extension == 'vcf.gz' + vcf : input_file.endsWith('.vcf') || input_file.endsWith('.vcf.gz') return [ meta_data, input_file ] - tab : input_file.extension == 'tsv' || input_file.extension == 'GSvar' + tab : input_file.endsWith('.tsv') || input_file.endsWith('.GSvar') return [ meta_data, input_file ] } .set { ch_variants } From 400b831b83cab39867be7341f651ce35714ef0a3 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Wed, 13 Dec 2023 12:33:30 +0000 Subject: [PATCH 10/13] Add check for mhc_class-allele combination --- workflows/epitopeprediction.nf | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index f3a8bf0..7b2cf8f 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -90,6 +90,17 @@ import groovy.json.JsonSlurper def jsonSlurper = new JsonSlurper() def external_tools_meta = jsonSlurper.parse(file(params.external_tools_meta, checkIfExists: true)) +// Function to check if the alleles are valid for the given mhc class +def validate_alleles(String alleles, String mhc_class) { + valid_class1_loci = ['A*','B*','C*','E*','G*'] + valid_class2_loci = ['DR','DP','DQ'] + allele_list = alleles.split(';') + if (( mhc_class == 'I' & allele_list.every { allele -> valid_class2_loci.any { allele.startsWith(it) }}) | + ( mhc_class == 'II' & allele_list.every { allele -> valid_class1_loci.any { allele.startsWith(it) }})) { + exit 1, "Please check input samplesheet -> Invalid mhc class ${mhc_class} and allele combination ${allele_list} found!" + } +} + workflow EPITOPEPREDICTION { validateParameters() @@ -116,6 +127,7 @@ workflow EPITOPEPREDICTION { .branch { sample, alleles, mhc_class, filename -> def allele_list = readAlleles(alleles) + validate_alleles(allele_list, mhc_class) variant_compressed : filename.endsWith('.vcf.gz') return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant_compressed'], filename ] variant_uncompressed : filename.endsWith('.vcf') || filename.endsWith('.GSvar') From a34c450d8f1749092055928405ef321007e2baab Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Mon, 18 Dec 2023 10:39:04 +0100 Subject: [PATCH 11/13] Remove txt filename input from schema_input.json --- assets/schema_input.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index df0fd34..c0728a6 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -44,8 +44,8 @@ }, "filename": { "type": "string", - "pattern": "^\\S+\\.(vcf|vcf.gz|tsv|fasta|fa|txt)$", - "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.vcf.gz', '.tsv', '.fasta', '.fa', '.txt'" + "pattern": "^\\S+\\.(vcf|vcf.gz|tsv|fasta|fa)$", + "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.vcf.gz', '.tsv', '.fasta', '.fa'" } }, "required": ["sample", "alleles", "mhc_class", "filename"] From 74da74b7edaf49ad609bc2c4a856a408586c554f Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:26:26 +0100 Subject: [PATCH 12/13] Update workflows/epitopeprediction.nf Co-authored-by: Christopher Mohr --- workflows/epitopeprediction.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 7b2cf8f..3e03824 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -130,7 +130,7 @@ workflow EPITOPEPREDICTION { validate_alleles(allele_list, mhc_class) variant_compressed : filename.endsWith('.vcf.gz') return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant_compressed'], filename ] - variant_uncompressed : filename.endsWith('.vcf') || filename.endsWith('.GSvar') + variant_uncompressed : filename.endsWith('.vcf') return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'variant'], filename ] peptide : filename.endsWith('.tsv') return [[sample:sample, alleles:allele_list, mhc_class:mhc_class, inputtype:'peptide'], filename ] From 693bd500fe3efb92c94aaa873cce2546d39d6a5b Mon Sep 17 00:00:00 2001 From: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:26:33 +0100 Subject: [PATCH 13/13] Update workflows/epitopeprediction.nf Co-authored-by: Christopher Mohr --- workflows/epitopeprediction.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 3e03824..00549dd 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -110,7 +110,7 @@ workflow EPITOPEPREDICTION { // Non-free prediction tools ch_nonfree_paths = Channel.empty() - // Function to read the alleles from a file or stage it from url + // Function to read the alleles from a file or use given string def readAlleles = { input -> if (input.endsWith(".txt")) { def file = file(input)