resolve conflicts

nf-core · Dec 13, 2023 · db037bd · db037bd
2 parents 1ffdacb + ee311d9
commit db037bd
Show file tree

Hide file tree

Showing 10 changed files with 18 additions and 263 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -55,15 +55,7 @@ jobs:
           - NXF_VER: ""
             NXF_EDGE: "1"
         tests:
-          [
-            "test_variant_tsv",
-            "test_grch38_variant_tsv",
-            "test_peptides",
-            "test_peptides_h2",
-            "test_proteins",
-            "test_mhcnuggets",
-            "test_mhcflurry",
-          ]
+          ["test_grch38", "test_peptides", "test_peptides_h2", "test_proteins", "test_mhcnuggets", "test_mhcflurry"]
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [#219](https://github.com/nf-core/epitopeprediction/pull/219) - Fix `EXTERNAL_TOOLS_IMPORT`` container registry and bump version
 
+### `Removed`
+
+- [#221](https://github.com/nf-core/epitopeprediction/pull/221) - Remove support of `GSvar` and variant `tsv` input files
+
 ## v2.2.1 - WaldhaeuserOst Hotfix - 2023-03-16
 
 ### `Fixed`

diff --git a/bin/epaa.py b/bin/epaa.py
@@ -70,26 +70,6 @@ def get_epytope_annotation(vt, p, r, alt):
     return position, reference, alternative
 
 
-def check_min_req_GSvar(row):
-    """
-    checking the presence of mandatory columns
-    :param row: dictionary of a GSvar row
-    :return: boolean, True if min req met
-    """
-    if (
-        "#chr" in row.keys()
-        and "start" in row.keys()
-        and "end" in row.keys()
-        and "ref" in row.keys()
-        and "obs" in row.keys()
-        and (
-            "coding_and_splicing_details" in row.keys() or "coding" in row.keys() or "coding_and_splicing" in row.keys()
-        )
-    ):
-        return True
-    return False
-
-
 def determine_variant_type(record, alternative):
     vt = VariationType.UNKNOWN
     if record.is_snp:
@@ -129,152 +109,6 @@ def determine_zygosity(record):
     return isHomozygous
 
 
-def read_GSvar(filename, pass_only=True):
-    """
-    reads GSvar and tsv files (tab sep files in context of genetic variants), omitting and warning about rows missing
-    mandatory columns
-    :param filename: /path/to/file
-    :return: list epytope variants
-    """
-    global ID_SYSTEM_USED
-    RE = re.compile("(\w+):([\w.]+):([&\w]+):\w*:exon(\d+)\D*\d*:(c.\D*([_\d]+)\D*):(p.\D*(\d+)\w*)")
-
-    # list of mandatory (meta)data
-    exclusion_list = [
-        "start",
-        "end",
-        "#chr",
-        "ref",
-        "obs",
-        "gene",
-        "tumour_genotype",
-        "coding_and_splicing_details",
-        "variant_details",
-        "variant_type",
-        "coding_and_splicing",
-    ]
-
-    list_vars = list()
-    lines = list()
-    transcript_ids = []
-    dict_vars = {}
-
-    cases = 0
-
-    with open(filename, "rt") as tsvfile:
-        tsvreader = csv.DictReader((row for row in tsvfile if not row.startswith("##")), delimiter="\t")
-        for row in tsvreader:
-            if not check_min_req_GSvar(row):
-                logger.warning("read_GSvar: Omitted row! Mandatory columns not present in: \n" + str(row) + ".")
-                continue
-            lines.append(row)
-
-    # get list of additional metadata
-    metadata_list = set(tsvreader.fieldnames) - set(exclusion_list)
-
-    for mut_id, line in enumerate(lines):
-        if "filter" in line and pass_only and line["filter"].strip():
-            continue
-        genome_start = int(line["start"]) - 1
-        genome_stop = int(line["end"]) - 1
-        chrom = line["#chr"]
-        ref = line["ref"]
-        alt = line["obs"]
-        gene = line.get("gene", "")
-
-        isHomozygous = (
-            True
-            if (
-                ("tumour_genotype" in line)
-                and (line["tumour_genotype"].split("/")[0] == line["tumour_genotype"].split("/")[1])
-            )
-            else False
-        )
-
-        # old GSvar version
-        if "coding_and_splicing_details" in line:
-            mut_type = line.get("variant_details", "")
-            annots = RE.findall(line["coding_and_splicing_details"])
-        else:
-            mut_type = line.get("variant_type", "")
-            # Gene, transcript number, type, impact, exon/intron number, HGVS.c, HGVS.p, Pfam
-            annots = RE.findall(line["coding_and_splicing"])
-        isyn = mut_type == "synonymous_variant"
-
-        """
-        Enum for variation types:
-        type.SNP, type.DEL, type.INS, type.FSDEL, type.FSINS, type.UNKNOWN
-        """
-        vt = VariationType.UNKNOWN
-        if mut_type == "missense_variant" or "missense_variant" in mut_type:
-            vt = VariationType.SNP
-        elif mut_type == "frameshift_variant":
-            if (ref == "-") or (len(ref) < len(alt)):
-                vt = VariationType.FSINS
-            else:
-                vt = VariationType.FSDEL
-        elif mut_type == "inframe_deletion":
-            vt = VariationType.DEL
-        elif mut_type == "inframe_insertion":
-            vt = VariationType.INS
-
-        coding = dict()
-
-        for annot in annots:
-            a_gene, transcript_id, a_mut_type, exon, trans_coding, trans_pos, prot_coding, prot_start = annot
-            if "NM" in transcript_id:
-                ID_SYSTEM_USED = EIdentifierTypes.REFSEQ
-            if "stop_gained" not in mut_type:
-                if not gene:
-                    gene = a_gene
-                if not mut_type:
-                    mut_type = a_mut_type
-
-                # with the latest epytope release (3.3.1), we can now handle full transcript IDs
-                coding[transcript_id] = MutationSyntax(
-                    transcript_id, int(trans_pos.split("_")[0]) - 1, int(prot_start) - 1, trans_coding, prot_coding
-                )
-                transcript_ids.append(transcript_id)
-        if coding:
-            var = Variant(
-                mut_id,
-                vt,
-                chrom.strip("chr"),
-                int(genome_start),
-                ref.upper(),
-                alt.upper(),
-                coding,
-                isHomozygous,
-                isSynonymous=isyn,
-            )
-            var.gene = gene
-
-            # metadata logging
-            for meta_name in metadata_list:
-                var.log_metadata(meta_name, line.get(meta_name, ""))
-
-            dict_vars[var] = var
-            list_vars.append(var)
-
-    transToVar = {}
-
-    # fix because of memory/timing issues due to combinatorial explosion
-    for variant in list_vars:
-        for trans_id in variant.coding.keys():
-            transToVar.setdefault(trans_id, []).append(variant)
-
-    for tId, vs in transToVar.items():
-        if len(vs) > 10:
-            cases += 1
-            for v in vs:
-                vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous)
-                vs_new.gene = v.gene
-                for m in metadata_list:
-                    vs_new.log_metadata(m, v.get_metadata(m)[0])
-                dict_vars[v] = vs_new
-    return dict_vars.values(), transcript_ids, metadata_list
-
-
 def read_vcf(filename, pass_only=True):
     """
     reads vcf files
@@ -1224,18 +1058,19 @@ def __main__():
         logger.info("Running epaa for peptides...")
         peptides, metadata = read_peptide_input(args.peptides)
     else:
-        if args.somatic_mutations.endswith(".GSvar") or args.somatic_mutations.endswith(".tsv"):
-            logger.info("Running epaa for variants...")
-            variant_list, transcripts, metadata = read_GSvar(args.somatic_mutations)
-        elif args.somatic_mutations.endswith(".vcf"):
+        logger.info("Running epaa for variants...")
+        if args.somatic_mutations.endswith(".vcf"):
             variant_list, transcripts, metadata = read_vcf(args.somatic_mutations)
+        else:
+            raise ValueError("File is not in VCF format. Please provide a VCF file.")
 
         transcripts = list(set(transcripts))
 
         # use function provided by epytope to retrieve protein IDs (different systems) for transcript IDs
         transcriptProteinTable = ma.get_protein_ids_from_transcripts(transcripts, type=ID_SYSTEM_USED)
 
     # get the alleles
+    # TODO: remove this in PR of nf-validation
     if args.alleles.startswith("http"):
         alleles = [Allele(a) for a in urllib.request.urlopen(args.alleles).read().decode("utf-8").splitlines()]
     elif args.alleles.endswith(".txt"):

diff --git a/conf/modules.config b/conf/modules.config
@@ -140,13 +140,6 @@ process {
         ]
     }
 
-    withName: CSVTK_SPLIT {
-        publishDir = [
-            path: { "${params.outdir}/split_input/${meta.sample}" },
-            mode: params.publish_dir_mode
-        ]
-    }
-
     withName: GET_PREDICTION_VERSIONS {
         publishDir = [
             path: { "${params.outdir}/reports" },

diff --git a/conf/test_grch38_variant_tsv.config → conf/test_grch38.config b/conf/test_grch38_variant_tsv.config → conf/test_grch38.config
@@ -4,7 +4,7 @@
  * -------------------------------------------------
  * Defines bundled input files and everything required
  * to run a fast and simple test. Use as follows:
- *   nextflow run nf-core/epitopeprediction -profile test_grch38_variant_tsv,<docker/singularity> --outdir <OUTDIR>
+ *   nextflow run nf-core/epitopeprediction -profile test_grch38,<docker/singularity> --outdir <OUTDIR>
  */
 
 params {
@@ -13,6 +13,6 @@ params {
     max_time = 48.h
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'
     genome_version = 'GRCh38'
 }
diff --git a/conf/test_variant_tsv.config b/conf/test_variant_tsv.config
diff --git a/docs/usage.md b/docs/usage.md
@@ -83,7 +83,7 @@ GBM_2,alleles.txt,I,gbm_2_variants.vcf
 | `sample`    | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
 | `alleles`   | A string that consists of the patient's alleles (separated by ";"), or a full path to a allele ".txt" file where each allele is saved on a row.                                        |
 | `mhc_class` | Specifies the MHC class for which the prediction should be performed. Valid values are: `I`, `II` and `H-2` (mouse).                                                                   |
-| `filename`  | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv", "fasta", or "GSvar").                                                                                        |
+| `filename`  | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv" or "fasta").                                                                                                  |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

diff --git a/modules/local/csvtk_split.nf b/modules/local/csvtk_split.nf
diff --git a/nextflow.config b/nextflow.config
@@ -204,8 +204,7 @@ profiles {
         executor.memory        = 8.GB
     }
     test { includeConfig 'conf/test.config' }
-    test_variant_tsv { includeConfig 'conf/test_variant_tsv.config' }
-    test_grch38_variant_tsv { includeConfig 'conf/test_grch38_variant_tsv.config' }
+    test_grch38 { includeConfig 'conf/test_grch38.config' }
     test_peptides { includeConfig 'conf/test_peptides.config' }
     test_peptides_h2 { includeConfig 'conf/test_peptides_h2.config' }
     test_proteins { includeConfig 'conf/test_proteins.config' }

diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
@@ -46,7 +46,6 @@ include { EPYTOPE_SHOW_SUPPORTED_MODELS }
 
 include { VARIANT_SPLIT}                                                            from '../modules/local/variant_split'
 include { SNPSIFT_SPLIT}                                                            from '../modules/local/snpsift_split'
-include { CSVTK_SPLIT}                                                              from '../modules/local/csvtk_split'
 
 include { EPYTOPE_GENERATE_PEPTIDES }                                               from '../modules/local/epytope_generate_peptides'
 include { SPLIT_PEPTIDES as SPLIT_PEPTIDES_PEPTIDES }                                                          from '../modules/local/split_peptides'
@@ -278,40 +277,22 @@ workflow EPITOPEPREDICTION {
     ========================================================================================
     */
 
-    // Make a division for the variant files and process them further accordingly
-    ch_samples_uncompressed
-        .variant
-        .branch {
-            meta_data, input_file ->
-                vcf : input_file.endsWith('.vcf') || input_file.endsWith('.vcf.gz')
-                    return [ meta_data, input_file ]
-                tab :  input_file.endsWith('.tsv') || input_file.endsWith('.GSvar')
-                    return [ meta_data, input_file ]
-        }
-        .set { ch_variants }
-
-    // decide between the split_by_variants and snpsift_split (by chromosome) function (only vcf and vcf.gz variant files)
+    // decide between the split_by_variants and snpsift_split (by chromosome) function
     if (params.split_by_variants) {
         VARIANT_SPLIT(
-            ch_variants.vcf
+            ch_samples_uncompressed.variant
         )
         .set { ch_split_variants }
         ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions )
 
     }
     else {
         SNPSIFT_SPLIT(
-            ch_variants.vcf
+            ch_samples_uncompressed.variant
         )
         .set { ch_split_variants }
         ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions )
     }
-    // include the csvtk_split function (only variant files with an tsv and GSvar executable)
-    CSVTK_SPLIT(
-        ch_variants.tab
-    )
-
-    ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions )
 
     // process FASTA file and generated peptides
     EPYTOPE_GENERATE_PEPTIDES(
@@ -364,10 +345,8 @@ workflow EPITOPEPREDICTION {
 
     // Run epitope prediction for variants
     EPYTOPE_PEPTIDE_PREDICTION_VAR(
-        CSVTK_SPLIT
-            .out
+        ch_split_variants
             .splitted
-            .mix( ch_split_variants.splitted )
             .combine( ch_prediction_tool_versions )
             .transpose(),
             EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])