From 31b2ba56d935d99b15d3a8935565282f3f3aca64 Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@uni-tuebingen.de>
Date: Thu, 25 Aug 2022 13:48:59 +0200
Subject: [PATCH 1/6] add optional input for expression values and
 functionality to map values to results

---
 bin/check_samplesheet.py                    | 29 ++++++---
 bin/epaa.py                                 | 67 ++++++++++++---------
 modules/local/epytope_peptide_prediction.nf |  5 ++
 subworkflows/local/input_check.nf           |  6 +-
 workflows/epitopeprediction.nf              | 14 +++--
 5 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 408ebfa..55de9f2 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out):
     GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
     GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
 
+    or with optional column(s)
+
+    sample,alleles,mhc_class,expression,filename
+    GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
+    GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
+
 
     where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins)
     and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
-    containing one allele per line (no header)
+    containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or
+    gene level) as generated by the rnaseq pipeline.
 
     Further examples:
     - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
@@ -246,12 +253,16 @@ def check_samplesheet(file_in, file_out):
     with open(file_in, "r") as fin:
 
         ## Check header
-        COL_NUM = 4
+        MIN_COL = 4 # expression optional
         HEADER = ["sample", "alleles", "mhc_class", "filename"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        expression_available = "expression" in header
         valid_classes = "I,II,H-2"
         valid_class1_loci = ['A*','B*','C*','E*','G*']
         valid_class2_loci = ['DR','DP','DQ']
+
+        if expression_available:
+            HEADER.insert(3, "expression")
         if header[: len(HEADER)] != HEADER:
             print("ERROR: Please check samplesheet header -> {} != {}".format("\t".join(header), "\t".join(HEADER)))
             sys.exit(1)
@@ -267,15 +278,20 @@ def check_samplesheet(file_in, file_out):
                     line,
                 )
             num_cols = len([x for x in lspl if x])
-            if num_cols != COL_NUM:
+            if num_cols < MIN_COL:
                 print_error(
-                    "Invalid number of populated columns (valid = {})!".format(COL_NUM),
+                    "Invalid number of populated columns (valid >= {})!".format(MIN_COL),
                     "Line",
                     line,
                 )
 
             ## Check sample name entries
-            sample, alleles, mhcclass, filename = lspl[: len(HEADER)]
+            if expression_available:
+                sample, alleles, mhcclass, expression, filename = lspl[: len(HEADER)]
+                sample_info = [sample, alleles, mhcclass, expression, filename]
+            else:
+                sample, alleles, mhcclass, filename = lspl[: len(HEADER)]
+                sample_info = [sample, alleles, mhcclass, "", filename]
 
             ## Check given file types
             if not filename.lower().endswith((".vcf", ".vcf.gz", ".tsv", ".GSvar", ".fasta", ".txt")):
@@ -289,7 +305,6 @@ def check_samplesheet(file_in, file_out):
             if  not os.path.isfile(alleles) and mhcclass == 'I' and any(substring in alleles for substring in valid_class2_loci) or mhcclass == 'II' and any(substring in alleles for substring in valid_class1_loci):
                 print_error("Samplesheet contains invalid mhc class and allele combination!", "Line", line)
 
-            sample_info = [sample, alleles, mhcclass, filename]
             ## Create sample mapping dictionary
             if sample not in sample_run_dict:
                 sample_run_dict[sample] = [sample_info]
@@ -304,7 +319,7 @@ def check_samplesheet(file_in, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "alleles","mhc_class","filename"]) + "\n")
+            fout.write(",".join(["sample", "alleles","mhc_class", "expression", "filename"]) + "\n")
 
             for sample in sorted(sample_run_dict.keys()):
                 for val in sample_run_dict[sample]:
diff --git a/bin/epaa.py b/bin/epaa.py
index 92296b6..1d474e1 100755
--- a/bin/epaa.py
+++ b/bin/epaa.py
@@ -419,6 +419,9 @@ def read_protein_quant(filename):
                         intensities[p.split('|')[1]] = valuedict
     return intensities
 
+# parse rnaseq analysis results
+# data frame: gene/transcript -> count/TPM
+#def read_diff_expression_values(filename):
 
 # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes
 def read_diff_expression_values(filename):
@@ -486,12 +489,6 @@ def create_mutationsyntax_genome_column_value(pep):
     return ','.join(set([y.cdsMutationSyntax for y in syntaxes]))
 
 
-def create_variationfilelinenumber_column_value(pep):
-    v = [x.vars.values() for x in pep.get_all_transcripts()]
-    vf = list(itertools.chain.from_iterable(v))
-    return ','.join([str(int(y.id.replace('line', ''))+1) for y in vf])
-
-
 def create_gene_column_value(pep):
     transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())]
     variants = []
@@ -877,7 +874,7 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, use_a
         df['length'] = df['sequence'].map(len)
         df['chr'] = df['sequence'].map(create_variant_chr_column_value)
         df['pos'] = df['sequence'].map(create_variant_pos_column_value)
-        df['gene'] = df['sequence'].map(create_gene_column_value)
+        df['gene_id'] = df['sequence'].map(create_gene_column_value)
         df['transcripts'] = df['sequence'].map(create_transcript_column_value)
         df['proteins'] = df['sequence'].map(create_protein_column_value)
         df['variant type'] = df['sequence'].map(
@@ -1049,8 +1046,8 @@ def __main__():
                         help="List of gene IDs for ID mapping.", required=False)
     parser.add_argument('-pq', "--protein_quantification",
                         help="File with protein quantification values")
-    parser.add_argument('-ge', "--gene_expression",
-                        help="File with expression analysis results")
+    parser.add_argument('-ge', "--expression",
+                        help="File with rnaseq analysis results", required=False)
     parser.add_argument('-de', "--diff_gene_expression",
                         help="File with differential expression analysis results (DESeq2)")
     parser.add_argument('-li', "--ligandomics_id",
@@ -1154,7 +1151,6 @@ def __main__():
     try:
         complete_df = pd.concat(pred_dataframes, sort=True)
         # replace method names with method names with version
-        # complete_df.replace({'method': methods}, inplace=True)
         complete_df['method'] = complete_df['method'].apply(
             lambda x: x.lower() + '-' + methods[x.lower()])
         predictions_available = True
@@ -1163,16 +1159,25 @@ def __main__():
         predictions_available = False
         logger.error("No predictions available.")
 
+    complete_df.replace("gene", "gene_id")
+
+    # get gene names from Ensembl and add them to the data frame
+    # we want to add gene names to our data frame in order to make the mapping easier
+    # we will use this when the next epytope release is ready where we already implemented the functionality
+    #mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
+    #mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
+    #complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")
+
     # include wild type sequences to dataframe if specified
     if args.wild_type:
         wt_sequences = generate_wt_seqs(all_peptides_filtered)
         complete_df['wt sequence'] = complete_df.apply(
             lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
         columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos',
-                         'gene', 'transcripts', 'proteins', 'variant type', 'method']
+                         'gene_id', 'transcripts', 'proteins', 'variant type', 'method']
     # Change the order (the index) of the columns
     else:
-        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene',
+        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene_id',
                          'transcripts', 'proteins', 'variant type', 'method']
     for c in complete_df.columns:
         if c not in columns_tiles:
@@ -1206,24 +1211,26 @@ def __main__():
         for k in first_entry.keys():
             complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(
                 lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
-    # parse (differential) expression analysis results, annotate features (genes/transcripts)
-    if args.gene_expression is not None:
-        fold_changes = read_diff_expression_values(args.gene_expression)
-        gene_id_lengths = {}
-        col_name = 'RNA expression (RPKM)'
-
-        with open(args.gene_reference, 'r') as gene_list:
-            for l in gene_list:
-                ids = l.split('\t')
-                gene_id_in_df = complete_df.iloc[1]['gene']
-                if 'ENSG' in gene_id_in_df:
-                    gene_id_lengths[ids[0]] = float(ids[2].strip())
-                else:
-                    gene_id_lengths[ids[1]] = float(ids[2].strip())
-        deseq = False
-        # add column to result dataframe
-        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(
-            row, fold_changes, deseq, gene_id_lengths), axis=1)
+    # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts)
+    if args.expression is not None:
+        rnaseq_results = pd.read_csv(args.expression, sep='\t', header=0)
+
+        measure = "count" if "count" in args.expression else "TPM"
+        transcript_features = "tx" in rnaseq_results.columns
+        #merge_on = "gene_name"
+        merge_on = "gene_id"
+
+        # we expect columns: tx gene_id samples
+        if transcript_features:
+            rnaseq_results.columns = ["{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") for c in rnaseq_results.columns]
+            merge_on = "tx"
+        # we expect columns: gene_id gene_name samples
+        else:
+            rnaseq_results.columns = ["{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") for c in rnaseq_results.columns]
+
+        # add sample-specific expression values to data frame
+        complete_df = complete_df.merge(rnaseq_results,on=merge_on,how="left")
+
     if args.diff_gene_expression is not None:
         gene_id_lengths = {}
         fold_changes = read_diff_expression_values(args.diff_gene_expression)
diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf
index b6a7eee..e68d582 100644
--- a/modules/local/epytope_peptide_prediction.nf
+++ b/modules/local/epytope_peptide_prediction.nf
@@ -9,6 +9,7 @@ process EPYTOPE_PEPTIDE_PREDICTION {
     input:
     tuple val(meta), path(splitted), path(software_versions)
     val netmhc_paths
+    path(expression)
 
     output:
     tuple val(meta), path("*.json"), emit: json
@@ -41,6 +42,10 @@ process EPYTOPE_PEPTIDE_PREDICTION {
         argument = "--use_affinity_thresholds " + argument
     }
 
+    if (expression) {
+        argument = "--expression ${expression} " + argument
+    }
+
     def netmhc_paths_string = netmhc_paths.join(",")
     def tools_split = params.tools.split(',')
     def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') }
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 1913eea..ed7787e 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -30,12 +30,14 @@ def get_samplesheet_paths(LinkedHashMap row) {
     meta.alleles        = allele_string
     meta.mhcclass       = row.mhc_class
     meta.inputtype      = type
+    expression = row.expression ? file(row.expression, checkIfExists: true) : []
 
     def array = []
     if (!file(row.filename).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}"
-    } else {
-        array = [ meta, file(row.filename) ]
+    }
+    else {
+        array = [meta, expression, file(row.filename)]
     }
     return array
 }
diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
index 64c59aa..f35e21f 100644
--- a/workflows/epitopeprediction.nf
+++ b/workflows/epitopeprediction.nf
@@ -112,7 +112,7 @@ workflow EPITOPEPREDICTION {
 
     INPUT_CHECK.out.reads
                 .branch {
-                    meta_data, input_file ->
+                    meta_data, expression, input_file ->
                         variant_compressed : meta_data.inputtype == 'variant_compressed'
                             return [ meta_data, input_file ]
                         variant_uncompressed :  meta_data.inputtype == 'variant'
@@ -124,6 +124,9 @@ workflow EPITOPEPREDICTION {
                     }
                 .set { ch_samples_from_sheet }
 
+    ch_expression = INPUT_CHECK.out.reads
+                        .map { meta_data, expression, input_file -> expression }
+
     // gunzip variant files
     GUNZIP_VCF (
         ch_samples_from_sheet.variant_compressed
@@ -342,7 +345,8 @@ workflow EPITOPEPREDICTION {
             .splitted
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
 
     // Run epitope prediction for peptides
@@ -352,7 +356,8 @@ workflow EPITOPEPREDICTION {
             .splitted
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
 
     // Run epitope prediction for variants
@@ -363,7 +368,8 @@ workflow EPITOPEPREDICTION {
             .mix( ch_split_variants.splitted )
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
 
     // collect prediction script versions

From 6cc654d196a6d22e1adbbdb1f230e630e501e889 Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@uni-tuebingen.de>
Date: Thu, 25 Aug 2022 13:50:52 +0200
Subject: [PATCH 2/6] add optional input to input schema

---
 assets/schema_input.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/assets/schema_input.json b/assets/schema_input.json
index b564a5c..e4654fc 100644
--- a/assets/schema_input.json
+++ b/assets/schema_input.json
@@ -30,6 +30,11 @@
                 "pattern": "^(I|II|H-2)$",
                 "errorMessage": "The MHC class must be provided. Valid values: "
             },
+            "rnaseq": {
+                "type": "string",
+                "pattern": "^\\S+\\.(tsv)$",
+                "errorMessage": "RNAseq analysis results must have one of the following extensions:  ''.tsv''"
+            },
             "filename": {
                 "type": "string",
                 "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$",

From 67cb5494ddaf09699ce522f35c4c79854772da63 Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@uni-tuebingen.de>
Date: Thu, 25 Aug 2022 16:38:24 +0200
Subject: [PATCH 3/6] update ensembl archive for grch38

---
 bin/epaa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/epaa.py b/bin/epaa.py
index 1d474e1..32fd5d0 100755
--- a/bin/epaa.py
+++ b/bin/epaa.py
@@ -1068,7 +1068,7 @@ def __main__():
     metadata = []
     proteins = []
     references = {'GRCh37': 'http://feb2014.archive.ensembl.org',
-                  'GRCh38': 'http://mar2017.archive.ensembl.org'}
+                  'GRCh38': 'http://aug2017.archive.ensembl.org'}
     global transcriptProteinMap
     global transcriptSwissProtMap
 

From a2af74e85d3659a37862885ba699fad4ffa3e873 Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@uni-tuebingen.de>
Date: Thu, 25 Aug 2022 16:43:23 +0200
Subject: [PATCH 4/6] allow genome_version in lower case

---
 nextflow_schema.json | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 10de656..990098c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -52,7 +55,12 @@
                     "type": "string",
                     "default": "GRCh37",
                     "help_text": "This defines against which human reference genome the pipeline performs the analysis including the incorporation of genetic variants e.g..",
-                    "enum": ["GRCh37", "GRCh38"],
+                    "enum": [
+                        "GRCh37",
+                        "GRCh38",
+                        "grch37",
+                        "grch38"
+                    ],
                     "description": "Specifies  the human reference genome version."
                 },
                 "proteome": {
@@ -319,7 +327,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {

From 05eb2c91e0aa94eacb2ce428ee151fcecba65513 Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@uni-tuebingen.de>
Date: Thu, 25 Aug 2022 17:02:28 +0200
Subject: [PATCH 5/6] fix format

---
 nextflow_schema.json | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 990098c..610af22 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -10,10 +10,7 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": [
-                "input",
-                "outdir"
-            ],
+            "required": ["input", "outdir"],
             "properties": {
                 "input": {
                     "type": "string",
@@ -55,12 +52,7 @@
                     "type": "string",
                     "default": "GRCh37",
                     "help_text": "This defines against which human reference genome the pipeline performs the analysis including the incorporation of genetic variants e.g..",
-                    "enum": [
-                        "GRCh37",
-                        "GRCh38",
-                        "grch37",
-                        "grch38"
-                    ],
+                    "enum": ["GRCh37", "GRCh38", "grch37", "grch38"],
                     "description": "Specifies  the human reference genome version."
                 },
                 "proteome": {
@@ -327,14 +319,7 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": [
-                        "symlink",
-                        "rellink",
-                        "link",
-                        "copy",
-                        "copyNoFollow",
-                        "move"
-                    ],
+                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
                     "hidden": true
                 },
                 "email_on_fail": {

From 069a1450fd108e41940bd7c2f6f48532261ec92d Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@uni-tuebingen.de>
Date: Wed, 7 Sep 2022 11:18:50 +0200
Subject: [PATCH 6/6] run black formatting

---
 bin/check_samplesheet.py |  8 ++++----
 bin/epaa.py              | 28 +++++++++++++++++-----------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 571bde8..cb76cf2 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -253,13 +253,13 @@ def check_samplesheet(file_in, file_out):
     with open(file_in, "r") as fin:
 
         ## Check header
-        MIN_COL = 4 # expression optional
+        MIN_COL = 4  # expression optional
         HEADER = ["sample", "alleles", "mhc_class", "filename"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
         expression_available = "expression" in header
         valid_classes = "I,II,H-2"
-        valid_class1_loci = ['A*','B*','C*','E*','G*']
-        valid_class2_loci = ['DR','DP','DQ']
+        valid_class1_loci = ["A*", "B*", "C*", "E*", "G*"]
+        valid_class2_loci = ["DR", "DP", "DQ"]
 
         if expression_available:
             HEADER.insert(3, "expression")
@@ -326,7 +326,7 @@ def check_samplesheet(file_in, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "alleles","mhc_class", "expression", "filename"]) + "\n")
+            fout.write(",".join(["sample", "alleles", "mhc_class", "expression", "filename"]) + "\n")
             for sample in sorted(sample_run_dict.keys()):
                 for val in sample_run_dict[sample]:
                     fout.write(",".join(val) + "\n")
diff --git a/bin/epaa.py b/bin/epaa.py
index 30b03c7..8a8f6f6 100755
--- a/bin/epaa.py
+++ b/bin/epaa.py
@@ -466,9 +466,10 @@ def read_protein_quant(filename):
                         intensities[p.split("|")[1]] = valuedict
     return intensities
 
+
 # parse rnaseq analysis results
 # data frame: gene/transcript -> count/TPM
-#def read_diff_expression_values(filename):
+# def read_diff_expression_values(filename):
 
 # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes
 def read_diff_expression_values(filename):
@@ -1212,8 +1213,7 @@ def __main__():
 
     metadata = []
     proteins = []
-    references = {'GRCh37': 'http://feb2014.archive.ensembl.org',
-                  'GRCh38': 'http://aug2017.archive.ensembl.org'}
+    references = {"GRCh37": "http://feb2014.archive.ensembl.org", "GRCh38": "http://aug2017.archive.ensembl.org"}
 
     global transcriptProteinMap
     global transcriptSwissProtMap
@@ -1330,9 +1330,9 @@ def __main__():
     # get gene names from Ensembl and add them to the data frame
     # we want to add gene names to our data frame in order to make the mapping easier
     # we will use this when the next epytope release is ready where we already implemented the functionality
-    #mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
-    #mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
-    #complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")
+    # mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
+    # mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
+    # complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")
 
     # include wild type sequences to dataframe if specified
     if args.wild_type:
@@ -1400,23 +1400,29 @@ def __main__():
             )
     # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts)
     if args.expression is not None:
-        rnaseq_results = pd.read_csv(args.expression, sep='\t', header=0)
+        rnaseq_results = pd.read_csv(args.expression, sep="\t", header=0)
 
         measure = "count" if "count" in args.expression else "TPM"
         transcript_features = "tx" in rnaseq_results.columns
-        #merge_on = "gene_name"
+        # merge_on = "gene_name"
         merge_on = "gene_id"
 
         # we expect columns: tx gene_id samples
         if transcript_features:
-            rnaseq_results.columns = ["{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") for c in rnaseq_results.columns]
+            rnaseq_results.columns = [
+                "{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}")
+                for c in rnaseq_results.columns
+            ]
             merge_on = "tx"
         # we expect columns: gene_id gene_name samples
         else:
-            rnaseq_results.columns = ["{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") for c in rnaseq_results.columns]
+            rnaseq_results.columns = [
+                "{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}")
+                for c in rnaseq_results.columns
+            ]
 
         # add sample-specific expression values to data frame
-        complete_df = complete_df.merge(rnaseq_results,on=merge_on,how="left")
+        complete_df = complete_df.merge(rnaseq_results, on=merge_on, how="left")
 
     if args.diff_gene_expression is not None:
         gene_id_lengths = {}