nf-core · christopher-mohr · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -30,6 +30,11 @@
                 "pattern": "^(I|II|H-2)$",
                 "errorMessage": "The MHC class must be provided. Valid values: "
             },
+            "rnaseq": {
+                "type": "string",
+                "pattern": "^\\S+\\.(tsv)$",
+                "errorMessage": "RNAseq analysis results must have one of the following extensions:  ''.tsv''"
+            },
             "filename": {
                 "type": "string",
                 "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$",

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out):
     GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
     GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
 
+    or with optional column(s)
+
+    sample,alleles,mhc_class,expression,filename
+    GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
+    GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
+
 
     where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins)
     and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
-    containing one allele per line (no header)
+    containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or
+    gene level) as generated by the rnaseq pipeline.
 
     Further examples:
     - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
@@ -251,7 +258,14 @@ def check_samplesheet(file_in, file_out):
         ## Check header
         valid_header = ["sample", "alleles", "mhc_class", "filename"]
         header = [x.strip('"') for x in samplesheet.readline().strip().split(",")]
-        if len(header) != 4:
+        header_length = 4 # expression optional
+
+        expression_available = "expression" in header
+        if expression_available:
+            valid_header.insert(3, "expression")
+            header += 1
+
+        if len(header) != header_length:
             raise ValueError(
                 f"Invalid number of header columns! Make sure the samplesheet is properly comma-separated."
             )
@@ -279,7 +293,6 @@ def check_samplesheet(file_in, file_out):
             for row in rows:
                 fout.write(",".join(row) + "\n")
 
-
 def main(argv=None):
     """Coordinate argument parsing and program execution."""
     args = parse_args(argv)

diff --git a/bin/epaa.py b/bin/epaa.py
@@ -567,6 +567,10 @@ def read_protein_quant(filename):
     return intensities
 
 
+# parse rnaseq analysis results
+# data frame: gene/transcript -> count/TPM
+# def read_diff_expression_values(filename):
+
 # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes
 def read_diff_expression_values(filename):
     # feature id: log2fold changes
@@ -625,15 +629,12 @@ def create_mutationsyntax_genome_column_value(pep, pep_dictionary):
             syntaxes.append(variant.coding[coding])
     return ",".join(set([mutationSyntax.cdsMutationSyntax for mutationSyntax in syntaxes]))
 
-
 def create_gene_column_value(pep, pep_dictionary):
     return ",".join(set([variant.gene for variant in set(pep_dictionary[pep])]))
 
-
 def create_variant_pos_column_value(pep, pep_dictionary):
     return ",".join(set(["{}".format(variant.genomePos) for variant in set(pep_dictionary[pep])]))
 
-
 def create_variant_chr_column_value(pep, pep_dictionary):
     return ",".join(set(["{}".format(variant.chrom) for variant in set(pep_dictionary[pep])]))
 
@@ -1184,7 +1185,7 @@ def __main__():
     parser.add_argument("-rp", "--reference_proteome", help="Reference proteome for self-filtering", required=False)
     parser.add_argument("-gr", "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
     parser.add_argument("-pq", "--protein_quantification", help="File with protein quantification values")
-    parser.add_argument("-ge", "--gene_expression", help="File with expression analysis results")
+    parser.add_argument("-ge", "--expression", help="File with expression analysis results")
     parser.add_argument(
         "-de", "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)"
     )
@@ -1328,6 +1329,15 @@ def __main__():
         predictions_available = False
         logger.error("No predictions available.")
 
+    complete_df.replace("gene", "gene_id")
+
+    # get gene names from Ensembl and add them to the data frame
+    # we want to add gene names to our data frame in order to make the mapping easier
+    # we will use this when the next epytope release is ready where we already implemented the functionality
+    # mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
+    # mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
+    # complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")
+
     # include wild type sequences to dataframe if specified
     if args.wild_type:
         if args.peptides:
@@ -1396,25 +1406,32 @@ def __main__():
             complete_df["{} log2 protein LFQ intensity".format(k)] = complete_df.apply(
                 lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1
             )
-    # parse (differential) expression analysis results, annotate features (genes/transcripts)
-    if args.gene_expression is not None:
-        fold_changes = read_diff_expression_values(args.gene_expression)
-        gene_id_lengths = {}
-        col_name = "RNA expression (RPKM)"
-
-        with open(args.gene_reference, "r") as gene_list:
-            for l in gene_list:
-                ids = l.split("\t")
-                gene_id_in_df = complete_df.iloc[1]["gene"]
-                if "ENSG" in gene_id_in_df:
-                    gene_id_lengths[ids[0]] = float(ids[2].strip())
-                else:
-                    gene_id_lengths[ids[1]] = float(ids[2].strip())
-        deseq = False
-        # add column to result dataframe
-        complete_df[col_name] = complete_df.apply(
-            lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1
-        )
+    # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts)
+    if args.expression is not None:
+        rnaseq_results = pd.read_csv(args.expression, sep="\t", header=0)
+
+        measure = "count" if "count" in args.expression else "TPM"
+        transcript_features = "tx" in rnaseq_results.columns
+        # merge_on = "gene_name"
+        merge_on = "gene_id"
+
+        # we expect columns: tx gene_id samples
+        if transcript_features:
+            rnaseq_results.columns = [
+                "{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}")
+                for c in rnaseq_results.columns
+            ]
+            merge_on = "tx"
+        # we expect columns: gene_id gene_name samples
+        else:
+            rnaseq_results.columns = [
+                "{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}")
+                for c in rnaseq_results.columns
+            ]
+
+        # add sample-specific expression values to data frame
+        complete_df = complete_df.merge(rnaseq_results, on=merge_on, how="left")
+
     if args.diff_gene_expression is not None:
         gene_id_lengths = {}
         fold_changes = read_diff_expression_values(args.diff_gene_expression)

diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf
@@ -7,6 +7,7 @@ process EPYTOPE_PEPTIDE_PREDICTION {
     input:
     tuple val(meta), path(splitted), path(software_versions)
     val netmhc_paths
+    path(expression)
 
     output:
     tuple val(meta), path("*.json"), emit: json
@@ -42,6 +43,10 @@ process EPYTOPE_PEPTIDE_PREDICTION {
         argument = "--use_affinity_thresholds " + argument
     }
 
+    if (expression) {
+        argument = "--expression ${expression} " + argument
+    }
+
     def netmhc_paths_string = netmhc_paths.join(",")
     def tools_split = params.tools.split(',')
     def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') }

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -31,12 +31,14 @@ def get_samplesheet_paths(LinkedHashMap row) {
     meta.alleles        = row.alleles
     meta.mhcclass       = row.mhc_class
     meta.inputtype      = row.inputtype
+    expression = row.expression ? file(row.expression, checkIfExists: true) : []
 
     def array = []
     if (!file(row.filename).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}"
-    } else {
-        array = [ meta, file(row.filename) ]
+    }
+    else {
+        array = [meta, file(row.filename)]
     }
     return array
 }

diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
@@ -115,7 +115,7 @@ workflow EPITOPEPREDICTION {
 
     INPUT_CHECK.out.meta
                 .branch {
-                    meta_data, input_file ->
+                    meta_data, expression, input_file ->
                         variant_compressed : meta_data.inputtype == 'variant_compressed'
                             return [ meta_data, input_file ]
                         variant_uncompressed :  meta_data.inputtype == 'variant'
@@ -127,6 +127,9 @@ workflow EPITOPEPREDICTION {
                     }
                 .set { ch_samples_from_sheet }
 
+    ch_expression = INPUT_CHECK.out.reads
+                        .map { meta_data, expression, input_file -> expression }
+
     // gunzip variant files
     GUNZIP_VCF (
         ch_samples_from_sheet.variant_compressed
@@ -348,7 +351,8 @@ workflow EPITOPEPREDICTION {
             .splitted
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
     ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) )
 
@@ -360,7 +364,8 @@ workflow EPITOPEPREDICTION {
             .splitted
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
     ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) )
 
@@ -373,7 +378,8 @@ workflow EPITOPEPREDICTION {
             .mix( ch_split_variants.splitted )
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
     ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) )