diff --git a/assets/schema_input.json b/assets/schema_input.json index b564a5c..e4654fc 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -30,6 +30,11 @@ "pattern": "^(I|II|H-2)$", "errorMessage": "The MHC class must be provided. Valid values: " }, + "rnaseq": { + "type": "string", + "pattern": "^\\S+\\.(tsv)$", + "errorMessage": "RNAseq analysis results must have one of the following extensions: ''.tsv''" + }, "filename": { "type": "string", "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$", diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 97f66c6..0dd3368 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out): GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta + or with optional column(s) + + sample,alleles,mhc_class,expression,filename + GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta + GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta + where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins) and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file - containing one allele per line (no header) + containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or + gene level) as generated by the rnaseq pipeline. Further examples: - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt @@ -251,7 +258,14 @@ def check_samplesheet(file_in, file_out): ## Check header valid_header = ["sample", "alleles", "mhc_class", "filename"] header = [x.strip('"') for x in samplesheet.readline().strip().split(",")] - if len(header) != 4: + header_length = 4 # expression optional + + expression_available = "expression" in header + if expression_available: + valid_header.insert(3, "expression") + header += 1 + + if len(header) != header_length: raise ValueError( f"Invalid number of header columns! Make sure the samplesheet is properly comma-separated." ) @@ -279,7 +293,6 @@ def check_samplesheet(file_in, file_out): for row in rows: fout.write(",".join(row) + "\n") - def main(argv=None): """Coordinate argument parsing and program execution.""" args = parse_args(argv) diff --git a/bin/epaa.py b/bin/epaa.py index a538a61..864fc41 100755 --- a/bin/epaa.py +++ b/bin/epaa.py @@ -567,6 +567,10 @@ def read_protein_quant(filename): return intensities +# parse rnaseq analysis results +# data frame: gene/transcript -> count/TPM +# def read_diff_expression_values(filename): + # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes def read_diff_expression_values(filename): # feature id: log2fold changes @@ -625,15 +629,12 @@ def create_mutationsyntax_genome_column_value(pep, pep_dictionary): syntaxes.append(variant.coding[coding]) return ",".join(set([mutationSyntax.cdsMutationSyntax for mutationSyntax in syntaxes])) - def create_gene_column_value(pep, pep_dictionary): return ",".join(set([variant.gene for variant in set(pep_dictionary[pep])])) - def create_variant_pos_column_value(pep, pep_dictionary): return ",".join(set(["{}".format(variant.genomePos) for variant in set(pep_dictionary[pep])])) - def create_variant_chr_column_value(pep, pep_dictionary): return ",".join(set(["{}".format(variant.chrom) for variant in set(pep_dictionary[pep])])) @@ -1184,7 +1185,7 @@ def __main__(): parser.add_argument("-rp", "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument("-gr", "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument("-pq", "--protein_quantification", help="File with protein quantification values") - parser.add_argument("-ge", "--gene_expression", help="File with expression analysis results") + parser.add_argument("-ge", "--expression", help="File with expression analysis results") parser.add_argument( "-de", "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)" ) @@ -1328,6 +1329,15 @@ def __main__(): predictions_available = False logger.error("No predictions available.") + complete_df.replace("gene", "gene_id") + + # get gene names from Ensembl and add them to the data frame + # we want to add gene names to our data frame in order to make the mapping easier + # we will use this when the next epytope release is ready where we already implemented the functionality + # mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list()) + # mapping_gene_names_ids.columns = ["gene_name", "gene_id"] + # complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left") + # include wild type sequences to dataframe if specified if args.wild_type: if args.peptides: @@ -1396,25 +1406,32 @@ def __main__(): complete_df["{} log2 protein LFQ intensity".format(k)] = complete_df.apply( lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1 ) - # parse (differential) expression analysis results, annotate features (genes/transcripts) - if args.gene_expression is not None: - fold_changes = read_diff_expression_values(args.gene_expression) - gene_id_lengths = {} - col_name = "RNA expression (RPKM)" - - with open(args.gene_reference, "r") as gene_list: - for l in gene_list: - ids = l.split("\t") - gene_id_in_df = complete_df.iloc[1]["gene"] - if "ENSG" in gene_id_in_df: - gene_id_lengths[ids[0]] = float(ids[2].strip()) - else: - gene_id_lengths[ids[1]] = float(ids[2].strip()) - deseq = False - # add column to result dataframe - complete_df[col_name] = complete_df.apply( - lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1 - ) + # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts) + if args.expression is not None: + rnaseq_results = pd.read_csv(args.expression, sep="\t", header=0) + + measure = "count" if "count" in args.expression else "TPM" + transcript_features = "tx" in rnaseq_results.columns + # merge_on = "gene_name" + merge_on = "gene_id" + + # we expect columns: tx gene_id samples + if transcript_features: + rnaseq_results.columns = [ + "{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") + for c in rnaseq_results.columns + ] + merge_on = "tx" + # we expect columns: gene_id gene_name samples + else: + rnaseq_results.columns = [ + "{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") + for c in rnaseq_results.columns + ] + + # add sample-specific expression values to data frame + complete_df = complete_df.merge(rnaseq_results, on=merge_on, how="left") + if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf index 8b8ca73..f13a01a 100644 --- a/modules/local/epytope_peptide_prediction.nf +++ b/modules/local/epytope_peptide_prediction.nf @@ -7,6 +7,7 @@ process EPYTOPE_PEPTIDE_PREDICTION { input: tuple val(meta), path(splitted), path(software_versions) val netmhc_paths + path(expression) output: tuple val(meta), path("*.json"), emit: json @@ -42,6 +43,10 @@ process EPYTOPE_PEPTIDE_PREDICTION { argument = "--use_affinity_thresholds " + argument } + if (expression) { + argument = "--expression ${expression} " + argument + } + def netmhc_paths_string = netmhc_paths.join(",") def tools_split = params.tools.split(',') def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index af9f467..77edab9 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -31,12 +31,14 @@ def get_samplesheet_paths(LinkedHashMap row) { meta.alleles = row.alleles meta.mhcclass = row.mhc_class meta.inputtype = row.inputtype + expression = row.expression ? file(row.expression, checkIfExists: true) : [] def array = [] if (!file(row.filename).exists()) { exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}" - } else { - array = [ meta, file(row.filename) ] + } + else { + array = [meta, file(row.filename)] } return array } diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 849ad28..f76f5f6 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -115,7 +115,7 @@ workflow EPITOPEPREDICTION { INPUT_CHECK.out.meta .branch { - meta_data, input_file -> + meta_data, expression, input_file -> variant_compressed : meta_data.inputtype == 'variant_compressed' return [ meta_data, input_file ] variant_uncompressed : meta_data.inputtype == 'variant' @@ -127,6 +127,9 @@ workflow EPITOPEPREDICTION { } .set { ch_samples_from_sheet } + ch_expression = INPUT_CHECK.out.reads + .map { meta_data, expression, input_file -> expression } + // gunzip variant files GUNZIP_VCF ( ch_samples_from_sheet.variant_compressed @@ -348,7 +351,8 @@ workflow EPITOPEPREDICTION { .splitted .combine( ch_prediction_tool_versions ) .transpose(), - EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]), + ch_expression ) ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) ) @@ -360,7 +364,8 @@ workflow EPITOPEPREDICTION { .splitted .combine( ch_prediction_tool_versions ) .transpose(), - EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]), + ch_expression ) ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) ) @@ -373,7 +378,8 @@ workflow EPITOPEPREDICTION { .mix( ch_split_variants.splitted ) .combine( ch_prediction_tool_versions ) .transpose(), - EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]), + ch_expression ) ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) )