From 31b2ba56d935d99b15d3a8935565282f3f3aca64 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Thu, 25 Aug 2022 13:48:59 +0200 Subject: [PATCH 1/6] add optional input for expression values and functionality to map values to results --- bin/check_samplesheet.py | 29 ++++++--- bin/epaa.py | 67 ++++++++++++--------- modules/local/epytope_peptide_prediction.nf | 5 ++ subworkflows/local/input_check.nf | 6 +- workflows/epitopeprediction.nf | 14 +++-- 5 files changed, 78 insertions(+), 43 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 408ebfa..55de9f2 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out): GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta + or with optional column(s) + + sample,alleles,mhc_class,expression,filename + GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta + GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta + where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins) and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file - containing one allele per line (no header) + containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or + gene level) as generated by the rnaseq pipeline. Further examples: - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt @@ -246,12 +253,16 @@ def check_samplesheet(file_in, file_out): with open(file_in, "r") as fin: ## Check header - COL_NUM = 4 + MIN_COL = 4 # expression optional HEADER = ["sample", "alleles", "mhc_class", "filename"] header = [x.strip('"') for x in fin.readline().strip().split(",")] + expression_available = "expression" in header valid_classes = "I,II,H-2" valid_class1_loci = ['A*','B*','C*','E*','G*'] valid_class2_loci = ['DR','DP','DQ'] + + if expression_available: + HEADER.insert(3, "expression") if header[: len(HEADER)] != HEADER: print("ERROR: Please check samplesheet header -> {} != {}".format("\t".join(header), "\t".join(HEADER))) sys.exit(1) @@ -267,15 +278,20 @@ def check_samplesheet(file_in, file_out): line, ) num_cols = len([x for x in lspl if x]) - if num_cols != COL_NUM: + if num_cols < MIN_COL: print_error( - "Invalid number of populated columns (valid = {})!".format(COL_NUM), + "Invalid number of populated columns (valid >= {})!".format(MIN_COL), "Line", line, ) ## Check sample name entries - sample, alleles, mhcclass, filename = lspl[: len(HEADER)] + if expression_available: + sample, alleles, mhcclass, expression, filename = lspl[: len(HEADER)] + sample_info = [sample, alleles, mhcclass, expression, filename] + else: + sample, alleles, mhcclass, filename = lspl[: len(HEADER)] + sample_info = [sample, alleles, mhcclass, "", filename] ## Check given file types if not filename.lower().endswith((".vcf", ".vcf.gz", ".tsv", ".GSvar", ".fasta", ".txt")): @@ -289,7 +305,6 @@ def check_samplesheet(file_in, file_out): if not os.path.isfile(alleles) and mhcclass == 'I' and any(substring in alleles for substring in valid_class2_loci) or mhcclass == 'II' and any(substring in alleles for substring in valid_class1_loci): print_error("Samplesheet contains invalid mhc class and allele combination!", "Line", line) - sample_info = [sample, alleles, mhcclass, filename] ## Create sample mapping dictionary if sample not in sample_run_dict: sample_run_dict[sample] = [sample_info] @@ -304,7 +319,7 @@ def check_samplesheet(file_in, file_out): out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - fout.write(",".join(["sample", "alleles","mhc_class","filename"]) + "\n") + fout.write(",".join(["sample", "alleles","mhc_class", "expression", "filename"]) + "\n") for sample in sorted(sample_run_dict.keys()): for val in sample_run_dict[sample]: diff --git a/bin/epaa.py b/bin/epaa.py index 92296b6..1d474e1 100755 --- a/bin/epaa.py +++ b/bin/epaa.py @@ -419,6 +419,9 @@ def read_protein_quant(filename): intensities[p.split('|')[1]] = valuedict return intensities +# parse rnaseq analysis results +# data frame: gene/transcript -> count/TPM +#def read_diff_expression_values(filename): # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes def read_diff_expression_values(filename): @@ -486,12 +489,6 @@ def create_mutationsyntax_genome_column_value(pep): return ','.join(set([y.cdsMutationSyntax for y in syntaxes])) -def create_variationfilelinenumber_column_value(pep): - v = [x.vars.values() for x in pep.get_all_transcripts()] - vf = list(itertools.chain.from_iterable(v)) - return ','.join([str(int(y.id.replace('line', ''))+1) for y in vf]) - - def create_gene_column_value(pep): transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] @@ -877,7 +874,7 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, use_a df['length'] = df['sequence'].map(len) df['chr'] = df['sequence'].map(create_variant_chr_column_value) df['pos'] = df['sequence'].map(create_variant_pos_column_value) - df['gene'] = df['sequence'].map(create_gene_column_value) + df['gene_id'] = df['sequence'].map(create_gene_column_value) df['transcripts'] = df['sequence'].map(create_transcript_column_value) df['proteins'] = df['sequence'].map(create_protein_column_value) df['variant type'] = df['sequence'].map( @@ -1049,8 +1046,8 @@ def __main__(): help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") - parser.add_argument('-ge', "--gene_expression", - help="File with expression analysis results") + parser.add_argument('-ge', "--expression", + help="File with rnaseq analysis results", required=False) parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument('-li', "--ligandomics_id", @@ -1154,7 +1151,6 @@ def __main__(): try: complete_df = pd.concat(pred_dataframes, sort=True) # replace method names with method names with version - # complete_df.replace({'method': methods}, inplace=True) complete_df['method'] = complete_df['method'].apply( lambda x: x.lower() + '-' + methods[x.lower()]) predictions_available = True @@ -1163,16 +1159,25 @@ def __main__(): predictions_available = False logger.error("No predictions available.") + complete_df.replace("gene", "gene_id") + + # get gene names from Ensembl and add them to the data frame + # we want to add gene names to our data frame in order to make the mapping easier + # we will use this when the next epytope release is ready where we already implemented the functionality + #mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list()) + #mapping_gene_names_ids.columns = ["gene_name", "gene_id"] + #complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left") + # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply( lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', - 'gene', 'transcripts', 'proteins', 'variant type', 'method'] + 'gene_id', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: - columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', + columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene_id', 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: @@ -1206,24 +1211,26 @@ def __main__(): for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply( lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) - # parse (differential) expression analysis results, annotate features (genes/transcripts) - if args.gene_expression is not None: - fold_changes = read_diff_expression_values(args.gene_expression) - gene_id_lengths = {} - col_name = 'RNA expression (RPKM)' - - with open(args.gene_reference, 'r') as gene_list: - for l in gene_list: - ids = l.split('\t') - gene_id_in_df = complete_df.iloc[1]['gene'] - if 'ENSG' in gene_id_in_df: - gene_id_lengths[ids[0]] = float(ids[2].strip()) - else: - gene_id_lengths[ids[1]] = float(ids[2].strip()) - deseq = False - # add column to result dataframe - complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result( - row, fold_changes, deseq, gene_id_lengths), axis=1) + # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts) + if args.expression is not None: + rnaseq_results = pd.read_csv(args.expression, sep='\t', header=0) + + measure = "count" if "count" in args.expression else "TPM" + transcript_features = "tx" in rnaseq_results.columns + #merge_on = "gene_name" + merge_on = "gene_id" + + # we expect columns: tx gene_id samples + if transcript_features: + rnaseq_results.columns = ["{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") for c in rnaseq_results.columns] + merge_on = "tx" + # we expect columns: gene_id gene_name samples + else: + rnaseq_results.columns = ["{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") for c in rnaseq_results.columns] + + # add sample-specific expression values to data frame + complete_df = complete_df.merge(rnaseq_results,on=merge_on,how="left") + if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf index b6a7eee..e68d582 100644 --- a/modules/local/epytope_peptide_prediction.nf +++ b/modules/local/epytope_peptide_prediction.nf @@ -9,6 +9,7 @@ process EPYTOPE_PEPTIDE_PREDICTION { input: tuple val(meta), path(splitted), path(software_versions) val netmhc_paths + path(expression) output: tuple val(meta), path("*.json"), emit: json @@ -41,6 +42,10 @@ process EPYTOPE_PEPTIDE_PREDICTION { argument = "--use_affinity_thresholds " + argument } + if (expression) { + argument = "--expression ${expression} " + argument + } + def netmhc_paths_string = netmhc_paths.join(",") def tools_split = params.tools.split(',') def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 1913eea..ed7787e 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -30,12 +30,14 @@ def get_samplesheet_paths(LinkedHashMap row) { meta.alleles = allele_string meta.mhcclass = row.mhc_class meta.inputtype = type + expression = row.expression ? file(row.expression, checkIfExists: true) : [] def array = [] if (!file(row.filename).exists()) { exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}" - } else { - array = [ meta, file(row.filename) ] + } + else { + array = [meta, expression, file(row.filename)] } return array } diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 64c59aa..f35e21f 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -112,7 +112,7 @@ workflow EPITOPEPREDICTION { INPUT_CHECK.out.reads .branch { - meta_data, input_file -> + meta_data, expression, input_file -> variant_compressed : meta_data.inputtype == 'variant_compressed' return [ meta_data, input_file ] variant_uncompressed : meta_data.inputtype == 'variant' @@ -124,6 +124,9 @@ workflow EPITOPEPREDICTION { } .set { ch_samples_from_sheet } + ch_expression = INPUT_CHECK.out.reads + .map { meta_data, expression, input_file -> expression } + // gunzip variant files GUNZIP_VCF ( ch_samples_from_sheet.variant_compressed @@ -342,7 +345,8 @@ workflow EPITOPEPREDICTION { .splitted .combine( ch_prediction_tool_versions ) .transpose(), - EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]), + ch_expression ) // Run epitope prediction for peptides @@ -352,7 +356,8 @@ workflow EPITOPEPREDICTION { .splitted .combine( ch_prediction_tool_versions ) .transpose(), - EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]), + ch_expression ) // Run epitope prediction for variants @@ -363,7 +368,8 @@ workflow EPITOPEPREDICTION { .mix( ch_split_variants.splitted ) .combine( ch_prediction_tool_versions ) .transpose(), - EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]), + ch_expression ) // collect prediction script versions From 6cc654d196a6d22e1adbbdb1f230e630e501e889 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Thu, 25 Aug 2022 13:50:52 +0200 Subject: [PATCH 2/6] add optional input to input schema --- assets/schema_input.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/assets/schema_input.json b/assets/schema_input.json index b564a5c..e4654fc 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -30,6 +30,11 @@ "pattern": "^(I|II|H-2)$", "errorMessage": "The MHC class must be provided. Valid values: " }, + "rnaseq": { + "type": "string", + "pattern": "^\\S+\\.(tsv)$", + "errorMessage": "RNAseq analysis results must have one of the following extensions: ''.tsv''" + }, "filename": { "type": "string", "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$", From 67cb5494ddaf09699ce522f35c4c79854772da63 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Thu, 25 Aug 2022 16:38:24 +0200 Subject: [PATCH 3/6] update ensembl archive for grch38 --- bin/epaa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/epaa.py b/bin/epaa.py index 1d474e1..32fd5d0 100755 --- a/bin/epaa.py +++ b/bin/epaa.py @@ -1068,7 +1068,7 @@ def __main__(): metadata = [] proteins = [] references = {'GRCh37': 'http://feb2014.archive.ensembl.org', - 'GRCh38': 'http://mar2017.archive.ensembl.org'} + 'GRCh38': 'http://aug2017.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap From a2af74e85d3659a37862885ba699fad4ffa3e873 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Thu, 25 Aug 2022 16:43:23 +0200 Subject: [PATCH 4/6] allow genome_version in lower case --- nextflow_schema.json | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 10de656..990098c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,10 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": [ + "input", + "outdir" + ], "properties": { "input": { "type": "string", @@ -52,7 +55,12 @@ "type": "string", "default": "GRCh37", "help_text": "This defines against which human reference genome the pipeline performs the analysis including the incorporation of genetic variants e.g..", - "enum": ["GRCh37", "GRCh38"], + "enum": [ + "GRCh37", + "GRCh38", + "grch37", + "grch38" + ], "description": "Specifies the human reference genome version." }, "proteome": { @@ -319,7 +327,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { From 05eb2c91e0aa94eacb2ce428ee151fcecba65513 Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Thu, 25 Aug 2022 17:02:28 +0200 Subject: [PATCH 5/6] fix format --- nextflow_schema.json | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 990098c..610af22 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -55,12 +52,7 @@ "type": "string", "default": "GRCh37", "help_text": "This defines against which human reference genome the pipeline performs the analysis including the incorporation of genetic variants e.g..", - "enum": [ - "GRCh37", - "GRCh38", - "grch37", - "grch38" - ], + "enum": ["GRCh37", "GRCh38", "grch37", "grch38"], "description": "Specifies the human reference genome version." }, "proteome": { @@ -327,14 +319,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { From 069a1450fd108e41940bd7c2f6f48532261ec92d Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Wed, 7 Sep 2022 11:18:50 +0200 Subject: [PATCH 6/6] run black formatting --- bin/check_samplesheet.py | 8 ++++---- bin/epaa.py | 28 +++++++++++++++++----------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 571bde8..cb76cf2 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -253,13 +253,13 @@ def check_samplesheet(file_in, file_out): with open(file_in, "r") as fin: ## Check header - MIN_COL = 4 # expression optional + MIN_COL = 4 # expression optional HEADER = ["sample", "alleles", "mhc_class", "filename"] header = [x.strip('"') for x in fin.readline().strip().split(",")] expression_available = "expression" in header valid_classes = "I,II,H-2" - valid_class1_loci = ['A*','B*','C*','E*','G*'] - valid_class2_loci = ['DR','DP','DQ'] + valid_class1_loci = ["A*", "B*", "C*", "E*", "G*"] + valid_class2_loci = ["DR", "DP", "DQ"] if expression_available: HEADER.insert(3, "expression") @@ -326,7 +326,7 @@ def check_samplesheet(file_in, file_out): out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - fout.write(",".join(["sample", "alleles","mhc_class", "expression", "filename"]) + "\n") + fout.write(",".join(["sample", "alleles", "mhc_class", "expression", "filename"]) + "\n") for sample in sorted(sample_run_dict.keys()): for val in sample_run_dict[sample]: fout.write(",".join(val) + "\n") diff --git a/bin/epaa.py b/bin/epaa.py index 30b03c7..8a8f6f6 100755 --- a/bin/epaa.py +++ b/bin/epaa.py @@ -466,9 +466,10 @@ def read_protein_quant(filename): intensities[p.split("|")[1]] = valuedict return intensities + # parse rnaseq analysis results # data frame: gene/transcript -> count/TPM -#def read_diff_expression_values(filename): +# def read_diff_expression_values(filename): # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes def read_diff_expression_values(filename): @@ -1212,8 +1213,7 @@ def __main__(): metadata = [] proteins = [] - references = {'GRCh37': 'http://feb2014.archive.ensembl.org', - 'GRCh38': 'http://aug2017.archive.ensembl.org'} + references = {"GRCh37": "http://feb2014.archive.ensembl.org", "GRCh38": "http://aug2017.archive.ensembl.org"} global transcriptProteinMap global transcriptSwissProtMap @@ -1330,9 +1330,9 @@ def __main__(): # get gene names from Ensembl and add them to the data frame # we want to add gene names to our data frame in order to make the mapping easier # we will use this when the next epytope release is ready where we already implemented the functionality - #mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list()) - #mapping_gene_names_ids.columns = ["gene_name", "gene_id"] - #complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left") + # mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list()) + # mapping_gene_names_ids.columns = ["gene_name", "gene_id"] + # complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left") # include wild type sequences to dataframe if specified if args.wild_type: @@ -1400,23 +1400,29 @@ def __main__(): ) # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts) if args.expression is not None: - rnaseq_results = pd.read_csv(args.expression, sep='\t', header=0) + rnaseq_results = pd.read_csv(args.expression, sep="\t", header=0) measure = "count" if "count" in args.expression else "TPM" transcript_features = "tx" in rnaseq_results.columns - #merge_on = "gene_name" + # merge_on = "gene_name" merge_on = "gene_id" # we expect columns: tx gene_id samples if transcript_features: - rnaseq_results.columns = ["{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") for c in rnaseq_results.columns] + rnaseq_results.columns = [ + "{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") + for c in rnaseq_results.columns + ] merge_on = "tx" # we expect columns: gene_id gene_name samples else: - rnaseq_results.columns = ["{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") for c in rnaseq_results.columns] + rnaseq_results.columns = [ + "{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") + for c in rnaseq_results.columns + ] # add sample-specific expression values to data frame - complete_df = complete_df.merge(rnaseq_results,on=merge_on,how="left") + complete_df = complete_df.merge(rnaseq_results, on=merge_on, how="left") if args.diff_gene_expression is not None: gene_id_lengths = {}