Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rnaseq input #174

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
"pattern": "^(I|II|H-2)$",
"errorMessage": "The MHC class must be provided. Valid values: "
},
"rnaseq": {
"type": "string",
"pattern": "^\\S+\\.(tsv)$",
"errorMessage": "RNAseq analysis results must have one of the following extensions: ''.tsv''"
},
"filename": {
"type": "string",
"pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$",
Expand Down
19 changes: 16 additions & 3 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out):
GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta

or with optional column(s)

sample,alleles,mhc_class,expression,filename
GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta


where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins)
and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
containing one allele per line (no header)
containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or
gene level) as generated by the rnaseq pipeline.

Further examples:
- Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
Expand All @@ -251,7 +258,14 @@ def check_samplesheet(file_in, file_out):
## Check header
valid_header = ["sample", "alleles", "mhc_class", "filename"]
header = [x.strip('"') for x in samplesheet.readline().strip().split(",")]
if len(header) != 4:
header_length = 4 # expression optional

expression_available = "expression" in header
if expression_available:
valid_header.insert(3, "expression")
header += 1

if len(header) != header_length:
raise ValueError(
f"Invalid number of header columns! Make sure the samplesheet is properly comma-separated."
)
Expand Down Expand Up @@ -279,7 +293,6 @@ def check_samplesheet(file_in, file_out):
for row in rows:
fout.write(",".join(row) + "\n")


def main(argv=None):
"""Coordinate argument parsing and program execution."""
args = parse_args(argv)
Expand Down
63 changes: 40 additions & 23 deletions bin/epaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,10 @@ def read_protein_quant(filename):
return intensities


# parse rnaseq analysis results
# data frame: gene/transcript -> count/TPM
# def read_diff_expression_values(filename):

# parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes
def read_diff_expression_values(filename):
# feature id: log2fold changes
Expand Down Expand Up @@ -625,15 +629,12 @@ def create_mutationsyntax_genome_column_value(pep, pep_dictionary):
syntaxes.append(variant.coding[coding])
return ",".join(set([mutationSyntax.cdsMutationSyntax for mutationSyntax in syntaxes]))


def create_gene_column_value(pep, pep_dictionary):
return ",".join(set([variant.gene for variant in set(pep_dictionary[pep])]))


def create_variant_pos_column_value(pep, pep_dictionary):
return ",".join(set(["{}".format(variant.genomePos) for variant in set(pep_dictionary[pep])]))


def create_variant_chr_column_value(pep, pep_dictionary):
return ",".join(set(["{}".format(variant.chrom) for variant in set(pep_dictionary[pep])]))

Expand Down Expand Up @@ -1184,7 +1185,7 @@ def __main__():
parser.add_argument("-rp", "--reference_proteome", help="Reference proteome for self-filtering", required=False)
parser.add_argument("-gr", "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
parser.add_argument("-pq", "--protein_quantification", help="File with protein quantification values")
parser.add_argument("-ge", "--gene_expression", help="File with expression analysis results")
parser.add_argument("-ge", "--expression", help="File with expression analysis results")
parser.add_argument(
"-de", "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)"
)
Expand Down Expand Up @@ -1328,6 +1329,15 @@ def __main__():
predictions_available = False
logger.error("No predictions available.")

complete_df.replace("gene", "gene_id")

# get gene names from Ensembl and add them to the data frame
# we want to add gene names to our data frame in order to make the mapping easier
# we will use this when the next epytope release is ready where we already implemented the functionality
# mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
# mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
# complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")

# include wild type sequences to dataframe if specified
if args.wild_type:
if args.peptides:
Expand Down Expand Up @@ -1396,25 +1406,32 @@ def __main__():
complete_df["{} log2 protein LFQ intensity".format(k)] = complete_df.apply(
lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1
)
# parse (differential) expression analysis results, annotate features (genes/transcripts)
if args.gene_expression is not None:
fold_changes = read_diff_expression_values(args.gene_expression)
gene_id_lengths = {}
col_name = "RNA expression (RPKM)"

with open(args.gene_reference, "r") as gene_list:
for l in gene_list:
ids = l.split("\t")
gene_id_in_df = complete_df.iloc[1]["gene"]
if "ENSG" in gene_id_in_df:
gene_id_lengths[ids[0]] = float(ids[2].strip())
else:
gene_id_lengths[ids[1]] = float(ids[2].strip())
deseq = False
# add column to result dataframe
complete_df[col_name] = complete_df.apply(
lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1
)
# parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts)
if args.expression is not None:
rnaseq_results = pd.read_csv(args.expression, sep="\t", header=0)

measure = "count" if "count" in args.expression else "TPM"
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
transcript_features = "tx" in rnaseq_results.columns
# merge_on = "gene_name"
merge_on = "gene_id"

# we expect columns: tx gene_id samples
if transcript_features:
rnaseq_results.columns = [
"{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}")
for c in rnaseq_results.columns
]
merge_on = "tx"
# we expect columns: gene_id gene_name samples
else:
rnaseq_results.columns = [
"{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}")
for c in rnaseq_results.columns
]

# add sample-specific expression values to data frame
complete_df = complete_df.merge(rnaseq_results, on=merge_on, how="left")

if args.diff_gene_expression is not None:
gene_id_lengths = {}
fold_changes = read_diff_expression_values(args.diff_gene_expression)
Expand Down
5 changes: 5 additions & 0 deletions modules/local/epytope_peptide_prediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ process EPYTOPE_PEPTIDE_PREDICTION {
input:
tuple val(meta), path(splitted), path(software_versions)
val netmhc_paths
path(expression)
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved

output:
tuple val(meta), path("*.json"), emit: json
Expand Down Expand Up @@ -42,6 +43,10 @@ process EPYTOPE_PEPTIDE_PREDICTION {
argument = "--use_affinity_thresholds " + argument
}

if (expression) {
argument = "--expression ${expression} " + argument
}

def netmhc_paths_string = netmhc_paths.join(",")
def tools_split = params.tools.split(',')
def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') }
Expand Down
6 changes: 4 additions & 2 deletions subworkflows/local/input_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ def get_samplesheet_paths(LinkedHashMap row) {
meta.alleles = row.alleles
meta.mhcclass = row.mhc_class
meta.inputtype = row.inputtype
expression = row.expression ? file(row.expression, checkIfExists: true) : []

def array = []
if (!file(row.filename).exists()) {
exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}"
} else {
array = [ meta, file(row.filename) ]
}
else {
array = [meta, file(row.filename)]
}
return array
}
Expand Down
14 changes: 10 additions & 4 deletions workflows/epitopeprediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ workflow EPITOPEPREDICTION {

INPUT_CHECK.out.meta
.branch {
meta_data, input_file ->
meta_data, expression, input_file ->
variant_compressed : meta_data.inputtype == 'variant_compressed'
return [ meta_data, input_file ]
variant_uncompressed : meta_data.inputtype == 'variant'
Expand All @@ -127,6 +127,9 @@ workflow EPITOPEPREDICTION {
}
.set { ch_samples_from_sheet }

ch_expression = INPUT_CHECK.out.reads
.map { meta_data, expression, input_file -> expression }

// gunzip variant files
GUNZIP_VCF (
ch_samples_from_sheet.variant_compressed
Expand Down Expand Up @@ -348,7 +351,8 @@ workflow EPITOPEPREDICTION {
.splitted
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
ch_expression
)
ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) )

Expand All @@ -360,7 +364,8 @@ workflow EPITOPEPREDICTION {
.splitted
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
ch_expression
)
ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) )

Expand All @@ -373,7 +378,8 @@ workflow EPITOPEPREDICTION {
.mix( ch_split_variants.splitted )
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
ch_expression
)
ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) )

Expand Down
Loading