Skip to content

Commit

Permalink
resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
jonasscheid committed Dec 13, 2023
2 parents 1ffdacb + ee311d9 commit db037bd
Show file tree
Hide file tree
Showing 10 changed files with 18 additions and 263 deletions.
10 changes: 1 addition & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,7 @@ jobs:
- NXF_VER: ""
NXF_EDGE: "1"
tests:
[
"test_variant_tsv",
"test_grch38_variant_tsv",
"test_peptides",
"test_peptides_h2",
"test_proteins",
"test_mhcnuggets",
"test_mhcflurry",
]
["test_grch38", "test_peptides", "test_peptides_h2", "test_proteins", "test_mhcnuggets", "test_mhcflurry"]
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#219](https://github.com/nf-core/epitopeprediction/pull/219) - Fix `EXTERNAL_TOOLS_IMPORT`` container registry and bump version

### `Removed`

- [#221](https://github.com/nf-core/epitopeprediction/pull/221) - Remove support of `GSvar` and variant `tsv` input files

## v2.2.1 - WaldhaeuserOst Hotfix - 2023-03-16

### `Fixed`
Expand Down
175 changes: 5 additions & 170 deletions bin/epaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,26 +70,6 @@ def get_epytope_annotation(vt, p, r, alt):
return position, reference, alternative


def check_min_req_GSvar(row):
"""
checking the presence of mandatory columns
:param row: dictionary of a GSvar row
:return: boolean, True if min req met
"""
if (
"#chr" in row.keys()
and "start" in row.keys()
and "end" in row.keys()
and "ref" in row.keys()
and "obs" in row.keys()
and (
"coding_and_splicing_details" in row.keys() or "coding" in row.keys() or "coding_and_splicing" in row.keys()
)
):
return True
return False


def determine_variant_type(record, alternative):
vt = VariationType.UNKNOWN
if record.is_snp:
Expand Down Expand Up @@ -129,152 +109,6 @@ def determine_zygosity(record):
return isHomozygous


def read_GSvar(filename, pass_only=True):
"""
reads GSvar and tsv files (tab sep files in context of genetic variants), omitting and warning about rows missing
mandatory columns
:param filename: /path/to/file
:return: list epytope variants
"""
global ID_SYSTEM_USED
RE = re.compile("(\w+):([\w.]+):([&\w]+):\w*:exon(\d+)\D*\d*:(c.\D*([_\d]+)\D*):(p.\D*(\d+)\w*)")

# list of mandatory (meta)data
exclusion_list = [
"start",
"end",
"#chr",
"ref",
"obs",
"gene",
"tumour_genotype",
"coding_and_splicing_details",
"variant_details",
"variant_type",
"coding_and_splicing",
]

list_vars = list()
lines = list()
transcript_ids = []
dict_vars = {}

cases = 0

with open(filename, "rt") as tsvfile:
tsvreader = csv.DictReader((row for row in tsvfile if not row.startswith("##")), delimiter="\t")
for row in tsvreader:
if not check_min_req_GSvar(row):
logger.warning("read_GSvar: Omitted row! Mandatory columns not present in: \n" + str(row) + ".")
continue
lines.append(row)

# get list of additional metadata
metadata_list = set(tsvreader.fieldnames) - set(exclusion_list)

for mut_id, line in enumerate(lines):
if "filter" in line and pass_only and line["filter"].strip():
continue
genome_start = int(line["start"]) - 1
genome_stop = int(line["end"]) - 1
chrom = line["#chr"]
ref = line["ref"]
alt = line["obs"]
gene = line.get("gene", "")

isHomozygous = (
True
if (
("tumour_genotype" in line)
and (line["tumour_genotype"].split("/")[0] == line["tumour_genotype"].split("/")[1])
)
else False
)

# old GSvar version
if "coding_and_splicing_details" in line:
mut_type = line.get("variant_details", "")
annots = RE.findall(line["coding_and_splicing_details"])
else:
mut_type = line.get("variant_type", "")
# Gene, transcript number, type, impact, exon/intron number, HGVS.c, HGVS.p, Pfam
annots = RE.findall(line["coding_and_splicing"])
isyn = mut_type == "synonymous_variant"

"""
Enum for variation types:
type.SNP, type.DEL, type.INS, type.FSDEL, type.FSINS, type.UNKNOWN
"""
vt = VariationType.UNKNOWN
if mut_type == "missense_variant" or "missense_variant" in mut_type:
vt = VariationType.SNP
elif mut_type == "frameshift_variant":
if (ref == "-") or (len(ref) < len(alt)):
vt = VariationType.FSINS
else:
vt = VariationType.FSDEL
elif mut_type == "inframe_deletion":
vt = VariationType.DEL
elif mut_type == "inframe_insertion":
vt = VariationType.INS

coding = dict()

for annot in annots:
a_gene, transcript_id, a_mut_type, exon, trans_coding, trans_pos, prot_coding, prot_start = annot
if "NM" in transcript_id:
ID_SYSTEM_USED = EIdentifierTypes.REFSEQ
if "stop_gained" not in mut_type:
if not gene:
gene = a_gene
if not mut_type:
mut_type = a_mut_type

# with the latest epytope release (3.3.1), we can now handle full transcript IDs
coding[transcript_id] = MutationSyntax(
transcript_id, int(trans_pos.split("_")[0]) - 1, int(prot_start) - 1, trans_coding, prot_coding
)
transcript_ids.append(transcript_id)
if coding:
var = Variant(
mut_id,
vt,
chrom.strip("chr"),
int(genome_start),
ref.upper(),
alt.upper(),
coding,
isHomozygous,
isSynonymous=isyn,
)
var.gene = gene

# metadata logging
for meta_name in metadata_list:
var.log_metadata(meta_name, line.get(meta_name, ""))

dict_vars[var] = var
list_vars.append(var)

transToVar = {}

# fix because of memory/timing issues due to combinatorial explosion
for variant in list_vars:
for trans_id in variant.coding.keys():
transToVar.setdefault(trans_id, []).append(variant)

for tId, vs in transToVar.items():
if len(vs) > 10:
cases += 1
for v in vs:
vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous)
vs_new.gene = v.gene
for m in metadata_list:
vs_new.log_metadata(m, v.get_metadata(m)[0])
dict_vars[v] = vs_new
return dict_vars.values(), transcript_ids, metadata_list


def read_vcf(filename, pass_only=True):
"""
reads vcf files
Expand Down Expand Up @@ -1224,18 +1058,19 @@ def __main__():
logger.info("Running epaa for peptides...")
peptides, metadata = read_peptide_input(args.peptides)
else:
if args.somatic_mutations.endswith(".GSvar") or args.somatic_mutations.endswith(".tsv"):
logger.info("Running epaa for variants...")
variant_list, transcripts, metadata = read_GSvar(args.somatic_mutations)
elif args.somatic_mutations.endswith(".vcf"):
logger.info("Running epaa for variants...")
if args.somatic_mutations.endswith(".vcf"):
variant_list, transcripts, metadata = read_vcf(args.somatic_mutations)
else:
raise ValueError("File is not in VCF format. Please provide a VCF file.")

transcripts = list(set(transcripts))

# use function provided by epytope to retrieve protein IDs (different systems) for transcript IDs
transcriptProteinTable = ma.get_protein_ids_from_transcripts(transcripts, type=ID_SYSTEM_USED)

# get the alleles
# TODO: remove this in PR of nf-validation
if args.alleles.startswith("http"):
alleles = [Allele(a) for a in urllib.request.urlopen(args.alleles).read().decode("utf-8").splitlines()]
elif args.alleles.endswith(".txt"):
Expand Down
7 changes: 0 additions & 7 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,6 @@ process {
]
}

withName: CSVTK_SPLIT {
publishDir = [
path: { "${params.outdir}/split_input/${meta.sample}" },
mode: params.publish_dir_mode
]
}

withName: GET_PREDICTION_VERSIONS {
publishDir = [
path: { "${params.outdir}/reports" },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/epitopeprediction -profile test_grch38_variant_tsv,<docker/singularity> --outdir <OUTDIR>
* nextflow run nf-core/epitopeprediction -profile test_grch38,<docker/singularity> --outdir <OUTDIR>
*/

params {
Expand All @@ -13,6 +13,6 @@ params {
max_time = 48.h

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv'
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'
genome_version = 'GRCh38'
}
17 changes: 0 additions & 17 deletions conf/test_variant_tsv.config

This file was deleted.

2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ GBM_2,alleles.txt,I,gbm_2_variants.vcf
| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
| `alleles` | A string that consists of the patient's alleles (separated by ";"), or a full path to a allele ".txt" file where each allele is saved on a row. |
| `mhc_class` | Specifies the MHC class for which the prediction should be performed. Valid values are: `I`, `II` and `H-2` (mouse). |
| `filename` | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv", "fasta", or "GSvar"). |
| `filename` | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv" or "fasta"). |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Expand Down
30 changes: 0 additions & 30 deletions modules/local/csvtk_split.nf

This file was deleted.

3 changes: 1 addition & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,7 @@ profiles {
executor.memory = 8.GB
}
test { includeConfig 'conf/test.config' }
test_variant_tsv { includeConfig 'conf/test_variant_tsv.config' }
test_grch38_variant_tsv { includeConfig 'conf/test_grch38_variant_tsv.config' }
test_grch38 { includeConfig 'conf/test_grch38.config' }
test_peptides { includeConfig 'conf/test_peptides.config' }
test_peptides_h2 { includeConfig 'conf/test_peptides_h2.config' }
test_proteins { includeConfig 'conf/test_proteins.config' }
Expand Down
29 changes: 4 additions & 25 deletions workflows/epitopeprediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ include { EPYTOPE_SHOW_SUPPORTED_MODELS }

include { VARIANT_SPLIT} from '../modules/local/variant_split'
include { SNPSIFT_SPLIT} from '../modules/local/snpsift_split'
include { CSVTK_SPLIT} from '../modules/local/csvtk_split'

include { EPYTOPE_GENERATE_PEPTIDES } from '../modules/local/epytope_generate_peptides'
include { SPLIT_PEPTIDES as SPLIT_PEPTIDES_PEPTIDES } from '../modules/local/split_peptides'
Expand Down Expand Up @@ -278,40 +277,22 @@ workflow EPITOPEPREDICTION {
========================================================================================
*/

// Make a division for the variant files and process them further accordingly
ch_samples_uncompressed
.variant
.branch {
meta_data, input_file ->
vcf : input_file.endsWith('.vcf') || input_file.endsWith('.vcf.gz')
return [ meta_data, input_file ]
tab : input_file.endsWith('.tsv') || input_file.endsWith('.GSvar')
return [ meta_data, input_file ]
}
.set { ch_variants }

// decide between the split_by_variants and snpsift_split (by chromosome) function (only vcf and vcf.gz variant files)
// decide between the split_by_variants and snpsift_split (by chromosome) function
if (params.split_by_variants) {
VARIANT_SPLIT(
ch_variants.vcf
ch_samples_uncompressed.variant
)
.set { ch_split_variants }
ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions )

}
else {
SNPSIFT_SPLIT(
ch_variants.vcf
ch_samples_uncompressed.variant
)
.set { ch_split_variants }
ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions )
}
// include the csvtk_split function (only variant files with an tsv and GSvar executable)
CSVTK_SPLIT(
ch_variants.tab
)

ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions )

// process FASTA file and generated peptides
EPYTOPE_GENERATE_PEPTIDES(
Expand Down Expand Up @@ -364,10 +345,8 @@ workflow EPITOPEPREDICTION {

// Run epitope prediction for variants
EPYTOPE_PEPTIDE_PREDICTION_VAR(
CSVTK_SPLIT
.out
ch_split_variants
.splitted
.mix( ch_split_variants.splitted )
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
Expand Down

0 comments on commit db037bd

Please sign in to comment.