diff --git a/.DS_Store b/.DS_Store index c41164ec8..a286b3a78 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/BALSAMIC/assets/scripts/merge_mnp.py b/BALSAMIC/assets/scripts/merge_mnp.py new file mode 100755 index 000000000..c21d40816 --- /dev/null +++ b/BALSAMIC/assets/scripts/merge_mnp.py @@ -0,0 +1,537 @@ +#!/usr/bin/env python +""" + +Copyright (c) Sentieon Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Script to combine neighboring PASS variants on the same haplotype, +supporting user-defined merging strategy with the corresponding +arguments. +Original variants that are combined will be marked as "MERGED". +Usage: + python merge_mnp.py $VCF $FASTA +Usage with codon file (requires merge_by_codon.py strategy file): + python merge_mnp.py $VCF $FASTA merge_by_codon $CODON_FILE $IGNORE_INDELS + +Requirements: +This script requires access to the vcflib library contained in the Sentieon +software package located in $SENTIEON_INSTALL_DIR/lib/python/sentieon. +""" +from __future__ import print_function + +import vcflib +import copy +import click +import sys +import pyfaidx +from typing import List, Union, Optional + +global vcf, reference + + +def ifmerge( + variant1: vcflib.vcf.Variant, variant2: vcflib.vcf.Variant, max_distance: int +) -> bool: + """ + Determine whether two variants should be merged based on specific conditions. + + Parameters: + variant1 (vcflib.vcf.Variant): The first variant object. + variant2 (vcflib.vcf.Variant): The second variant object. + max_distance (int): The maximum allowable distance between two variants + for them to be considered for merging. + + Returns: + bool: True if the variants should be merged, False otherwise. + """ + # Check if either variant contains structural variation information + if "SVTYPE" in variant1.info or "SVTYPE" in variant2.info: + return False + + # Ensure variants are on the same chromosome and within the maximum distance + if ( + variant1.chrom != variant2.chrom + or variant2.pos - variant1.pos > max_distance + len(variant1.ref) - 1 + ): + return False + + # Extract phasing-related information from samples + pid1 = variant1.samples[0].get("PID", "") + pid2 = variant2.samples[0].get("PID", "") + pgt1 = variant1.samples[0].get("PGT", "") + pgt2 = variant2.samples[0].get("PGT", "") + ps1 = variant1.samples[0].get("PS", "") + ps2 = variant2.samples[0].get("PS", "") + + # Check if variants share the same PID and PGT, or the same PS + if (pid1 == pid2 and pgt1 == pgt2 and pid1 and pgt1) or (ps1 == ps2 and ps1): + return True + + return False + + +def distance( + v1: Union[List[vcflib.vcf.Variant], vcflib.vcf.Variant], v2: vcflib.vcf.Variant +) -> int: + """ + Calculate the distance between two VCF variants. If `v1` is a list of variants, + the function will calculate the distance to the last variant in the list. + + Parameters: + v1 (Union[List[vcflib.vcf.Variant], vcflib.VCF.Variant]): The first variant(s). + v2 (vcflib.vcf.Variant): The second variant. + + Returns: + int: The calculated distance between the two variants. Returns 1000 if variants + are on different chromosomes or if the distance cannot be determined. + """ + if isinstance(v1, list): + # If v1 is a list of variants, return the distance to the last variant in the list + for v in v1[::-1]: # Reverse list iteration + return v2.pos - v.pos - len(v.ref) + 1 + return 1000 + + if v1.chrom != v2.chrom: + # If variants are on different chromosomes, return a large distance + return 1000 + + # Calculate distance between two variants on the same chromosome + return v2.pos - v1.pos - len(v1.ref) + 1 + + +def merge( + variant_stack: List[vcflib.vcf.Variant], + max_distance: int, + preserve_filters: List[str], +) -> List[vcflib.vcf.Variant]: + """ + Merges a stack of overlapping variants into a single variant. + + This function takes a stack of variants that are close enough (within a specified max_distance) + and merges them into one variant, combining their reference and alternate alleles, quality scores, + and sample information. + + Parameters: + variant_stack (List[vcflib.vcf.Variant]): A list of variants to be merged. + max_distance (int): The maximum distance between variants for them to be considered for merging. + preserve_filters (str): Filters which when uniquely present (does not also contain PASS) in the same merged MNV is not set to conflicting. + + Returns: + List[vcflib.vcf.Variant]: A list of merged variants. + """ + + def _merge(vv: List[vcflib.vcf.Variant]) -> vcflib.vcf.Variant: + """ + Merges a list of variants into a single variant by combining their properties. + + This function merges reference, alternate alleles, quality, INFO fields, and sample information + of overlapping variants in the list. + + Parameters: + vv (List[vcflib.vcf.Variant]): The list of variants to be merged. + + Returns: + vcflib.vcf.Variant: The merged variant. + """ + len_vv = len(vv) + + # Check if variants overlap in positions, return None if not + for i in range(len_vv - 1): + if vv[i + 1].pos - vv[i].pos < len(vv[i].ref): + return None + + # Initialize the merged variant with the first variant in the list + v = copy.deepcopy(vv[0]) + v.id = "." # Clear the ID for merged variant + ref = vv[0].ref + alt = vv[0].alt + + # Combine REF and ALT sequences for all variants + for i in range(1, len_vv): + vi = vv[i] + ref_gap = "" + if vi.pos - (v.pos + len(ref)) > 0: + ref_gap = reference[v.chrom][v.pos + len(ref) : vi.pos].seq.upper() + ref = ref + ref_gap + vi.ref + alt = [alt[j] + ref_gap + vi.alt[j] for j in range(len(alt))] + + # Calculate the merged quality score + qual_list = [vi.qual for vi in vv] + v.qual = round(sum(qual_list) / len_vv, 4) if None not in qual_list else None + + # Merge INFO fields by averaging the values + for k in v.info.keys(): + vk = [vi.info.get(k) for vi in vv] + if None in vk: + v.info[k] = None + else: + try: + v.info[k] = round(sum(vk) / len_vv, 4) + except (TypeError, ZeroDivisionError): + v.info[k] = None + + # Remove common leading bases between REF and ALT alleles + i = 0 + while i < len(ref) - 1: + common = False not in [ + ref[i] == alti[i] if i < len(alti) - 1 else False for alti in alt + ] + if not common: + break + i += 1 + v.ref = ref[i:] + v.alt = [alti[i:] for alti in alt] + v.pos += i + + # Build TNSCOPE_MNV_FILTERS by joining filters for each variant in vv. + tnscope_mnv_filters = ["|".join(vi.filter) for vi in vv] + + # Map each variant's index to its filter list. + vi_filters = {i: vi.filter for i, vi in enumerate(vv)} + + # Collect all unique filters from all variants. + all_filters = {flt for vi in vv for flt in vi.filter} + + # Determine for each variant if it contains any preserve filter. + preserve_variants = [ + any(f in preserve_filters for f in filters) + for filters in vi_filters.values() + ] + + # Set the merged variant's filter based on the preserve logic. + if all(preserve_variants): + v.filter = all_filters + else: + v.filter = ( + ["MNV_CONFLICTING_FILTERS"] + if len(all_filters) > 1 + else [all_filters.pop()] + ) + + # Mark all constituent variants as "MERGED" + # Create list of all constituent SNV chrom,pos,ref,alts + tnscope_mnv_vars: list = [] + for vi in vv: + if "MERGED" not in vi.filter: + vi.filter.append("MERGED") + vi_alt = vi.alt[0] + tnscope_mnv_vars.append(f"{vi.chrom}_{vi.pos}_{vi.ref}_{vi_alt}") + + # Merge sample information (AF, AD, AFDP) + tnscope_mnv_afs = {} + tnscope_mnv_ads = {} + for i in range(len(vcf.samples)): + t = v.samples[i] + vv_samples = [vs.samples[i] for vs in vv] + sample_af = [vi.get("AF") for vi in vv_samples] + tnscope_mnv_afs[i] = sample_af + + # Handle AF (allele frequency) + if None not in sample_af: + if isinstance(vv_samples[0]["AF"], list): + t["AF"] = [ + sum([vsi["AF"][j] for vsi in vv_samples]) / len_vv + for j in range(len(alt)) + ] + else: + t["AF"] = sum(sample_af) / len_vv + + # Handle AD (allele depths) + ref_ads: list = [vi["AD"][0] for vi in vv_samples] + alt_ads: list = [vi["AD"][1] for vi in vv_samples] + tnscope_mnv_ads[i]: list = ( + f"{ref}|{alt}" for ref, alt in zip(ref_ads, alt_ads) + ) + + t["AD"] = ( + int(sum(ref_ads) / len_vv), + int(sum(alt_ads) / len_vv), + ) + + # Handle AFDP (allele frequency depth) + afdp_values = [vi.get("AFDP") for vi in vv_samples] + if None not in afdp_values: + t["AFDP"] = int(sum(afdp_values) / len_vv) + + v.info["TNSCOPE_MNV_FILTERS"] = ",".join(tnscope_mnv_filters) + v.info["TNSCOPE_MNV_TUMOR_AFs"] = ",".join(str(af) for af in tnscope_mnv_afs[0]) + if 1 in tnscope_mnv_afs: + v.info["TNSCOPE_MNV_NORMAL_AFs"] = ",".join( + str(af) for af in tnscope_mnv_afs[1] + ) + v.info["TNSCOPE_MNV_TUMOR_ADs"] = ",".join(tnscope_mnv_ads[0]) + if 1 in tnscope_mnv_ads: + v.info["TNSCOPE_MNV_NORMAL_ADs"] = ",".join(tnscope_mnv_ads[1]) + v.info["TNSCOPE_MNV_VARS"] = ",".join(tnscope_mnv_vars) + + # Format the final variant + _ = vcf.format(v) + for vi in vv: + _ = vcf.format(vi) + + return v + + vlist = [] # List to store the final merged variants + to_push = [] # Temporary list to store variants being merged + + # Iterate over the variant stack and merge overlapping variants + for i in range(len(variant_stack)): + vi: vcflib.vcf.Variant = variant_stack[i] + to_merge = [vi] + for j in range(i + 1, len(variant_stack)): + vj: vcflib.vcf.Variant = variant_stack[j] + # If variant has already been merged, ignore it + if "MERGED" in vj.filter: + continue + if ifmerge(to_merge[-1], vj, max_distance): + to_merge.append(vj) + + # If there are variants to merge, merge them + if len(to_merge) > 1: + merged = _merge(to_merge) + if merged: + to_push.append(merged) + + # Add merged variants to the final list if they are in order + while len(to_push): + if to_push[0].pos <= vi.pos: + vlist.append(to_push.pop(0)) + else: + break + vlist.append(vi) + + # Append any remaining merged variants + while len(to_push): + vlist.append(to_push.pop(0)) + + return vlist + + +def process( + vcf_file: str, + ref_file: str, + out_file: Optional[str], + max_distance: int, + preserve_filters: List[str], +) -> None: + """ + Processes a VCF file, merges neighboring variants into MNVs, and writes the result to an output file or stdout. + + Parameters: + vcf_file (str): Path to the input VCF file to process. + ref_file (str): Path to the reference genome file. + out_file (Optional[str]): Path to the output VCF file. If not specified, the result is printed to stdout. + max_distance (int): Maximum distance between two variants to be merged. + preserve_filters (str): Filters which when uniquely present (does not also contain PASS) in the same merged MNV is not set to conflicting. + + This function reads the input VCF file, processes each variant, and merges variants that are close + enough to each other based on the `max_distance` parameter. The result is written to the specified + output file or to stdout if no output file is provided. + """ + global vcf, reference + vcf = vcflib.VCF(vcf_file) + reference = pyfaidx.Fasta(ref_file) + + # Open output file (or use stdout if no file is specified) + out_fh = open(out_file, "w") if out_file else sys.stdout + + # Define and add the MERGED filter to the VCF header if not already present + new_filters = { + "MERGED": { + "Description": '"SNV Merged with neighboring variants"', + "ID": "MERGED", + }, + "MNV_CONFLICTING_FILTERS": { + "Description": '"Merged MNV contains SNVs with conflicting filters, such as triallelic_site and in_normal"', + "ID": "MNV_CONFLICTING_FILTERS", + }, + } + new_info_fieds = { + "TNSCOPE_MNV_FILTERS": { + "Description": '"A list of filters from each respective constituent SNV (separated by |)"', + "ID": "TNSCOPE_MNV_FILTERS", + "Number": ".", + "Type": "String", + }, + "TNSCOPE_MNV_TUMOR_AFs": { + "Description": '"A list of AFs from each respective constituent SNV in tumor sample"', + "ID": "TNSCOPE_MNV_TUMOR_AFs", + "Number": ".", + "Type": "String", + }, + "TNSCOPE_MNV_TUMOR_ADs": { + "Description": '"A list of ADs from each respective constituent SNV (ref, alt separated by |) in tumor sample"', + "ID": "TNSCOPE_MNV_TUMOR_ADs", + "Number": ".", + "Type": "String", + }, + "TNSCOPE_MNV_NORMAL_AFs": { + "Description": '"A list of AFs from each respective constituent SNV in normal sample"', + "ID": "TNSCOPE_MNV_NORMAL_AFs", + "Number": ".", + "Type": "String", + }, + "TNSCOPE_MNV_NORMAL_ADs": { + "Description": '"A list of ADs from each respective constituent SNV (ref, alt separated by |) in normal sample"', + "ID": "TNSCOPE_MNV_NORMAL_ADs", + "Number": ".", + "Type": "String", + }, + "TNSCOPE_MNV_VARS": { + "Description": '"A list of chrom_pos_ref_alt strings from each respective constituent SNV"', + "ID": "TNSCOPE_MNV_VARS", + "Number": ".", + "Type": "String", + }, + } + for new_filter_id, description_dict in new_filters.items(): + vcf.filters[new_filter_id] = description_dict + + filter_added, info_added = False, False + for header_line in vcf.headers: + if header_line.startswith("##FILTER") and not filter_added: + for new_filter_id, description_dict in new_filters.items(): + filter_id = description_dict["ID"] + description = description_dict["Description"] + print( + f"##FILTER=", + file=out_fh, + ) + filter_added = True + if header_line.startswith("##INFO") and not info_added: + for new_info_id, description_dict in new_info_fieds.items(): + info_id = description_dict["ID"] + description = description_dict["Description"] + number = description_dict["Number"] + info_type = description_dict["Type"] + print( + f"##INFO=", + file=out_fh, + ) + info_added = True + print(header_line, file=out_fh) + + # Process variants and merge them if they are within max_distance + variant_stack = [] + for variant in vcf: + if not variant_stack: + # If the stack is empty, add the variant or print if it's an SV + if "SVTYPE" in variant.info: + print(variant, file=out_fh) + else: + variant_stack.append(variant) + else: + # Check if the current variant is close enough to merge + if distance(variant_stack, variant) <= max_distance: + variant_stack.append(variant) + else: + # Merge variants in the stack and reset it + merged_variants = merge(variant_stack, max_distance, preserve_filters) + for merged_variant in merged_variants: + print(merged_variant, file=out_fh) + variant_stack = [] + # Print the current variant or add it to the stack + if "SVTYPE" in variant.info: + print(variant, file=out_fh) + else: + variant_stack.append(variant) + + # Process any remaining variants in the stack after the loop + merged_variants = merge(variant_stack, max_distance, preserve_filters) + for merged_variant in merged_variants: + print(merged_variant, file=out_fh) + + # Close the output file if it was specified + if out_file: + out_fh.close() + + +@click.command() +@click.argument("vcf_file", type=click.Path(exists=True)) +@click.argument("reference", type=click.Path(exists=True)) +@click.option( + "--out_file", + type=click.Path(), + default=None, + help="Output VCF file. If not specified, output will be written to stdout.", +) +@click.option( + "--max_distance", + type=int, + default=5, + show_default=True, + help="Maximum distance between two variants to be merged.", +) +@click.option( + "--preserve_filters", + type=str, + default="in_normal,germline_risk", + show_default=True, + help="Filters which when uniquely present (does not also contain PASS) in the same merged MNV is not set to conflicting, but is preserved as is.", +) +def main( + vcf_file: str, + reference: str, + out_file: Optional[str], + max_distance: int, + preserve_filters: str, +) -> None: + """ + Merge phased SNVs into MNVs and output the result to a VCF file or stdout. + + Parameters: + vcf_file (str): Path to the input VCF file to process. + reference (str): Path to the reference genome file. + out_file (Optional[str]): Path to the output VCF file. If not specified, the result will be printed to stdout. + max_distance (int): Maximum allowed distance between two variants to be merged. + preserve_filters (str): Comma separated list of filters to not automatically include in MNV_CONFLICTING_FILTERS + + This command processes the input VCF file and merges phased single nucleotide variants (SNVs) + into multi-nucleotide variants (MNVs) if they meet the criteria based on `max_distance`. + """ + # Call the process function to perform the actual merging + process(vcf_file, reference, out_file, max_distance, preserve_filters.split(",")) + + +if __name__ == "__main__": + main() + + +""" +NOTE: + +Modifications by Clinical Genomics Stockholm, in the context of: https://github.com/Clinical-Genomics/BALSAMIC + +- Filter PASS is no longer a requirement for MNVs to be merged + +This was added because we have additional filters, like triallelic_site, and also variants with normal/germline filters set that we want to merge into MNVs. + +- Additional INFO fields which saves AD, AF, FILTER and CHROM_POS_REF_ALT of the constituent SNVs/InDels + +This was added mainly for the ability of tracing back the filters of the constituent SNVs/InDels + +- New argument for preserve_filters + +This was added as an extra logic in the merging of filters in the creation of the MNV. +The filters listed in the preserve_filters argument are kept as filters as long as there are no other other filters in the same MNV (such as PASS / triallelic_site), at which point the filter will be set to MNV_CONFLICTING_FILTERS +""" diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index f9b63198c..941fb0858 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -388,7 +388,7 @@ "time": "01:00:00", "n": 4 }, - "modify_tnscope_infofield": { + "post_process_tnscope": { "time": "01:00:00", "n": 4 }, diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index 4c49e8f6c..cc8ee7824 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -200,8 +200,9 @@ "vep_annotate_germlineVAR_tumor", "vep_annotate_germlineVAR_normal", # SNVs - "modify_tnscope_infofield", - "modify_tnscope_infofield_umi", + "bcftools_split_tnscope_variants", + "sentieon_tnscope_umi", + "sentieon_tnscope_umi_tn", "gatk_update_vcf_sequence_dictionary", "bcftools_filter_tnscope_clinical_tumor_only", "bcftools_filter_tnscope_clinical_tumor_normal", diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index 4a873d751..c1fe7a541 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -329,6 +329,9 @@ class WgsSNVFilters(BaseSNVFilters): class TgaSNVFilters(BaseSNVFilters): research = [ + VCFFilter( + filter_name="MERGED", Description="SNV Merged with neighboring variants" + ), VCFFilter(tag_value=0.01, filter_name="SWEGENAF", field="INFO"), VCFFilter(tag_value=0.005, filter_name="balsamic_high_pop_freq", field="INFO"), ] diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 943ee6f2c..836c57a1a 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -115,7 +115,7 @@ }, } -SLEEP_BEFORE_START = 600 +SLEEP_BEFORE_START = 800 WORKFLOW_PARAMS = { "bam_post_processing": { diff --git a/BALSAMIC/containers/varcall_py3/varcall_py3.yaml b/BALSAMIC/containers/varcall_py3/varcall_py3.yaml index 4f106103e..d59f8e6be 100644 --- a/BALSAMIC/containers/varcall_py3/varcall_py3.yaml +++ b/BALSAMIC/containers/varcall_py3/varcall_py3.yaml @@ -104,6 +104,7 @@ dependencies: - pthread-stubs=0.4 - pycosat=0.6.4 - pycparser=2.20 + - pyfaidx=0.8.1.3 - pyopenssl=20.0.1 - pysam=0.19.1 - pysocks=1.7.1 diff --git a/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule b/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule index 4e4371c3b..8f6727347 100644 --- a/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule +++ b/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule @@ -1,15 +1,14 @@ rule modify_tnscope_infofield_umi: input: - vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.preprocess.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.post_process.vcf.gz", benchmark: Path(benchmark_dir,'modify_tnscope_infofield_umi_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, modify_tnscope_infofield = get_script_path("modify_tnscope_infofield.py"), tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"] diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index 5e9665299..4fd414c0b 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -12,12 +12,13 @@ rule sentieon_tnscope_umi: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.preprocess.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + housekeeper_id = {"id": config["analysis"]["case_id"],"tags": "research"}, sentieon_exec = config_model.sentieon.sentieon_exec, sentieon_lic = config_model.sentieon.sentieon_license, tumor_af = params.tnscope_umi.filter_tumor_af, diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index 863feb249..e400c73f2 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -12,12 +12,13 @@ rule sentieon_tnscope_umi_tn: bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.preprocess.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + housekeeper_id = {"id": config["analysis"]["case_id"],"tags": "research"}, sentieon_exec = config_model.sentieon.sentieon_exec, sentieon_lic = config_model.sentieon.sentieon_license, tumor_af = params.tnscope_umi.filter_tumor_af, diff --git a/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule b/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule index 69c760191..2db2cff26 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule @@ -86,7 +86,7 @@ if config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"]["a input: vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", output: - vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.pre_process.vcf.gz", benchmark: Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() singularity: @@ -155,7 +155,7 @@ elif config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"][ input: vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", output: - vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.pre_process.vcf.gz", benchmark: Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_normal_' + config["analysis"]["case_id"] + ".tsv").as_posix() singularity: @@ -223,7 +223,7 @@ elif config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"][ if config_model.analysis.analysis_workflow == AnalysisWorkflow.BALSAMIC_UMI and config["analysis"]["analysis_type"] == 'paired': rule bcftools_quality_filter_TNscope_umi_tumor_normal: input: - vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.post_process.vcf.gz", output: vcf_filtered = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz", benchmark: @@ -251,9 +251,9 @@ if config_model.analysis.analysis_workflow == AnalysisWorkflow.BALSAMIC_UMI and elif config_model.analysis.analysis_workflow == AnalysisWorkflow.BALSAMIC_UMI and config["analysis"]["analysis_type"] == 'single': rule bcftools_quality_filter_TNscope_umi_tumor_only: input: - vcf=vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.post_process.vcf.gz", output: - vcf_filtered=vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz" + vcf_filtered = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz" benchmark: Path(benchmark_dir,'bcftools_quality_filter_TNscope_umi_tumor_only' + config["analysis"][ "case_id"] + ".tsv").as_posix() diff --git a/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule b/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule index e8fc4ca04..e5a38ea6a 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule @@ -5,16 +5,16 @@ rule bcftools_split_tnscope_variants: input: - ref = config["reference"]["reference_genome"], vcf = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", output: - vcf_tnscope = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.preprocess.vcf", + vcf_tnscope = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", vcf_tnscope_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", benchmark: Path(benchmark_dir,'bcftools_split_tnscope_variants_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"] threads: @@ -26,40 +26,52 @@ rule bcftools_split_tnscope_variants: export TMPDIR={params.tmpdir}; mkdir -p {params.tmpdir}; -bcftools view --include 'INFO/SVTYPE=="."' -o {output.vcf_tnscope} {input.vcf} ; +bcftools view --include 'INFO/SVTYPE=="."' -O z -o {output.vcf_tnscope} {input.vcf} ; bcftools view --include 'INFO/SVTYPE!="."' -O z -o {output.vcf_tnscope_sv} {input.vcf}; tabix -p vcf -f {output.vcf_tnscope_sv}; +tabix -p vcf -f {output.vcf_tnscope}; """ -rule modify_tnscope_infofield: +rule post_process_tnscope: input: - vcf_tnscope = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.preprocess.vcf", + vcf_tnscope = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.pre_process.vcf.gz", + ref = config["reference"]["reference_genome"], output: - vcf_tnscope = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + vcf_tnscope = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", benchmark: - Path(benchmark_dir,'modify_tnscope_infofield_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() + Path(benchmark_dir,'post_process_tnscope_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + merge_mnvs = get_script_path("merge_mnp.py"), modify_tnscope_infofield = get_script_path("modify_tnscope_infofield.py"), + edit_vcf_script= get_script_path("edit_vcf_info.py"), tmpdir = tempfile.mkdtemp(prefix=tmp_dir), case_name = config["analysis"]["case_id"], - edit_vcf_script = get_script_path("edit_vcf_info.py"), + sentieon_exec = config_model.sentieon.sentieon_exec, + sentieon_lic = config_model.sentieon.sentieon_license, + matched_normal_filternames = ",".join(BaseSNVFilters.MATCHED_NORMAL_FILTER_NAMES), variant_caller= "tnscope" threads: - get_threads(cluster_config, 'modify_tnscope_infofield') + get_threads(cluster_config, 'post_process_tnscope') message: - "Add DP and AF tumor sample info and FOUND_IN to INFO field for case: {params.case_name}" + "Merge TNscope SNVs with same phaseID to MNVs." + "Add DP and AF tumor sample info and FOUND_IN to INFO field: {params.case_name}" shell: """ -export TMPDIR={params.tmpdir}; -mkdir -p {params.tmpdir}; +export SENTIEON_TMPDIR={params.tmpdir}; +export SENTIEON_LICENSE={params.sentieon_lic}; + +{params.sentieon_exec} pyexec {params.merge_mnvs} --preserve_filters {params.matched_normal_filternames} --max_distance 5 {input.vcf_tnscope} {input.ref} > {params.tmpdir}/tnscope.research.mnv.vcf ; + +python {params.modify_tnscope_infofield} {params.tmpdir}/tnscope.research.mnv.vcf {params.tmpdir}/tnscope.research.mnv.add_info_fields.vcf ; -python {params.modify_tnscope_infofield} {input.vcf_tnscope} {params.tmpdir}/vcf_tnscope_snvs_modified.vcf ; -python {params.edit_vcf_script} -i {params.tmpdir}/vcf_tnscope_snvs_modified.vcf -o {params.tmpdir}/vcf_tnscope_snvs_modified_found_in_added.vcf -c {params.variant_caller}; -bgzip {params.tmpdir}/vcf_tnscope_snvs_modified_found_in_added.vcf ; -mv {params.tmpdir}/vcf_tnscope_snvs_modified_found_in_added.vcf.gz {output.vcf_tnscope} ; +python {params.edit_vcf_script} -i {params.tmpdir}/tnscope.research.mnv.add_info_fields.vcf -o {params.tmpdir}/tnscope.research.mnv.add_info_fields.added_found_in.vcf -c {params.variant_caller}; + +bgzip {params.tmpdir}/tnscope.research.mnv.add_info_fields.added_found_in.vcf ; + +mv {params.tmpdir}/tnscope.research.mnv.add_info_fields.added_found_in.vcf.gz {output.vcf_tnscope} ; tabix -p vcf -f {output.vcf_tnscope} ; """ + diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 1854b62cf..c4d0e6828 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -20,6 +20,7 @@ from BALSAMIC.constants.analysis import ( from BALSAMIC.constants.paths import BALSAMIC_DIR from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.variant_filters import ( + BaseSNVFilters, SVDB_FILTER_SETTINGS, MANTA_FILTER_SETTINGS, WgsSNVFilters, diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b83f4a408..ca9e1ef01 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,12 +1,19 @@ [X.X.X] -------- +-------- Added: ^^^^^^ * Added option to disable hard filter of variants in matched normal https://github.com/Clinical-Genomics/BALSAMIC/pull/1509 +* Added check to verify sample sex for all workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1516 + Changed: ^^^^^^^^ +* Reworked bcftools filters https://github.com/Clinical-Genomics/BALSAMIC/pull/1509 +* Renamed high_normal_tumor_af_frac to in_normal https://github.com/Clinical-Genomics/BALSAMIC/pull/1509 +* check to verify sample sex for all workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1516 +* Merging SNVs into MNVs in TNscope TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1524 +* Change raw delivery SNV file for TGA to before any post-processing https://github.com/Clinical-Genomics/BALSAMIC/pull/1524 Removed: @@ -14,15 +21,12 @@ Removed: * Remove WGS-level GC-bias metric from TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1521 -Changed: -^^^^^^^^ -* Reworked bcftools filters https://github.com/Clinical-Genomics/BALSAMIC/pull/1509 -* Renamed high_normal_tumor_af_frac to in_normal https://github.com/Clinical-Genomics/BALSAMIC/pull/1509 -* check to verify sample sex for all workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1516 +Fixed: +^^^^^^ [16.0.0] -------- +-------- Added: ^^^^^^ diff --git a/docs/.DS_Store b/docs/.DS_Store deleted file mode 100644 index 5e25ade96..000000000 Binary files a/docs/.DS_Store and /dev/null differ diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index 6b27953f8..929078ab8 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -245,6 +245,7 @@ The `TNscope `_ algor *min_base_qual*: Minimal base quality to consider in calling :: + min_base_qual = 15 *min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. @@ -313,6 +314,33 @@ The `TNscope `_ algor marks variant with soft-filter `in_normal` variant if: AF(normal) / AF(tumor) > 0.3 +**Post-processing of TNscope variants** + +After quality-filtering TNscope variants and before merging with VarDict variants the phased SNVs and InDels from TNscope are merged together to MNVs using a slightly modified script from `Sentieon-scripts `_ which can be found in ``BALSAMIC/assets/scripts/merge_mnp.py`` + +This was done to avoid multiple representations of the same variant as VarDict already outputs these types of variants as MNVs, and because VEP isn't coded to handle phased SNVs in the interpretation of protein effect. + +In the merging of phased SNVs to MNV we need to handle how to consolidate information from multiple variants into a single metric, and importantly also for the FILTER column. + +An example is a MNV created by merging a phased germline SNV with a somatic SNV. This has been solved as follows: + +- `MNV_CONFLICTING_FILTERS`: Is a filter given to MNVs with constituent variants with different filters (such as `in_normal` and `PASS`) + +.. note:: + + However, as we may have multiple filters which means similar things, such as germline_risk and in_normal, MNVs constituted by variants with only these filters set aren't exactly "conflicting". + +Therefore the logic for setting `MNV_CONFLICTING_FILTERS` has been made a bit more complex, and in summary there are 3 possible outcomes for filters when merging SNVs/InDels into MNVs: + +1. Single filter such as PASS, when all constituting variants all have the same filter and no other. +2. Multiple filters, such as in_normal,germline_risk, when all constituting variants have at least 1 of the matched normal filters. +3. `MNV_CONFLICTING_FILTERS` when the merged variants have conflicting filters, and they don't all contain matched normal filters. + +.. note:: + + In addition to this a few more fields are added to the INFO field of the created MNVs containing comma-separated lists of AD, AF, and FILTER from its constituting variants. + + **Post-call Observation database Filters** diff --git a/docs/balsamic_pon.rst b/docs/balsamic_pon.rst index 201d7714e..7df2017ea 100644 --- a/docs/balsamic_pon.rst +++ b/docs/balsamic_pon.rst @@ -8,7 +8,7 @@ Currently two PON-methods are implemented in BALSAMIC to correct for biases and - To produce normalised CN-profiles for WGS cases visualised in ``GENS``. Sharing PON for publications -====================== +============================ If a PON has been used for the analysis of samples in a research project and a publication requires that the PON is uploaded to some database, a request can be made to Clinical Genomics, and depending on the status of the consent of the individuals from which the samples used in the construction of the PON has been derived it may or may not be possible to share the PON. diff --git a/docs/balsamic_sv_cnv.rst b/docs/balsamic_sv_cnv.rst index c2b17b893..e4f7d901d 100644 --- a/docs/balsamic_sv_cnv.rst +++ b/docs/balsamic_sv_cnv.rst @@ -58,7 +58,7 @@ It is mandatory to provide the gender of the sample from BALSAMIC version >= 10. Further details about a specific caller can be found in the links for the repositories containing the documentation for SV and CNV callers along with the links for the articles are listed in `bioinfo softwares `_. **Difficult to detect clinically relevant SVs** -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ **IGH::DUX4 rearrangements** diff --git a/docs/user_guide.rst b/docs/user_guide.rst index 00db4bf9d..157ba6f12 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -5,7 +5,7 @@ Short tutorial Here a short tutorial is provided for BALSAMIC (**version** = 16.0.0). Regarding fastq-inputs ---------------------- +--------------------------- Previous versions of BALSAMIC only accepted one fastq-pair per sample, which required concatenation of fastq-pairs if multiple existed.