diff --git a/repo_utils/answer_key/collapse/inputissue196_collapsed.vcf b/repo_utils/answer_key/collapse/inputissue196_collapsed.vcf new file mode 100644 index 00000000..f1c36b0b --- /dev/null +++ b/repo_utils/answer_key/collapse/inputissue196_collapsed.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +##FORMAT= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr21 14088112 chr21-14088113-DEL-223 A . . END=14088558;SVTYPE=DEL;SVLEN=-223;NumCollapsed=2;NumConsolidated=0;CollapseId=1.0 GT 1|0 diff --git a/repo_utils/answer_key/collapse/inputissue196_removed.vcf b/repo_utils/answer_key/collapse/inputissue196_removed.vcf new file mode 100644 index 00000000..a4bc8a7f --- /dev/null +++ b/repo_utils/answer_key/collapse/inputissue196_removed.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr21 14088212 chr21-14088113-DEL-223 T . . END=14088828;SVTYPE=DEL;SVLEN=-393;PctSeqSimilarity=0;PctSizeSimilarity=0.5674;PctRecOverlap=0.5624;SizeDiff=-170;StartDistance=-100;EndDistance=-270;GTMatch=.;TruScore=37;MatchId=1.0 GT 1|0 +chr21 14088312 chr21-14088113-DEL-223 A . . END=14089203;SVTYPE=DEL;SVLEN=-668;PctSeqSimilarity=0;PctSizeSimilarity=0.5883;PctRecOverlap=0.5796;SizeDiff=275;StartDistance=100;EndDistance=375;GTMatch=.;TruScore=38;MatchId=1.0 GT 1|0 diff --git a/repo_utils/sub_tests/collapse.sh b/repo_utils/sub_tests/collapse.sh index d325421a..93220119 100644 --- a/repo_utils/sub_tests/collapse.sh +++ b/repo_utils/sub_tests/collapse.sh @@ -98,3 +98,13 @@ run test_collapse_intragth $truv collapse -i $INDIR/variants/bcftools_merged.vcf if [ $test_collapse_intragth ]; then collapse_assert intragth fi + + +run test_collapse_chain $truv collapse -i $INDIR/variants/issue196_chain.vcf.gz \ + -o $OD/inputissue196_collapsed.vcf \ + -c $OD/inputissue196_removed.vcf \ + --chain --pctseq 0 --pctsize 0.35 +if [ $test_collapse_chain ]; then + collapse_assert issue196 +fi + diff --git a/repo_utils/test_files/variants/issue196_chain.vcf.gz b/repo_utils/test_files/variants/issue196_chain.vcf.gz new file mode 100644 index 00000000..3bed91de Binary files /dev/null and b/repo_utils/test_files/variants/issue196_chain.vcf.gz differ diff --git a/repo_utils/test_files/variants/issue196_chain.vcf.gz.tbi b/repo_utils/test_files/variants/issue196_chain.vcf.gz.tbi new file mode 100644 index 00000000..fb7a7c6c Binary files /dev/null and b/repo_utils/test_files/variants/issue196_chain.vcf.gz.tbi differ diff --git a/truvari/bench.py b/truvari/bench.py index 53a06653..d62fe1cd 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -37,6 +37,8 @@ def parse_args(args): help="Output directory") parser.add_argument("-f", "--reference", type=str, default=None, help="Fasta used to call variants. Turns on reference context sequence comparison") + parser.add_argument("--short", action="store_true", + help="Short circuit comparisions. Faster, but fewer annotations") parser.add_argument("--debug", action="store_true", default=False, help="Verbose logging") @@ -751,7 +753,7 @@ def bench_main(cmdargs): matcher = truvari.Matcher(args) m_bench = Bench(matcher, args.base, args.comp, args.output, - args.includebed, args.extend, args.debug, True) + args.includebed, args.extend, args.debug, True, args.short) output = m_bench.run() logging.info("Stats: %s", json.dumps(output.stats_box, indent=4)) diff --git a/truvari/collapse.py b/truvari/collapse.py index 5fa3d299..3a271295 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -31,14 +31,7 @@ class CollapsedCalls(): match_id: str matches: list = field(default_factory=list) gt_consolidate_count: int = 0 - genotype_mask: str = "" # bad - - def combine(self, other): - """ - Put other's entries into this' collapsed_entries - """ - self.matches.append(other.entry) - self.matches.extend(other.matches) + genotype_mask: str = None # not actually a str def calc_median_sizepos(self): """ @@ -94,20 +87,33 @@ def gt_conflict(self, other, which_gt): self.genotype_mask |= o_mask return False + def combine(self, other): + """ + Add other's calls/matches to self's matches + """ + self.matches.extend(other.matches) + self.gt_consolidate_count += other.gt_consolidate_count + if self.genotype_mask is not None and other.genotype_mask is not None: + self.genotype_mask |= other.genotype_mask def chain_collapse(cur_collapse, all_collapse, matcher): """ Perform transitive matching of cur_collapse to all_collapse + Check the cur_collapse's entry to all other collapses' consolidated entries """ for m_collap in all_collapse: for other in m_collap.matches: mat = matcher.build_match(cur_collapse.entry, - other.base, + other.comp, m_collap.match_id, skip_gt=True, short_circuit=True) if mat.state: - m_collap.consolidate(cur_collapse) + # The other's representative entry will report its + # similarity to the matched call that pulled it in + mat.base, mat.comp = mat.comp, mat.base + m_collap.matches.append(mat) + m_collap.combine(cur_collapse) return True # you can just ignore it later return False # You'll have to add it to your set of collapsed calls @@ -143,6 +149,10 @@ def collapse_chunk(chunk, matcher): mat.state = False if mat.state: m_collap.matches.append(mat) + elif mat.sizesim is not None and mat.sizesim < matcher.params.pctsize: + # Can we do this? The sort tells us that we're going through most->least + # similar size. So the next one will only be worse... + break # Does this collap need to go into a previous collap? if not matcher.chain or not chain_collapse(m_collap, ret, matcher): diff --git a/truvari/matching.py b/truvari/matching.py index 9967a851..40eddf66 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -155,7 +155,7 @@ def filter_call(self, entry, base=False): Returns True if the call should be filtered Base has different filtering requirements, so let the method know """ - if self.params.check_monref and entry.alts is None: # ignore monomorphic reference + if self.params.check_monref and entry.alts in (None, '*'): # ignore monomorphic reference return True if self.params.check_multi and len(entry.alts) > 1: diff --git a/truvari/phab.py b/truvari/phab.py index 809c36f3..8d437b79 100644 --- a/truvari/phab.py +++ b/truvari/phab.py @@ -137,24 +137,25 @@ def make_consensus(data, ref_fn, passonly=True, max_size=50000): vcf_i = truvari.region_filter(vcf, tree, with_region=True) + # Don't make haplotypes of non-sequence resolved, non-pass (sometimes), too big variants + # pylint: disable=unnecessary-lambda-assignment + entry_filter = lambda entry: \ + entry.alts is not None \ + and (entry.alleles_variant_types[-1] in ['SNP', 'INDEL', 'OTHER']) \ + and (not passonly or not truvari.entry_is_filtered(entry)) \ + and (truvari.entry_size(entry) <= max_size) + # pylint: enable=unnecessary-lambda-assignment + cur_key = None cur_entries = [] - # Can do the entry filtering here - # Won't need to pass to make_haplotypes for entry, key in vcf_i: if cur_key is None: cur_key = key - if entry.alts is None \ - or (entry.alleles_variant_types[-1] not in ['SNP', 'INDEL']) \ - or (passonly and truvari.entry_is_filtered(entry)) \ - or truvari.entry_size(entry) > max_size: - continue - if key != cur_key: ref = f"{cur_key[0]}:{cur_key[1].begin}-{cur_key[1].end - 1}" ref_seq = reference.fetch(ref) - ret[ref] = make_haplotypes(ref_seq, cur_entries, o_samp, + ret[ref] = make_haplotypes(ref_seq, filter(entry_filter, cur_entries), o_samp, ref, cur_key[1].begin, sample) cur_key = key cur_entries = [] @@ -217,13 +218,14 @@ def collect_haplotypes(ref_haps_fn, hap_jobs, threads, passonly=True, max_size=5 """ all_haps = defaultdict(BytesIO) m_ref = pysam.FastaFile(ref_haps_fn) - with multiprocessing.Pool(threads, maxtasksperchild=1) as pool: + with multiprocessing.Pool(threads, maxtasksperchild=1000) as pool: to_call = partial(make_consensus, ref_fn=ref_haps_fn, passonly=passonly, max_size=max_size) + # Keep imap because determinism for pos, haplotypes in enumerate(pool.imap(to_call, hap_jobs)): for location, fasta_entry in haplotypes.items(): cur = all_haps[location] cur.write(fasta_entry) - if pos == len(hap_jobs) - 1: # ref/seek/read on last hap + if pos == len(hap_jobs) - 1: # Write reference for this location at the end cur.write(f">ref_{location}\n{m_ref[location]}\n".encode()) cur.seek(0) all_haps[location] = cur.read() @@ -371,7 +373,7 @@ def phab(var_regions, base_vcf, ref_fn, output_fn, bSamples=None, buffer=100, fout.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t") fout.write("\t".join(samp_names) + "\n") - with multiprocessing.Pool(threads, maxtasksperchild=1) as pool: + with multiprocessing.Pool(threads, maxtasksperchild=1000) as pool: for result in pool.imap_unordered(align_method, haplotypes): fout.write(result) pool.close()