Skip to content

Commit

Permalink
Merge branch 'develop' into feature/rust
Browse files Browse the repository at this point in the history
  • Loading branch information
ACEnglish committed Feb 14, 2024
2 parents 2359fd0 + 1cd03b2 commit 6c7eb04
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 24 deletions.
12 changes: 12 additions & 0 deletions repo_utils/answer_key/collapse/inputissue196_collapsed.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr21,length=46709983,md5=2f45a3455007b7e271509161e52954a9>
##INFO=<ID=END,Number=1,Type=Integer,Description="SV END position">
##INFO=<ID=SVTYPE,Number=A,Type=String,Description="Variant type">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length of ref and alt alleles">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=NumCollapsed,Number=1,Type=Integer,Description="Number of calls collapsed into this call by truvari">
##INFO=<ID=CollapseId,Number=1,Type=String,Description="Truvari uid to help tie output.vcf and output.collapsed.vcf entries together">
##INFO=<ID=NumConsolidated,Number=1,Type=Integer,Description="Number of samples consolidated into this call by truvari">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr21 14088112 chr21-14088113-DEL-223 A <DEL> . . END=14088558;SVTYPE=DEL;SVLEN=-223;NumCollapsed=2;NumConsolidated=0;CollapseId=1.0 GT 1|0
20 changes: 20 additions & 0 deletions repo_utils/answer_key/collapse/inputissue196_removed.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr21,length=46709983,md5=2f45a3455007b7e271509161e52954a9>
##INFO=<ID=END,Number=1,Type=Integer,Description="SV END position">
##INFO=<ID=SVTYPE,Number=A,Type=String,Description="Variant type">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length of ref and alt alleles">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=TruScore,Number=1,Type=Integer,Description="Truvari score for similarity of match">
##INFO=<ID=PctSeqSimilarity,Number=1,Type=Float,Description="Pct sequence similarity between this variant and its closest match">
##INFO=<ID=PctSizeSimilarity,Number=1,Type=Float,Description="Pct size similarity between this variant and its closest match">
##INFO=<ID=PctRecOverlap,Number=1,Type=Float,Description="Percent reciprocal overlap percent of the two calls' coordinates">
##INFO=<ID=StartDistance,Number=1,Type=Integer,Description="Distance of the base call's end from comparison call's start">
##INFO=<ID=EndDistance,Number=1,Type=Integer,Description="Distance of the base call's end from comparison call's end">
##INFO=<ID=SizeDiff,Number=1,Type=Float,Description="Difference in size of base and comp calls">
##INFO=<ID=GTMatch,Number=1,Type=Integer,Description="Base/Comparison genotypes AC difference">
##INFO=<ID=MatchId,Number=.,Type=String,Description="Tuple of base and comparison call ids which were matched">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr21 14088212 chr21-14088113-DEL-223 T <DEL> . . END=14088828;SVTYPE=DEL;SVLEN=-393;PctSeqSimilarity=0;PctSizeSimilarity=0.5674;PctRecOverlap=0.5624;SizeDiff=-170;StartDistance=-100;EndDistance=-270;GTMatch=.;TruScore=37;MatchId=1.0 GT 1|0
chr21 14088312 chr21-14088113-DEL-223 A <DEL> . . END=14089203;SVTYPE=DEL;SVLEN=-668;PctSeqSimilarity=0;PctSizeSimilarity=0.5883;PctRecOverlap=0.5796;SizeDiff=275;StartDistance=100;EndDistance=375;GTMatch=.;TruScore=38;MatchId=1.0 GT 1|0
10 changes: 10 additions & 0 deletions repo_utils/sub_tests/collapse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,13 @@ run test_collapse_intragth $truv collapse -i $INDIR/variants/bcftools_merged.vcf
if [ $test_collapse_intragth ]; then
collapse_assert intragth
fi


run test_collapse_chain $truv collapse -i $INDIR/variants/issue196_chain.vcf.gz \
-o $OD/inputissue196_collapsed.vcf \
-c $OD/inputissue196_removed.vcf \
--chain --pctseq 0 --pctsize 0.35
if [ $test_collapse_chain ]; then
collapse_assert issue196
fi

Binary file added repo_utils/test_files/variants/issue196_chain.vcf.gz
Binary file not shown.
Binary file not shown.
4 changes: 3 additions & 1 deletion truvari/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def parse_args(args):
help="Output directory")
parser.add_argument("-f", "--reference", type=str, default=None,
help="Fasta used to call variants. Turns on reference context sequence comparison")
parser.add_argument("--short", action="store_true",
help="Short circuit comparisions. Faster, but fewer annotations")
parser.add_argument("--debug", action="store_true", default=False,
help="Verbose logging")

Expand Down Expand Up @@ -751,7 +753,7 @@ def bench_main(cmdargs):
matcher = truvari.Matcher(args)

m_bench = Bench(matcher, args.base, args.comp, args.output,
args.includebed, args.extend, args.debug, True)
args.includebed, args.extend, args.debug, True, args.short)
output = m_bench.run()

logging.info("Stats: %s", json.dumps(output.stats_box, indent=4))
Expand Down
30 changes: 20 additions & 10 deletions truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,7 @@ class CollapsedCalls():
match_id: str
matches: list = field(default_factory=list)
gt_consolidate_count: int = 0
genotype_mask: str = "" # bad

def combine(self, other):
"""
Put other's entries into this' collapsed_entries
"""
self.matches.append(other.entry)
self.matches.extend(other.matches)
genotype_mask: str = None # not actually a str

def calc_median_sizepos(self):
"""
Expand Down Expand Up @@ -94,20 +87,33 @@ def gt_conflict(self, other, which_gt):
self.genotype_mask |= o_mask
return False

def combine(self, other):
"""
Add other's calls/matches to self's matches
"""
self.matches.extend(other.matches)
self.gt_consolidate_count += other.gt_consolidate_count
if self.genotype_mask is not None and other.genotype_mask is not None:
self.genotype_mask |= other.genotype_mask

def chain_collapse(cur_collapse, all_collapse, matcher):
"""
Perform transitive matching of cur_collapse to all_collapse
Check the cur_collapse's entry to all other collapses' consolidated entries
"""
for m_collap in all_collapse:
for other in m_collap.matches:
mat = matcher.build_match(cur_collapse.entry,
other.base,
other.comp,
m_collap.match_id,
skip_gt=True,
short_circuit=True)
if mat.state:
m_collap.consolidate(cur_collapse)
# The other's representative entry will report its
# similarity to the matched call that pulled it in
mat.base, mat.comp = mat.comp, mat.base
m_collap.matches.append(mat)
m_collap.combine(cur_collapse)
return True # you can just ignore it later
return False # You'll have to add it to your set of collapsed calls

Expand Down Expand Up @@ -143,6 +149,10 @@ def collapse_chunk(chunk, matcher):
mat.state = False
if mat.state:
m_collap.matches.append(mat)
elif mat.sizesim is not None and mat.sizesim < matcher.params.pctsize:
# Can we do this? The sort tells us that we're going through most->least
# similar size. So the next one will only be worse...
break

# Does this collap need to go into a previous collap?
if not matcher.chain or not chain_collapse(m_collap, ret, matcher):
Expand Down
2 changes: 1 addition & 1 deletion truvari/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def filter_call(self, entry, base=False):
Returns True if the call should be filtered
Base has different filtering requirements, so let the method know
"""
if self.params.check_monref and entry.alts is None: # ignore monomorphic reference
if self.params.check_monref and entry.alts in (None, '*'): # ignore monomorphic reference
return True

if self.params.check_multi and len(entry.alts) > 1:
Expand Down
26 changes: 14 additions & 12 deletions truvari/phab.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,24 +137,25 @@ def make_consensus(data, ref_fn, passonly=True, max_size=50000):

vcf_i = truvari.region_filter(vcf, tree, with_region=True)

# Don't make haplotypes of non-sequence resolved, non-pass (sometimes), too big variants
# pylint: disable=unnecessary-lambda-assignment
entry_filter = lambda entry: \
entry.alts is not None \
and (entry.alleles_variant_types[-1] in ['SNP', 'INDEL', 'OTHER']) \
and (not passonly or not truvari.entry_is_filtered(entry)) \
and (truvari.entry_size(entry) <= max_size)
# pylint: enable=unnecessary-lambda-assignment

cur_key = None
cur_entries = []
# Can do the entry filtering here
# Won't need to pass to make_haplotypes
for entry, key in vcf_i:
if cur_key is None:
cur_key = key

if entry.alts is None \
or (entry.alleles_variant_types[-1] not in ['SNP', 'INDEL']) \
or (passonly and truvari.entry_is_filtered(entry)) \
or truvari.entry_size(entry) > max_size:
continue

if key != cur_key:
ref = f"{cur_key[0]}:{cur_key[1].begin}-{cur_key[1].end - 1}"
ref_seq = reference.fetch(ref)
ret[ref] = make_haplotypes(ref_seq, cur_entries, o_samp,
ret[ref] = make_haplotypes(ref_seq, filter(entry_filter, cur_entries), o_samp,
ref, cur_key[1].begin, sample)
cur_key = key
cur_entries = []
Expand Down Expand Up @@ -217,13 +218,14 @@ def collect_haplotypes(ref_haps_fn, hap_jobs, threads, passonly=True, max_size=5
"""
all_haps = defaultdict(BytesIO)
m_ref = pysam.FastaFile(ref_haps_fn)
with multiprocessing.Pool(threads, maxtasksperchild=1) as pool:
with multiprocessing.Pool(threads, maxtasksperchild=1000) as pool:
to_call = partial(make_consensus, ref_fn=ref_haps_fn, passonly=passonly, max_size=max_size)
# Keep imap because determinism
for pos, haplotypes in enumerate(pool.imap(to_call, hap_jobs)):
for location, fasta_entry in haplotypes.items():
cur = all_haps[location]
cur.write(fasta_entry)
if pos == len(hap_jobs) - 1: # ref/seek/read on last hap
if pos == len(hap_jobs) - 1: # Write reference for this location at the end
cur.write(f">ref_{location}\n{m_ref[location]}\n".encode())
cur.seek(0)
all_haps[location] = cur.read()
Expand Down Expand Up @@ -371,7 +373,7 @@ def phab(var_regions, base_vcf, ref_fn, output_fn, bSamples=None, buffer=100,
fout.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t")
fout.write("\t".join(samp_names) + "\n")

with multiprocessing.Pool(threads, maxtasksperchild=1) as pool:
with multiprocessing.Pool(threads, maxtasksperchild=1000) as pool:
for result in pool.imap_unordered(align_method, haplotypes):
fout.write(result)
pool.close()
Expand Down

0 comments on commit 6c7eb04

Please sign in to comment.