Skip to content

Commit

Permalink
Beginning seq context deprecation
Browse files Browse the repository at this point in the history
And replacing `--reference` functionality to fill in symbolic alts
  • Loading branch information
ACEnglish committed Nov 14, 2024
1 parent 38f4d8d commit 3a3ded4
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 111 deletions.
8 changes: 0 additions & 8 deletions docs/api/truvari.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,6 @@ entry_size_similarity
^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: entry_size_similarity

entry_shared_ref_context
^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: entry_shared_ref_context

entry_to_hash
^^^^^^^^^^^^^
.. autofunction:: entry_to_hash
Expand Down Expand Up @@ -100,10 +96,6 @@ compress_index_vcf
^^^^^^^^^^^^^^^^^^
.. autofunction:: compress_index_vcf

create_pos_haplotype
^^^^^^^^^^^^^^^^^^^^
.. autofunction:: create_pos_haplotype

get_gt
^^^^^^
.. autofunction:: get_gt
Expand Down
4 changes: 0 additions & 4 deletions truvari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
:meth:`entry_is_present`
:meth:`entry_reciprocal_overlap`
:meth:`entry_same_variant_type`
:meth:`entry_shared_ref_context`
:meth:`entry_seq_similarity`
:meth:`entry_size`
:meth:`entry_size_similarity`
Expand All @@ -30,7 +29,6 @@
:meth:`calc_af`
:meth:`calc_hwe`
:meth:`compress_index_vcf`
:meth:`create_pos_haplotype`
:meth:`get_gt`
:meth:`get_scalebin`
:meth:`get_sizebin`
Expand Down Expand Up @@ -105,15 +103,13 @@

from truvari.comparisons import (
coords_within,
create_pos_haplotype,
entry_boundaries,
entry_distance,
entry_gt_comp,
entry_is_filtered,
entry_is_present,
entry_reciprocal_overlap,
entry_same_variant_type,
entry_shared_ref_context,
entry_seq_similarity,
entry_size,
entry_size_similarity,
Expand Down
5 changes: 1 addition & 4 deletions truvari/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def parse_args(args):
parser.add_argument("-o", "--output", type=str, required=True,
help="Output directory")
parser.add_argument("-f", "--reference", type=str, default=None,
help="Fasta used to call variants. Turns on reference context sequence comparison")
help="Fasta used to call variants. Only needed with symbolic variants.")
parser.add_argument("--short", action="store_true",
help="Short circuit comparisions. Faster, but fewer annotations")
parser.add_argument("--debug", action="store_true", default=False,
Expand All @@ -59,8 +59,6 @@ def parse_args(args):
help="Assume DUP svtypes are INS (%(default)s)")
thresg.add_argument("-C", "--chunksize", type=truvari.restricted_int, default=defaults.chunksize,
help="Max reference distance to compare calls (%(default)s)")
thresg.add_argument("-B", "--minhaplen", type=truvari.restricted_int, default=defaults.minhaplen,
help="Min haplotype sequence length to create (%(default)s)")

genoty = parser.add_argument_group("Genotype Comparison Arguments")
genoty.add_argument("--bSample", type=str, default=None,
Expand Down Expand Up @@ -144,7 +142,6 @@ def check_params(args):
logging.error("Include bed %s does not exist", args.includebed)
check_fail = True
if args.reference:
sys.stderr.write("WARN: `--reference` is no longer recommended and will be deprecated by v5\n")
if not os.path.exists(args.reference):
logging.error("Reference %s does not exist", args.reference)
check_fail = True
Expand Down
2 changes: 1 addition & 1 deletion truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def parse_args(args):
parser.add_argument("-c", "--collapsed-output", type=str, default="collapsed.vcf",
help="Where collapsed variants are written (collapsed.vcf)")
parser.add_argument("-f", "--reference", type=str, default=None,
help="Indexed fasta used to call variants")
help="Indexed fasta used to call variants. Only needed with symbolic variants.")
parser.add_argument("-k", "--keep", choices=["first", "maxqual", "common"], default="first",
help="When collapsing calls, which one to keep (%(default)s)")
parser.add_argument("--bed", type=str, default=None,
Expand Down
77 changes: 1 addition & 76 deletions truvari/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,42 +40,6 @@ def coords_within(qstart, qend, rstart, rend, end_within):
return qstart >= rstart and ending


def create_pos_haplotype(a1, a2, ref, min_len=0):
"""
Create haplotypes of two allele's regions that are assumed to be overlapping
:param `a1`: tuple of chrom, start, end, seq
:type `a1`: tuple
:param `a2`: tuple of chrom, start, end, seq
:type `a2`: tuple
:param `ref`: Reference genome
:type `ref`: :class:`pysam.FastaFile`
:param `min_len`: Minimum length of the haplotype sequence to create
:type `min_len`: int, optional
:return: allele haplotype sequences created
:rtupe: tuple (string, string)
"""
chrom, a1_start, a1_end, a1_seq = a1
chrom, a2_start, a2_end, a2_seq = a2
start = min(a1_start, a2_start)
end = max(a1_end, a2_end)

hap_len1 = (abs(a1_start - start) + len(a1_seq) + abs(a1_end - end))
hap_len2 = (abs(a2_start - start) + len(a2_seq) + abs(a2_end - end))
min_size = min(hap_len1, hap_len2)
if min_size < min_len:
start -= (min_len - min_size) // 2
end += (min_len + min_size) // 2
# no negative fetch
start = max(0, start)
hap1_seq = ref.fetch(chrom, start, a1_start) + \
a1_seq + ref.fetch(chrom, a1_end, end)
hap2_seq = ref.fetch(chrom, start, a2_start) + \
a2_seq + ref.fetch(chrom, a2_end, end)
return str(hap1_seq), str(hap2_seq)


def entry_boundaries(entry, ins_inflate=False):
"""
Get the start and end of an entry
Expand Down Expand Up @@ -204,7 +168,7 @@ def entry_is_present(entry, sample=None, allow_missing=True):
truvari.GT.HET, truvari.GT.HOM]


def entry_seq_similarity(entryA, entryB, ref=None, min_len=0):
def entry_seq_similarity(entryA, entryB):
"""
Calculate sequence similarity of two entries. If reference is not None,
compare their shared reference context. Otherwise, use the unroll technique.
Expand All @@ -213,10 +177,6 @@ def entry_seq_similarity(entryA, entryB, ref=None, min_len=0):
:type `entryA`: :class:`pysam.VariantRecord`
:param `entryB`: second entry
:type `entryB`: :class:`pysam.VariantRecord`
:param `ref`: Reference genome
:type `ref`: :class:`pysam.FastaFile`
:param `min_len`: Minimum length of reference context to generate
:type `min_len`: float, optional
:return: sequence similarity
:rtype: float
Expand All @@ -242,11 +202,6 @@ def entry_seq_similarity(entryA, entryB, ref=None, min_len=0):
if st_dist == 0 or ed_dist == 0:
return seqsim(a_seq, b_seq)

if ref is not None:
allele1, allele2 = entry_shared_ref_context(
entryA, entryB, ref, min_len=min_len)
return seqsim(allele1, allele2)

# Directionality of rolling makes a difference
if st_dist < 0:
st_dist *= -1
Expand Down Expand Up @@ -286,36 +241,6 @@ def entry_reciprocal_overlap(entry1, entry2, ins_inflate=True):
return reciprocal_overlap(astart, aend, bstart, bend)


def entry_shared_ref_context(entryA, entryB, ref, use_ref_seq=False, min_len=0):
"""
Get the shared reference context of two entries and create the haplotype
:param `entryA`: first entry
:type `entryA`: :class:`pysam.VariantRecord`
:param `entryB`: second entry
:type `entryB`: :class:`pysam.VariantRecord`
:param `ref`: Reference genome
:type `ref`: :class:`pysam.FastaFile`
:param `use_ref_seq`: If True, use the reference genome to get the variant sequence instead the entries
:type `use_ref_seq`: bool, optional
:param `min_len`: Minimum length of the reference context to create
:type `min_len`: int, optional
:return: sequences created
:rtype: tuple : (string, string)
"""
def get_props(entry):
"""
We compare the longer of the ref/alt sequence to increase comparability
"""
if use_ref_seq and (entry.alts[0] == "<DEL>" or len(entry.alts[0]) < len(entry.ref)):
return entry.chrom, entry.start, entry.stop, ref.fetch(entry.chrom, entry.start, entry.stop)
return entry.chrom, entry.start, entry.stop, entry.alts[0]
a1 = get_props(entryA)
a2 = get_props(entryB)
return create_pos_haplotype(a1, a2, ref, min_len=min_len)


def entry_same_variant_type(entryA, entryB, dup_to_ins=False):
"""
Check if entryA svtype == entryB svtype
Expand Down
79 changes: 61 additions & 18 deletions truvari/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ def make_match_params():
params.reference = None
params.refdist = 500
params.pctseq = 0.70
params.minhaplen = 50
params.pctsize = 0.70
params.pctovl = 0.0
params.typeignore = False
Expand Down Expand Up @@ -128,7 +127,6 @@ def make_match_params_from_args(args):
ret.reference = args.reference
ret.refdist = args.refdist
ret.pctseq = args.pctseq
ret.minhaplen = args.minhaplen
ret.pctsize = args.pctsize
ret.pctovl = args.pctovl
ret.typeignore = args.typeignore
Expand Down Expand Up @@ -156,7 +154,12 @@ def filter_call(self, entry, base=False):
return True

if self.params.check_multi and len(entry.alts) > 1:
raise ValueError(f"Cannot compare multi-allelic records. Please split\nline {str(entry)}")
raise ValueError(
f"Cannot compare multi-allelic records. Please split\nline {str(entry)}")

# Don't compare BNDs, ever
if entry.alleles_variant_types[1] == 'BND':
return True

if self.params.passonly and truvari.entry_is_filtered(entry):
return True
Expand All @@ -175,7 +178,6 @@ def filter_call(self, entry, base=False):

return False


def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False):
"""
Build a MatchResult
Expand Down Expand Up @@ -231,8 +233,7 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False
return ret

if self.params.pctseq > 0:
ret.seqsim = truvari.entry_seq_similarity(
base, comp, self.reference, self.params.minhaplen)
ret.seqsim = truvari.entry_seq_similarity(base, comp)
if ret.seqsim < self.params.pctseq:
logging.debug("%s and %s sequence similarity is too low (%.3ff)",
str(base), str(comp), ret.seqsim)
Expand Down Expand Up @@ -305,14 +306,24 @@ def chunker(matcher, *files):
cur_chunk = defaultdict(list)
unresolved_warned = False
for key, entry in file_zipper(*files):
if matcher.params.pctseq != 0 and (entry.alts and entry.alts[0].startswith('<')):
if not unresolved_warned:
logging.warning(
"Unresolved SVs (e.g. ALT=<DEL>) are filtered when `--pctseq != 0`")
unresolved_warned = True
if matcher.filter_call(entry, key == 'base'):
cur_chunk['__filtered'].append(entry)
call_counts['__filtered'] += 1
continue

# check symbolic, resolve if needed/possible
if matcher.params.pctseq != 0 and entry.alts[0].startswith('<') and matcher.params.reference:
was_resolved = resolve_sv(entry,
matcher.params.reference,
matcher.params.dup_to_ins)
if not was_resolved:
if not unresolved_warned:
logging.warning("Some symbolic SVs couldn't be resolved")
unresolved_warned = True
cur_chunk['__filtered'].append(entry)
call_counts['__filtered'] += 1
continue

new_chrom = cur_chrom and entry.chrom != cur_chrom
new_chunk = cur_end and cur_end + matcher.params.chunksize < entry.start
if new_chunk or new_chrom:
Expand All @@ -324,14 +335,46 @@ def chunker(matcher, *files):
cur_chunk = defaultdict(list)

cur_chrom = entry.chrom
if not matcher.filter_call(entry, key == 'base'):
cur_end = max(entry.stop, cur_end)
cur_chunk[key].append(entry)
call_counts[key] += 1
else:
cur_chunk['__filtered'].append(entry)
call_counts['__filtered'] += 1
cur_end = max(entry.stop, cur_end)
cur_chunk[key].append(entry)
call_counts[key] += 1

chunk_count += 1
logging.info("%d chunks of %d variants %s", chunk_count,
sum(call_counts.values()), call_counts)
yield cur_chunk, chunk_count


RC = str.maketrans("ATCG", "TAGC")


def do_rc(s):
"""
Reverse complement a sequence
"""
return s.translate(RC)[::-1]


def resolve_sv(entry, ref, dup_to_ins=False):
"""
Attempts to resolve an SV's REF/ALT sequences
"""
if ref is None or entry.alts[0] in ['<CNV>', '<INS>'] or entry.start > ref.get_reference_length(entry.chrom):
return False

seq = ref.fetch(entry.chrom, entry.start, entry.stop)
if entry.alts[0] == '<DEL>':
entry.ref = seq
entry.alts = [seq[0]]
elif entry.alts[0] == '<INV>':
entry.ref = seq
entry.alts = [do_rc(seq)]
elif entry.alts[0] == '<DUP>' and dup_to_ins:
entry.info['SVTYPE'] = 'INS'
entry.ref = seq[0]
entry.alts = [seq]
entry.stop = entry.start + 1
else:
return False

return True

0 comments on commit 3a3ded4

Please sign in to comment.