Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local alleles updates #286

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 2 additions & 43 deletions bio2zarr/vcf2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def compute_laa_field(variant) -> np.ndarray:
The LAA field is a list of one-based indices into the ALT alleles
that indicates which alternate alleles are observed in the sample.

This method infers which alleles are observed from the GT, AD, and PL fields.
This method infers which alleles are observed from the GT field.
"""
sample_count = variant.num_called + variant.num_unknown
alt_allele_count = len(variant.ALT)
Expand All @@ -528,48 +528,6 @@ def compute_laa_field(variant) -> np.ndarray:
np.bincount, axis=1, arr=genotypes, minlength=allele_count
)
allele_counts += genotype_allele_counts
if "AD" in variant.FORMAT:
depths = variant.format("AD")
depths.clip(0, None, out=depths)

def bincount_nonzero(arr, *, minlength):
# nonzero returns the indices of the nonzero elements for each axis
return np.bincount(arr.nonzero()[0], minlength=minlength)

depths_allele_counts = np.apply_along_axis(
bincount_nonzero, axis=1, arr=depths, minlength=allele_count
)
allele_counts += depths_allele_counts
if "PL" in variant.FORMAT:
likelihoods = variant.format("PL")
likelihoods.clip(0, None, out=likelihoods)
# n is the indices of the nonzero likelihoods
n = np.tile(np.arange(likelihoods.shape[1]), (likelihoods.shape[0], 1))
assert n.shape == likelihoods.shape
n[likelihoods <= 0] = 0
ploidy = variant.ploidy

if ploidy == 1:
a = n
b = np.zeros_like(a)
elif ploidy == 2:
# We have n = b(b+1) / 2 + a
# We need to compute a and b
b = np.ceil(np.sqrt(2 * n + 9 / 4) - 3 / 2).astype(int)
a = (n - b * (b + 1) / 2).astype(int)
else:
# TODO: Handle all possible ploidy
raise ValueError(f"Cannot handle ploidy = {ploidy}")

a_counts = np.apply_along_axis(
np.bincount, axis=1, arr=a, minlength=allele_count
)
b_counts = np.apply_along_axis(
np.bincount, axis=1, arr=b, minlength=allele_count
)
assert a_counts.shape == b_counts.shape == allele_counts.shape
allele_counts += a_counts
allele_counts += b_counts

allele_counts[:, 0] = 0 # We don't count the reference allele
max_row_length = 1
Expand Down Expand Up @@ -640,6 +598,7 @@ def compute_lpl_field(variant, laa_val: np.ndarray) -> np.ndarray:
pl_val = np.broadcast_to(pl_val, n.shape)
row_index = np.arange(pl_val.shape[0]).reshape(-1, 1)
lpl_val = pl_val[row_index, n]
lpl_val[b == constants.INT_FILL] = constants.INT_FILL

return lpl_val

Expand Down
135 changes: 20 additions & 115 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,41 +446,41 @@ def ds(self, tmp_path_factory):

def test_call_LAA(self, ds):
call_LAA = [
[[1, -2, -2], [1, -2, -2]],
[[1, -2, -2], [1, -2, -2]],
[[1, 2, -2], [1, 2, -2]],
[[1, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [1, -2, -2]],
[[1, -2, -2], [2, -2, -2]],
[[1, 2, 3], [2, 3, -2]],
[[1, -2, -2], [1, -2, -2]],
[[2, -2, -2], [1, -2, -2]],
[[-2, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [1, -2, -2]],
[[1, -2, -2], [-1, -2, -2]],
[[2, -2, -2], [2, -2, -2]],
[[1, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [-2, -2, -2]],
[[1, -2, -2], [1, -2, -2]],
[[-2, -2, -2], [-2, -2, -2]],
[[-2, -2, -2], [1, -2, -2]],
]
nt.assert_array_equal(ds.call_LAA.values, call_LAA)

def test_call_LPL(self, ds):
call_LPL = [
[
[100, 0, 105, -2, -2, -2, -2, -2, -2, -2],
[0, 100, 200, -2, -2, -2, -2, -2, -2, -2],
[0, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[0, 100, 200, -2, -2, -2, -2, -2, -2, -2],
[0, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[154, 22, 0, -2, -2, -2, -2, -2, -2, -2],
],
[
[1002, 55, 1002, 0, 55, 1002, -2, -2, -2, -2],
[154, 154, 0, 154, 102, 102, -2, -2, -2, -2],
[1002, 55, 1002, -2, -2, -2, -2, -2, -2, -2],
[154, 154, 102, -2, -2, -2, -2, -2, -2, -2],
],
[
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
],
[
[30, 30, 30, -2, -2, -2, -2, -2, -2, -2],
[30, 60, 0, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[0, 30, -2, -2, -2, -2, -2, -2, -2, -2],
Expand All @@ -495,12 +495,12 @@ def test_call_LPL(self, ds):
[10, 40, 60, -2, -2, -2, -2, -2, -2, -2],
],
[
[30, 30, 30, -2, -2, -2, -2, -2, -2, -2],
[30, -1, 0, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[30, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[-1, -1, -1, -2, -2, -2, -2, -2, -2, -2],
[-1, -1, -1, -2, -2, -2, -2, -2, -2, -2],
[-1, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[-1, -2, -2, -2, -2, -2, -2, -2, -2, -2],
],
[
[-1, -1, -2, -2, -2, -2, -2, -2, -2, -2],
Expand Down Expand Up @@ -637,107 +637,12 @@ def test_call_AD(self, ds):
nt.assert_array_equal(ds.call_AD.values, call_AD)

def test_call_LAA(self, ds):
call_LAA = [
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
[[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]],
[[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
[[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]],
]
# All the genotypes are 0/0
call_LAA = np.full((23, 3, 1), -2)
nt.assert_array_equal(ds.call_LAA.values, call_LAA)

def test_call_LPL(self, ds):
# fmt: off
call_LPL = [
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]],
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501
[[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2],
[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]]
]
# fmt: on
call_LPL = np.tile([0, -2, -2], (23, 3, 1))
nt.assert_array_equal(ds.call_LPL.values, call_LPL)

def test_call_PID(self, ds):
Expand Down
Loading