From 15d526b48598667761133227f856076891a5c6c0 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 23 Oct 2024 16:51:13 +0100 Subject: [PATCH 1/3] Remove check on PL values when computing local alleles --- bio2zarr/vcf2zarr/icf.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index 7646826..697b0c7 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -540,36 +540,6 @@ def bincount_nonzero(arr, *, minlength): bincount_nonzero, axis=1, arr=depths, minlength=allele_count ) allele_counts += depths_allele_counts - if "PL" in variant.FORMAT: - likelihoods = variant.format("PL") - likelihoods.clip(0, None, out=likelihoods) - # n is the indices of the nonzero likelihoods - n = np.tile(np.arange(likelihoods.shape[1]), (likelihoods.shape[0], 1)) - assert n.shape == likelihoods.shape - n[likelihoods <= 0] = 0 - ploidy = variant.ploidy - - if ploidy == 1: - a = n - b = np.zeros_like(a) - elif ploidy == 2: - # We have n = b(b+1) / 2 + a - # We need to compute a and b - b = np.ceil(np.sqrt(2 * n + 9 / 4) - 3 / 2).astype(int) - a = (n - b * (b + 1) / 2).astype(int) - else: - # TODO: Handle all possible ploidy - raise ValueError(f"Cannot handle ploidy = {ploidy}") - - a_counts = np.apply_along_axis( - np.bincount, axis=1, arr=a, minlength=allele_count - ) - b_counts = np.apply_along_axis( - np.bincount, axis=1, arr=b, minlength=allele_count - ) - assert a_counts.shape == b_counts.shape == allele_counts.shape - allele_counts += a_counts - allele_counts += b_counts allele_counts[:, 0] = 0 # We don't count the reference allele max_row_length = 1 From 88798ce626b4fcd7a2a0b726a487c1623c683460 Mon Sep 17 00:00:00 2001 From: willtyler Date: Thu, 24 Oct 2024 17:08:31 +0000 Subject: [PATCH 2/3] Remove AD values check when computing LPL values --- bio2zarr/vcf2zarr/icf.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index 697b0c7..e099c5c 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -512,7 +512,7 @@ def compute_laa_field(variant) -> np.ndarray: The LAA field is a list of one-based indices into the ALT alleles that indicates which alternate alleles are observed in the sample. - This method infers which alleles are observed from the GT, AD, and PL fields. + This method infers which alleles are observed from the GT field. """ sample_count = variant.num_called + variant.num_unknown alt_allele_count = len(variant.ALT) @@ -528,18 +528,6 @@ def compute_laa_field(variant) -> np.ndarray: np.bincount, axis=1, arr=genotypes, minlength=allele_count ) allele_counts += genotype_allele_counts - if "AD" in variant.FORMAT: - depths = variant.format("AD") - depths.clip(0, None, out=depths) - - def bincount_nonzero(arr, *, minlength): - # nonzero returns the indices of the nonzero elements for each axis - return np.bincount(arr.nonzero()[0], minlength=minlength) - - depths_allele_counts = np.apply_along_axis( - bincount_nonzero, axis=1, arr=depths, minlength=allele_count - ) - allele_counts += depths_allele_counts allele_counts[:, 0] = 0 # We don't count the reference allele max_row_length = 1 From 1aaf413cfaced644fd4e95c6b8aee68105c112fb Mon Sep 17 00:00:00 2001 From: willtyler Date: Thu, 24 Oct 2024 17:09:13 +0000 Subject: [PATCH 3/3] Update unit tests --- bio2zarr/vcf2zarr/icf.py | 1 + tests/test_vcf_examples.py | 135 ++++++------------------------------- 2 files changed, 21 insertions(+), 115 deletions(-) diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index e099c5c..23d7617 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -598,6 +598,7 @@ def compute_lpl_field(variant, laa_val: np.ndarray) -> np.ndarray: pl_val = np.broadcast_to(pl_val, n.shape) row_index = np.arange(pl_val.shape[0]).reshape(-1, 1) lpl_val = pl_val[row_index, n] + lpl_val[b == constants.INT_FILL] = constants.INT_FILL return lpl_val diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 5e2dfb8..9c01fac 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -446,17 +446,17 @@ def ds(self, tmp_path_factory): def test_call_LAA(self, ds): call_LAA = [ - [[1, -2, -2], [1, -2, -2]], - [[1, -2, -2], [1, -2, -2]], - [[1, 2, -2], [1, 2, -2]], + [[1, -2, -2], [-2, -2, -2]], + [[-2, -2, -2], [1, -2, -2]], + [[1, -2, -2], [2, -2, -2]], [[1, 2, 3], [2, 3, -2]], - [[1, -2, -2], [1, -2, -2]], - [[2, -2, -2], [1, -2, -2]], + [[-2, -2, -2], [-2, -2, -2]], + [[-2, -2, -2], [1, -2, -2]], [[1, -2, -2], [-1, -2, -2]], [[2, -2, -2], [2, -2, -2]], - [[1, -2, -2], [-2, -2, -2]], [[-2, -2, -2], [-2, -2, -2]], - [[1, -2, -2], [1, -2, -2]], + [[-2, -2, -2], [-2, -2, -2]], + [[-2, -2, -2], [1, -2, -2]], ] nt.assert_array_equal(ds.call_LAA.values, call_LAA) @@ -464,23 +464,23 @@ def test_call_LPL(self, ds): call_LPL = [ [ [100, 0, 105, -2, -2, -2, -2, -2, -2, -2], - [0, 100, 200, -2, -2, -2, -2, -2, -2, -2], + [0, -2, -2, -2, -2, -2, -2, -2, -2, -2], ], [ - [0, 100, 200, -2, -2, -2, -2, -2, -2, -2], + [0, -2, -2, -2, -2, -2, -2, -2, -2, -2], [154, 22, 0, -2, -2, -2, -2, -2, -2, -2], ], [ - [1002, 55, 1002, 0, 55, 1002, -2, -2, -2, -2], - [154, 154, 0, 154, 102, 102, -2, -2, -2, -2], + [1002, 55, 1002, -2, -2, -2, -2, -2, -2, -2], + [154, 154, 102, -2, -2, -2, -2, -2, -2, -2], ], [ [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], ], [ - [30, 30, 30, -2, -2, -2, -2, -2, -2, -2], - [30, 60, 0, -2, -2, -2, -2, -2, -2, -2], + [30, -2, -2, -2, -2, -2, -2, -2, -2, -2], + [30, -2, -2, -2, -2, -2, -2, -2, -2, -2], ], [ [0, 30, -2, -2, -2, -2, -2, -2, -2, -2], @@ -495,12 +495,12 @@ def test_call_LPL(self, ds): [10, 40, 60, -2, -2, -2, -2, -2, -2, -2], ], [ - [30, 30, 30, -2, -2, -2, -2, -2, -2, -2], - [30, -1, 0, -2, -2, -2, -2, -2, -2, -2], + [30, -2, -2, -2, -2, -2, -2, -2, -2, -2], + [30, -2, -2, -2, -2, -2, -2, -2, -2, -2], ], [ - [-1, -1, -1, -2, -2, -2, -2, -2, -2, -2], - [-1, -1, -1, -2, -2, -2, -2, -2, -2, -2], + [-1, -2, -2, -2, -2, -2, -2, -2, -2, -2], + [-1, -2, -2, -2, -2, -2, -2, -2, -2, -2], ], [ [-1, -1, -2, -2, -2, -2, -2, -2, -2, -2], @@ -637,107 +637,12 @@ def test_call_AD(self, ds): nt.assert_array_equal(ds.call_AD.values, call_AD) def test_call_LAA(self, ds): - call_LAA = [ - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]], - [[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - [[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]], - [[1, -2, -2, -2], [1, -2, -2, -2], [1, -2, -2, -2]], - [[1, 2, -2, -2], [1, 2, -2, -2], [1, 2, -2, -2]], - [[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]], - [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - [[1, 2, 3, -2], [1, 2, 3, -2], [1, 2, 3, -2]], - ] + # All the genotypes are 0/0 + call_LAA = np.full((23, 3, 1), -2) nt.assert_array_equal(ds.call_LAA.values, call_LAA) def test_call_LPL(self, ds): - # fmt: off - call_LPL = [ - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501 - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501 - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501 - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501 - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501 - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, -2, -2, -2, -2, -2, -2, -2, -2, -2]], - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]], # noqa: E501 - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800], # noqa: E501 - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, 120, 1800, 1800, 1800, 1800]], # noqa: E501 - [[0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2], - [0, 120, 1800, 120, 1800, 1800, 120, 1800, 1800, 1800, -2, -2, -2, -2, -2]] - ] - # fmt: on + call_LPL = np.tile([0, -2, -2], (23, 3, 1)) nt.assert_array_equal(ds.call_LPL.values, call_LPL) def test_call_PID(self, ds):