Skip to content

Commit

Permalink
Merge pull request #164 from jeromekelleher/reuse-numba-code-in-ts
Browse files Browse the repository at this point in the history
Reuse the numba classifier in ts_afdist
  • Loading branch information
jeromekelleher authored Sep 4, 2024
2 parents bfa1323 + edfd95b commit 69bf2c0
Showing 1 changed file with 45 additions and 25 deletions.
70 changes: 45 additions & 25 deletions src/ts_afdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,37 @@
import pandas as pd
import tensorstore as ts

from zarr_afdist import count_genotypes_chunk, GenotypeCounts


def ts_afdist(path):
path = str(path)
store = ts.open({
store = ts.open(
{
"driver": "zarr",
"kvstore": {
"driver": "file",
"path": path,
},
}, write=False).result()
"context": {
"cache_pool": {"total_bytes_limit": 0},
"data_copy_concurrency": {"limit": 1},
"file_io_concurrency": {"limit": 1},
},
},
write=False,
).result()

variant_count = store.shape[0]
sample_count = store.shape[1]
chunk_shape = store.chunk_layout.read_chunk.shape
variant_chunk_size = chunk_shape[0]
sample_chunk_size = chunk_shape[1]
bin_counts = np.zeros((11,), dtype=int)

het = np.zeros(variant_count, dtype=np.int32)
hom_alt = np.zeros(variant_count, dtype=np.int32)
hom_ref = np.zeros(variant_count, dtype=np.int32)
ref_count = np.zeros(variant_count, dtype=np.int32)

for variant_chunk_start in range(0, variant_count, variant_chunk_size):
variant_chunk_end = min(variant_count, variant_chunk_start + variant_chunk_size)
Expand All @@ -28,28 +44,32 @@ def ts_afdist(path):
for sample_chunk_start in range(0, sample_count, sample_chunk_size):
sample_chunk_end = min(sample_count, sample_chunk_start + sample_chunk_size)

chunk = store[variant_chunk_start:variant_chunk_end, sample_chunk_start:sample_chunk_end].read().result()
a = chunk[:, :, 0]
b = chunk[:, :, 1]

chunk_ref_counts = ((a == 0).astype(int) + (b == 0).astype(int)).sum(axis=1)
chunk_het_counts = (a != b).sum(axis=1)
chunk_hom_alt_counts = np.logical_and(a == b, a > 0).sum(axis=1)

np.add(ref_counts, chunk_ref_counts, out=ref_counts)
np.add(het_counts, chunk_het_counts, out=het_counts)
np.add(hom_alt_counts, chunk_hom_alt_counts, out=hom_alt_counts)
G = (
store[
variant_chunk_start:variant_chunk_end,
sample_chunk_start:sample_chunk_end,
]
.read()
.result()
)
count_genotypes_chunk(
variant_chunk_start, G, hom_ref, hom_alt, het, ref_count
)

alt_count = 2 * sample_count - ref_counts
alt_freq = alt_count / (2 * sample_count)
het_ref_freq = 2 * alt_freq * (1 - alt_freq)
hom_alt_freq = alt_freq * alt_freq
counts = GenotypeCounts(hom_ref, hom_alt, het, ref_count)

bins = np.linspace(0, 1.0, len(bin_counts))
bins[-1] += 0.0125
a = np.bincount(np.digitize(het_ref_freq, bins), weights=het_counts, minlength=len(bins)).astype(int)
b = np.bincount(np.digitize(hom_alt_freq, bins), weights=hom_alt_counts, minlength=len(bins)).astype(int)
np.add(bin_counts, a, out=bin_counts)
np.add(bin_counts, b, out=bin_counts)
num_bins = 10
n = sample_count
alt_count = 2 * n - counts.ref_count
af = alt_count / (n * 2)
bins = np.linspace(0, 1.0, num_bins + 1)
bins[-1] += 0.0125
pRA = 2 * af * (1 - af)
pAA = af * af
a = np.bincount(np.digitize(pRA, bins), weights=counts.het, minlength=num_bins + 1)
b = np.bincount(
np.digitize(pAA, bins), weights=counts.hom_alt, minlength=num_bins + 1
)
count = (a + b).astype(int)

return pd.DataFrame({"start": bins[:-1], "stop": bins[1:], "prob dist": bin_counts[1:]})
return pd.DataFrame({"start": bins[:-1], "stop": bins[1:], "prob_dist": count[1:]})

0 comments on commit 69bf2c0

Please sign in to comment.