-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PC Relate #133
PC Relate #133
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import dask.array as da | ||
import xarray as xr | ||
|
||
from sgkit.typing import ArrayLike | ||
|
||
|
||
def gramian(a: ArrayLike) -> ArrayLike: | ||
"""Returns gramian matrix of the given matrix""" | ||
return a.T.dot(a) | ||
|
||
|
||
def _impute_genotype_call_with_variant_mean( | ||
call_g: xr.DataArray, call_g_mask: xr.DataArray | ||
) -> xr.DataArray: | ||
call_g_present = ~call_g_mask # type: ignore[operator] | ||
variant_mean = call_g.where(call_g_present).mean(dim="samples") | ||
imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean) | ||
return imputed_call_g | ||
|
||
|
||
def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset: | ||
"""Compute PC-Relate as described in Conomos, et al. 2016 [1]. | ||
|
||
Parameters | ||
---------- | ||
ds : `xr.Dataset` | ||
Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC): | ||
* genotype calls: "call_genotype" (SxVxD) | ||
* genotype calls mask: "call_genotype_mask" (SxVxD) | ||
* sample PCs: "sample_pcs" (PCxS) | ||
maf : float | ||
individual minor allele frequency filter. If an individual's estimated | ||
individual-specific minor allele frequency at a SNP is less than this value, | ||
that SNP will be excluded from the analysis for that individual. | ||
The default value is 0.01. Must be between (0.0, 0.1). | ||
|
||
|
||
This method computes the kinship coefficient matrix. The kinship coefficient for | ||
a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that | ||
a random allele selected from ``i`` and a random allele selected from ``j`` at | ||
a locus are IBD. Several of the most common family relationships and their | ||
corresponding kinship coefficient: | ||
|
||
+--------------------------------------------------+---------------------+ | ||
| Relationship | Kinship coefficient | | ||
+==================================================+=====================+ | ||
| Individual-self | 1/2 | | ||
+--------------------------------------------------+---------------------+ | ||
| full sister/full brother | 1/4 | | ||
+--------------------------------------------------+---------------------+ | ||
| mother/father/daughter/son | 1/4 | | ||
+--------------------------------------------------+---------------------+ | ||
| grandmother/grandfather/granddaughter/grandson | 1/8 | | ||
+--------------------------------------------------+---------------------+ | ||
| aunt/uncle/niece/nephew | 1/8 | | ||
+--------------------------------------------------+---------------------+ | ||
| first cousin | 1/16 | | ||
+--------------------------------------------------+---------------------+ | ||
| half-sister/half-brother | 1/8 | | ||
+--------------------------------------------------+---------------------+ | ||
|
||
Warnings | ||
-------- | ||
This function is only applicable to diploid, biallelic datasets. | ||
|
||
Returns | ||
------- | ||
Dataset | ||
Dataset containing (S = num samples): | ||
pc_relate_phi: (S,S) ArrayLike | ||
pairwise recent kinship coefficient matrix as float in [-0.5, 0.5]. | ||
|
||
References | ||
---------- | ||
- [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016. | ||
"Model-Free Estimation of Recent Genetic Relatedness." | ||
American Journal of Human Genetics 98 (1): 127–48. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
* If ploidy of provided dataset != 2 | ||
* If maximum number of alleles in provided dataset != 2 | ||
* Input dataset is missing any of the required variables | ||
* If maf is not in (0.0, 1.0) | ||
""" | ||
if maf <= 0.0 or maf >= 1.0: | ||
raise ValueError("MAF must be between (0.0, 1.0)") | ||
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2: | ||
raise ValueError("PC Relate only works for diploid genotypes") | ||
if "alleles" in ds.dims and ds.dims["alleles"] != 2: | ||
raise ValueError("PC Relate only works for biallelic genotypes") | ||
if "call_genotype" not in ds: | ||
raise ValueError("Input dataset must contain call_genotype") | ||
if "call_genotype_mask" not in ds: | ||
raise ValueError("Input dataset must contain call_genotype_mask") | ||
if "sample_pcs" not in ds: | ||
raise ValueError("Input dataset must contain sample_pcs variable") | ||
|
||
call_g_mask = ds["call_genotype_mask"].any(dim="ploidy") | ||
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call] | ||
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask) | ||
|
||
# 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T | ||
# is a length D vector of regression coefficients for each of the PCs | ||
pcs = ds["sample_pcs"] | ||
pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0) | ||
# Note: dask qr decomp requires no chunking in one dimension, and because number of | ||
# components should be smaller than number of samples in most cases, we disable | ||
# chunking on components | ||
pcsi = pcsi.T.rechunk((None, -1)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
q, r = da.linalg.qr(pcsi) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remind me again how many PCs are typically provided? If we need to, there's probably a way to do this without the unit chunking requirement but I don't remember if the number of PCs is ever big enough to matter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the number of PCs should be relatively low, from UKBB:
Further for PC Relate afaiu the number can be further limited to only the components that best capture the populations, from Conomos, 2016 “Model-Free Estimation of Recent Genetic Relatedness.”:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah perfect, no need to worry about that then! Thanks for pulling those references. |
||
# mu, eq: 3 | ||
half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T) | ||
mu = pcsi.dot(half_beta).T | ||
# phi, eq: 4 | ||
mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask | ||
mu_mask = da.ma.masked_array(mu, mask=mask) | ||
variance = mu_mask * (1.0 - mu_mask) | ||
variance = da.ma.filled(variance, fill_value=0.0) | ||
stddev = da.sqrt(variance) | ||
centered_af = call_g / 2 - mu_mask | ||
centered_af = da.ma.filled(centered_af, fill_value=0.0) | ||
# NOTE: gramian could be a performance bottleneck, and we could explore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think some shape assertions along the way could be really helpful to spot why this is |
||
# performance improvements like (or maybe sth else): | ||
# * calculating only the pairs we are interested in | ||
# * using an optimized einsum. | ||
assert centered_af.shape == call_g.shape | ||
assert stddev.shape == call_g.shape | ||
phi = gramian(centered_af) / gramian(stddev) | ||
# NOTE: phi is of shape (S x S), S = num samples | ||
assert phi.shape == (call_g.shape[1],) * 2 | ||
return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ def simulate_genotype_call_dataset( | |
n_allele: int = 2, | ||
n_contig: int = 1, | ||
seed: Optional[int] = None, | ||
missing_pct: Optional[float] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😁 |
||
) -> Dataset: | ||
"""Simulate genotype calls and variant/sample data. | ||
|
||
|
@@ -41,16 +42,25 @@ def simulate_genotype_call_dataset( | |
will all be 0 by default with `n_contig` == 1. | ||
seed : int, optional | ||
Seed for random number generation | ||
missing_pct: float, optional | ||
Donate the percent of missing calls, must be within [0.0, 1.0] | ||
|
||
Returns | ||
------- | ||
Dataset | ||
Dataset from `sgkit.create_genotype_call_dataset`. | ||
""" | ||
if missing_pct and (missing_pct < 0.0 or missing_pct > 1.0): | ||
raise ValueError("missing_pct must be within [0.0, 1.0]") | ||
rs = np.random.RandomState(seed=seed) | ||
call_genotype = rs.randint( | ||
0, n_allele, size=(n_variant, n_sample, n_ploidy), dtype=np.int8 | ||
) | ||
if missing_pct: | ||
call_genotype = np.where( | ||
rs.rand(*call_genotype.shape) < missing_pct, -1, call_genotype | ||
) | ||
|
||
contig_size = split_array_chunks(n_variant, n_contig) | ||
contig = np.repeat(np.arange(n_contig), contig_size) | ||
contig_names = np.unique(contig) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from pathlib import Path | ||
|
||
import dask.array as da | ||
import numpy as np | ||
import pandas as pd | ||
import xarray as xr | ||
|
||
from sgkit.stats.pc_relate import pc_relate | ||
|
||
# TODO (rav): finish up tests/validation, clean and split | ||
|
||
|
||
def test_same_as_reference_implementation() -> None: | ||
d = Path(__file__).parent.joinpath("test_pc_relate") | ||
ds = xr.open_zarr(d.joinpath("zarr_data").as_posix()) # type: ignore[no-untyped-call] | ||
pcs = da.from_array( | ||
pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy() | ||
).T | ||
ds["sample_pcs"] = (("components", "samples"), pcs) | ||
phi = pc_relate(ds).compute()["pc_relate_phi"] | ||
|
||
assert isinstance(phi, xr.DataArray) | ||
assert phi.shape == (1000, 1000) | ||
|
||
# Get genesis/reference results: | ||
genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv")) | ||
genesis_phi = genesis_phi[["ID1", "ID2", "kin"]] | ||
genesis_phi["ID1"], genesis_phi["ID2"] = genesis_phi.ID1 - 1, genesis_phi.ID2 - 1 | ||
indices = (genesis_phi["ID1"] * 1000 + genesis_phi["ID2"]).to_numpy() | ||
values = genesis_phi["kin"].to_numpy() | ||
genesis_phi_full = np.zeros((1000, 1000)) | ||
np.put(genesis_phi_full, indices, values) | ||
|
||
# Compare with reference/GENESIS: | ||
genesis_phi_s = genesis_phi_full[np.triu_indices_from(genesis_phi_full, 1)] | ||
phi_s = phi.data[np.triu_indices_from(phi.data, 1)] | ||
assert len(phi_s) == len(genesis_phi_s) | ||
assert np.allclose(phi_s, genesis_phi_s) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@eric-czech please ignore the test code for now, current tests is there to validate results against the reference implementation (but I need to make it smaller etc). But I'm curious to hear your opinion about the implementation itself, do you have any suggestions about xr/dask use?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Had a few minor suggestions, but looks good to me!
Some array shape checks would definitely help in thinking about the scalability though.