Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PC Relate #133

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions sgkit/stats/pc_relate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import dask.array as da
import xarray as xr

from sgkit.typing import ArrayLike


def gramian(a: ArrayLike) -> ArrayLike:
"""Returns gramian matrix of the given matrix"""
return a.T.dot(a)


def _impute_genotype_call_with_variant_mean(
call_g: xr.DataArray, call_g_mask: xr.DataArray
) -> xr.DataArray:
call_g_present = ~call_g_mask # type: ignore[operator]
variant_mean = call_g.where(call_g_present).mean(dim="samples")
imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
return imputed_call_g


def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eric-czech please ignore the test code for now, current tests is there to validate results against the reference implementation (but I need to make it smaller etc). But I'm curious to hear your opinion about the implementation itself, do you have any suggestions about xr/dask use?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had a few minor suggestions, but looks good to me!

Some array shape checks would definitely help in thinking about the scalability though.

"""Compute PC-Relate as described in Conomos, et al. 2016 [1].

Parameters
----------
ds : `xr.Dataset`
Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
* genotype calls: "call_genotype" (SxVxD)
* genotype calls mask: "call_genotype_mask" (SxVxD)
* sample PCs: "sample_pcs" (PCxS)
maf : float
individual minor allele frequency filter. If an individual's estimated
individual-specific minor allele frequency at a SNP is less than this value,
that SNP will be excluded from the analysis for that individual.
The default value is 0.01. Must be between (0.0, 0.1).


This method computes the kinship coefficient matrix. The kinship coefficient for
a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
a random allele selected from ``i`` and a random allele selected from ``j`` at
a locus are IBD. Several of the most common family relationships and their
corresponding kinship coefficient:

+--------------------------------------------------+---------------------+
| Relationship | Kinship coefficient |
+==================================================+=====================+
| Individual-self | 1/2 |
+--------------------------------------------------+---------------------+
| full sister/full brother | 1/4 |
+--------------------------------------------------+---------------------+
| mother/father/daughter/son | 1/4 |
+--------------------------------------------------+---------------------+
| grandmother/grandfather/granddaughter/grandson | 1/8 |
+--------------------------------------------------+---------------------+
| aunt/uncle/niece/nephew | 1/8 |
+--------------------------------------------------+---------------------+
| first cousin | 1/16 |
+--------------------------------------------------+---------------------+
| half-sister/half-brother | 1/8 |
+--------------------------------------------------+---------------------+

Warnings
--------
This function is only applicable to diploid, biallelic datasets.

Returns
-------
Dataset
Dataset containing (S = num samples):
pc_relate_phi: (S,S) ArrayLike
pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].

References
----------
- [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
"Model-Free Estimation of Recent Genetic Relatedness."
American Journal of Human Genetics 98 (1): 127–48.

Raises
------
ValueError
* If ploidy of provided dataset != 2
* If maximum number of alleles in provided dataset != 2
* Input dataset is missing any of the required variables
* If maf is not in (0.0, 1.0)
"""
if maf <= 0.0 or maf >= 1.0:
raise ValueError("MAF must be between (0.0, 1.0)")
if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
raise ValueError("PC Relate only works for diploid genotypes")
if "alleles" in ds.dims and ds.dims["alleles"] != 2:
raise ValueError("PC Relate only works for biallelic genotypes")
if "call_genotype" not in ds:
raise ValueError("Input dataset must contain call_genotype")
if "call_genotype_mask" not in ds:
raise ValueError("Input dataset must contain call_genotype_mask")
if "sample_pcs" not in ds:
raise ValueError("Input dataset must contain sample_pcs variable")

call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call]
imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)

# 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
# is a length D vector of regression coefficients for each of the PCs
pcs = ds["sample_pcs"]
pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
# Note: dask qr decomp requires no chunking in one dimension, and because number of
# components should be smaller than number of samples in most cases, we disable
# chunking on components
pcsi = pcsi.T.rechunk((None, -1))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


q, r = da.linalg.qr(pcsi)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remind me again how many PCs are typically provided? If we need to, there's probably a way to do this without the unit chunking requirement but I don't remember if the number of PCs is ever big enough to matter.

Copy link
Collaborator Author

@ravwojdyla ravwojdyla Aug 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the number of PCs should be relatively low, from UKBB:

Principal components should ideally be computed using a subset of high quality, unrelated samples. However, the metrics used to find related samples, as well as poorer quality samples themselves require information about population structure (see Sections S 3.5 and S 3.7). We therefore conducted an initial round of PCA, computing just the top 8 PCs, using a set of unrelated samples based on an initial round of kinship estimation. We used the results of this analysis to compute PC adjusted heterozygosity as well as refine the relatedness inference. Having then identified a set of high quality, unrelated samples, we conducted a second round of PCA, computing the first 40 PCs. Results of the second round are made available to researchers and visualised in Figure 3A, Figure S6-S7, Extended Data Figure 3, and discussed in the main text.

Further for PC Relate afaiu the number can be further limited to only the components that best capture the populations, from Conomos, 2016 “Model-Free Estimation of Recent Genetic Relatedness.”:

The appropriate number of PCs that should be used to adjust for population structure in a PC-Relate analysis will depend on the sample structure. A reasonable set of PCs can often be selected by examining scatter plots and parallel coordinates plots of the top PCs to identify which ones appear to reflect population structure, and by examining a scree plot of the eigenvalues to identify a point of separation between PCs that explain a significant proportion of the total variation in the data and those that explain little variation. However, making the appropriate choice can be challenging, so we investigated the sensitivity of relatedness inference with PC-Relate to the number of PCs used in the analysis. Consider population structure II, where there are only two dimensions of population structure (i.e., ancestry contributed by three subpopulations). Figure S12 displays the kinship coefficient estimates from PC-Relate using varying numbers of PCs. Relatedness estimates with PC-Relate were nearly identical when using the top 2, 5, 10, or 20 PCs. However, including the top 100 PCs, which is 50 times more PCs than are required to explain the population structure in the sample, resulted in a substantial increase in variability. In this particular setting, choosing the number of PCs to be within a factor of 10 of the appropriate number allowed for accurate relatedness inference with PC-Relate. These results suggest that PC-Relate is quite robust to the choice of PCs, provided that there are a sufficient number of PCs included in the relatedness analysis to fully capture the population structure in the sample.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah perfect, no need to worry about that then! Thanks for pulling those references.

# mu, eq: 3
half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
mu = pcsi.dot(half_beta).T
# phi, eq: 4
mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
mu_mask = da.ma.masked_array(mu, mask=mask)
variance = mu_mask * (1.0 - mu_mask)
variance = da.ma.filled(variance, fill_value=0.0)
stddev = da.sqrt(variance)
centered_af = call_g / 2 - mu_mask
centered_af = da.ma.filled(centered_af, fill_value=0.0)
# NOTE: gramian could be a performance bottleneck, and we could explore
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think some shape assertions along the way could be really helpful to spot why this is

# performance improvements like (or maybe sth else):
# * calculating only the pairs we are interested in
# * using an optimized einsum.
assert centered_af.shape == call_g.shape
assert stddev.shape == call_g.shape
phi = gramian(centered_af) / gramian(stddev)
# NOTE: phi is of shape (S x S), S = num samples
assert phi.shape == (call_g.shape[1],) * 2
return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})
10 changes: 10 additions & 0 deletions sgkit/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def simulate_genotype_call_dataset(
n_allele: int = 2,
n_contig: int = 1,
seed: Optional[int] = None,
missing_pct: Optional[float] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😁

) -> Dataset:
"""Simulate genotype calls and variant/sample data.

Expand Down Expand Up @@ -41,16 +42,25 @@ def simulate_genotype_call_dataset(
will all be 0 by default with `n_contig` == 1.
seed : int, optional
Seed for random number generation
missing_pct: float, optional
Donate the percent of missing calls, must be within [0.0, 1.0]

Returns
-------
Dataset
Dataset from `sgkit.create_genotype_call_dataset`.
"""
if missing_pct and (missing_pct < 0.0 or missing_pct > 1.0):
raise ValueError("missing_pct must be within [0.0, 1.0]")
rs = np.random.RandomState(seed=seed)
call_genotype = rs.randint(
0, n_allele, size=(n_variant, n_sample, n_ploidy), dtype=np.int8
)
if missing_pct:
call_genotype = np.where(
rs.rand(*call_genotype.shape) < missing_pct, -1, call_genotype
)

contig_size = split_array_chunks(n_variant, n_contig)
contig = np.repeat(np.arange(n_contig), contig_size)
contig_names = np.unique(contig)
Expand Down
38 changes: 38 additions & 0 deletions sgkit/tests/test_pc_relate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from pathlib import Path

import dask.array as da
import numpy as np
import pandas as pd
import xarray as xr

from sgkit.stats.pc_relate import pc_relate

# TODO (rav): finish up tests/validation, clean and split


def test_same_as_reference_implementation() -> None:
d = Path(__file__).parent.joinpath("test_pc_relate")
ds = xr.open_zarr(d.joinpath("zarr_data").as_posix()) # type: ignore[no-untyped-call]
pcs = da.from_array(
pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy()
).T
ds["sample_pcs"] = (("components", "samples"), pcs)
phi = pc_relate(ds).compute()["pc_relate_phi"]

assert isinstance(phi, xr.DataArray)
assert phi.shape == (1000, 1000)

# Get genesis/reference results:
genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv"))
genesis_phi = genesis_phi[["ID1", "ID2", "kin"]]
genesis_phi["ID1"], genesis_phi["ID2"] = genesis_phi.ID1 - 1, genesis_phi.ID2 - 1
indices = (genesis_phi["ID1"] * 1000 + genesis_phi["ID2"]).to_numpy()
values = genesis_phi["kin"].to_numpy()
genesis_phi_full = np.zeros((1000, 1000))
np.put(genesis_phi_full, indices, values)

# Compare with reference/GENESIS:
genesis_phi_s = genesis_phi_full[np.triu_indices_from(genesis_phi_full, 1)]
phi_s = phi.data[np.triu_indices_from(phi.data, 1)]
assert len(phi_s) == len(genesis_phi_s)
assert np.allclose(phi_s, genesis_phi_s)
Loading