sgkit-dev · ravwojdyla · Aug 20, 2020 · ravwojdyla · Aug 22, 2020 · eric-czech
diff --git a/sgkit/stats/pc_relate.py b/sgkit/stats/pc_relate.py
@@ -0,0 +1,134 @@
+import dask.array as da
+import xarray as xr
+
+from sgkit.typing import ArrayLike
+
+
+def gramian(a: ArrayLike) -> ArrayLike:
+    """Returns gramian matrix of the given matrix"""
+    return a.T.dot(a)
+
+
+def _impute_genotype_call_with_variant_mean(
+    call_g: xr.DataArray, call_g_mask: xr.DataArray
+) -> xr.DataArray:
+    call_g_present = ~call_g_mask  # type: ignore[operator]
+    variant_mean = call_g.where(call_g_present).mean(dim="samples")
+    imputed_call_g: xr.DataArray = call_g.where(call_g_present, variant_mean)
+    return imputed_call_g
+
+
+def pc_relate(ds: xr.Dataset, maf: float = 0.01) -> xr.Dataset:
+    """Compute PC-Relate as described in Conomos, et al. 2016 [1].
+
+    Parameters
+    ----------
+    ds : `xr.Dataset`
+        Dataset containing (S = num samples, V = num variants, D = ploidy, PC = num PC):
+        * genotype calls: "call_genotype" (SxVxD)
+        * genotype calls mask: "call_genotype_mask" (SxVxD)
+        * sample PCs: "sample_pcs" (PCxS)
+    maf : float
+        individual minor allele frequency filter. If an individual's estimated
+        individual-specific minor allele frequency at a SNP is less than this value,
+        that SNP will be excluded from the analysis for that individual.
+        The default value is 0.01. Must be between (0.0, 0.1).
+
+
+    This method computes the kinship coefficient matrix. The kinship coefficient for
+    a pair of individuals ``i`` and ``j`` is commonly defined to be the probability that
+    a random allele selected from ``i`` and a random allele selected from ``j`` at
+    a locus are IBD. Several of the most common family relationships and their
+    corresponding kinship coefficient:
+
+    +--------------------------------------------------+---------------------+
+    | Relationship                                     | Kinship coefficient |
+    +==================================================+=====================+
+    | Individual-self                                  | 1/2                 |
+    +--------------------------------------------------+---------------------+
+    | full sister/full brother                         | 1/4                 |
+    +--------------------------------------------------+---------------------+
+    | mother/father/daughter/son                       | 1/4                 |
+    +--------------------------------------------------+---------------------+
+    | grandmother/grandfather/granddaughter/grandson   | 1/8                 |
+    +--------------------------------------------------+---------------------+
+    | aunt/uncle/niece/nephew                          | 1/8                 |
+    +--------------------------------------------------+---------------------+
+    | first cousin                                     | 1/16                |
+    +--------------------------------------------------+---------------------+
+    | half-sister/half-brother                         | 1/8                 |
+    +--------------------------------------------------+---------------------+
+
+    Warnings
+    --------
+    This function is only applicable to diploid, biallelic datasets.
+
+    Returns
+    -------
+    Dataset
+        Dataset containing (S = num samples):
+        pc_relate_phi: (S,S) ArrayLike
+            pairwise recent kinship coefficient matrix as float in [-0.5, 0.5].
+
+    References
+    ----------
+    - [1] Conomos, Matthew P., Alexander P. Reiner, Bruce S. Weir, and Timothy A. Thornton. 2016.
+        "Model-Free Estimation of Recent Genetic Relatedness."
+        American Journal of Human Genetics 98 (1): 127–48.
+
+    Raises
+    ------
+    ValueError
+        * If ploidy of provided dataset != 2
+        * If maximum number of alleles in provided dataset != 2
+        * Input dataset is missing any of the required variables
+        * If maf is not in (0.0, 1.0)
+    """
+    if maf <= 0.0 or maf >= 1.0:
+        raise ValueError("MAF must be between (0.0, 1.0)")
+    if "ploidy" in ds.dims and ds.dims["ploidy"] != 2:
+        raise ValueError("PC Relate only works for diploid genotypes")
+    if "alleles" in ds.dims and ds.dims["alleles"] != 2:
+        raise ValueError("PC Relate only works for biallelic genotypes")
+    if "call_genotype" not in ds:
+        raise ValueError("Input dataset must contain call_genotype")
+    if "call_genotype_mask" not in ds:
+        raise ValueError("Input dataset must contain call_genotype_mask")
+    if "sample_pcs" not in ds:
+        raise ValueError("Input dataset must contain sample_pcs variable")
+
+    call_g_mask = ds["call_genotype_mask"].any(dim="ploidy")
+    call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim="ploidy"))  # type: ignore[no-untyped-call]
+    imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
+
+    # 𝔼[gs|V] = 1β0 + Vβ, where 1 is a length _s_ vector of 1s, and β = (β1,...,βD)^T
+    # is a length D vector of regression coefficients for each of the PCs
+    pcs = ds["sample_pcs"]
+    pcsi = da.concatenate([da.ones((1, pcs.shape[1]), dtype=pcs.dtype), pcs], axis=0)
+    # Note: dask qr decomp requires no chunking in one dimension, and because number of
+    # components should be smaller than number of samples in most cases, we disable
+    # chunking on components
+    pcsi = pcsi.T.rechunk((None, -1))
+
+    q, r = da.linalg.qr(pcsi)
+    # mu, eq: 3
+    half_beta = da.linalg.inv(2 * r).dot(q.T).dot(imputed_call_g.T)
+    mu = pcsi.dot(half_beta).T
+    # phi, eq: 4
+    mask = (mu <= maf) | (mu >= 1.0 - maf) | call_g_mask
+    mu_mask = da.ma.masked_array(mu, mask=mask)
+    variance = mu_mask * (1.0 - mu_mask)
+    variance = da.ma.filled(variance, fill_value=0.0)
+    stddev = da.sqrt(variance)
+    centered_af = call_g / 2 - mu_mask
+    centered_af = da.ma.filled(centered_af, fill_value=0.0)
+    # NOTE: gramian could be a performance bottleneck, and we could explore
+    #       performance improvements like (or maybe sth else):
+    #       * calculating only the pairs we are interested in
+    #       * using an optimized einsum.
+    assert centered_af.shape == call_g.shape
+    assert stddev.shape == call_g.shape
+    phi = gramian(centered_af) / gramian(stddev)
+    # NOTE: phi is of shape (S x S), S = num samples
+    assert phi.shape == (call_g.shape[1],) * 2
+    return xr.Dataset({"pc_relate_phi": (("sample_x", "sample_y"), phi)})
diff --git a/sgkit/testing.py b/sgkit/testing.py
@@ -14,6 +14,7 @@ def simulate_genotype_call_dataset(
     n_allele: int = 2,
     n_contig: int = 1,
     seed: Optional[int] = None,
+    missing_pct: Optional[float] = None,
 ) -> Dataset:
     """Simulate genotype calls and variant/sample data.
 
@@ -41,16 +42,25 @@ def simulate_genotype_call_dataset(
         will all be 0 by default with `n_contig` == 1.
     seed : int, optional
         Seed for random number generation
+    missing_pct: float, optional
+        Donate the percent of missing calls, must be within [0.0, 1.0]
 
     Returns
     -------
     Dataset
         Dataset from `sgkit.create_genotype_call_dataset`.
     """
+    if missing_pct and (missing_pct < 0.0 or missing_pct > 1.0):
+        raise ValueError("missing_pct must be within [0.0, 1.0]")
     rs = np.random.RandomState(seed=seed)
     call_genotype = rs.randint(
         0, n_allele, size=(n_variant, n_sample, n_ploidy), dtype=np.int8
     )
+    if missing_pct:
+        call_genotype = np.where(
+            rs.rand(*call_genotype.shape) < missing_pct, -1, call_genotype
+        )
+
     contig_size = split_array_chunks(n_variant, n_contig)
     contig = np.repeat(np.arange(n_contig), contig_size)
     contig_names = np.unique(contig)

diff --git a/sgkit/tests/test_pc_relate.py b/sgkit/tests/test_pc_relate.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+import dask.array as da
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from sgkit.stats.pc_relate import pc_relate
+
+# TODO (rav): finish up tests/validation, clean and split
+
+
+def test_same_as_reference_implementation() -> None:
+    d = Path(__file__).parent.joinpath("test_pc_relate")
+    ds = xr.open_zarr(d.joinpath("zarr_data").as_posix())  # type: ignore[no-untyped-call]
+    pcs = da.from_array(
+        pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy()
+    ).T
+    ds["sample_pcs"] = (("components", "samples"), pcs)
+    phi = pc_relate(ds).compute()["pc_relate_phi"]
+
+    assert isinstance(phi, xr.DataArray)
+    assert phi.shape == (1000, 1000)
+
+    # Get genesis/reference results:
+    genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv"))
+    genesis_phi = genesis_phi[["ID1", "ID2", "kin"]]
+    genesis_phi["ID1"], genesis_phi["ID2"] = genesis_phi.ID1 - 1, genesis_phi.ID2 - 1
+    indices = (genesis_phi["ID1"] * 1000 + genesis_phi["ID2"]).to_numpy()
+    values = genesis_phi["kin"].to_numpy()
+    genesis_phi_full = np.zeros((1000, 1000))
+    np.put(genesis_phi_full, indices, values)
+
+    # Compare with reference/GENESIS:
+    genesis_phi_s = genesis_phi_full[np.triu_indices_from(genesis_phi_full, 1)]
+    phi_s = phi.data[np.triu_indices_from(phi.data, 1)]
+    assert len(phi_s) == len(genesis_phi_s)
+    assert np.allclose(phi_s, genesis_phi_s)