diff --git a/gneiss/__init__.py b/gneiss/__init__.py index f7feef1..a17a0a9 100644 --- a/gneiss/__init__.py +++ b/gneiss/__init__.py @@ -8,8 +8,8 @@ from __future__ import absolute_import, division, print_function -from gneiss._formula import ols +from gneiss._formula import mixedlm, ols -__all__ = ['ols'] +__all__ = ['ols', 'mixedlm'] __version__ = "0.0.2" diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 8877ec1..a4a87f8 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -221,7 +221,127 @@ def ols(formula, table, metadata, tree, **kwargs): # http://stackoverflow.com/a/22439820/1167475 stats_formula = '%s ~ %s' % (b, formula) - mdf = smf.ols(stats_formula, data=data).fit() + mdf = smf.ols(stats_formula, data=data, **kwargs).fit() + fits.append(mdf) + return RegressionResults(fits, basis=basis, + feature_names=table.columns) + + +def mixedlm(formula, table, metadata, tree, groups, **kwargs): + """ Linear Mixed Effects Models applied to balances. + + Parameters + ---------- + formula : str + Formula representing the statistical equation to be evaluated. + These strings are similar to how equations are handled in R. + Note that the dependent variable in this string should not be + specified, since this method will be run on each of the individual + balances. See `patsy` for more details. + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + metadata: pd.DataFrame + Metadata table that contains information about the samples contained + in the `table` object. Samples correspond to rows and covariates + correspond to columns. + tree : skbio.TreeNode + Tree object where the leaves correspond to the columns contained in + the table. + groups : str + Column names in `metadata` that specifies the groups. These groups are + often associated with individuals repeatedly sampled, typically + longitudinally. + **kwargs : dict + Other arguments accepted into + `statsmodels.regression.linear_model.MixedLM` + + Returns + ------- + RegressionResults + Container object that holds information about the overall fit. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> from skbio.stats.composition import ilr_inv + >>> from skbio import TreeNode + >>> from gneiss import mixedlm + + Here, we will define a table of proportions with 3 features + `a`, `b`, and `c` across 12 samples. + + >>> table = pd.DataFrame({ + ... 'x1': ilr_inv(np.array([1.1, 1.1])), + ... 'x2': ilr_inv(np.array([1., 2.])), + ... 'x3': ilr_inv(np.array([1.1, 3.])), + ... 'y1': ilr_inv(np.array([1., 2.1])), + ... 'y2': ilr_inv(np.array([1., 3.1])), + ... 'y3': ilr_inv(np.array([1., 4.])), + ... 'z1': ilr_inv(np.array([1.1, 5.])), + ... 'z2': ilr_inv(np.array([1., 6.1])), + ... 'z3': ilr_inv(np.array([1.1, 7.])), + ... 'u1': ilr_inv(np.array([1., 6.1])), + ... 'u2': ilr_inv(np.array([1., 7.])), + ... 'u3': ilr_inv(np.array([1.1, 8.1]))}, + ... index=['a', 'b', 'c']).T + + Now we are going to define some of the external variables to + test for in the model. Here we will be testing a hypothetical + longitudinal study across 3 time points, with 4 patients + `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1` + and `z` and `u` were given treatment `2`. + + >>> metadata = pd.DataFrame({ + ... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + ... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + ... 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] + ... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', + ... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + + Finally, we need to define a bifurcating tree used to convert the + proportions to balances. If the internal nodes aren't labels, + a default labeling will be applied (i.e. `y1`, `y2`, ...) + + >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + >>> print(tree.ascii_art()) + /-c + -Y1------| + | /-b + \Y2------| + \-a + + Now we can run the linear mixed effects model on the proportions. + Underneath the hood, the proportions will be transformed into balances, + so that the linear mixed effects models can be run directly on balances. + Since each patient was sampled repeatedly, we'll specify them separately + in the groups. In the linear mixed effects model `time` and `treatment` + will be simultaneously tested for with respect to the balances. + + >>> res = mixedlm('time + treatment', table, metadata, tree, + ... groups='patient') + + See Also + -------- + statsmodels.regression.linear_model.MixedLM + ols + + """ + table, metadata, tree = _intersect_of_table_metadata_tree(table, + metadata, + tree) + ilr_table, basis = _to_balances(table, tree) + data = pd.merge(ilr_table, metadata, left_index=True, right_index=True) + + fits = [] + for b in ilr_table.columns: + # mixed effects code is obtained here: + # http://stackoverflow.com/a/22439820/1167475 + stats_formula = '%s ~ %s' % (b, formula) + mdf = smf.mixedlm(stats_formula, data=data, + groups=data[groups], + **kwargs).fit() fits.append(mdf) return RegressionResults(fits, basis=basis, feature_names=table.columns) diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 2bbf326..d5a4f35 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. # @@ -39,26 +37,29 @@ def __init__(self, self.basis = basis self.results = stat_results - # sum of squares error. Also referred to as sum of squares residuals - sse = 0 - # sum of squares regression. Also referred to as - # explained sum of squares. - ssr = 0 - # See `statsmodels.regression.linear_model.RegressionResults` - # for more explanation on `ess` and `ssr`. - # obtain pvalues self.pvalues = pd.DataFrame() for r in self.results: p = r.pvalues p.name = r.model.endog_names self.pvalues = self.pvalues.append(p) - sse += r.ssr - ssr += r.ess + @property + def r2(self): + """ Calculates the coefficients of determination """ + # Reason why we wanted to move this out was because not + # all types of statsmodels regressions have this property. + + # See `statsmodels.regression.linear_model.RegressionResults` + # for more explanation on `ess` and `ssr`. + # sum of squares regression. Also referred to as + # explained sum of squares. + ssr = sum([r.ess for r in self.results]) + # sum of squares error. Also referred to as sum of squares residuals + sse = sum([r.ssr for r in self.results]) # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr - self.r2 = 1 - sse / sst + return 1 - sse / sst def _check_projection(self, project): """ diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index c45259f..c26bc91 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -11,7 +11,9 @@ import unittest from skbio.stats.composition import ilr_inv from skbio import TreeNode -from gneiss._formula import ols +from gneiss._formula import ols, mixedlm +import statsmodels.formula.api as smf +import numpy.testing as npt class TestOLS(unittest.TestCase): @@ -150,5 +152,216 @@ def test_ols_empty_metadata(self): ols('real + lame', table, metadata, tree) +class TestMixedLM(unittest.TestCase): + + def setUp(self): + A = np.array # aliasing for the sake of pep8 + self.table = pd.DataFrame({ + 'x1': ilr_inv(A([1., 1.])), + 'x2': ilr_inv(A([1., 2.])), + 'x3': ilr_inv(A([1., 3.])), + 'y1': ilr_inv(A([1., 2.])), + 'y2': ilr_inv(A([1., 3.])), + 'y3': ilr_inv(A([1., 4.])), + 'z1': ilr_inv(A([1., 5.])), + 'z2': ilr_inv(A([1., 6.])), + 'z3': ilr_inv(A([1., 7.])), + 'u1': ilr_inv(A([1., 6.])), + 'u2': ilr_inv(A([1., 7.])), + 'u3': ilr_inv(A([1., 8.]))}, + index=['a', 'b', 'c']).T + self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + self.metadata = pd.DataFrame({ + 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] + }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', + 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + + # test case borrowed from statsmodels + # https://github.com/statsmodels/statsmodels/blob/master/statsmodels + # /regression/tests/test_lme.py#L254 + def test_mixedlm_univariate(self): + + np.random.seed(6241) + n = 1600 + exog = np.random.normal(size=(n, 2)) + groups = np.kron(np.arange(n / 16), np.ones(16)) + + # Build up the random error vector + errors = 0 + + # The random effects + exog_re = np.random.normal(size=(n, 2)) + slopes = np.random.normal(size=(n / 16, 2)) + slopes = np.kron(slopes, np.ones((16, 1))) * exog_re + errors += slopes.sum(1) + + # First variance component + errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) + + # Second variance component + errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) + + # iid errors + errors += np.random.normal(size=n) + + endog = exog.sum(1) + errors + + df = pd.DataFrame(index=range(n)) + df["y"] = endog + df["groups"] = groups + df["x1"] = exog[:, 0] + df["x2"] = exog[:, 1] + + # Equivalent model in R: + # df.to_csv("tst.csv") + # model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 | + # v2), df) + + model1 = smf.mixedlm("y ~ x1 + x2", groups=groups, + data=df) + result1 = model1.fit() + + # Compare to R + npt.assert_allclose(result1.fe_params, [ + 0.211451, 1.022008, 0.924873], rtol=1e-2) + npt.assert_allclose(result1.cov_re, [[0.987738]], rtol=1e-3) + + npt.assert_allclose(result1.bse.iloc[0:3], [ + 0.128377, 0.082644, 0.081031], rtol=1e-3) + + def test_mixedlm_balances(self): + np.random.seed(6241) + n = 1600 + exog = np.random.normal(size=(n, 2)) + groups = np.kron(np.arange(n / 16), np.ones(16)) + + # Build up the random error vector + errors = 0 + + # The random effects + exog_re = np.random.normal(size=(n, 2)) + slopes = np.random.normal(size=(n / 16, 2)) + slopes = np.kron(slopes, np.ones((16, 1))) * exog_re + errors += slopes.sum(1) + + # First variance component + errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) + + # Second variance component + errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) + + # iid errors + errors += np.random.normal(size=n) + + endog = exog.sum(1) + errors + + df = pd.DataFrame(index=range(n)) + df["y1"] = endog + df["y2"] = endog + 2 * 2 + df["groups"] = groups + df["x1"] = exog[:, 0] + df["x2"] = exog[:, 1] + + tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + iv = ilr_inv(df[["y1", "y2"]].values) + table = pd.DataFrame(iv, columns=['a', 'b', 'c']) + metadata = df[['x1', 'x2', 'groups']] + + res = mixedlm("x1 + x2", table, metadata, tree, groups="groups") + exp_pvalues = pd.DataFrame( + [[4.923122e-236, 3.180390e-40, 3.972325e-35, 3.568599e-30], + [9.953418e-02, 3.180390e-40, 3.972325e-35, 3.568599e-30]], + index=['Y1', 'Y2'], + columns=['Intercept', 'Intercept RE', 'x1', 'x2']) + + pdt.assert_frame_equal(res.pvalues, exp_pvalues, + check_less_precise=True) + + exp_coefficients = pd.DataFrame( + [[4.211451, -0.305906, 1.022008, 0.924873], + [0.211451, -0.305906, 1.022008, 0.924873]], + columns=['Intercept', 'Intercept RE', 'x1', 'x2'], + index=['Y1', 'Y2']) + + pdt.assert_frame_equal(res.coefficients(), exp_coefficients, + check_less_precise=True) + + def test_mixedlm_balances_vcf(self): + np.random.seed(6241) + n = 1600 + exog = np.random.normal(size=(n, 2)) + groups = np.kron(np.arange(n / 16), np.ones(16)) + + # Build up the random error vector + errors = 0 + + # The random effects + exog_re = np.random.normal(size=(n, 2)) + slopes = np.random.normal(size=(n / 16, 2)) + slopes = np.kron(slopes, np.ones((16, 1))) * exog_re + errors += slopes.sum(1) + + # First variance component + subgroups1 = np.kron(np.arange(n / 4), np.ones(4)) + errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) + + # Second variance component + subgroups2 = np.kron(np.arange(n / 2), np.ones(2)) + errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) + + # iid errors + errors += np.random.normal(size=n) + + endog = exog.sum(1) + errors + + df = pd.DataFrame(index=range(n)) + df["y1"] = endog + df["y2"] = endog + 2 * 2 + df["groups"] = groups + df["x1"] = exog[:, 0] + df["x2"] = exog[:, 1] + df["z1"] = exog_re[:, 0] + df["z2"] = exog_re[:, 1] + df["v1"] = subgroups1 + df["v2"] = subgroups2 + + tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + iv = ilr_inv(df[["y1", "y2"]].values) + table = pd.DataFrame(iv, columns=['a', 'b', 'c']) + metadata = df[['x1', 'x2', 'z1', 'z2', 'v1', 'v2', 'groups']] + + res = mixedlm("x1 + x2", table, metadata, tree, groups="groups", + re_formula="0+z1+z2") + + exp_pvalues = pd.DataFrame( + [[4.923122e-236, 3.180390e-40, 3.972325e-35, 3.568599e-30], + [9.953418e-02, 3.180390e-40, 3.972325e-35, 3.568599e-30]], + index=['Y1', 'Y2'], + columns=['Intercept', 'Intercept RE', 'x1', 'x2']) + + exp_pvalues = pd.DataFrame([ + [0.000000, 3.858750e-39, 2.245068e-33, + 2.434437e-35, 0.776775, 6.645741e-34], + [0.038015, 3.858750e-39, 2.245068e-33, + 2.434437e-35, 0.776775, 6.645741e-34]], + columns=['Intercept', 'x1', 'x2', 'z1 RE', + 'z1 RE x z2 RE', 'z2 RE'], + index=['Y1', 'Y2']) + exp_coefficients = pd.DataFrame( + [[4.163141, 1.030013, 0.935514, 0.339239, -0.005792, 0.38456], + [0.163141, 1.030013, 0.935514, 0.339239, -0.005792, 0.38456]], + columns=['Intercept', 'x1', 'x2', 'z1 RE', + 'z1 RE x z2 RE', 'z2 RE'], + index=['Y1', 'Y2']) + + pdt.assert_frame_equal(res.pvalues, exp_pvalues, + check_less_precise=True) + + pdt.assert_frame_equal(res.coefficients(), exp_coefficients, + check_less_precise=True) + + if __name__ == '__main__': unittest.main() diff --git a/ipynb/balance_trees.ipynb b/ipynb/balance_trees.ipynb index 16fcd27..bf1f937 100644 --- a/ipynb/balance_trees.ipynb +++ b/ipynb/balance_trees.ipynb @@ -45,7 +45,7 @@ "$$\n", "b_i = \\sqrt{\\frac{l_i r_i}{l_i + r_i}} \n", " \\log \\big( \\frac{(\\prod_{x_n \\in L}{x_n})^{1/l_i}}\n", - " {(\\prod_{x_m \\in L}{x_m})^{1/r_i}} \\big)\n", + " {(\\prod_{x_m \\in R}{x_m})^{1/r_i}} \\big)\n", "$$\n", "\n", "where $b_i$ is the $i$th balance, $l_i$ is the number of leaves in the left subtree, $r_i$ is the number of leaves in the right subtree $x_n$ are the abundances of the species in the left subtree, and $x_m$ are the abundances of species in the right subtree.\n"