biocore · ElDeveloper · Aug 18, 2016 · Jul 29, 2016 · Jul 29, 2016 · Jul 31, 2016
diff --git a/gneiss/__init__.py b/gneiss/__init__.py
@@ -8,8 +8,8 @@
 
 from __future__ import absolute_import, division, print_function
 
-from gneiss._formula import ols
+from gneiss._formula import mixedlm, ols
 
-__all__ = ['ols']
+__all__ = ['ols', 'mixedlm']
 
 __version__ = "0.0.2"
diff --git a/gneiss/_formula.py b/gneiss/_formula.py
@@ -37,6 +37,7 @@ def _intersect_of_table_metadata_tree(table, metadata, tree):
 
 def _to_balances(table, tree):
     """ Converts a table of abundances to balances given a tree.
+
     Parameters
     ----------
     table : pd.DataFrame
@@ -197,7 +198,126 @@ def ols(formula, table, metadata, tree, **kwargs):
         # http://stackoverflow.com/a/22439820/1167475
         stats_formula = '%s ~ %s' % (b, formula)
 
-        mdf = smf.ols(stats_formula, data=data).fit()
+        mdf = smf.ols(stats_formula, data=data, **kwargs).fit()
+        fits.append(mdf)
+    return RegressionResults(fits, basis=basis,
+                             feature_names=table.columns)
+
+
+def mixedlm(formula, table, metadata, tree, groups, **kwargs):
+    """ Linear Mixed Effects Models applied to balances.
+
+    Parameters
+    ----------
+    formula : str
+        Formula representing the statistical equation to be evaluated.
+        These strings are similar to how equations are handled in R.
+        Note that the dependent variable in this string should not be
+        specified, since this method will be run on each of the individual
+        balances. See `patsy` for more details.
+    table : pd.DataFrame
+        Contingency table where samples correspond to rows and
+        features correspond to columns.
+    metadata: pd.DataFrame
+        Metadata table that contains information about the samples contained
+        in the `table` object.  Samples correspond to rows and covariates
+        correspond to columns.
+    tree : skbio.TreeNode
+        Tree object where the leaves correspond to the columns contained in
+        the table.
+    groups : str
+        Column names in `metadata` that specifies the groups.  These groups are
+        often associated with individuals repeatedly sampled, typically
+        longitudinally.
+    **kwargs : dict
+        Other arguments accepted into `statsmodels.regression.linear_model.OLS`
+
+    Returns
+    -------
+    RegressionResults
+        Container object that holds information about the overall fit.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> from skbio.stats.composition import ilr_inv
+    >>> from skbio import TreeNode
+    >>> from gneiss import mixedlm
+
+    Here, we will define a table of proportions with 3 features
+    `a`, `b`, and `c` across 12 samples.
+
+    >>> table = pd.DataFrame({
+    ...         'x1': ilr_inv(np.array([1.1, 1.1])),
+    ...         'x2': ilr_inv(np.array([1., 2.])),
+    ...         'x3': ilr_inv(np.array([1.1, 3.])),
+    ...         'y1': ilr_inv(np.array([1., 2.1])),
+    ...         'y2': ilr_inv(np.array([1., 3.1])),
+    ...         'y3': ilr_inv(np.array([1., 4.])),
+    ...         'z1': ilr_inv(np.array([1.1, 5.])),
+    ...         'z2': ilr_inv(np.array([1., 6.1])),
+    ...         'z3': ilr_inv(np.array([1.1, 7.])),
+    ...         'u1': ilr_inv(np.array([1., 6.1])),
+    ...         'u2': ilr_inv(np.array([1., 7.])),
+    ...         'u3': ilr_inv(np.array([1.1, 8.1]))},
+    ...         index=['a', 'b', 'c']).T
+
+    Now we are going to define some of the external variables to
+    test for in the model.  Here we will be testing a hypothetical
+    longitudinal study across 3 time points, with 4 patients
+    `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1`
+    and `z` and `u` were given treatment `2`.
+
+    >>> metadata = pd.DataFrame({
+    ...         'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
+    ...         'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
+    ...         'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
+    ...     }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
+    ...               'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])
+
+    Finally, we need to define a bifurcating tree used to convert the
+    proportions to balances.  If the internal nodes aren't labels,
+    a default labeling will be applied (i.e. `y1`, `y2`, ...)
+
+    >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
+    >>> print(tree.ascii_art())
+              /-c
+    -Y1------|
+             |          /-b
+              \Y2------|
+                        \-a
+
+    Now we can run the linear mixed effects model on the proportions.
+    Underneath the hood, the proportions will be transformed into balances,
+    so that the linear mixed effects models can be run directly on balances.
+    Since each patient was  sampled repeatedly, we'll specify them separately
+    in the groups.  In the linear mixed effects  model `time` and `treatment`
+    will be simultaneously tested for with respect to the balances.
+
+    >>> res = mixedlm('time + treatment', table, metadata, tree,
+    ...               groups='patient')
+
+    See Also
+    --------
+    statsmodels.regression.linear_model.MixedLM
+    ols
+
+    """
+    table, metadata, tree = _intersect_of_table_metadata_tree(table,
+                                                              metadata,
+                                                              tree)
+    ilr_table, basis = _to_balances(table, tree)
+    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)
+
+    fits = []
+    for b in ilr_table.columns:
+        # mixed effects code is obtained here:
+        # http://stackoverflow.com/a/22439820/1167475
+        stats_formula = '%s ~ %s' % (b, formula)
+        mdf = smf.mixedlm(stats_formula, data=data,
+                          groups=data[groups],
+                          **kwargs).fit()
         fits.append(mdf)
     return RegressionResults(fits, basis=basis,
                              feature_names=table.columns)
diff --git a/gneiss/_summary.py b/gneiss/_summary.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # ----------------------------------------------------------------------------
 # Copyright (c) 2016--, gneiss development team.
 #
@@ -39,26 +37,29 @@ def __init__(self,
         self.basis = basis
         self.results = stat_results
 
-        # sum of squares error.  Also referred to as sum of squares residuals
-        sse = 0
-        # sum of squares regression. Also referred to as
-        # explained sum of squares.
-        ssr = 0
-        # See `statsmodels.regression.linear_model.RegressionResults`
-        # for more explanation on `ess` and `ssr`.
-
         # obtain pvalues
         self.pvalues = pd.DataFrame()
         for r in self.results:
             p = r.pvalues
             p.name = r.model.endog_names
             self.pvalues = self.pvalues.append(p)
-            sse += r.ssr
-            ssr += r.ess
 
+    @property
+    def r2(self):
+        """ Calculates the coefficients of determination """
+        # Reason why we wanted to move this out was because not
+        # all types of statsmodels regressions have this property.
+
+        # See `statsmodels.regression.linear_model.RegressionResults`
+        # for more explanation on `ess` and `ssr`.
+        # sum of squares regression. Also referred to as
+        # explained sum of squares.
+        ssr = sum([r.ess for r in self.results])
+        # sum of squares error.  Also referred to as sum of squares residuals
+        sse = sum([r.ssr for r in self.results])
         # calculate the overall coefficient of determination (i.e. R2)
         sst = sse + ssr
-        self.r2 = 1 - sse / sst
+        return 1 - sse / sst
 
     def _check_projection(self, project):
         """

diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py
@@ -11,7 +11,9 @@
 import unittest
 from skbio.stats.composition import ilr_inv
 from skbio import TreeNode
-from gneiss._formula import ols
+from gneiss._formula import ols, mixedlm
+import statsmodels.formula.api as smf
+import numpy.testing as npt
 
 
 class TestOLS(unittest.TestCase):
@@ -150,5 +152,133 @@ def test_ols_empty_metadata(self):
             ols('real + lame', table, metadata, tree)
 
 
+class TestMixedLM(unittest.TestCase):
+
+    def setUp(self):
+        A = np.array  # aliasing for the sake of pep8
+        self.table = pd.DataFrame({
+            'x1': ilr_inv(A([1., 1.])),
+            'x2': ilr_inv(A([1., 2.])),
+            'x3': ilr_inv(A([1., 3.])),
+            'y1': ilr_inv(A([1., 2.])),
+            'y2': ilr_inv(A([1., 3.])),
+            'y3': ilr_inv(A([1., 4.])),
+            'z1': ilr_inv(A([1., 5.])),
+            'z2': ilr_inv(A([1., 6.])),
+            'z3': ilr_inv(A([1., 7.])),
+            'u1': ilr_inv(A([1., 6.])),
+            'u2': ilr_inv(A([1., 7.])),
+            'u3': ilr_inv(A([1., 8.]))},
+            index=['a', 'b', 'c']).T
+        self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
+        self.metadata = pd.DataFrame({
+            'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
+            'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
+            'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
+        }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
+                  'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])
+
+    # test case borrowed from statsmodels
+    # https://github.com/statsmodels/statsmodels/blob/master/statsmodels
+    # /regression/tests/test_lme.py#L254
+    def test_mixedlm_univariate(self):
+
+        np.random.seed(6241)
+        n = 1600
+        exog = np.random.normal(size=(n, 2))
+        groups = np.kron(np.arange(n / 16), np.ones(16))
+
+        # Build up the random error vector
+        errors = 0
+
+        # The random effects
+        exog_re = np.random.normal(size=(n, 2))
+        slopes = np.random.normal(size=(n / 16, 2))
+        slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
+        errors += slopes.sum(1)
+
+        # First variance component
+        errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))
+
+        # Second variance component
+        errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))
+
+        # iid errors
+        errors += np.random.normal(size=n)
+
+        endog = exog.sum(1) + errors
+
+        df = pd.DataFrame(index=range(n))
+        df["y"] = endog
+        df["groups"] = groups
+        df["x1"] = exog[:, 0]
+        df["x2"] = exog[:, 1]
+
+        # Equivalent model in R:
+        # df.to_csv("tst.csv")
+        # model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 |
+        # v2), df)
+
+        model1 = smf.mixedlm("y ~ x1 + x2", groups=groups,
+                             data=df)
+        result1 = model1.fit()
+
+        # Compare to R
+        npt.assert_allclose(result1.fe_params, [
+            0.211451, 1.022008, 0.924873], rtol=1e-2)
+        npt.assert_allclose(result1.cov_re, [[0.987738]], rtol=1e-3)
+
+        npt.assert_allclose(result1.bse.iloc[0:3], [
+            0.128377,  0.082644,  0.081031], rtol=1e-3)
+
+    def test_mixedlm_balances(self):
+        np.random.seed(6241)
+        n = 1600
+        exog = np.random.normal(size=(n, 2))
+        groups = np.kron(np.arange(n / 16), np.ones(16))
+
+        # Build up the random error vector
+        errors = 0
+
+        # The random effects
+        exog_re = np.random.normal(size=(n, 2))
+        slopes = np.random.normal(size=(n / 16, 2))
+        slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
+        errors += slopes.sum(1)
+
+        # First variance component
+        errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))
+
+        # Second variance component
+        errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))
+
+        # iid errors
+        errors += np.random.normal(size=n)
+
+        endog = exog.sum(1) + errors
+
+        df = pd.DataFrame(index=range(n))
+        df["y1"] = endog
+        df["y2"] = endog + 2 * 2
+        df["groups"] = groups
+        df["x1"] = exog[:, 0]
+        df["x2"] = exog[:, 1]
+
+        tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
+        iv = ilr_inv(df[["y1", "y2"]].values)
+        table = pd.DataFrame(iv, columns=['a', 'b', 'c'])
+        metadata = df[['x1', 'x2', 'groups']]
+
+        res = mixedlm("x1 + x2", table, metadata, tree, groups="groups")
+        exp_pvalues = pd.DataFrame(
+            [[4.923122e-236,  3.180390e-40,  3.972325e-35,  3.568599e-30],
+             [9.953418e-02,  3.180390e-40,  3.972325e-35,  3.568599e-30]],
+            index=['Y1', 'Y2'],
+            columns=['Intercept', 'Intercept RE', 'x1', 'x2'])
+
+        pdt.assert_frame_equal(res.pvalues, exp_pvalues,
+                               check_less_precise=True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/ipynb/balance_trees.ipynb b/ipynb/balance_trees.ipynb
@@ -45,7 +45,7 @@
     "$$\n",
     "b_i = \\sqrt{\\frac{l_i r_i}{l_i + r_i}} \n",
     " \\log \\big( \\frac{(\\prod_{x_n \\in L}{x_n})^{1/l_i}}\n",
-    " {(\\prod_{x_m \\in L}{x_m})^{1/r_i}} \\big)\n",
+    " {(\\prod_{x_m \\in R}{x_m})^{1/r_i}} \\big)\n",
     "$$\n",
     "\n",
     "where $b_i$ is the $i$th balance, $l_i$ is the number of leaves in the left subtree, $r_i$ is the number of leaves in the right subtree  $x_n$ are the abundances of the species in the left subtree, and $x_m$ are the abundances of species in the right subtree.\n"