Merge pull request #38 from mortonjt/lme2

Lme2
biocore · Aug 18, 2016 · 47f577f · 47f577f
2 parents eb871ea + 9922347
commit 47f577f
Show file tree

Hide file tree

Showing 5 changed files with 352 additions and 18 deletions.
diff --git a/gneiss/__init__.py b/gneiss/__init__.py
@@ -8,8 +8,8 @@
 
 from __future__ import absolute_import, division, print_function
 
-from gneiss._formula import ols
+from gneiss._formula import mixedlm, ols
 
-__all__ = ['ols']
+__all__ = ['ols', 'mixedlm']
 
 __version__ = "0.0.2"
diff --git a/gneiss/_formula.py b/gneiss/_formula.py
@@ -221,7 +221,127 @@ def ols(formula, table, metadata, tree, **kwargs):
         # http://stackoverflow.com/a/22439820/1167475
         stats_formula = '%s ~ %s' % (b, formula)
 
-        mdf = smf.ols(stats_formula, data=data).fit()
+        mdf = smf.ols(stats_formula, data=data, **kwargs).fit()
+        fits.append(mdf)
+    return RegressionResults(fits, basis=basis,
+                             feature_names=table.columns)
+
+
+def mixedlm(formula, table, metadata, tree, groups, **kwargs):
+    """ Linear Mixed Effects Models applied to balances.
+
+    Parameters
+    ----------
+    formula : str
+        Formula representing the statistical equation to be evaluated.
+        These strings are similar to how equations are handled in R.
+        Note that the dependent variable in this string should not be
+        specified, since this method will be run on each of the individual
+        balances. See `patsy` for more details.
+    table : pd.DataFrame
+        Contingency table where samples correspond to rows and
+        features correspond to columns.
+    metadata: pd.DataFrame
+        Metadata table that contains information about the samples contained
+        in the `table` object.  Samples correspond to rows and covariates
+        correspond to columns.
+    tree : skbio.TreeNode
+        Tree object where the leaves correspond to the columns contained in
+        the table.
+    groups : str
+        Column names in `metadata` that specifies the groups.  These groups are
+        often associated with individuals repeatedly sampled, typically
+        longitudinally.
+    **kwargs : dict
+        Other arguments accepted into
+        `statsmodels.regression.linear_model.MixedLM`
+
+    Returns
+    -------
+    RegressionResults
+        Container object that holds information about the overall fit.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> from skbio.stats.composition import ilr_inv
+    >>> from skbio import TreeNode
+    >>> from gneiss import mixedlm
+
+    Here, we will define a table of proportions with 3 features
+    `a`, `b`, and `c` across 12 samples.
+
+    >>> table = pd.DataFrame({
+    ...         'x1': ilr_inv(np.array([1.1, 1.1])),
+    ...         'x2': ilr_inv(np.array([1., 2.])),
+    ...         'x3': ilr_inv(np.array([1.1, 3.])),
+    ...         'y1': ilr_inv(np.array([1., 2.1])),
+    ...         'y2': ilr_inv(np.array([1., 3.1])),
+    ...         'y3': ilr_inv(np.array([1., 4.])),
+    ...         'z1': ilr_inv(np.array([1.1, 5.])),
+    ...         'z2': ilr_inv(np.array([1., 6.1])),
+    ...         'z3': ilr_inv(np.array([1.1, 7.])),
+    ...         'u1': ilr_inv(np.array([1., 6.1])),
+    ...         'u2': ilr_inv(np.array([1., 7.])),
+    ...         'u3': ilr_inv(np.array([1.1, 8.1]))},
+    ...         index=['a', 'b', 'c']).T
+
+    Now we are going to define some of the external variables to
+    test for in the model.  Here we will be testing a hypothetical
+    longitudinal study across 3 time points, with 4 patients
+    `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1`
+    and `z` and `u` were given treatment `2`.
+
+    >>> metadata = pd.DataFrame({
+    ...         'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
+    ...         'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
+    ...         'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
+    ...     }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
+    ...               'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])
+
+    Finally, we need to define a bifurcating tree used to convert the
+    proportions to balances.  If the internal nodes aren't labels,
+    a default labeling will be applied (i.e. `y1`, `y2`, ...)
+
+    >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
+    >>> print(tree.ascii_art())
+              /-c
+    -Y1------|
+             |          /-b
+              \Y2------|
+                        \-a
+
+    Now we can run the linear mixed effects model on the proportions.
+    Underneath the hood, the proportions will be transformed into balances,
+    so that the linear mixed effects models can be run directly on balances.
+    Since each patient was sampled repeatedly, we'll specify them separately
+    in the groups.  In the linear mixed effects model `time` and `treatment`
+    will be simultaneously tested for with respect to the balances.
+
+    >>> res = mixedlm('time + treatment', table, metadata, tree,
+    ...               groups='patient')
+
+    See Also
+    --------
+    statsmodels.regression.linear_model.MixedLM
+    ols
+
+    """
+    table, metadata, tree = _intersect_of_table_metadata_tree(table,
+                                                              metadata,
+                                                              tree)
+    ilr_table, basis = _to_balances(table, tree)
+    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)
+
+    fits = []
+    for b in ilr_table.columns:
+        # mixed effects code is obtained here:
+        # http://stackoverflow.com/a/22439820/1167475
+        stats_formula = '%s ~ %s' % (b, formula)
+        mdf = smf.mixedlm(stats_formula, data=data,
+                          groups=data[groups],
+                          **kwargs).fit()
         fits.append(mdf)
     return RegressionResults(fits, basis=basis,
                              feature_names=table.columns)
diff --git a/gneiss/_summary.py b/gneiss/_summary.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # ----------------------------------------------------------------------------
 # Copyright (c) 2016--, gneiss development team.
 #
@@ -39,26 +37,29 @@ def __init__(self,
         self.basis = basis
         self.results = stat_results
 
-        # sum of squares error.  Also referred to as sum of squares residuals
-        sse = 0
-        # sum of squares regression. Also referred to as
-        # explained sum of squares.
-        ssr = 0
-        # See `statsmodels.regression.linear_model.RegressionResults`
-        # for more explanation on `ess` and `ssr`.
-
         # obtain pvalues
         self.pvalues = pd.DataFrame()
         for r in self.results:
             p = r.pvalues
             p.name = r.model.endog_names
             self.pvalues = self.pvalues.append(p)
-            sse += r.ssr
-            ssr += r.ess
 
+    @property
+    def r2(self):
+        """ Calculates the coefficients of determination """
+        # Reason why we wanted to move this out was because not
+        # all types of statsmodels regressions have this property.
+
+        # See `statsmodels.regression.linear_model.RegressionResults`
+        # for more explanation on `ess` and `ssr`.
+        # sum of squares regression. Also referred to as
+        # explained sum of squares.
+        ssr = sum([r.ess for r in self.results])
+        # sum of squares error.  Also referred to as sum of squares residuals
+        sse = sum([r.ssr for r in self.results])
         # calculate the overall coefficient of determination (i.e. R2)
         sst = sse + ssr
-        self.r2 = 1 - sse / sst
+        return 1 - sse / sst
 
     def _check_projection(self, project):
         """