Skip to content

Commit

Permalink
Merge pull request #38 from mortonjt/lme2
Browse files Browse the repository at this point in the history
Lme2
  • Loading branch information
ElDeveloper authored Aug 18, 2016
2 parents eb871ea + 9922347 commit 47f577f
Show file tree
Hide file tree
Showing 5 changed files with 352 additions and 18 deletions.
4 changes: 2 additions & 2 deletions gneiss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

from __future__ import absolute_import, division, print_function

from gneiss._formula import ols
from gneiss._formula import mixedlm, ols

__all__ = ['ols']
__all__ = ['ols', 'mixedlm']

__version__ = "0.0.2"
122 changes: 121 additions & 1 deletion gneiss/_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,127 @@ def ols(formula, table, metadata, tree, **kwargs):
# http://stackoverflow.com/a/22439820/1167475
stats_formula = '%s ~ %s' % (b, formula)

mdf = smf.ols(stats_formula, data=data).fit()
mdf = smf.ols(stats_formula, data=data, **kwargs).fit()
fits.append(mdf)
return RegressionResults(fits, basis=basis,
feature_names=table.columns)


def mixedlm(formula, table, metadata, tree, groups, **kwargs):
""" Linear Mixed Effects Models applied to balances.
Parameters
----------
formula : str
Formula representing the statistical equation to be evaluated.
These strings are similar to how equations are handled in R.
Note that the dependent variable in this string should not be
specified, since this method will be run on each of the individual
balances. See `patsy` for more details.
table : pd.DataFrame
Contingency table where samples correspond to rows and
features correspond to columns.
metadata: pd.DataFrame
Metadata table that contains information about the samples contained
in the `table` object. Samples correspond to rows and covariates
correspond to columns.
tree : skbio.TreeNode
Tree object where the leaves correspond to the columns contained in
the table.
groups : str
Column names in `metadata` that specifies the groups. These groups are
often associated with individuals repeatedly sampled, typically
longitudinally.
**kwargs : dict
Other arguments accepted into
`statsmodels.regression.linear_model.MixedLM`
Returns
-------
RegressionResults
Container object that holds information about the overall fit.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from skbio.stats.composition import ilr_inv
>>> from skbio import TreeNode
>>> from gneiss import mixedlm
Here, we will define a table of proportions with 3 features
`a`, `b`, and `c` across 12 samples.
>>> table = pd.DataFrame({
... 'x1': ilr_inv(np.array([1.1, 1.1])),
... 'x2': ilr_inv(np.array([1., 2.])),
... 'x3': ilr_inv(np.array([1.1, 3.])),
... 'y1': ilr_inv(np.array([1., 2.1])),
... 'y2': ilr_inv(np.array([1., 3.1])),
... 'y3': ilr_inv(np.array([1., 4.])),
... 'z1': ilr_inv(np.array([1.1, 5.])),
... 'z2': ilr_inv(np.array([1., 6.1])),
... 'z3': ilr_inv(np.array([1.1, 7.])),
... 'u1': ilr_inv(np.array([1., 6.1])),
... 'u2': ilr_inv(np.array([1., 7.])),
... 'u3': ilr_inv(np.array([1.1, 8.1]))},
... index=['a', 'b', 'c']).T
Now we are going to define some of the external variables to
test for in the model. Here we will be testing a hypothetical
longitudinal study across 3 time points, with 4 patients
`x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1`
and `z` and `u` were given treatment `2`.
>>> metadata = pd.DataFrame({
... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
... 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])
Finally, we need to define a bifurcating tree used to convert the
proportions to balances. If the internal nodes aren't labels,
a default labeling will be applied (i.e. `y1`, `y2`, ...)
>>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
>>> print(tree.ascii_art())
/-c
-Y1------|
| /-b
\Y2------|
\-a
Now we can run the linear mixed effects model on the proportions.
Underneath the hood, the proportions will be transformed into balances,
so that the linear mixed effects models can be run directly on balances.
Since each patient was sampled repeatedly, we'll specify them separately
in the groups. In the linear mixed effects model `time` and `treatment`
will be simultaneously tested for with respect to the balances.
>>> res = mixedlm('time + treatment', table, metadata, tree,
... groups='patient')
See Also
--------
statsmodels.regression.linear_model.MixedLM
ols
"""
table, metadata, tree = _intersect_of_table_metadata_tree(table,
metadata,
tree)
ilr_table, basis = _to_balances(table, tree)
data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)

fits = []
for b in ilr_table.columns:
# mixed effects code is obtained here:
# http://stackoverflow.com/a/22439820/1167475
stats_formula = '%s ~ %s' % (b, formula)
mdf = smf.mixedlm(stats_formula, data=data,
groups=data[groups],
**kwargs).fit()
fits.append(mdf)
return RegressionResults(fits, basis=basis,
feature_names=table.columns)
27 changes: 14 additions & 13 deletions gneiss/_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
Expand Down Expand Up @@ -39,26 +37,29 @@ def __init__(self,
self.basis = basis
self.results = stat_results

# sum of squares error. Also referred to as sum of squares residuals
sse = 0
# sum of squares regression. Also referred to as
# explained sum of squares.
ssr = 0
# See `statsmodels.regression.linear_model.RegressionResults`
# for more explanation on `ess` and `ssr`.

# obtain pvalues
self.pvalues = pd.DataFrame()
for r in self.results:
p = r.pvalues
p.name = r.model.endog_names
self.pvalues = self.pvalues.append(p)
sse += r.ssr
ssr += r.ess

@property
def r2(self):
""" Calculates the coefficients of determination """
# Reason why we wanted to move this out was because not
# all types of statsmodels regressions have this property.

# See `statsmodels.regression.linear_model.RegressionResults`
# for more explanation on `ess` and `ssr`.
# sum of squares regression. Also referred to as
# explained sum of squares.
ssr = sum([r.ess for r in self.results])
# sum of squares error. Also referred to as sum of squares residuals
sse = sum([r.ssr for r in self.results])
# calculate the overall coefficient of determination (i.e. R2)
sst = sse + ssr
self.r2 = 1 - sse / sst
return 1 - sse / sst

def _check_projection(self, project):
"""
Expand Down
Loading

0 comments on commit 47f577f

Please sign in to comment.