Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lme2 #38

Merged
merged 14 commits into from
Aug 18, 2016
4 changes: 2 additions & 2 deletions gneiss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

from __future__ import absolute_import, division, print_function

from gneiss._formula import ols
from gneiss._formula import mixedlm, ols

__all__ = ['ols']
__all__ = ['ols', 'mixedlm']

__version__ = "0.0.2"
122 changes: 121 additions & 1 deletion gneiss/_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def _intersect_of_table_metadata_tree(table, metadata, tree):

def _to_balances(table, tree):
""" Converts a table of abundances to balances given a tree.

Parameters
----------
table : pd.DataFrame
Expand Down Expand Up @@ -197,7 +198,126 @@ def ols(formula, table, metadata, tree, **kwargs):
# http://stackoverflow.com/a/22439820/1167475
stats_formula = '%s ~ %s' % (b, formula)

mdf = smf.ols(stats_formula, data=data).fit()
mdf = smf.ols(stats_formula, data=data, **kwargs).fit()
fits.append(mdf)
return RegressionResults(fits, basis=basis,
feature_names=table.columns)


def mixedlm(formula, table, metadata, tree, groups, **kwargs):
""" Linear Mixed Effects Models applied to balances.

Parameters
----------
formula : str
Formula representing the statistical equation to be evaluated.
These strings are similar to how equations are handled in R.
Note that the dependent variable in this string should not be
specified, since this method will be run on each of the individual
balances. See `patsy` for more details.
table : pd.DataFrame
Contingency table where samples correspond to rows and
features correspond to columns.
metadata: pd.DataFrame
Metadata table that contains information about the samples contained
in the `table` object. Samples correspond to rows and covariates
correspond to columns.
tree : skbio.TreeNode
Tree object where the leaves correspond to the columns contained in
the table.
groups : str
Column names in `metadata` that specifies the groups. These groups are
often associated with individuals repeatedly sampled, typically
longitudinally.
**kwargs : dict
Other arguments accepted into `statsmodels.regression.linear_model.OLS`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be mixedlm instead of OLS


Returns
-------
RegressionResults
Container object that holds information about the overall fit.

Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from skbio.stats.composition import ilr_inv
>>> from skbio import TreeNode
>>> from gneiss import mixedlm

Here, we will define a table of proportions with 3 features
`a`, `b`, and `c` across 12 samples.

>>> table = pd.DataFrame({
... 'x1': ilr_inv(np.array([1.1, 1.1])),
... 'x2': ilr_inv(np.array([1., 2.])),
... 'x3': ilr_inv(np.array([1.1, 3.])),
... 'y1': ilr_inv(np.array([1., 2.1])),
... 'y2': ilr_inv(np.array([1., 3.1])),
... 'y3': ilr_inv(np.array([1., 4.])),
... 'z1': ilr_inv(np.array([1.1, 5.])),
... 'z2': ilr_inv(np.array([1., 6.1])),
... 'z3': ilr_inv(np.array([1.1, 7.])),
... 'u1': ilr_inv(np.array([1., 6.1])),
... 'u2': ilr_inv(np.array([1., 7.])),
... 'u3': ilr_inv(np.array([1.1, 8.1]))},
... index=['a', 'b', 'c']).T

Now we are going to define some of the external variables to
test for in the model. Here we will be testing a hypothetical
longitudinal study across 3 time points, with 4 patients
`x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1`
and `z` and `u` were given treatment `2`.

>>> metadata = pd.DataFrame({
... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
... 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])

Finally, we need to define a bifurcating tree used to convert the
proportions to balances. If the internal nodes aren't labels,
a default labeling will be applied (i.e. `y1`, `y2`, ...)

>>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
>>> print(tree.ascii_art())
/-c
-Y1------|
| /-b
\Y2------|
\-a

Now we can run the linear mixed effects model on the proportions.
Underneath the hood, the proportions will be transformed into balances,
so that the linear mixed effects models can be run directly on balances.
Since each patient was sampled repeatedly, we'll specify them separately
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra space between "was" and "sampled"

in the groups. In the linear mixed effects model `time` and `treatment`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra space between "effects" and "model"

will be simultaneously tested for with respect to the balances.

>>> res = mixedlm('time + treatment', table, metadata, tree,
... groups='patient')

See Also
--------
statsmodels.regression.linear_model.MixedLM
ols

"""
table, metadata, tree = _intersect_of_table_metadata_tree(table,
metadata,
tree)
ilr_table, basis = _to_balances(table, tree)
data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)

fits = []
for b in ilr_table.columns:
# mixed effects code is obtained here:
# http://stackoverflow.com/a/22439820/1167475
stats_formula = '%s ~ %s' % (b, formula)
mdf = smf.mixedlm(stats_formula, data=data,
groups=data[groups],
**kwargs).fit()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any easy way to exercise the kwargs in the unit tests?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly. Give me a sec.

fits.append(mdf)
return RegressionResults(fits, basis=basis,
feature_names=table.columns)
27 changes: 14 additions & 13 deletions gneiss/_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2016--, gneiss development team.
#
Expand Down Expand Up @@ -39,26 +37,29 @@ def __init__(self,
self.basis = basis
self.results = stat_results

# sum of squares error. Also referred to as sum of squares residuals
sse = 0
# sum of squares regression. Also referred to as
# explained sum of squares.
ssr = 0
# See `statsmodels.regression.linear_model.RegressionResults`
# for more explanation on `ess` and `ssr`.

# obtain pvalues
self.pvalues = pd.DataFrame()
for r in self.results:
p = r.pvalues
p.name = r.model.endog_names
self.pvalues = self.pvalues.append(p)
sse += r.ssr
ssr += r.ess

@property
def r2(self):
""" Calculates the coefficients of determination """
# Reason why we wanted to move this out was because not
# all types of statsmodels regressions have this property.

# See `statsmodels.regression.linear_model.RegressionResults`
# for more explanation on `ess` and `ssr`.
# sum of squares regression. Also referred to as
# explained sum of squares.
ssr = sum([r.ess for r in self.results])
# sum of squares error. Also referred to as sum of squares residuals
sse = sum([r.ssr for r in self.results])
# calculate the overall coefficient of determination (i.e. R2)
sst = sse + ssr
self.r2 = 1 - sse / sst
return 1 - sse / sst
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any tests that need to be modified/created?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope. This is handled within the ols results. Note that I wanted to move this outside of the constructor, because linear mixed effects models don't have an r2.


def _check_projection(self, project):
"""
Expand Down
132 changes: 131 additions & 1 deletion gneiss/tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import unittest
from skbio.stats.composition import ilr_inv
from skbio import TreeNode
from gneiss._formula import ols
from gneiss._formula import ols, mixedlm
import statsmodels.formula.api as smf
import numpy.testing as npt


class TestOLS(unittest.TestCase):
Expand Down Expand Up @@ -150,5 +152,133 @@ def test_ols_empty_metadata(self):
ols('real + lame', table, metadata, tree)


class TestMixedLM(unittest.TestCase):

def setUp(self):
A = np.array # aliasing for the sake of pep8
self.table = pd.DataFrame({
'x1': ilr_inv(A([1., 1.])),
'x2': ilr_inv(A([1., 2.])),
'x3': ilr_inv(A([1., 3.])),
'y1': ilr_inv(A([1., 2.])),
'y2': ilr_inv(A([1., 3.])),
'y3': ilr_inv(A([1., 4.])),
'z1': ilr_inv(A([1., 5.])),
'z2': ilr_inv(A([1., 6.])),
'z3': ilr_inv(A([1., 7.])),
'u1': ilr_inv(A([1., 6.])),
'u2': ilr_inv(A([1., 7.])),
'u3': ilr_inv(A([1., 8.]))},
index=['a', 'b', 'c']).T
self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
self.metadata = pd.DataFrame({
'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
}, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])

# test case borrowed from statsmodels
# https://github.com/statsmodels/statsmodels/blob/master/statsmodels
# /regression/tests/test_lme.py#L254
def test_mixedlm_univariate(self):

np.random.seed(6241)
n = 1600
exog = np.random.normal(size=(n, 2))
groups = np.kron(np.arange(n / 16), np.ones(16))

# Build up the random error vector
errors = 0

# The random effects
exog_re = np.random.normal(size=(n, 2))
slopes = np.random.normal(size=(n / 16, 2))
slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
errors += slopes.sum(1)

# First variance component
errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))

# Second variance component
errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))

# iid errors
errors += np.random.normal(size=n)

endog = exog.sum(1) + errors

df = pd.DataFrame(index=range(n))
df["y"] = endog
df["groups"] = groups
df["x1"] = exog[:, 0]
df["x2"] = exog[:, 1]

# Equivalent model in R:
# df.to_csv("tst.csv")
# model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 |
# v2), df)

model1 = smf.mixedlm("y ~ x1 + x2", groups=groups,
data=df)
result1 = model1.fit()

# Compare to R
npt.assert_allclose(result1.fe_params, [
0.211451, 1.022008, 0.924873], rtol=1e-2)
npt.assert_allclose(result1.cov_re, [[0.987738]], rtol=1e-3)

npt.assert_allclose(result1.bse.iloc[0:3], [
0.128377, 0.082644, 0.081031], rtol=1e-3)

def test_mixedlm_balances(self):
np.random.seed(6241)
n = 1600
exog = np.random.normal(size=(n, 2))
groups = np.kron(np.arange(n / 16), np.ones(16))

# Build up the random error vector
errors = 0

# The random effects
exog_re = np.random.normal(size=(n, 2))
slopes = np.random.normal(size=(n / 16, 2))
slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
errors += slopes.sum(1)

# First variance component
errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))

# Second variance component
errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))

# iid errors
errors += np.random.normal(size=n)

endog = exog.sum(1) + errors

df = pd.DataFrame(index=range(n))
df["y1"] = endog
df["y2"] = endog + 2 * 2
df["groups"] = groups
df["x1"] = exog[:, 0]
df["x2"] = exog[:, 1]

tree = TreeNode.read(['(c, (b,a)Y2)Y1;'])
iv = ilr_inv(df[["y1", "y2"]].values)
table = pd.DataFrame(iv, columns=['a', 'b', 'c'])
metadata = df[['x1', 'x2', 'groups']]

res = mixedlm("x1 + x2", table, metadata, tree, groups="groups")
exp_pvalues = pd.DataFrame(
[[4.923122e-236, 3.180390e-40, 3.972325e-35, 3.568599e-30],
[9.953418e-02, 3.180390e-40, 3.972325e-35, 3.568599e-30]],
index=['Y1', 'Y2'],
columns=['Intercept', 'Intercept RE', 'x1', 'x2'])

pdt.assert_frame_equal(res.pvalues, exp_pvalues,
check_less_precise=True)


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion ipynb/balance_trees.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"$$\n",
"b_i = \\sqrt{\\frac{l_i r_i}{l_i + r_i}} \n",
" \\log \\big( \\frac{(\\prod_{x_n \\in L}{x_n})^{1/l_i}}\n",
" {(\\prod_{x_m \\in L}{x_m})^{1/r_i}} \\big)\n",
" {(\\prod_{x_m \\in R}{x_m})^{1/r_i}} \\big)\n",
"$$\n",
"\n",
"where $b_i$ is the $i$th balance, $l_i$ is the number of leaves in the left subtree, $r_i$ is the number of leaves in the right subtree $x_n$ are the abundances of the species in the left subtree, and $x_m$ are the abundances of species in the right subtree.\n"
Expand Down