From 6a247a71fa2df4106391e17fc83c0a36566bd341 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Fri, 29 Jul 2016 11:18:31 -0700 Subject: [PATCH 01/12] Typo in ipython notebook --- ipynb/balance_trees.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipynb/balance_trees.ipynb b/ipynb/balance_trees.ipynb index 13d41be..b9f039a 100644 --- a/ipynb/balance_trees.ipynb +++ b/ipynb/balance_trees.ipynb @@ -45,7 +45,7 @@ "$$\n", "b_i = \\sqrt{\\frac{l_i r_i}{l_i + r_i}} \n", " \\log \\big( \\frac{(\\prod_{x_n \\in L}{x_n})^{1/l_i}}\n", - " {(\\prod_{x_m \\in L}{x_m})^{1/r_i}} \\big)\n", + " {(\\prod_{x_m \\inRL}{x_m})^{1/r_i}} \\big)\n", "$$\n", "\n", "where $b_i$ is the $i$th balance, $l_i$ is the number of leaves in the left subtree, $r_i$ is the number of leaves in the right subtree $x_n$ are the abundances of the species in the left subtree, and $x_m$ are the abundances of species in the right subtree.\n" From f13954dd156fdd10d2ce8b625045953ac82dd7b1 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Fri, 29 Jul 2016 11:19:12 -0700 Subject: [PATCH 02/12] Fixing typo in ipython notebook --- ipynb/balance_trees.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipynb/balance_trees.ipynb b/ipynb/balance_trees.ipynb index b9f039a..96f22b9 100644 --- a/ipynb/balance_trees.ipynb +++ b/ipynb/balance_trees.ipynb @@ -45,7 +45,7 @@ "$$\n", "b_i = \\sqrt{\\frac{l_i r_i}{l_i + r_i}} \n", " \\log \\big( \\frac{(\\prod_{x_n \\in L}{x_n})^{1/l_i}}\n", - " {(\\prod_{x_m \\inRL}{x_m})^{1/r_i}} \\big)\n", + " {(\\prod_{x_m \\in R}{x_m})^{1/r_i}} \\big)\n", "$$\n", "\n", "where $b_i$ is the $i$th balance, $l_i$ is the number of leaves in the left subtree, $r_i$ is the number of leaves in the right subtree $x_n$ are the abundances of the species in the left subtree, and $x_m$ are the abundances of species in the right subtree.\n" From 5a7ad7ad143d59dbff43eccc73116c643834b520 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Sat, 30 Jul 2016 19:10:16 -0700 Subject: [PATCH 03/12] Adding linear mixed effects models --- gneiss/_formula.py | 76 ++++++++++++++++++++++++++++++------ gneiss/tests/test_formula.py | 35 +++++++++++++++-- 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 22fa7e3..93a38e7 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -8,7 +8,7 @@ import pandas as pd import statsmodels.formula.api as smf from skbio.stats.composition import ilr -from gneiss.util import match, match_tips +from gneiss.util import match, match_tips, rename_internal_nodes from gneiss._summary import RegressionResults from gneiss.balances import balance_basis @@ -28,10 +28,21 @@ def _process(table, metadata, tree): return _table, _metadata, _tree +def _transform(table, tree): + """ Performs ILR transform on table""" + basis, _ = balance_basis(tree) + non_tips = [n.name for n in tree.levelorder() if not n.is_tip()] + mat = ilr(table.values, basis=basis) + ilr_table = pd.DataFrame(mat, + columns=non_tips, + index=table.index) + return ilr_table, basis + + def ols(formula, table, metadata, tree, **kwargs): """ Ordinary Least Squares applied to balances. - A ordinary least square regression is applied to each balance. + An ordinary least square regression is applied to each balance. Parameters ---------- @@ -64,13 +75,7 @@ def ols(formula, table, metadata, tree, **kwargs): statsmodels.regression.linear_model.OLS """ _table, _metadata, _tree = _process(table, metadata, tree) - basis, _ = balance_basis(_tree) - non_tips = [n.name for n in _tree.levelorder() if not n.is_tip()] - - mat = ilr(_table.values, basis=basis) - ilr_table = pd.DataFrame(mat, - columns=non_tips, - index=table.index) + basis, ilr_table = _transform(_table, _tree) data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) @@ -80,7 +85,7 @@ def ols(formula, table, metadata, tree, **kwargs): # http://stackoverflow.com/a/22439820/1167475 stats_formula = '%s ~ %s' % (b, formula) - mdf = smf.ols(stats_formula, data=data).fit() + mdf = smf.ols(stats_formula, data=data, **kwargs).fit() fits.append(mdf) return RegressionResults(fits, basis=basis, feature_names=table.columns) @@ -96,12 +101,59 @@ def glm(formula, table, metadata, tree, **kwargs): def mixedlm(formula, table, metadata, tree, **kwargs): - """ Linear Mixed Models applied to balances. + """ Linear Mixed Effects Models applied to balances. + + An Linear Mixed Effects model is applied to each balance. Parameters ---------- + formula : str + Formula representing the statistical equation to be evaluated. + These strings are similar to how equations are handled in R. + Note that the dependent variable in this string should not be + specified, since this method will be run on each of the individual + balances. See `patsy` for more details. + table : pd.DataFrame + Contingency table where samples correspond to rows and + features correspond to columns. + metadata: pd.DataFrame + Metadata table that contains information about the samples contained + in the `table` object. Samples correspond to rows and covariates + correspond to columns. + tree : skbio.TreeNode + Tree object where the leaves correspond to the columns contained in + the table. + groups : str + Variable in `metadata` that specifies the groups. Data from + different groups are different. + **kwargs : dict + Other arguments accepted into `statsmodels.regression.linear_model.OLS` + + Returns + ------- + RegressionResults + Container object that holds information about the overall fit. + + See Also + -------- + statsmodels.regression.linear_model.MixedLM """ - pass + _table, _metadata, _tree = _process(table, metadata, tree) + ilr_table, basis = _transform(_table, _tree) + data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) + + fits = [] + for b in ilr_table.columns: + # mixed effects code is obtained here: + # http://stackoverflow.com/a/22439820/1167475 + stats_formula = '%s ~ %s' % (b, formula) + + mdf = smf.mixedlm(stats_formula, data=data, + groups=data[groups], + **kwargs).fit() + fits.append(mdf) + return RegressionResults(fits, basis=basis, + feature_names=table.columns) def gee(formula, table, metadata, tree, **kwargs): diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index d6816eb..f2b9cc6 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -10,10 +10,8 @@ import numpy as np import pandas as pd import pandas.util.testing as pdt -import statsmodels.formula.api as smf import unittest -from gneiss._summary import RegressionResults -from skbio.stats.composition import _gram_schmidt_basis, ilr_inv +from skbio.stats.composition import ilr_inv from skbio import TreeNode from gneiss._formula import ols @@ -60,5 +58,36 @@ def test_ols(self): pdt.assert_frame_equal(exp_resid, res.residuals()) +class TestMixedLM(unittest.TestCase): + + def setUp(self): + A = np.array # aliasing for the sake of pep8 + self.table = pd.DataFrame({ + 'x1': ilr_inv(A([1., 1.])), + 'x2': ilr_inv(A([1., 2.])), + 'x3': ilr_inv(A([1., 3.])), + 'y1': ilr_inv(A([1., 2.])), + 'y2': ilr_inv(A([1., 3.])), + 'y3': ilr_inv(A([1., 4.])), + 'z1': ilr_inv(A([1., 5.])), + 'z2': ilr_inv(A([1., 6.])), + 'z3': ilr_inv(A([1., 7.]))}, + 'u1': ilr_inv(A([1., 6.])), + 'u2': ilr_inv(A([1., 7.])), + 'u3': ilr_inv(A([1., 8.]))}, + index=['a', 'b', 'c']).T + self.tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + self.metadata = pd.DataFrame({ + 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] + }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', + 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + + def test_mixedlm(self): + res = mixedlm('time', self.table, self.metadata, self.tree, groups=) + + pass + if __name__ == '__main__': unittest.main() From 241f2a2ca535b2405e10c6f6132998f0dac8f71f Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 2 Aug 2016 12:39:20 -0400 Subject: [PATCH 04/12] Adding in mixedlm --- gneiss/_formula.py | 31 ++++++++++++++++++++++++++++++- gneiss/_summary.py | 8 +++++--- gneiss/tests/test_formula.py | 6 +++--- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 93a38e7..0a65bcd 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -100,7 +100,7 @@ def glm(formula, table, metadata, tree, **kwargs): pass -def mixedlm(formula, table, metadata, tree, **kwargs): +def mixedlm(formula, table, metadata, tree, groups, **kwargs): """ Linear Mixed Effects Models applied to balances. An Linear Mixed Effects model is applied to each balance. @@ -134,6 +134,35 @@ def mixedlm(formula, table, metadata, tree, **kwargs): RegressionResults Container object that holds information about the overall fit. + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> from skbio.stats.composition import ilr_inv + >>> from skbio import TreeNode + >>> from gneiss._formula import mixedlm + >>> table = pd.DataFrame({ + ... 'x1': ilr_inv(np.array([1.1, 1.1])), + ... 'x2': ilr_inv(np.array([1., 2.])), + ... 'x3': ilr_inv(np.array([1.1, 3.])), + ... 'y1': ilr_inv(np.array([1., 2.1])), + ... 'y2': ilr_inv(np.array([1., 3.1])), + ... 'y3': ilr_inv(np.array([1., 4.])), + ... 'z1': ilr_inv(np.array([1.1, 5.])), + ... 'z2': ilr_inv(np.array([1., 6.1])), + ... 'z3': ilr_inv(np.array([1.1, 7.])), + ... 'u1': ilr_inv(np.array([1., 6.1])), + ... 'u2': ilr_inv(np.array([1., 7.])), + ... 'u3': ilr_inv(np.array([1.1, 8.1]))}, + ... index=['a', 'b', 'c']).T + >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + >>> metadata = pd.DataFrame({ + ... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + ... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + ... 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] + ... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', + ... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + See Also -------- statsmodels.regression.linear_model.MixedLM diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 89bfd3d..0d795fb 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -53,12 +53,14 @@ def __init__(self, p = r.pvalues p.name = r.model.endog_names self.pvalues = self.pvalues.append(p) - sse += r.ssr - ssr += r.ess + @property + def r2(self): + ssr = sum([r.ssr for r in self.results]) + sse = sum([r.sse for r in self.results]) # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr - self.r2 = 1 - sse / sst + return 1 - sse / sst def _check_projection(self, project): """ diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index f2b9cc6..453341b 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -13,7 +13,7 @@ import unittest from skbio.stats.composition import ilr_inv from skbio import TreeNode -from gneiss._formula import ols +from gneiss._formula import ols, mixedlm class TestOLS(unittest.TestCase): @@ -71,7 +71,7 @@ def setUp(self): 'y3': ilr_inv(A([1., 4.])), 'z1': ilr_inv(A([1., 5.])), 'z2': ilr_inv(A([1., 6.])), - 'z3': ilr_inv(A([1., 7.]))}, + 'z3': ilr_inv(A([1., 7.])), 'u1': ilr_inv(A([1., 6.])), 'u2': ilr_inv(A([1., 7.])), 'u3': ilr_inv(A([1., 8.]))}, @@ -85,7 +85,7 @@ def setUp(self): 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) def test_mixedlm(self): - res = mixedlm('time', self.table, self.metadata, self.tree, groups=) + res = mixedlm('time', self.table, self.metadata, self.tree, groups=patient) pass From 5a534090672699dff684c0d0016172bfe68793a8 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 23:34:23 -0700 Subject: [PATCH 05/12] prototype of linear mixed effects model --- gneiss/__init__.py | 2 ++ gneiss/_formula.py | 41 ++++++++++++++++++++++++++++++++---- gneiss/tests/test_formula.py | 23 +++++++++++++++++--- 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/gneiss/__init__.py b/gneiss/__init__.py index a27c03f..afe4610 100644 --- a/gneiss/__init__.py +++ b/gneiss/__init__.py @@ -8,5 +8,7 @@ from __future__ import absolute_import, division, print_function +from gneiss._formula import mixedlm +__all__ = ['mixedlm'] __version__ = "0.0.2" diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 0a65bcd..98ae63c 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -124,8 +124,10 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): Tree object where the leaves correspond to the columns contained in the table. groups : str - Variable in `metadata` that specifies the groups. Data from - different groups are different. + Variable in `metadata` that specifies the groups. These groups are + often associated with individuals repeatedly sampled, typically + longitudinally. + **kwargs : dict Other arguments accepted into `statsmodels.regression.linear_model.OLS` @@ -140,7 +142,11 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): >>> import numpy as np >>> from skbio.stats.composition import ilr_inv >>> from skbio import TreeNode - >>> from gneiss._formula import mixedlm + >>> from gneiss import mixedlm + + Here, we will define a table of proportions with 3 features + `a`, `b`, and `c` across 12 samples. + >>> table = pd.DataFrame({ ... 'x1': ilr_inv(np.array([1.1, 1.1])), ... 'x2': ilr_inv(np.array([1., 2.])), @@ -155,7 +161,13 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): ... 'u2': ilr_inv(np.array([1., 7.])), ... 'u3': ilr_inv(np.array([1.1, 8.1]))}, ... index=['a', 'b', 'c']).T - >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + + Now we are going to define some of the external variables to + test for in the model. Here we will be testing a hypothetical + longitudinal study across 3 time points, with 4 patients + `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1` + and `z` and `u` were given treatment `2`. + >>> metadata = pd.DataFrame({ ... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], ... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], @@ -163,6 +175,27 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): ... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', ... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + Finally, we need to define a bifurcating tree used to convert the + proportions to balances. If the internal nodes aren't labels, + a default labeling will be applied (i.e. `y1`, `y2`, ...) + + >>> tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + >>> print(tree.ascii_art()) + /-c + -Y1------| + | /-b + \Y2------| + \-a + + Now we can run the linear mixed effects model on the proportions. + Underneath the hood, the proportions will be transformed into balance, + so that the linear mixed effects models can be run directly on balances. + Since each patient was sampled repeatedly, we'll specify them separately + in the groups. In the linear mixed effects model `time` and `treatment` + will be simultaneously tested for with respect to the balances. + + >>> res = mixedlm('time + treatment', table, metadata, tree, groups='patient') + See Also -------- statsmodels.regression.linear_model.MixedLM diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 453341b..7493fb6 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -14,7 +14,7 @@ from skbio.stats.composition import ilr_inv from skbio import TreeNode from gneiss._formula import ols, mixedlm - +from random import seed class TestOLS(unittest.TestCase): @@ -85,9 +85,26 @@ def setUp(self): 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) def test_mixedlm(self): - res = mixedlm('time', self.table, self.metadata, self.tree, groups=patient) + np.random.seed(0) + seed(0) + res = mixedlm('time', self.table, self.metadata, self.tree, groups='patient') + + exp_pvals = pd.DataFrame([[0.015293, 0.193931, 0.012249], + [0.000000, 0.579045, 0.909983]], + index=['Y1', 'Y2'], + columns=['Intercept', 'Intercept RE', 'time']) + + exp_coefs = pd.DataFrame([[2.600000, 42.868869, 0.9750], + [1.016667, 0.216595, 0.0125]], + index=['Y1', 'Y2'], + columns=['Intercept', 'Intercept RE', 'time']) + + pdt.assert_index_equal(exp_pvals.index, res.pvalues.index) + pdt.assert_index_equal(exp_pvals.columns, res.pvalues.columns) + + pdt.assert_index_equal(exp_coefs.index, res.coefficients().index) + pdt.assert_index_equal(exp_coefs.columns, res.coefficients().columns) - pass if __name__ == '__main__': unittest.main() From 66e146ddbb9ed9c9eb3f3dffc1e58b813890c490 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 17 Aug 2016 23:40:12 -0700 Subject: [PATCH 06/12] clean up --- gneiss/_formula.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 874d6b4..4696d64 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -37,6 +37,7 @@ def _intersect_of_table_metadata_tree(table, metadata, tree): def _to_balances(table, tree): """ Converts a table of abundances to balances given a tree. + Parameters ---------- table : pd.DataFrame @@ -203,20 +204,9 @@ def ols(formula, table, metadata, tree, **kwargs): feature_names=table.columns) -def glm(formula, table, metadata, tree, **kwargs): - """ Generalized Linear Models applied to balances. - - Parameters - ---------- - """ - pass - - def mixedlm(formula, table, metadata, tree, groups, **kwargs): """ Linear Mixed Effects Models applied to balances. - An Linear Mixed Effects model is applied to each balance. - Parameters ---------- formula : str @@ -239,7 +229,6 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): Variable in `metadata` that specifies the groups. These groups are often associated with individuals repeatedly sampled, typically longitudinally. - **kwargs : dict Other arguments accepted into `statsmodels.regression.linear_model.OLS` @@ -312,8 +301,10 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): -------- statsmodels.regression.linear_model.MixedLM """ - _table, _metadata, _tree = _process(table, metadata, tree) - ilr_table, basis = _transform(_table, _tree) + _table, _metadata, _tree = _intersect_of_table_metadata_tree(table, + metadata, + tree) + ilr_table, basis = _to_balances(_table, _tree) data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) fits = [] @@ -330,10 +321,3 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): feature_names=table.columns) -def gee(formula, table, metadata, tree, **kwargs): - """ Generalized Estimating Equations applied to balances. - - Parameters - ---------- - """ - pass From ea3669c1c28414409d62bc783414bbd487274b5e Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 09:38:59 -0700 Subject: [PATCH 07/12] Removing shebang --- gneiss/_formula.py | 3 ++- gneiss/_summary.py | 2 -- gneiss/tests/test_formula.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 4696d64..8e843a6 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -11,6 +11,7 @@ from gneiss.util import match, match_tips, rename_internal_nodes from gneiss._summary import RegressionResults from gneiss.balances import balance_basis +import numpy as np def _intersect_of_table_metadata_tree(table, metadata, tree): @@ -312,7 +313,7 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 stats_formula = '%s ~ %s' % (b, formula) - + np.random.seed(0) mdf = smf.mixedlm(stats_formula, data=data, groups=data[groups], **kwargs).fit() diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 55dd6ef..d48dee4 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # ---------------------------------------------------------------------------- # Copyright (c) 2016--, gneiss development team. # diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index ac1d331..7421147 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -14,6 +14,7 @@ from gneiss._formula import ols, mixedlm from random import seed + class TestOLS(unittest.TestCase): def setUp(self): @@ -149,7 +150,6 @@ def test_ols_empty_metadata(self): with self.assertRaises(ValueError): ols('real + lame', table, metadata, tree) - class TestMixedLM(unittest.TestCase): def setUp(self): @@ -181,6 +181,7 @@ def test_mixedlm(self): seed(0) res = mixedlm('time', self.table, self.metadata, self.tree, groups='patient') + exp_pvals = pd.DataFrame([[0.015293, 0.193931, 0.012249], [0.000000, 0.579045, 0.909983]], index=['Y1', 'Y2'], From ec77bfaed6e31f409ae81a24f81387f0bed32996 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 10:30:03 -0700 Subject: [PATCH 08/12] adding lme --- gneiss/_formula.py | 1 + gneiss/tests/test_formula.py | 39 +++++++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 8e843a6..f49f231 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -317,6 +317,7 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): mdf = smf.mixedlm(stats_formula, data=data, groups=data[groups], **kwargs).fit() + print(mdf.pvalues) fits.append(mdf) return RegressionResults(fits, basis=basis, feature_names=table.columns) diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 7421147..7e2154c 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -12,7 +12,6 @@ from skbio.stats.composition import ilr_inv from skbio import TreeNode from gneiss._formula import ols, mixedlm -from random import seed class TestOLS(unittest.TestCase): @@ -178,26 +177,48 @@ def setUp(self): def test_mixedlm(self): np.random.seed(0) - seed(0) - res = mixedlm('time', self.table, self.metadata, self.tree, groups='patient') + res = mixedlm('time', self.table, self.metadata, self.tree, groups='patient') exp_pvals = pd.DataFrame([[0.015293, 0.193931, 0.012249], [0.000000, 0.579045, 0.909983]], index=['Y1', 'Y2'], columns=['Intercept', 'Intercept RE', 'time']) - exp_coefs = pd.DataFrame([[2.600000, 42.868869, 0.9750], - [1.016667, 0.216595, 0.0125]], + exp_coefs = pd.DataFrame([[2.5, 4.205488e+07, 1.000000e+00], + [1.000000, 1.000000e+00, 0]], index=['Y1', 'Y2'], columns=['Intercept', 'Intercept RE', 'time']) - pdt.assert_index_equal(exp_pvals.index, res.pvalues.index) pdt.assert_index_equal(exp_pvals.columns, res.pvalues.columns) - - pdt.assert_index_equal(exp_coefs.index, res.coefficients().index) - pdt.assert_index_equal(exp_coefs.columns, res.coefficients().columns) + np.random.seed(0) + pdt.assert_frame_equal(exp_coefs, res.coefficients(), check_less_precise=True) if __name__ == '__main__': unittest.main() + + + +table = pd.DataFrame({ + 'x1': [1.], + 'x2': [2.], + 'x3': [3.], + 'y1': [2.], + 'y2': [3.], + 'y3': [4.], + 'z1': [5.], + 'z2': [6.], + 'z3': [7.], + 'u1': [6.], + 'u2': [7.], + 'u3': [8.]}, + index=['a', 'b', 'c']).T + +metadata = pd.DataFrame({ + 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], + 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] +}, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', + 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + From af5f730930e3837d3386ff8c3589201aafbff5ce Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 11:17:20 -0700 Subject: [PATCH 09/12] Adding more robust tests into mixedlm --- gneiss/_formula.py | 24 +++---- gneiss/_summary.py | 21 +++--- gneiss/tests/test_formula.py | 136 +++++++++++++++++++++++++---------- 3 files changed, 120 insertions(+), 61 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index f49f231..158b97e 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -11,7 +11,6 @@ from gneiss.util import match, match_tips, rename_internal_nodes from gneiss._summary import RegressionResults from gneiss.balances import balance_basis -import numpy as np def _intersect_of_table_metadata_tree(table, metadata, tree): @@ -227,7 +226,7 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): Tree object where the leaves correspond to the columns contained in the table. groups : str - Variable in `metadata` that specifies the groups. These groups are + Column names in `metadata` that specifies the groups. These groups are often associated with individuals repeatedly sampled, typically longitudinally. **kwargs : dict @@ -290,36 +289,35 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): \-a Now we can run the linear mixed effects model on the proportions. - Underneath the hood, the proportions will be transformed into balance, + Underneath the hood, the proportions will be transformed into balances, so that the linear mixed effects models can be run directly on balances. Since each patient was sampled repeatedly, we'll specify them separately in the groups. In the linear mixed effects model `time` and `treatment` will be simultaneously tested for with respect to the balances. - >>> res = mixedlm('time + treatment', table, metadata, tree, groups='patient') + >>> res = mixedlm('time + treatment', table, metadata, tree, + ... groups='patient') See Also -------- statsmodels.regression.linear_model.MixedLM + ols + """ - _table, _metadata, _tree = _intersect_of_table_metadata_tree(table, - metadata, - tree) - ilr_table, basis = _to_balances(_table, _tree) - data = pd.merge(ilr_table, _metadata, left_index=True, right_index=True) + table, metadata, tree = _intersect_of_table_metadata_tree(table, + metadata, + tree) + ilr_table, basis = _to_balances(table, tree) + data = pd.merge(ilr_table, metadata, left_index=True, right_index=True) fits = [] for b in ilr_table.columns: # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 stats_formula = '%s ~ %s' % (b, formula) - np.random.seed(0) mdf = smf.mixedlm(stats_formula, data=data, groups=data[groups], **kwargs).fit() - print(mdf.pvalues) fits.append(mdf) return RegressionResults(fits, basis=basis, feature_names=table.columns) - - diff --git a/gneiss/_summary.py b/gneiss/_summary.py index d48dee4..d5a4f35 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -37,14 +37,6 @@ def __init__(self, self.basis = basis self.results = stat_results - # sum of squares error. Also referred to as sum of squares residuals - sse = 0 - # sum of squares regression. Also referred to as - # explained sum of squares. - ssr = 0 - # See `statsmodels.regression.linear_model.RegressionResults` - # for more explanation on `ess` and `ssr`. - # obtain pvalues self.pvalues = pd.DataFrame() for r in self.results: @@ -54,8 +46,17 @@ def __init__(self, @property def r2(self): - ssr = sum([r.ssr for r in self.results]) - sse = sum([r.sse for r in self.results]) + """ Calculates the coefficients of determination """ + # Reason why we wanted to move this out was because not + # all types of statsmodels regressions have this property. + + # See `statsmodels.regression.linear_model.RegressionResults` + # for more explanation on `ess` and `ssr`. + # sum of squares regression. Also referred to as + # explained sum of squares. + ssr = sum([r.ess for r in self.results]) + # sum of squares error. Also referred to as sum of squares residuals + sse = sum([r.ssr for r in self.results]) # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr return 1 - sse / sst diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 7e2154c..0cf5ada 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -12,6 +12,8 @@ from skbio.stats.composition import ilr_inv from skbio import TreeNode from gneiss._formula import ols, mixedlm +import statsmodels.formula.api as smf +import numpy.testing as npt class TestOLS(unittest.TestCase): @@ -149,6 +151,7 @@ def test_ols_empty_metadata(self): with self.assertRaises(ValueError): ols('real + lame', table, metadata, tree) + class TestMixedLM(unittest.TestCase): def setUp(self): @@ -175,50 +178,107 @@ def setUp(self): }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) - def test_mixedlm(self): - np.random.seed(0) + # test case borrowed from statsmodels + # https://github.com/statsmodels/statsmodels/blob/master/statsmodels + # /regression/tests/test_lme.py#L254 + def test_mixedlm_univariate(self): - res = mixedlm('time', self.table, self.metadata, self.tree, groups='patient') + np.random.seed(6241) + n = 1600 + exog = np.random.normal(size=(n, 2)) + groups = np.kron(np.arange(n / 16), np.ones(16)) - exp_pvals = pd.DataFrame([[0.015293, 0.193931, 0.012249], - [0.000000, 0.579045, 0.909983]], - index=['Y1', 'Y2'], - columns=['Intercept', 'Intercept RE', 'time']) + # Build up the random error vector + errors = 0 - exp_coefs = pd.DataFrame([[2.5, 4.205488e+07, 1.000000e+00], - [1.000000, 1.000000e+00, 0]], - index=['Y1', 'Y2'], - columns=['Intercept', 'Intercept RE', 'time']) - pdt.assert_index_equal(exp_pvals.index, res.pvalues.index) - pdt.assert_index_equal(exp_pvals.columns, res.pvalues.columns) - np.random.seed(0) - pdt.assert_frame_equal(exp_coefs, res.coefficients(), check_less_precise=True) + # The random effects + exog_re = np.random.normal(size=(n, 2)) + slopes = np.random.normal(size=(n / 16, 2)) + slopes = np.kron(slopes, np.ones((16, 1))) * exog_re + errors += slopes.sum(1) + # First variance component + errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) -if __name__ == '__main__': - unittest.main() + # Second variance component + errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) + + # iid errors + errors += np.random.normal(size=n) + + endog = exog.sum(1) + errors + + df = pd.DataFrame(index=range(n)) + df["y"] = endog + df["groups"] = groups + df["x1"] = exog[:, 0] + df["x2"] = exog[:, 1] + + # Equivalent model in R: + # df.to_csv("tst.csv") + # model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 | + # v2), df) + + model1 = smf.mixedlm("y ~ x1 + x2", groups=groups, + data=df) + result1 = model1.fit() + + # Compare to R + npt.assert_allclose(result1.fe_params, [ + 0.211451, 1.022008, 0.924873], rtol=1e-2) + npt.assert_allclose(result1.cov_re, [[0.987738]], rtol=1e-3) + + npt.assert_allclose(result1.bse.iloc[0:3], [ + 0.128377, 0.082644, 0.081031], rtol=1e-3) + + def test_mixedlm_balances(self): + np.random.seed(6241) + n = 1600 + exog = np.random.normal(size=(n, 2)) + groups = np.kron(np.arange(n / 16), np.ones(16)) + # Build up the random error vector + errors = 0 + # The random effects + exog_re = np.random.normal(size=(n, 2)) + slopes = np.random.normal(size=(n / 16, 2)) + slopes = np.kron(slopes, np.ones((16, 1))) * exog_re + errors += slopes.sum(1) -table = pd.DataFrame({ - 'x1': [1.], - 'x2': [2.], - 'x3': [3.], - 'y1': [2.], - 'y2': [3.], - 'y3': [4.], - 'z1': [5.], - 'z2': [6.], - 'z3': [7.], - 'u1': [6.], - 'u2': [7.], - 'u3': [8.]}, - index=['a', 'b', 'c']).T - -metadata = pd.DataFrame({ - 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], - 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], - 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] -}, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', - 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) + # First variance component + errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) + # Second variance component + errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) + + # iid errors + errors += np.random.normal(size=n) + + endog = exog.sum(1) + errors + + df = pd.DataFrame(index=range(n)) + df["y1"] = endog + df["y2"] = endog + 2 * 2 + df["groups"] = groups + df["x1"] = exog[:, 0] + df["x2"] = exog[:, 1] + + tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + iv = ilr_inv(df[["y1", "y2"]].values) + table = pd.DataFrame(iv, columns=['a', 'b', 'c']) + metadata = df[['x1', 'x2', 'groups']] + + res = mixedlm("x1 + x2", table, metadata, tree, groups="groups") + exp_pvalues = pd.DataFrame( + [[4.923122e-236, 3.180390e-40, 3.972325e-35, 3.568599e-30], + [9.953418e-02, 3.180390e-40, 3.972325e-35, 3.568599e-30]], + index=['Y1', 'Y2'], + columns=['Intercept', 'Intercept RE', 'x1', 'x2']) + + pdt.assert_frame_equal(res.pvalues, exp_pvalues, + check_less_precise=True) + + +if __name__ == '__main__': + unittest.main() From eecf811ee7b0700169978af80f1c6cf297d0f1e0 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 11:45:10 -0700 Subject: [PATCH 10/12] Addressing @Eldeveloper's comments and adding tests for kwargs --- gneiss/tests/test_formula.py | 83 ++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/gneiss/tests/test_formula.py b/gneiss/tests/test_formula.py index 0cf5ada..c26bc91 100644 --- a/gneiss/tests/test_formula.py +++ b/gneiss/tests/test_formula.py @@ -279,6 +279,89 @@ def test_mixedlm_balances(self): pdt.assert_frame_equal(res.pvalues, exp_pvalues, check_less_precise=True) + exp_coefficients = pd.DataFrame( + [[4.211451, -0.305906, 1.022008, 0.924873], + [0.211451, -0.305906, 1.022008, 0.924873]], + columns=['Intercept', 'Intercept RE', 'x1', 'x2'], + index=['Y1', 'Y2']) + + pdt.assert_frame_equal(res.coefficients(), exp_coefficients, + check_less_precise=True) + + def test_mixedlm_balances_vcf(self): + np.random.seed(6241) + n = 1600 + exog = np.random.normal(size=(n, 2)) + groups = np.kron(np.arange(n / 16), np.ones(16)) + + # Build up the random error vector + errors = 0 + + # The random effects + exog_re = np.random.normal(size=(n, 2)) + slopes = np.random.normal(size=(n / 16, 2)) + slopes = np.kron(slopes, np.ones((16, 1))) * exog_re + errors += slopes.sum(1) + + # First variance component + subgroups1 = np.kron(np.arange(n / 4), np.ones(4)) + errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) + + # Second variance component + subgroups2 = np.kron(np.arange(n / 2), np.ones(2)) + errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) + + # iid errors + errors += np.random.normal(size=n) + + endog = exog.sum(1) + errors + + df = pd.DataFrame(index=range(n)) + df["y1"] = endog + df["y2"] = endog + 2 * 2 + df["groups"] = groups + df["x1"] = exog[:, 0] + df["x2"] = exog[:, 1] + df["z1"] = exog_re[:, 0] + df["z2"] = exog_re[:, 1] + df["v1"] = subgroups1 + df["v2"] = subgroups2 + + tree = TreeNode.read(['(c, (b,a)Y2)Y1;']) + iv = ilr_inv(df[["y1", "y2"]].values) + table = pd.DataFrame(iv, columns=['a', 'b', 'c']) + metadata = df[['x1', 'x2', 'z1', 'z2', 'v1', 'v2', 'groups']] + + res = mixedlm("x1 + x2", table, metadata, tree, groups="groups", + re_formula="0+z1+z2") + + exp_pvalues = pd.DataFrame( + [[4.923122e-236, 3.180390e-40, 3.972325e-35, 3.568599e-30], + [9.953418e-02, 3.180390e-40, 3.972325e-35, 3.568599e-30]], + index=['Y1', 'Y2'], + columns=['Intercept', 'Intercept RE', 'x1', 'x2']) + + exp_pvalues = pd.DataFrame([ + [0.000000, 3.858750e-39, 2.245068e-33, + 2.434437e-35, 0.776775, 6.645741e-34], + [0.038015, 3.858750e-39, 2.245068e-33, + 2.434437e-35, 0.776775, 6.645741e-34]], + columns=['Intercept', 'x1', 'x2', 'z1 RE', + 'z1 RE x z2 RE', 'z2 RE'], + index=['Y1', 'Y2']) + exp_coefficients = pd.DataFrame( + [[4.163141, 1.030013, 0.935514, 0.339239, -0.005792, 0.38456], + [0.163141, 1.030013, 0.935514, 0.339239, -0.005792, 0.38456]], + columns=['Intercept', 'x1', 'x2', 'z1 RE', + 'z1 RE x z2 RE', 'z2 RE'], + index=['Y1', 'Y2']) + + pdt.assert_frame_equal(res.pvalues, exp_pvalues, + check_less_precise=True) + + pdt.assert_frame_equal(res.coefficients(), exp_coefficients, + check_less_precise=True) + if __name__ == '__main__': unittest.main() From a78b49ff5e95b77f47b141d8fff009a26e64263d Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 11:47:17 -0700 Subject: [PATCH 11/12] Addressing @wasade's comments --- gneiss/_formula.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 158b97e..5050c98 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -230,7 +230,7 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): often associated with individuals repeatedly sampled, typically longitudinally. **kwargs : dict - Other arguments accepted into `statsmodels.regression.linear_model.OLS` + Other arguments accepted into `statsmodels.regression.linear_model.MixedLM` Returns ------- @@ -291,8 +291,8 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): Now we can run the linear mixed effects model on the proportions. Underneath the hood, the proportions will be transformed into balances, so that the linear mixed effects models can be run directly on balances. - Since each patient was sampled repeatedly, we'll specify them separately - in the groups. In the linear mixed effects model `time` and `treatment` + Since each patient was sampled repeatedly, we'll specify them separately + in the groups. In the linear mixed effects model `time` and `treatment` will be simultaneously tested for with respect to the balances. >>> res = mixedlm('time + treatment', table, metadata, tree, From 992234769986faccefb21b6cc6c18565d83b83c0 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 18 Aug 2016 11:54:47 -0700 Subject: [PATCH 12/12] pep8 --- gneiss/_formula.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gneiss/_formula.py b/gneiss/_formula.py index 5050c98..7937f89 100644 --- a/gneiss/_formula.py +++ b/gneiss/_formula.py @@ -230,7 +230,8 @@ def mixedlm(formula, table, metadata, tree, groups, **kwargs): often associated with individuals repeatedly sampled, typically longitudinally. **kwargs : dict - Other arguments accepted into `statsmodels.regression.linear_model.MixedLM` + Other arguments accepted into + `statsmodels.regression.linear_model.MixedLM` Returns -------