From 10a28aa3a88950c4ac7ba786862b91e330eb97b7 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 09:27:25 -0700 Subject: [PATCH 1/7] ENH: Adding in first draft of the RegressionResults object --- gneiss/_summary.py | 53 +++++++++++++++++++++++++++++++ gneiss/tests/test_summary.py | 60 ++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 gneiss/_summary.py create mode 100644 gneiss/tests/test_summary.py diff --git a/gneiss/_summary.py b/gneiss/_summary.py new file mode 100644 index 0000000..785ba29 --- /dev/null +++ b/gneiss/_summary.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import pandas as pd +from skbio.stats.composition import ilr_inv + + +class RegressionResults(): + """ + Summary object for storing regression results. + """ + def __init__(self, stat_results, + feature_names=None, + basis=None): + """ Reorganizes statsmodels regression modules. + + Accepts a list of statsmodels RegressionResults objects + and performs some addition summary statistics. + + Parameters + ---------- + stat_results : list, sm.RegressionResults + List of RegressionResults objects. + feature_names : array_list, str + List of original names for features. + basis : np.array, optional + Orthonormal basis in the Aitchison simplex. + If this is not specified, then `project` cannot + be enabled in `coefficients` or `predict`. + """ + self.feature_names = feature_names + self.basis = basis + self.results = stat_results + + # obtain pvalues + pvals = pd.DataFrame() + for i in range(len(self.results)): + p = self.results[i].pvalues + p.name = self.results[i].model.endog_names + pvals = pvals.append(p) + self.pvalues = pvals + + # calculate the overall R2 value + sse = sum([r.ssr for r in self.results]) + ssr = sum([r.ess for r in self.results]) + sst = sse + ssr + self.r2 = 1 - sse / sst diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py new file mode 100644 index 0000000..70a2931 --- /dev/null +++ b/gneiss/tests/test_summary.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import pandas as pd +import pandas.util.testing as pdt +import numpy as np +import statsmodels.formula.api as smf +import unittest +from gneiss._summary import RegressionResults +from skbio.stats.composition import (_gram_schmidt_basis, ilr_inv) + + +class TestRegressionResults(unittest.TestCase): + + def setUp(self): + self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4], + list(range(1, 8)), + [1, 3, 2, 4, 3, 5, 4]], + columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7'], + index=['Y1', 'Y2', 'X']).T + model1 = smf.ols(formula="Y1 ~ X", data=self.data) + model2 = smf.ols(formula="Y2 ~ X", data=self.data) + self.results = [model1.fit(), model2.fit()] + + def test_r2(self): + fittedvalues = pd.DataFrame({'s1': [1.986842, 1.236842], + 's2': [3.065789, 3.815789], + 's3': [2.526316, 2.526316], + 's4': [3.605263, 5.105263], + 's5': [3.065789, 3.815789], + 's6': [4.144737, 6.394737], + 's7': [3.605263, 5.105263]}, + index=['Y1','Y2']).T + m = self.data.mean(axis=0) + sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum() + ssr = ((fittedvalues - m)**2).sum().sum() + sst = ((m - self.data.iloc[:, :2])**2).sum().sum() + exp_r2 = 1 - (sse / sst) + + res = RegressionResults(self.results) + self.assertAlmostEqual(exp_r2, res.r2) + + def test_regression_results_pvalues(self): + # checks to see if pvalues are calculated correctly. + res = RegressionResults(self.results) + exp = pd.DataFrame({'Intercept':[0.307081, 0.972395], + 'X': [0.211391, 0.029677]}, + index=['Y1', 'Y2']) + pdt.assert_frame_equal(res.pvalues, exp, + check_exact=False, + check_less_precise=True) + +if __name__ == "__main__": + unittest.main() From 0043f05fcd9cbfadc92d88ad78812aa0a0cab863 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 09:37:20 -0700 Subject: [PATCH 2/7] fixing array_like typo --- gneiss/_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 785ba29..345e86f 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -27,7 +27,7 @@ def __init__(self, stat_results, ---------- stat_results : list, sm.RegressionResults List of RegressionResults objects. - feature_names : array_list, str + feature_names : array_like, str List of original names for features. basis : np.array, optional Orthonormal basis in the Aitchison simplex. From 1af836541f08cdc267b03b8ae7f6996d32af9f5a Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 09:39:54 -0700 Subject: [PATCH 3/7] pep8ing away. Will add imports in next PR --- gneiss/_summary.py | 1 - gneiss/tests/test_summary.py | 13 ++++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 345e86f..0af6ad2 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -8,7 +8,6 @@ # The full license is in the file COPYING.txt, distributed with this software. # ---------------------------------------------------------------------------- import pandas as pd -from skbio.stats.composition import ilr_inv class RegressionResults(): diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py index 70a2931..dbd2cdf 100644 --- a/gneiss/tests/test_summary.py +++ b/gneiss/tests/test_summary.py @@ -9,11 +9,9 @@ # ---------------------------------------------------------------------------- import pandas as pd import pandas.util.testing as pdt -import numpy as np import statsmodels.formula.api as smf import unittest from gneiss._summary import RegressionResults -from skbio.stats.composition import (_gram_schmidt_basis, ilr_inv) class TestRegressionResults(unittest.TestCase): @@ -22,7 +20,8 @@ def setUp(self): self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4], list(range(1, 8)), [1, 3, 2, 4, 3, 5, 4]], - columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7'], + columns=['s1', 's2', 's3', 's4', + 's5', 's6', 's7'], index=['Y1', 'Y2', 'X']).T model1 = smf.ols(formula="Y1 ~ X", data=self.data) model2 = smf.ols(formula="Y2 ~ X", data=self.data) @@ -36,10 +35,10 @@ def test_r2(self): 's5': [3.065789, 3.815789], 's6': [4.144737, 6.394737], 's7': [3.605263, 5.105263]}, - index=['Y1','Y2']).T + index=['Y1', 'Y2']).T m = self.data.mean(axis=0) sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum() - ssr = ((fittedvalues - m)**2).sum().sum() + # ssr = ((fittedvalues - m)**2).sum().sum() sst = ((m - self.data.iloc[:, :2])**2).sum().sum() exp_r2 = 1 - (sse / sst) @@ -49,9 +48,9 @@ def test_r2(self): def test_regression_results_pvalues(self): # checks to see if pvalues are calculated correctly. res = RegressionResults(self.results) - exp = pd.DataFrame({'Intercept':[0.307081, 0.972395], + exp = pd.DataFrame({'Intercept': [0.307081, 0.972395], 'X': [0.211391, 0.029677]}, - index=['Y1', 'Y2']) + index=['Y1', 'Y2']) pdt.assert_frame_equal(res.pvalues, exp, check_exact=False, check_less_precise=True) From cacb9f394468ee0ba77ec127461bbe227f3f6a7e Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 09:56:29 -0700 Subject: [PATCH 4/7] BLD: Adding in statsmodels dependency --- ci/pip_requirements.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/ci/pip_requirements.txt b/ci/pip_requirements.txt index 72321f8..41d47e9 100644 --- a/ci/pip_requirements.txt +++ b/ci/pip_requirements.txt @@ -1,2 +1,3 @@ coveralls ete3 +statsmodels diff --git a/setup.py b/setup.py index 28a305e..e905f5a 100644 --- a/setup.py +++ b/setup.py @@ -83,6 +83,7 @@ def finalize_options(self): 'scipy >= 0.15.1', 'nose >= 1.3.7', 'scikit-bio>=0.4.2', + 'statsmodels', 'ete3', ], classifiers=classifiers, From ab57286dedb4ef8996effc506d2b56b8dc9bce8a Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 12:10:41 -0700 Subject: [PATCH 5/7] Addressing @antgonza's comments --- gneiss/_summary.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 0af6ad2..1203734 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -26,7 +26,7 @@ def __init__(self, stat_results, ---------- stat_results : list, sm.RegressionResults List of RegressionResults objects. - feature_names : array_like, str + feature_names : array_like, str, optional List of original names for features. basis : np.array, optional Orthonormal basis in the Aitchison simplex. @@ -38,15 +38,22 @@ def __init__(self, stat_results, self.results = stat_results # obtain pvalues - pvals = pd.DataFrame() + self.pvalues = pd.DataFrame() for i in range(len(self.results)): p = self.results[i].pvalues p.name = self.results[i].model.endog_names - pvals = pvals.append(p) - self.pvalues = pvals + self.pvalues = pvals.append(p) - # calculate the overall R2 value + # calculate the overall coefficient of determination (i.e. R2) + + # sum of squares error. Also referred to as sum of squares residuals sse = sum([r.ssr for r in self.results]) + # sum of squares regression. Also referred to as explained sum of squares. ssr = sum([r.ess for r in self.results]) + # See `statsmodels.regression.linear_model.RegressionResults` + # for more explanation on `ess` and `ssr`. + sst = sse + ssr self.r2 = 1 - sse / sst + + From 5ef40416859017a9da495d810ca506db44ac8459 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 12:12:05 -0700 Subject: [PATCH 6/7] pep8 and friends --- gneiss/_summary.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 1203734..79d1d05 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -42,18 +42,17 @@ def __init__(self, stat_results, for i in range(len(self.results)): p = self.results[i].pvalues p.name = self.results[i].model.endog_names - self.pvalues = pvals.append(p) + self.pvalues = self.pvalues.append(p) # calculate the overall coefficient of determination (i.e. R2) # sum of squares error. Also referred to as sum of squares residuals sse = sum([r.ssr for r in self.results]) - # sum of squares regression. Also referred to as explained sum of squares. + # sum of squares regression. Also referred to as + # explained sum of squares. ssr = sum([r.ess for r in self.results]) # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. sst = sse + ssr self.r2 = 1 - sse / sst - - From 491165b5e5aa24dd47e7976ce6a23bcfb11787f5 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Thu, 28 Jul 2016 12:24:38 -0700 Subject: [PATCH 7/7] Addressing @antgonza's comments --- gneiss/_summary.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/gneiss/_summary.py b/gneiss/_summary.py index 79d1d05..ed70f4d 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -37,22 +37,23 @@ def __init__(self, stat_results, self.basis = basis self.results = stat_results - # obtain pvalues - self.pvalues = pd.DataFrame() - for i in range(len(self.results)): - p = self.results[i].pvalues - p.name = self.results[i].model.endog_names - self.pvalues = self.pvalues.append(p) - - # calculate the overall coefficient of determination (i.e. R2) - # sum of squares error. Also referred to as sum of squares residuals - sse = sum([r.ssr for r in self.results]) + sse = 0 # sum of squares regression. Also referred to as # explained sum of squares. - ssr = sum([r.ess for r in self.results]) + ssr = 0 # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. + # obtain pvalues + self.pvalues = pd.DataFrame() + for r in self.results: + p = r.pvalues + p.name = r.model.endog_names + self.pvalues = self.pvalues.append(p) + sse += r.ssr + ssr += r.ess + + # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr self.r2 = 1 - sse / sst