diff --git a/ci/pip_requirements.txt b/ci/pip_requirements.txt index 72321f8..41d47e9 100644 --- a/ci/pip_requirements.txt +++ b/ci/pip_requirements.txt @@ -1,2 +1,3 @@ coveralls ete3 +statsmodels diff --git a/gneiss/_summary.py b/gneiss/_summary.py new file mode 100644 index 0000000..ed70f4d --- /dev/null +++ b/gneiss/_summary.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import pandas as pd + + +class RegressionResults(): + """ + Summary object for storing regression results. + """ + def __init__(self, stat_results, + feature_names=None, + basis=None): + """ Reorganizes statsmodels regression modules. + + Accepts a list of statsmodels RegressionResults objects + and performs some addition summary statistics. + + Parameters + ---------- + stat_results : list, sm.RegressionResults + List of RegressionResults objects. + feature_names : array_like, str, optional + List of original names for features. + basis : np.array, optional + Orthonormal basis in the Aitchison simplex. + If this is not specified, then `project` cannot + be enabled in `coefficients` or `predict`. + """ + self.feature_names = feature_names + self.basis = basis + self.results = stat_results + + # sum of squares error. Also referred to as sum of squares residuals + sse = 0 + # sum of squares regression. Also referred to as + # explained sum of squares. + ssr = 0 + # See `statsmodels.regression.linear_model.RegressionResults` + # for more explanation on `ess` and `ssr`. + + # obtain pvalues + self.pvalues = pd.DataFrame() + for r in self.results: + p = r.pvalues + p.name = r.model.endog_names + self.pvalues = self.pvalues.append(p) + sse += r.ssr + ssr += r.ess + + # calculate the overall coefficient of determination (i.e. R2) + sst = sse + ssr + self.r2 = 1 - sse / sst diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py new file mode 100644 index 0000000..dbd2cdf --- /dev/null +++ b/gneiss/tests/test_summary.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, gneiss development team. +# +# Distributed under the terms of the GPLv3 License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import pandas as pd +import pandas.util.testing as pdt +import statsmodels.formula.api as smf +import unittest +from gneiss._summary import RegressionResults + + +class TestRegressionResults(unittest.TestCase): + + def setUp(self): + self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4], + list(range(1, 8)), + [1, 3, 2, 4, 3, 5, 4]], + columns=['s1', 's2', 's3', 's4', + 's5', 's6', 's7'], + index=['Y1', 'Y2', 'X']).T + model1 = smf.ols(formula="Y1 ~ X", data=self.data) + model2 = smf.ols(formula="Y2 ~ X", data=self.data) + self.results = [model1.fit(), model2.fit()] + + def test_r2(self): + fittedvalues = pd.DataFrame({'s1': [1.986842, 1.236842], + 's2': [3.065789, 3.815789], + 's3': [2.526316, 2.526316], + 's4': [3.605263, 5.105263], + 's5': [3.065789, 3.815789], + 's6': [4.144737, 6.394737], + 's7': [3.605263, 5.105263]}, + index=['Y1', 'Y2']).T + m = self.data.mean(axis=0) + sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum() + # ssr = ((fittedvalues - m)**2).sum().sum() + sst = ((m - self.data.iloc[:, :2])**2).sum().sum() + exp_r2 = 1 - (sse / sst) + + res = RegressionResults(self.results) + self.assertAlmostEqual(exp_r2, res.r2) + + def test_regression_results_pvalues(self): + # checks to see if pvalues are calculated correctly. + res = RegressionResults(self.results) + exp = pd.DataFrame({'Intercept': [0.307081, 0.972395], + 'X': [0.211391, 0.029677]}, + index=['Y1', 'Y2']) + pdt.assert_frame_equal(res.pvalues, exp, + check_exact=False, + check_less_precise=True) + +if __name__ == "__main__": + unittest.main() diff --git a/setup.py b/setup.py index 28a305e..e905f5a 100644 --- a/setup.py +++ b/setup.py @@ -83,6 +83,7 @@ def finalize_options(self): 'scipy >= 0.15.1', 'nose >= 1.3.7', 'scikit-bio>=0.4.2', + 'statsmodels', 'ete3', ], classifiers=classifiers,