Merge pull request #21 from mortonjt/regression_results_part1

ENH: Adding in first draft of the RegressionResults object
biocore · Jul 28, 2016 · 356896d · 356896d
2 parents 22cbcc3 + 491165b
commit 356896d
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 0 deletions.
diff --git a/ci/pip_requirements.txt b/ci/pip_requirements.txt
@@ -1,2 +1,3 @@
 coveralls
 ete3
+statsmodels
diff --git a/gneiss/_summary.py b/gneiss/_summary.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+
+
+class RegressionResults():
+    """
+    Summary object for storing regression results.
+    """
+    def __init__(self, stat_results,
+                 feature_names=None,
+                 basis=None):
+        """ Reorganizes statsmodels regression modules.
+
+        Accepts a list of statsmodels RegressionResults objects
+        and performs some addition summary statistics.
+
+        Parameters
+        ----------
+        stat_results : list, sm.RegressionResults
+            List of RegressionResults objects.
+        feature_names : array_like, str, optional
+            List of original names for features.
+        basis : np.array, optional
+            Orthonormal basis in the Aitchison simplex.
+            If this is not specified, then `project` cannot
+            be enabled in `coefficients` or `predict`.
+        """
+        self.feature_names = feature_names
+        self.basis = basis
+        self.results = stat_results
+
+        # sum of squares error.  Also referred to as sum of squares residuals
+        sse = 0
+        # sum of squares regression. Also referred to as
+        # explained sum of squares.
+        ssr = 0
+        # See `statsmodels.regression.linear_model.RegressionResults`
+        # for more explanation on `ess` and `ssr`.
+
+        # obtain pvalues
+        self.pvalues = pd.DataFrame()
+        for r in self.results:
+            p = r.pvalues
+            p.name = r.model.endog_names
+            self.pvalues = self.pvalues.append(p)
+            sse += r.ssr
+            ssr += r.ess
+
+        # calculate the overall coefficient of determination (i.e. R2)
+        sst = sse + ssr
+        self.r2 = 1 - sse / sst
diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import pandas.util.testing as pdt
+import statsmodels.formula.api as smf
+import unittest
+from gneiss._summary import RegressionResults
+
+
+class TestRegressionResults(unittest.TestCase):
+
+    def setUp(self):
+        self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4],
+                                  list(range(1, 8)),
+                                  [1, 3, 2, 4, 3, 5, 4]],
+                                 columns=['s1', 's2', 's3', 's4',
+                                          's5', 's6', 's7'],
+                                 index=['Y1', 'Y2', 'X']).T
+        model1 = smf.ols(formula="Y1 ~ X", data=self.data)
+        model2 = smf.ols(formula="Y2 ~ X", data=self.data)
+        self.results = [model1.fit(), model2.fit()]
+
+    def test_r2(self):
+        fittedvalues = pd.DataFrame({'s1': [1.986842, 1.236842],
+                                     's2': [3.065789, 3.815789],
+                                     's3': [2.526316, 2.526316],
+                                     's4': [3.605263, 5.105263],
+                                     's5': [3.065789, 3.815789],
+                                     's6': [4.144737, 6.394737],
+                                     's7': [3.605263, 5.105263]},
+                                    index=['Y1', 'Y2']).T
+        m = self.data.mean(axis=0)
+        sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum()
+        # ssr = ((fittedvalues - m)**2).sum().sum()
+        sst = ((m - self.data.iloc[:, :2])**2).sum().sum()
+        exp_r2 = 1 - (sse / sst)
+
+        res = RegressionResults(self.results)
+        self.assertAlmostEqual(exp_r2, res.r2)
+
+    def test_regression_results_pvalues(self):
+        # checks to see if pvalues are calculated correctly.
+        res = RegressionResults(self.results)
+        exp = pd.DataFrame({'Intercept': [0.307081, 0.972395],
+                            'X': [0.211391, 0.029677]},
+                           index=['Y1', 'Y2'])
+        pdt.assert_frame_equal(res.pvalues, exp,
+                               check_exact=False,
+                               check_less_precise=True)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/setup.py b/setup.py
@@ -83,6 +83,7 @@ def finalize_options(self):
           'scipy >= 0.15.1',
           'nose >= 1.3.7',
           'scikit-bio>=0.4.2',
+          'statsmodels',
           'ete3',
       ],
       classifiers=classifiers,