From 10a28aa3a88950c4ac7ba786862b91e330eb97b7 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 09:27:25 -0700
Subject: [PATCH 1/7] ENH: Adding in first draft of the RegressionResults
 object

---
 gneiss/_summary.py           | 53 +++++++++++++++++++++++++++++++
 gneiss/tests/test_summary.py | 60 ++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 gneiss/_summary.py
 create mode 100644 gneiss/tests/test_summary.py

diff --git a/gneiss/_summary.py b/gneiss/_summary.py
new file mode 100644
index 0000000..785ba29
--- /dev/null
+++ b/gneiss/_summary.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+from skbio.stats.composition import ilr_inv
+
+
+class RegressionResults():
+    """
+    Summary object for storing regression results.
+    """
+    def __init__(self, stat_results,
+                 feature_names=None,
+                 basis=None):
+        """ Reorganizes statsmodels regression modules.
+
+        Accepts a list of statsmodels RegressionResults objects
+        and performs some addition summary statistics.
+
+        Parameters
+        ----------
+        stat_results : list, sm.RegressionResults
+            List of RegressionResults objects.
+        feature_names : array_list, str
+            List of original names for features.
+        basis : np.array, optional
+            Orthonormal basis in the Aitchison simplex.
+            If this is not specified, then `project` cannot
+            be enabled in `coefficients` or `predict`.
+        """
+        self.feature_names = feature_names
+        self.basis = basis
+        self.results = stat_results
+
+        # obtain pvalues
+        pvals = pd.DataFrame()
+        for i in range(len(self.results)):
+            p = self.results[i].pvalues
+            p.name = self.results[i].model.endog_names
+            pvals = pvals.append(p)
+        self.pvalues = pvals
+
+        # calculate the overall R2 value
+        sse = sum([r.ssr for r in self.results])
+        ssr = sum([r.ess for r in self.results])
+        sst = sse + ssr
+        self.r2 = 1 - sse / sst
diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py
new file mode 100644
index 0000000..70a2931
--- /dev/null
+++ b/gneiss/tests/test_summary.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016--, gneiss development team.
+#
+# Distributed under the terms of the GPLv3 License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import pandas.util.testing as pdt
+import numpy as np
+import statsmodels.formula.api as smf
+import unittest
+from gneiss._summary import RegressionResults
+from skbio.stats.composition import (_gram_schmidt_basis, ilr_inv)
+
+
+class TestRegressionResults(unittest.TestCase):
+
+    def setUp(self):
+        self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4],
+                                  list(range(1, 8)),
+                                  [1, 3, 2, 4, 3, 5, 4]],
+                                 columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
+                                 index=['Y1', 'Y2', 'X']).T
+        model1 = smf.ols(formula="Y1 ~ X", data=self.data)
+        model2 = smf.ols(formula="Y2 ~ X", data=self.data)
+        self.results = [model1.fit(), model2.fit()]
+
+    def test_r2(self):
+        fittedvalues = pd.DataFrame({'s1': [1.986842, 1.236842],
+                                     's2': [3.065789, 3.815789],
+                                     's3': [2.526316, 2.526316],
+                                     's4': [3.605263, 5.105263],
+                                     's5': [3.065789, 3.815789],
+                                     's6': [4.144737, 6.394737],
+                                     's7': [3.605263, 5.105263]},
+                                    index=['Y1','Y2']).T
+        m = self.data.mean(axis=0)
+        sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum()
+        ssr = ((fittedvalues - m)**2).sum().sum()
+        sst = ((m - self.data.iloc[:, :2])**2).sum().sum()
+        exp_r2 = 1 - (sse / sst)
+
+        res = RegressionResults(self.results)
+        self.assertAlmostEqual(exp_r2, res.r2)
+
+    def test_regression_results_pvalues(self):
+        # checks to see if pvalues are calculated correctly.
+        res = RegressionResults(self.results)
+        exp = pd.DataFrame({'Intercept':[0.307081, 0.972395],
+                            'X': [0.211391, 0.029677]},
+                            index=['Y1', 'Y2'])
+        pdt.assert_frame_equal(res.pvalues, exp,
+                               check_exact=False,
+                               check_less_precise=True)
+
+if __name__ == "__main__":
+    unittest.main()

From 0043f05fcd9cbfadc92d88ad78812aa0a0cab863 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 09:37:20 -0700
Subject: [PATCH 2/7] fixing array_like typo

---
 gneiss/_summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gneiss/_summary.py b/gneiss/_summary.py
index 785ba29..345e86f 100644
--- a/gneiss/_summary.py
+++ b/gneiss/_summary.py
@@ -27,7 +27,7 @@ def __init__(self, stat_results,
         ----------
         stat_results : list, sm.RegressionResults
             List of RegressionResults objects.
-        feature_names : array_list, str
+        feature_names : array_like, str
             List of original names for features.
         basis : np.array, optional
             Orthonormal basis in the Aitchison simplex.

From 1af836541f08cdc267b03b8ae7f6996d32af9f5a Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 09:39:54 -0700
Subject: [PATCH 3/7] pep8ing away.  Will add imports in next PR

---
 gneiss/_summary.py           |  1 -
 gneiss/tests/test_summary.py | 13 ++++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/gneiss/_summary.py b/gneiss/_summary.py
index 345e86f..0af6ad2 100644
--- a/gneiss/_summary.py
+++ b/gneiss/_summary.py
@@ -8,7 +8,6 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # ----------------------------------------------------------------------------
 import pandas as pd
-from skbio.stats.composition import ilr_inv
 
 
 class RegressionResults():
diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py
index 70a2931..dbd2cdf 100644
--- a/gneiss/tests/test_summary.py
+++ b/gneiss/tests/test_summary.py
@@ -9,11 +9,9 @@
 # ----------------------------------------------------------------------------
 import pandas as pd
 import pandas.util.testing as pdt
-import numpy as np
 import statsmodels.formula.api as smf
 import unittest
 from gneiss._summary import RegressionResults
-from skbio.stats.composition import (_gram_schmidt_basis, ilr_inv)
 
 
 class TestRegressionResults(unittest.TestCase):
@@ -22,7 +20,8 @@ def setUp(self):
         self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4],
                                   list(range(1, 8)),
                                   [1, 3, 2, 4, 3, 5, 4]],
-                                 columns=['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
+                                 columns=['s1', 's2', 's3', 's4',
+                                          's5', 's6', 's7'],
                                  index=['Y1', 'Y2', 'X']).T
         model1 = smf.ols(formula="Y1 ~ X", data=self.data)
         model2 = smf.ols(formula="Y2 ~ X", data=self.data)
@@ -36,10 +35,10 @@ def test_r2(self):
                                      's5': [3.065789, 3.815789],
                                      's6': [4.144737, 6.394737],
                                      's7': [3.605263, 5.105263]},
-                                    index=['Y1','Y2']).T
+                                    index=['Y1', 'Y2']).T
         m = self.data.mean(axis=0)
         sse = ((fittedvalues - self.data.iloc[:, :2])**2).sum().sum()
-        ssr = ((fittedvalues - m)**2).sum().sum()
+        # ssr = ((fittedvalues - m)**2).sum().sum()
         sst = ((m - self.data.iloc[:, :2])**2).sum().sum()
         exp_r2 = 1 - (sse / sst)
 
@@ -49,9 +48,9 @@ def test_r2(self):
     def test_regression_results_pvalues(self):
         # checks to see if pvalues are calculated correctly.
         res = RegressionResults(self.results)
-        exp = pd.DataFrame({'Intercept':[0.307081, 0.972395],
+        exp = pd.DataFrame({'Intercept': [0.307081, 0.972395],
                             'X': [0.211391, 0.029677]},
-                            index=['Y1', 'Y2'])
+                           index=['Y1', 'Y2'])
         pdt.assert_frame_equal(res.pvalues, exp,
                                check_exact=False,
                                check_less_precise=True)

From cacb9f394468ee0ba77ec127461bbe227f3f6a7e Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 09:56:29 -0700
Subject: [PATCH 4/7] BLD: Adding in statsmodels dependency

---
 ci/pip_requirements.txt | 1 +
 setup.py                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/ci/pip_requirements.txt b/ci/pip_requirements.txt
index 72321f8..41d47e9 100644
--- a/ci/pip_requirements.txt
+++ b/ci/pip_requirements.txt
@@ -1,2 +1,3 @@
 coveralls
 ete3
+statsmodels
diff --git a/setup.py b/setup.py
index 28a305e..e905f5a 100644
--- a/setup.py
+++ b/setup.py
@@ -83,6 +83,7 @@ def finalize_options(self):
           'scipy >= 0.15.1',
           'nose >= 1.3.7',
           'scikit-bio>=0.4.2',
+          'statsmodels',
           'ete3',
       ],
       classifiers=classifiers,

From ab57286dedb4ef8996effc506d2b56b8dc9bce8a Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 12:10:41 -0700
Subject: [PATCH 5/7] Addressing @antgonza's comments

---
 gneiss/_summary.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/gneiss/_summary.py b/gneiss/_summary.py
index 0af6ad2..1203734 100644
--- a/gneiss/_summary.py
+++ b/gneiss/_summary.py
@@ -26,7 +26,7 @@ def __init__(self, stat_results,
         ----------
         stat_results : list, sm.RegressionResults
             List of RegressionResults objects.
-        feature_names : array_like, str
+        feature_names : array_like, str, optional
             List of original names for features.
         basis : np.array, optional
             Orthonormal basis in the Aitchison simplex.
@@ -38,15 +38,22 @@ def __init__(self, stat_results,
         self.results = stat_results
 
         # obtain pvalues
-        pvals = pd.DataFrame()
+        self.pvalues = pd.DataFrame()
         for i in range(len(self.results)):
             p = self.results[i].pvalues
             p.name = self.results[i].model.endog_names
-            pvals = pvals.append(p)
-        self.pvalues = pvals
+            self.pvalues = pvals.append(p)
 
-        # calculate the overall R2 value
+        # calculate the overall coefficient of determination (i.e. R2)
+
+        # sum of squares error.  Also referred to as sum of squares residuals
         sse = sum([r.ssr for r in self.results])
+        # sum of squares regression. Also referred to as explained sum of squares.
         ssr = sum([r.ess for r in self.results])
+        # See `statsmodels.regression.linear_model.RegressionResults`
+        # for more explanation on `ess` and `ssr`.
+
         sst = sse + ssr
         self.r2 = 1 - sse / sst
+
+

From 5ef40416859017a9da495d810ca506db44ac8459 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 12:12:05 -0700
Subject: [PATCH 6/7] pep8 and friends

---
 gneiss/_summary.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/gneiss/_summary.py b/gneiss/_summary.py
index 1203734..79d1d05 100644
--- a/gneiss/_summary.py
+++ b/gneiss/_summary.py
@@ -42,18 +42,17 @@ def __init__(self, stat_results,
         for i in range(len(self.results)):
             p = self.results[i].pvalues
             p.name = self.results[i].model.endog_names
-            self.pvalues = pvals.append(p)
+            self.pvalues = self.pvalues.append(p)
 
         # calculate the overall coefficient of determination (i.e. R2)
 
         # sum of squares error.  Also referred to as sum of squares residuals
         sse = sum([r.ssr for r in self.results])
-        # sum of squares regression. Also referred to as explained sum of squares.
+        # sum of squares regression. Also referred to as
+        # explained sum of squares.
         ssr = sum([r.ess for r in self.results])
         # See `statsmodels.regression.linear_model.RegressionResults`
         # for more explanation on `ess` and `ssr`.
 
         sst = sse + ssr
         self.r2 = 1 - sse / sst
-
-

From 491165b5e5aa24dd47e7976ce6a23bcfb11787f5 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Thu, 28 Jul 2016 12:24:38 -0700
Subject: [PATCH 7/7] Addressing @antgonza's comments

---
 gneiss/_summary.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/gneiss/_summary.py b/gneiss/_summary.py
index 79d1d05..ed70f4d 100644
--- a/gneiss/_summary.py
+++ b/gneiss/_summary.py
@@ -37,22 +37,23 @@ def __init__(self, stat_results,
         self.basis = basis
         self.results = stat_results
 
-        # obtain pvalues
-        self.pvalues = pd.DataFrame()
-        for i in range(len(self.results)):
-            p = self.results[i].pvalues
-            p.name = self.results[i].model.endog_names
-            self.pvalues = self.pvalues.append(p)
-
-        # calculate the overall coefficient of determination (i.e. R2)
-
         # sum of squares error.  Also referred to as sum of squares residuals
-        sse = sum([r.ssr for r in self.results])
+        sse = 0
         # sum of squares regression. Also referred to as
         # explained sum of squares.
-        ssr = sum([r.ess for r in self.results])
+        ssr = 0
         # See `statsmodels.regression.linear_model.RegressionResults`
         # for more explanation on `ess` and `ssr`.
 
+        # obtain pvalues
+        self.pvalues = pd.DataFrame()
+        for r in self.results:
+            p = r.pvalues
+            p.name = r.model.endog_names
+            self.pvalues = self.pvalues.append(p)
+            sse += r.ssr
+            ssr += r.ess
+
+        # calculate the overall coefficient of determination (i.e. R2)
         sst = sse + ssr
         self.r2 = 1 - sse / sst