Merge pull request #32 from mortonjt/regression_results_part3

Regression results part3
biocore · Aug 18, 2016 · ddc3d40 · ddc3d40
2 parents 0f6a56c + 855078e
commit ddc3d40
Show file tree

Hide file tree

Showing 3 changed files with 216 additions and 12 deletions.
diff --git a/gneiss/_summary.py b/gneiss/_summary.py
@@ -15,10 +15,11 @@ class RegressionResults():
     """
     Summary object for storing regression results.
     """
-    def __init__(self, stat_results,
+    def __init__(self,
+                 stat_results,
                  feature_names=None,
                  basis=None):
-        """ Reorganizes statsmodels regression modules.
+        """ Reorganizes statsmodels regression results modules.
 
         Accepts a list of statsmodels RegressionResults objects
         and performs some addition summary statistics.
@@ -78,11 +79,11 @@ def _check_projection(self, project):
         """
         if self.basis is None and project:
             raise ValueError("Cannot perform projection into Aitchison simplex"
-                             "if `basis` is not specified.")
+                             " if `basis` is not specified.")
 
         if self.feature_names is None and project:
             raise ValueError("Cannot perform projection into Aitchison simplex"
-                             "if `feature_names` is not specified.")
+                             " if `feature_names` is not specified.")
 
     def coefficients(self, project=False):
         """ Returns coefficients from fit.
@@ -91,7 +92,7 @@ def coefficients(self, project=False):
         ----------
         project : bool, optional
             Specifies if coefficients should be projected back into
-            the Aitchison simplex.  If false, the coefficients will be
+            the Aitchison simplex [1]_.  If false, the coefficients will be
             represented as balances  (default: False).
 
         Returns
@@ -109,6 +110,11 @@ def coefficients(self, project=False):
         ValueError:
             Cannot perform projection into Aitchison simplex
             if `feature_names` is not specified.
+
+        References
+        ----------
+        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
+           CDA work." Girona 24 (2003): 73-81.
         """
         self._check_projection(project)
         coef = pd.DataFrame()
@@ -128,3 +134,102 @@ def coefficients(self, project=False):
                                 columns=coef.columns)
         else:
             return coef
+
+    def residuals(self, project=False):
+        """ Returns calculated residuals.
+
+        Parameters
+        ----------
+        X : pd.DataFrame, optional
+            Input table of covariates.  If not specified, then the
+            fitted values calculated from training the model will be
+            returned.
+        project : bool, optional
+            Specifies if coefficients should be projected back into
+            the Aitchison simplex [1]_.  If false, the coefficients will be
+            represented as balances  (default: False).
+        Returns
+        -------
+        pd.DataFrame
+            A table of values where rows are samples, and the columns
+            are either balances or proportions, depending on the value of
+            `project`.
+
+        References
+        ----------
+        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
+           CDA work." Girona 24 (2003): 73-81.
+        """
+        self._check_projection(project)
+
+        resid = pd.DataFrame()
+
+        for r in self.results:
+            err = r.resid
+            err.name = r.model.endog_names
+            resid = resid.append(err)
+
+        if project:
+            # `check=False`, due to a problem with error handling
+            # addressed here https://github.com/biocore/scikit-bio/pull/1396
+            # This will need to be fixed here:
+            # https://github.com/biocore/gneiss/issues/34
+            proj_resid = ilr_inv(resid.values.T, basis=self.basis,
+                                 check=False).T
+            return pd.DataFrame(proj_resid, index=self.feature_names,
+                                columns=resid.columns).T
+        else:
+            return resid.T
+
+    def predict(self, X=None, project=False, **kwargs):
+        """ Performs a prediction based on model.
+
+        Parameters
+        ----------
+        X : pd.DataFrame, optional
+            Input table of covariates, where columns are covariates, and
+            rows are samples.  If not specified, then the fitted values
+            calculated from training the model will be returned.
+        project : bool, optional
+            Specifies if coefficients should be projected back into
+            the Aitchison simplex [1]_.  If false, the coefficients will be
+            represented as balances  (default: False).
+        **kwargs : dict
+            Other arguments to be passed into the model prediction.
+
+        Returns
+        -------
+        pd.DataFrame
+            A table of values where rows are coefficients, and the columns
+            are either balances or proportions, depending on the value of
+            `project`.
+
+        References
+        ----------
+        .. [1] Aitchison, J. "A concise guide to compositional data analysis,
+           CDA work." Girona 24 (2003): 73-81.
+        """
+        self._check_projection(project)
+
+        prediction = pd.DataFrame()
+        for m in self.results:
+            # check if X is none.
+            p = pd.Series(m.predict(X, **kwargs))
+            p.name = m.model.endog_names
+            if X is not None:
+                p.index = X.index
+            else:
+                p.index = m.fittedvalues.index
+            prediction = prediction.append(p)
+
+        if project:
+            # `check=False`, due to a problem with error handling
+            # addressed here https://github.com/biocore/scikit-bio/pull/1396
+            # This will need to be fixed here:
+            # https://github.com/biocore/gneiss/issues/34
+            proj_prediction = ilr_inv(prediction.values.T, basis=self.basis,
+                                      check=False)
+            return pd.DataFrame(proj_prediction,
+                                columns=self.feature_names,
+                                index=prediction.columns)
+        return prediction.T
diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py
@@ -19,12 +19,16 @@
 class TestRegressionResults(unittest.TestCase):
 
     def setUp(self):
-        self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4],
-                                  list(range(1, 8)),
-                                  [1, 3, 2, 4, 3, 5, 4]],
-                                 columns=['s1', 's2', 's3', 's4',
-                                          's5', 's6', 's7'],
-                                 index=['Y1', 'Y2', 'X']).T
+        self.data = pd.DataFrame([[1, 1, 1],
+                                  [3, 2, 3],
+                                  [4, 3, 2],
+                                  [5, 4, 4],
+                                  [2, 5, 3],
+                                  [3, 6, 5],
+                                  [4, 7, 4]],
+                                 index=['s1', 's2', 's3', 's4',
+                                        's5', 's6', 's7'],
+                                 columns=['Y1', 'Y2', 'X'])
         model1 = smf.ols(formula="Y1 ~ X", data=self.data)
         model2 = smf.ols(formula="Y2 ~ X", data=self.data)
         self.results = [model1.fit(), model2.fit()]
@@ -104,5 +108,100 @@ def test_regression_results_coefficient_project_error(self):
         with self.assertRaises(ValueError):
             res.coefficients(project=True)
 
+    def test_regression_results_residuals_projection(self):
+        A = np.array  # aliasing np.array for the sake of pep8
+        exp_resid = pd.DataFrame({'s1': ilr_inv(A([-0.986842, -0.236842])),
+                                  's2': ilr_inv(A([-0.065789, -1.815789])),
+                                  's3': ilr_inv(A([1.473684, 0.473684])),
+                                  's4': ilr_inv(A([1.394737, -1.105263])),
+                                  's5': ilr_inv(A([-1.065789, 1.184211])),
+                                  's6': ilr_inv(A([-1.144737, -0.394737])),
+                                  's7': ilr_inv(A([0.394737, 1.894737]))},
+                                 index=['Z1', 'Z2', 'Z3']).T
+        feature_names = ['Z1', 'Z2', 'Z3']
+        basis = _gram_schmidt_basis(3)
+        res = RegressionResults(self.results, basis=basis,
+                                feature_names=feature_names)
+        pdt.assert_frame_equal(res.residuals(project=True), exp_resid,
+                               check_exact=False,
+                               check_less_precise=True)
+
+    def test_regression_results_residuals(self):
+        exp_resid = pd.DataFrame({'s1': [-0.986842, -0.236842],
+                                  's2': [-0.065789, -1.815789],
+                                  's3': [1.473684, 0.473684],
+                                  's4': [1.394737, -1.105263],
+                                  's5': [-1.065789, 1.184211],
+                                  's6': [-1.144737, -0.394737],
+                                  's7': [0.394737, 1.894737]},
+                                 index=['Y1', 'Y2']).T
+        res = RegressionResults(self.results)
+        pdt.assert_frame_equal(res.residuals(), exp_resid,
+                               check_exact=False,
+                               check_less_precise=True)
+
+    def test_regression_results_predict(self):
+        model = RegressionResults(self.results)
+        res_predict = model.predict(self.data[['X']])
+
+        exp_predict = pd.DataFrame({'s1': [1.986842, 1.236842],
+                                    's2': [3.065789, 3.815789],
+                                    's3': [2.526316, 2.526316],
+                                    's4': [3.605263, 5.105263],
+                                    's5': [3.065789, 3.815789],
+                                    's6': [4.144737, 6.394737],
+                                    's7': [3.605263, 5.105263]},
+                                   index=['Y1', 'Y2']).T
+
+        pdt.assert_frame_equal(res_predict, exp_predict)
+
+    def test_regression_results_predict_extrapolate(self):
+        model = RegressionResults(self.results)
+        extrapolate = pd.DataFrame({'X': [8, 9, 10]},
+                                   index=['k1', 'k2', 'k3'])
+        res_predict = model.predict(extrapolate)
+
+        exp_predict = pd.DataFrame({'k1': [5.76315789, 10.26315789],
+                                    'k2': [6.30263158, 11.55263158],
+                                    'k3': [6.84210526, 12.84210526]},
+                                   index=['Y1', 'Y2']).T
+
+        pdt.assert_frame_equal(res_predict, exp_predict)
+
+    def test_regression_results_predict_projection(self):
+        feature_names = ['Z1', 'Z2', 'Z3']
+        basis = _gram_schmidt_basis(3)
+        model = RegressionResults(self.results, basis=basis,
+                                  feature_names=feature_names)
+
+        res_predict = model.predict(self.data[['X']], project=True)
+        A = np.array  # aliasing np.array for the sake of pep8
+        exp_predict = pd.DataFrame({'s1': ilr_inv(A([1.986842, 1.236842])),
+                                    's2': ilr_inv(A([3.065789, 3.815789])),
+                                    's3': ilr_inv(A([2.526316, 2.526316])),
+                                    's4': ilr_inv(A([3.605263, 5.105263])),
+                                    's5': ilr_inv(A([3.065789, 3.815789])),
+                                    's6': ilr_inv(A([4.144737, 6.394737])),
+                                    's7': ilr_inv(A([3.605263, 5.105263]))},
+                                   index=feature_names).T
+
+        pdt.assert_frame_equal(res_predict, exp_predict)
+
+    def test_regression_results_predict_none(self):
+        model = RegressionResults(self.results)
+        res_predict = model.predict()
+
+        exp_predict = pd.DataFrame({'s1': [1.986842, 1.236842],
+                                    's2': [3.065789, 3.815789],
+                                    's3': [2.526316, 2.526316],
+                                    's4': [3.605263, 5.105263],
+                                    's5': [3.065789, 3.815789],
+                                    's6': [4.144737, 6.394737],
+                                    's7': [3.605263, 5.105263]},
+                                   index=['Y1', 'Y2']).T
+
+        pdt.assert_frame_equal(res_predict, exp_predict)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/ipynb/balance_trees.ipynb b/ipynb/balance_trees.ipynb
@@ -459,7 +459,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,