Skip to content

Commit

Permalink
Merge pull request #32 from mortonjt/regression_results_part3
Browse files Browse the repository at this point in the history
Regression results part3
  • Loading branch information
wasade authored Aug 18, 2016
2 parents 0f6a56c + 855078e commit ddc3d40
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 12 deletions.
115 changes: 110 additions & 5 deletions gneiss/_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ class RegressionResults():
"""
Summary object for storing regression results.
"""
def __init__(self, stat_results,
def __init__(self,
stat_results,
feature_names=None,
basis=None):
""" Reorganizes statsmodels regression modules.
""" Reorganizes statsmodels regression results modules.
Accepts a list of statsmodels RegressionResults objects
and performs some addition summary statistics.
Expand Down Expand Up @@ -78,11 +79,11 @@ def _check_projection(self, project):
"""
if self.basis is None and project:
raise ValueError("Cannot perform projection into Aitchison simplex"
"if `basis` is not specified.")
" if `basis` is not specified.")

if self.feature_names is None and project:
raise ValueError("Cannot perform projection into Aitchison simplex"
"if `feature_names` is not specified.")
" if `feature_names` is not specified.")

def coefficients(self, project=False):
""" Returns coefficients from fit.
Expand All @@ -91,7 +92,7 @@ def coefficients(self, project=False):
----------
project : bool, optional
Specifies if coefficients should be projected back into
the Aitchison simplex. If false, the coefficients will be
the Aitchison simplex [1]_. If false, the coefficients will be
represented as balances (default: False).
Returns
Expand All @@ -109,6 +110,11 @@ def coefficients(self, project=False):
ValueError:
Cannot perform projection into Aitchison simplex
if `feature_names` is not specified.
References
----------
.. [1] Aitchison, J. "A concise guide to compositional data analysis,
CDA work." Girona 24 (2003): 73-81.
"""
self._check_projection(project)
coef = pd.DataFrame()
Expand All @@ -128,3 +134,102 @@ def coefficients(self, project=False):
columns=coef.columns)
else:
return coef

def residuals(self, project=False):
""" Returns calculated residuals.
Parameters
----------
X : pd.DataFrame, optional
Input table of covariates. If not specified, then the
fitted values calculated from training the model will be
returned.
project : bool, optional
Specifies if coefficients should be projected back into
the Aitchison simplex [1]_. If false, the coefficients will be
represented as balances (default: False).
Returns
-------
pd.DataFrame
A table of values where rows are samples, and the columns
are either balances or proportions, depending on the value of
`project`.
References
----------
.. [1] Aitchison, J. "A concise guide to compositional data analysis,
CDA work." Girona 24 (2003): 73-81.
"""
self._check_projection(project)

resid = pd.DataFrame()

for r in self.results:
err = r.resid
err.name = r.model.endog_names
resid = resid.append(err)

if project:
# `check=False`, due to a problem with error handling
# addressed here https://github.com/biocore/scikit-bio/pull/1396
# This will need to be fixed here:
# https://github.com/biocore/gneiss/issues/34
proj_resid = ilr_inv(resid.values.T, basis=self.basis,
check=False).T
return pd.DataFrame(proj_resid, index=self.feature_names,
columns=resid.columns).T
else:
return resid.T

def predict(self, X=None, project=False, **kwargs):
""" Performs a prediction based on model.
Parameters
----------
X : pd.DataFrame, optional
Input table of covariates, where columns are covariates, and
rows are samples. If not specified, then the fitted values
calculated from training the model will be returned.
project : bool, optional
Specifies if coefficients should be projected back into
the Aitchison simplex [1]_. If false, the coefficients will be
represented as balances (default: False).
**kwargs : dict
Other arguments to be passed into the model prediction.
Returns
-------
pd.DataFrame
A table of values where rows are coefficients, and the columns
are either balances or proportions, depending on the value of
`project`.
References
----------
.. [1] Aitchison, J. "A concise guide to compositional data analysis,
CDA work." Girona 24 (2003): 73-81.
"""
self._check_projection(project)

prediction = pd.DataFrame()
for m in self.results:
# check if X is none.
p = pd.Series(m.predict(X, **kwargs))
p.name = m.model.endog_names
if X is not None:
p.index = X.index
else:
p.index = m.fittedvalues.index
prediction = prediction.append(p)

if project:
# `check=False`, due to a problem with error handling
# addressed here https://github.com/biocore/scikit-bio/pull/1396
# This will need to be fixed here:
# https://github.com/biocore/gneiss/issues/34
proj_prediction = ilr_inv(prediction.values.T, basis=self.basis,
check=False)
return pd.DataFrame(proj_prediction,
columns=self.feature_names,
index=prediction.columns)
return prediction.T
111 changes: 105 additions & 6 deletions gneiss/tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@
class TestRegressionResults(unittest.TestCase):

def setUp(self):
self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4],
list(range(1, 8)),
[1, 3, 2, 4, 3, 5, 4]],
columns=['s1', 's2', 's3', 's4',
's5', 's6', 's7'],
index=['Y1', 'Y2', 'X']).T
self.data = pd.DataFrame([[1, 1, 1],
[3, 2, 3],
[4, 3, 2],
[5, 4, 4],
[2, 5, 3],
[3, 6, 5],
[4, 7, 4]],
index=['s1', 's2', 's3', 's4',
's5', 's6', 's7'],
columns=['Y1', 'Y2', 'X'])
model1 = smf.ols(formula="Y1 ~ X", data=self.data)
model2 = smf.ols(formula="Y2 ~ X", data=self.data)
self.results = [model1.fit(), model2.fit()]
Expand Down Expand Up @@ -104,5 +108,100 @@ def test_regression_results_coefficient_project_error(self):
with self.assertRaises(ValueError):
res.coefficients(project=True)

def test_regression_results_residuals_projection(self):
A = np.array # aliasing np.array for the sake of pep8
exp_resid = pd.DataFrame({'s1': ilr_inv(A([-0.986842, -0.236842])),
's2': ilr_inv(A([-0.065789, -1.815789])),
's3': ilr_inv(A([1.473684, 0.473684])),
's4': ilr_inv(A([1.394737, -1.105263])),
's5': ilr_inv(A([-1.065789, 1.184211])),
's6': ilr_inv(A([-1.144737, -0.394737])),
's7': ilr_inv(A([0.394737, 1.894737]))},
index=['Z1', 'Z2', 'Z3']).T
feature_names = ['Z1', 'Z2', 'Z3']
basis = _gram_schmidt_basis(3)
res = RegressionResults(self.results, basis=basis,
feature_names=feature_names)
pdt.assert_frame_equal(res.residuals(project=True), exp_resid,
check_exact=False,
check_less_precise=True)

def test_regression_results_residuals(self):
exp_resid = pd.DataFrame({'s1': [-0.986842, -0.236842],
's2': [-0.065789, -1.815789],
's3': [1.473684, 0.473684],
's4': [1.394737, -1.105263],
's5': [-1.065789, 1.184211],
's6': [-1.144737, -0.394737],
's7': [0.394737, 1.894737]},
index=['Y1', 'Y2']).T
res = RegressionResults(self.results)
pdt.assert_frame_equal(res.residuals(), exp_resid,
check_exact=False,
check_less_precise=True)

def test_regression_results_predict(self):
model = RegressionResults(self.results)
res_predict = model.predict(self.data[['X']])

exp_predict = pd.DataFrame({'s1': [1.986842, 1.236842],
's2': [3.065789, 3.815789],
's3': [2.526316, 2.526316],
's4': [3.605263, 5.105263],
's5': [3.065789, 3.815789],
's6': [4.144737, 6.394737],
's7': [3.605263, 5.105263]},
index=['Y1', 'Y2']).T

pdt.assert_frame_equal(res_predict, exp_predict)

def test_regression_results_predict_extrapolate(self):
model = RegressionResults(self.results)
extrapolate = pd.DataFrame({'X': [8, 9, 10]},
index=['k1', 'k2', 'k3'])
res_predict = model.predict(extrapolate)

exp_predict = pd.DataFrame({'k1': [5.76315789, 10.26315789],
'k2': [6.30263158, 11.55263158],
'k3': [6.84210526, 12.84210526]},
index=['Y1', 'Y2']).T

pdt.assert_frame_equal(res_predict, exp_predict)

def test_regression_results_predict_projection(self):
feature_names = ['Z1', 'Z2', 'Z3']
basis = _gram_schmidt_basis(3)
model = RegressionResults(self.results, basis=basis,
feature_names=feature_names)

res_predict = model.predict(self.data[['X']], project=True)
A = np.array # aliasing np.array for the sake of pep8
exp_predict = pd.DataFrame({'s1': ilr_inv(A([1.986842, 1.236842])),
's2': ilr_inv(A([3.065789, 3.815789])),
's3': ilr_inv(A([2.526316, 2.526316])),
's4': ilr_inv(A([3.605263, 5.105263])),
's5': ilr_inv(A([3.065789, 3.815789])),
's6': ilr_inv(A([4.144737, 6.394737])),
's7': ilr_inv(A([3.605263, 5.105263]))},
index=feature_names).T

pdt.assert_frame_equal(res_predict, exp_predict)

def test_regression_results_predict_none(self):
model = RegressionResults(self.results)
res_predict = model.predict()

exp_predict = pd.DataFrame({'s1': [1.986842, 1.236842],
's2': [3.065789, 3.815789],
's3': [2.526316, 2.526316],
's4': [3.605263, 5.105263],
's5': [3.065789, 3.815789],
's6': [4.144737, 6.394737],
's7': [3.605263, 5.105263]},
index=['Y1', 'Y2']).T

pdt.assert_frame_equal(res_predict, exp_predict)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion ipynb/balance_trees.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
"version": "3.5.2"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ddc3d40

Please sign in to comment.