Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Getting pvalue computation ready #222

Merged
merged 8 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ tree models.
PermutationForestClassifier
PermutationForestRegressor
build_coleman_forest
build_permutation_forest
build_hyppo_oob_forest
build_hyppo_cv_forest
PermutationHonestForestClassifier
Expand Down
2 changes: 2 additions & 0 deletions doc/whats_new/v0.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ Changelog
by `Adam Li`_ (:pr:`#211`)
- |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
by `Sambit Panda`_ (:pr:`#203`)
- |Feature| :func:`sktree.stats.build_coleman_forest` and :func:`sktree.stats.build_permutation_forest`
are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`)

Code and Documentation Contributors
-----------------------------------
Expand Down
2 changes: 2 additions & 0 deletions sktree/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
build_coleman_forest,
build_hyppo_cv_forest,
build_hyppo_oob_forest,
build_permutation_forest,
)
from .monte_carlo import PermutationTest
from .permutationforest import PermutationForestClassifier, PermutationForestRegressor
Expand All @@ -18,5 +19,6 @@
"build_hyppo_cv_forest",
"build_hyppo_oob_forest",
"build_coleman_forest",
"build_permutation_forest",
"PermutationHonestForestClassifier",
]
182 changes: 169 additions & 13 deletions sktree/stats/forestht.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..experimental import conditional_resample
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..tree._classes import DTYPE
from .permuteforest import PermutationHonestForestClassifier
from .utils import (
METRIC_FUNCTIONS,
POSITIVE_METRICS,
Expand Down Expand Up @@ -1219,13 +1220,20 @@ def build_coleman_forest(
y,
covariate_index=None,
metric="s@98",
n_repeats=1000,
n_repeats=10_000,
verbose=False,
seed=None,
return_posteriors=True,
**metric_kwargs,
):
"""Build a hypothesis testing forest using oob samples.
"""Build a hypothesis testing forest using a two-forest approach.

The two-forest approach stems from the Coleman et al. 2022 paper, where
two forests are trained: one on the original dataset, and one on the
permuted dataset. The dataset is either permuted once, or independently for
each tree in the permuted forest. The original test statistic is computed by
comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
For full details, see :footcite:`coleman2022scalable`.

Parameters
----------
Expand All @@ -1244,7 +1252,9 @@ def build_coleman_forest(
98% specificity.
n_repeats : int, optional
Number of times to bootstrap sample the two forests to construct
the null distribution, by default 1000.
the null distribution, by default 10000. The construction of the
null forests will be parallelized according to the ``n_jobs``
argument of the ``est`` forest.
verbose : bool, optional
Verbosity, by default False.
seed : int, optional
Expand All @@ -1267,38 +1277,183 @@ def build_coleman_forest(
perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each of the permuted estimators
on their out of bag samples.

References
----------
.. footbibliography::
"""
rng = np.random.default_rng(seed)
metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]

if covariate_index is None:
covariate_index = np.arange(X.shape[1], dtype=int)

if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
)
# perform permutation of covariates
# TODO: refactor permutations into the HonestForest(?)
n_samples_train = X.shape[0]
index_arr = rng.choice(
np.arange(n_samples_train, dtype=int),
size=(n_samples_train, 1),
replace=False,
shuffle=True,
# n_samples_train = X.shape[0]
# index_arr = rng.choice(
# np.arange(n_samples_train, dtype=int),
# size=(n_samples_train, 1),
# replace=False,
# shuffle=True,
# )
# X_permute = X.copy()
# X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]

# build two sets of forests
est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
perm_est, perm_forest_proba = build_hyppo_oob_forest(
perm_est, X, y, verbose=verbose, covariate_index=covariate_index
)
X_permute = X.copy()
X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]

# get the number of jobs
n_jobs = est.n_jobs

metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)

# metric^\pi - metric = observed test statistic, which under the
# null is normally distributed around 0
observe_test_stat = permute_stat - observe_stat

# metric^\pi_j - metric_j, which is centered at 0
null_dist = metric_star_pi - metric_star

# compute pvalue
if metric in POSITIVE_METRICS:
pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats)
else:
pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)

if return_posteriors:
return observe_test_stat, pvalue, orig_forest_proba, perm_forest_proba
else:
return observe_test_stat, pvalue


def build_permutation_forest(
est,
perm_est,
X,
y,
covariate_index=None,
metric="s@98",
n_repeats=500,
verbose=False,
seed=None,
return_posteriors=True,
**metric_kwargs,
):
"""Build a hypothesis testing forest using a permutation-forest approach.

The permutation-forest approach stems from standard permutaiton-testing, where
each forest is trained on a new permutation of the dataset. The original test
statistic is computed on the original data. Then the pvalue is computed
by comparing the original test statistic to the null distribution of the
test statistic computed from the permuted forests.

Parameters
----------
est : Forest
The type of forest to use. Must be enabled with ``bootstrap=True``.
perm_est : Forest
The forest to use for the permuted dataset. Should be
``PermutationHonestForestClassifier``.
X : ArrayLike of shape (n_samples, n_features)
Data.
y : ArrayLike of shape (n_samples, n_outputs)
Binary target, so ``n_outputs`` should be at most 1.
covariate_index : ArrayLike, optional of shape (n_covariates,)
The index array of covariates to shuffle, by default None.
metric : str, optional
The metric to compute, by default "s@98", for sensitivity at
98% specificity.
n_repeats : int, optional
Number of times to bootstrap sample the two forests to construct
the null distribution, by default 10000. The construction of the
null forests will be parallelized according to the ``n_jobs``
argument of the ``est`` forest.
verbose : bool, optional
Verbosity, by default False.
seed : int, optional
Random seed, by default None.
return_posteriors : bool, optional
Whether or not to return the posteriors, by default False.
**metric_kwargs : dict, optional
Additional keyword arguments to pass to the metric function.

Returns
-------
observe_stat : float
The test statistic. To compute the test statistic, take
``permute_stat_`` and subtract ``observe_stat_``.
pvalue : float
The p-value of the test statistic.
orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each estimator on their
out of bag samples.
perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each of the permuted estimators
on their out of bag samples.

References
----------
.. footbibliography::
"""
metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]

if covariate_index is None:
covariate_index = np.arange(X.shape[1], dtype=int)

if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
)
# perform permutation of covariates
# TODO: refactor permutations into the HonestForest(?)
# n_samples_train = X.shape[0]
# index_arr = rng.choice(
# np.arange(n_samples_train, dtype=int),
# size=(n_samples_train, 1),
# replace=False,
# shuffle=True,
# )
# X_permute = X.copy()
# X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]

# build two sets of forests
est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
perm_est, perm_forest_proba = build_hyppo_oob_forest(
perm_est, X_permute, y, verbose=verbose, covariate_index=covariate_index
perm_est, X, y, verbose=verbose, covariate_index=covariate_index
)

# get the number of jobs
n_jobs = est.n_jobs

metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

Expand Down Expand Up @@ -1476,6 +1631,7 @@ def build_hyppo_cv_forest(
return est_list, all_proba_list


# XXX: can delete?
def predict_oob_proba(est, X, verbose=False):
"""Predict out of bag posterior probabilities."""
from sklearn.utils.validation import check_is_fitted
Expand Down
Loading
Loading