Skip to content

Commit

Permalink
[ENH] Getting pvalue computation ready (#222)
Browse files Browse the repository at this point in the history
* Getting pvalue computation ready and adding unit-tests for Coleman and permutation forest for MIGHT

---------

Signed-off-by: Adam Li <[email protected]>
Co-authored-by: Sambit Panda <[email protected]>
  • Loading branch information
adam2392 and sampan501 authored Feb 21, 2024
1 parent d46e40b commit 95e2597
Show file tree
Hide file tree
Showing 8 changed files with 330 additions and 92 deletions.
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ tree models.
PermutationForestClassifier
PermutationForestRegressor
build_coleman_forest
build_permutation_forest
build_hyppo_oob_forest
build_hyppo_cv_forest
PermutationHonestForestClassifier
Expand Down
2 changes: 2 additions & 0 deletions doc/whats_new/v0.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ Changelog
by `Adam Li`_ (:pr:`#211`)
- |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
by `Sambit Panda`_ (:pr:`#203`)
- |Feature| :func:`sktree.stats.build_coleman_forest` and :func:`sktree.stats.build_permutation_forest`
are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`)

Code and Documentation Contributors
-----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion sktree/_lib/sklearn_fork
Submodule sklearn_fork updated 216 files
2 changes: 2 additions & 0 deletions sktree/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
build_coleman_forest,
build_hyppo_cv_forest,
build_hyppo_oob_forest,
build_permutation_forest,
)
from .monte_carlo import PermutationTest
from .permutationforest import PermutationForestClassifier, PermutationForestRegressor
Expand All @@ -18,5 +19,6 @@
"build_hyppo_cv_forest",
"build_hyppo_oob_forest",
"build_coleman_forest",
"build_permutation_forest",
"PermutationHonestForestClassifier",
]
208 changes: 157 additions & 51 deletions sktree/stats/forestht.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import threading
from collections import namedtuple
from typing import Callable, Optional, Tuple, Union

import numpy as np
Expand All @@ -24,6 +25,7 @@
from ..experimental import conditional_resample
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..tree._classes import DTYPE
from .permuteforest import PermutationHonestForestClassifier
from .utils import (
METRIC_FUNCTIONS,
POSITIVE_METRICS,
Expand Down Expand Up @@ -1212,20 +1214,32 @@ def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
return prediction


ForestTestResult = namedtuple(
"ForestTestResult", ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue"]
)


def build_coleman_forest(
est,
perm_est,
X,
y,
covariate_index=None,
metric="s@98",
n_repeats=1000,
n_repeats=10_000,
verbose=False,
seed=None,
return_posteriors=True,
**metric_kwargs,
):
"""Build a hypothesis testing forest using oob samples.
"""Build a hypothesis testing forest using a two-forest approach.
The two-forest approach stems from the Coleman et al. 2022 paper, where
two forests are trained: one on the original dataset, and one on the
permuted dataset. The dataset is either permuted once, or independently for
each tree in the permuted forest. The original test statistic is computed by
comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
For full details, see :footcite:`coleman2022scalable`.
Parameters
----------
Expand All @@ -1244,7 +1258,9 @@ def build_coleman_forest(
98% specificity.
n_repeats : int, optional
Number of times to bootstrap sample the two forests to construct
the null distribution, by default 1000.
the null distribution, by default 10000. The construction of the
null forests will be parallelized according to the ``n_jobs``
argument of the ``est`` forest.
verbose : bool, optional
Verbosity, by default False.
seed : int, optional
Expand All @@ -1267,38 +1283,38 @@ def build_coleman_forest(
perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each of the permuted estimators
on their out of bag samples.
References
----------
.. footbibliography::
"""
rng = np.random.default_rng(seed)
metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]

if covariate_index is None:
covariate_index = np.arange(X.shape[1], dtype=int)

# perform permutation of covariates
# TODO: refactor permutations into the HonestForest(?)
n_samples_train = X.shape[0]
index_arr = rng.choice(
np.arange(n_samples_train, dtype=int),
size=(n_samples_train, 1),
replace=False,
shuffle=True,
)
X_permute = X.copy()
X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]
if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
)

# build two sets of forests
est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
perm_est, perm_forest_proba = build_hyppo_oob_forest(
perm_est, X_permute, y, verbose=verbose, covariate_index=covariate_index
perm_est, X, y, verbose=verbose, covariate_index=covariate_index
)

# get the number of jobs
n_jobs = est.n_jobs

metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

Expand All @@ -1320,10 +1336,133 @@ def build_coleman_forest(
else:
pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)

forest_result = ForestTestResult(observe_test_stat, permute_stat, observe_stat, pvalue)
if return_posteriors:
return observe_test_stat, pvalue, orig_forest_proba, perm_forest_proba
return forest_result, orig_forest_proba, perm_forest_proba
else:
return observe_test_stat, pvalue
return forest_result


def build_permutation_forest(
est,
perm_est,
X,
y,
covariate_index=None,
metric="s@98",
n_repeats=500,
verbose=False,
seed=None,
return_posteriors=True,
**metric_kwargs,
):
"""Build a hypothesis testing forest using a permutation-forest approach.
The permutation-forest approach stems from standard permutaiton-testing, where
each forest is trained on a new permutation of the dataset. The original test
statistic is computed on the original data. Then the pvalue is computed
by comparing the original test statistic to the null distribution of the
test statistic computed from the permuted forests.
Parameters
----------
est : Forest
The type of forest to use. Must be enabled with ``bootstrap=True``.
perm_est : Forest
The forest to use for the permuted dataset. Should be
``PermutationHonestForestClassifier``.
X : ArrayLike of shape (n_samples, n_features)
Data.
y : ArrayLike of shape (n_samples, n_outputs)
Binary target, so ``n_outputs`` should be at most 1.
covariate_index : ArrayLike, optional of shape (n_covariates,)
The index array of covariates to shuffle, by default None.
metric : str, optional
The metric to compute, by default "s@98", for sensitivity at
98% specificity.
n_repeats : int, optional
Number of times to bootstrap sample the two forests to construct
the null distribution, by default 10000. The construction of the
null forests will be parallelized according to the ``n_jobs``
argument of the ``est`` forest.
verbose : bool, optional
Verbosity, by default False.
seed : int, optional
Random seed, by default None.
return_posteriors : bool, optional
Whether or not to return the posteriors, by default False.
**metric_kwargs : dict, optional
Additional keyword arguments to pass to the metric function.
Returns
-------
observe_stat : float
The test statistic. To compute the test statistic, take
``permute_stat_`` and subtract ``observe_stat_``.
pvalue : float
The p-value of the test statistic.
orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each estimator on their
out of bag samples.
perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each of the permuted estimators
on their out of bag samples.
References
----------
.. footbibliography::
"""
rng = np.random.default_rng(seed)
metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]

if covariate_index is None:
covariate_index = np.arange(X.shape[1], dtype=int)

if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
)

# train the original forest on unpermuted data
est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
observe_test_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)

# get the number of jobs
index_arr = np.arange(X.shape[0], dtype=int).reshape(-1, 1)

# train many null forests
X_perm = X.copy()
null_dist = []
for _ in range(n_repeats):
rng.shuffle(index_arr)
perm_X_cov = X_perm[index_arr, covariate_index]
X_perm[:, covariate_index] = perm_X_cov

#
perm_est = clone(perm_est)
perm_est.set_params(random_state=rng.integers(0, np.iinfo(np.int32).max))

perm_est, perm_forest_proba = build_hyppo_oob_forest(
perm_est, X_perm, y, verbose=verbose, covariate_index=covariate_index
)

y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
null_dist.append(permute_stat)

# compute pvalue, which note is opposite that of the Coleman approach, since
# we are testing if the null distribution results in a test statistic greater
if metric in POSITIVE_METRICS:
pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)
else:
pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats)

forest_result = ForestTestResult(observe_test_stat, permute_stat, None, pvalue)
if return_posteriors:
return forest_result, orig_forest_proba, perm_forest_proba
else:
return forest_result


def build_hyppo_oob_forest(est, X, y, verbose=False, **est_kwargs):
Expand Down Expand Up @@ -1474,36 +1613,3 @@ def build_hyppo_cv_forest(
est_list.append(est)

return est_list, all_proba_list


def predict_oob_proba(est, X, verbose=False):
"""Predict out of bag posterior probabilities."""
from sklearn.utils.validation import check_is_fitted

check_is_fitted(est)

# now evaluate
X = est._validate_X_predict(X)

# if we trained a binning tree, then we should re-bin the data
# XXX: this is inefficient and should be improved to be in line with what
# the Histogram Gradient Boosting Tree does, where the binning thresholds
# are passed into the tree itself, thus allowing us to set the node feature
# value thresholds within the tree itself.
if est.max_bins is not None:
X = est._bin_data(X, is_training_data=False).astype(DTYPE)

# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs)

# avoid storing the output of every estimator by summing them here
lock = threading.Lock()
# accumulate the predictions across all trees
all_proba = np.full(
(len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
)
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
)
return all_proba
Loading

0 comments on commit 95e2597

Please sign in to comment.