diff --git a/doc/api.rst b/doc/api.rst index 18fdf5195..793adb7a5 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -150,6 +150,7 @@ tree models. PermutationForestClassifier PermutationForestRegressor build_coleman_forest + build_permutation_forest build_hyppo_oob_forest build_hyppo_cv_forest PermutationHonestForestClassifier diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 717faee65..2d474beca 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -25,6 +25,8 @@ Changelog by `Adam Li`_ (:pr:`#211`) - |Feature| Introduce a new set of simulations based on Marron and Wand 1992. by `Sambit Panda`_ (:pr:`#203`) +- |Feature| :func:`sktree.stats.build_coleman_forest` and :func:`sktree.stats.build_permutation_forest` + are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`) Code and Documentation Contributors ----------------------------------- diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index d9c9a3918..db91ddb6f 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit d9c9a3918a8de5f0984a25d4a7c1968ff18c916d +Subproject commit db91ddb6f090ac091588dd844be6d9c60afc341f diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py index ede7a5f8f..39d09234b 100644 --- a/sktree/stats/__init__.py +++ b/sktree/stats/__init__.py @@ -4,6 +4,7 @@ build_coleman_forest, build_hyppo_cv_forest, build_hyppo_oob_forest, + build_permutation_forest, ) from .monte_carlo import PermutationTest from .permutationforest import PermutationForestClassifier, PermutationForestRegressor @@ -18,5 +19,6 @@ "build_hyppo_cv_forest", "build_hyppo_oob_forest", "build_coleman_forest", + "build_permutation_forest", "PermutationHonestForestClassifier", ] diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 9fa3a837a..9b9aef692 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1,4 +1,5 @@ import threading +from collections import namedtuple from typing import Callable, Optional, Tuple, Union import numpy as np @@ -24,6 +25,7 @@ from ..experimental import conditional_resample from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..tree._classes import DTYPE +from .permuteforest import PermutationHonestForestClassifier from .utils import ( METRIC_FUNCTIONS, POSITIVE_METRICS, @@ -1212,6 +1214,11 @@ def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock): return prediction +ForestTestResult = namedtuple( + "ForestTestResult", ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue"] +) + + def build_coleman_forest( est, perm_est, @@ -1219,13 +1226,20 @@ def build_coleman_forest( y, covariate_index=None, metric="s@98", - n_repeats=1000, + n_repeats=10_000, verbose=False, seed=None, return_posteriors=True, **metric_kwargs, ): - """Build a hypothesis testing forest using oob samples. + """Build a hypothesis testing forest using a two-forest approach. + + The two-forest approach stems from the Coleman et al. 2022 paper, where + two forests are trained: one on the original dataset, and one on the + permuted dataset. The dataset is either permuted once, or independently for + each tree in the permuted forest. The original test statistic is computed by + comparing the metric on both forests ``(metric_forest - metric_perm_forest)``. + For full details, see :footcite:`coleman2022scalable`. Parameters ---------- @@ -1244,7 +1258,9 @@ def build_coleman_forest( 98% specificity. n_repeats : int, optional Number of times to bootstrap sample the two forests to construct - the null distribution, by default 1000. + the null distribution, by default 10000. The construction of the + null forests will be parallelized according to the ``n_jobs`` + argument of the ``est`` forest. verbose : bool, optional Verbosity, by default False. seed : int, optional @@ -1267,31 +1283,30 @@ def build_coleman_forest( perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each of the permuted estimators on their out of bag samples. + + References + ---------- + .. footbibliography:: """ - rng = np.random.default_rng(seed) metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] if covariate_index is None: covariate_index = np.arange(X.shape[1], dtype=int) - # perform permutation of covariates - # TODO: refactor permutations into the HonestForest(?) - n_samples_train = X.shape[0] - index_arr = rng.choice( - np.arange(n_samples_train, dtype=int), - size=(n_samples_train, 1), - replace=False, - shuffle=True, - ) - X_permute = X.copy() - X_permute[:, covariate_index] = X_permute[index_arr, covariate_index] + if not isinstance(perm_est, PermutationHonestForestClassifier): + raise RuntimeError( + f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" + ) # build two sets of forests est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose) perm_est, perm_forest_proba = build_hyppo_oob_forest( - perm_est, X_permute, y, verbose=verbose, covariate_index=covariate_index + perm_est, X, y, verbose=verbose, covariate_index=covariate_index ) + # get the number of jobs + n_jobs = est.n_jobs + metric_star, metric_star_pi = _compute_null_distribution_coleman( y, orig_forest_proba, @@ -1299,6 +1314,7 @@ def build_coleman_forest( metric, n_repeats=n_repeats, seed=seed, + n_jobs=n_jobs, **metric_kwargs, ) @@ -1320,10 +1336,133 @@ def build_coleman_forest( else: pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) + forest_result = ForestTestResult(observe_test_stat, permute_stat, observe_stat, pvalue) if return_posteriors: - return observe_test_stat, pvalue, orig_forest_proba, perm_forest_proba + return forest_result, orig_forest_proba, perm_forest_proba else: - return observe_test_stat, pvalue + return forest_result + + +def build_permutation_forest( + est, + perm_est, + X, + y, + covariate_index=None, + metric="s@98", + n_repeats=500, + verbose=False, + seed=None, + return_posteriors=True, + **metric_kwargs, +): + """Build a hypothesis testing forest using a permutation-forest approach. + + The permutation-forest approach stems from standard permutaiton-testing, where + each forest is trained on a new permutation of the dataset. The original test + statistic is computed on the original data. Then the pvalue is computed + by comparing the original test statistic to the null distribution of the + test statistic computed from the permuted forests. + + Parameters + ---------- + est : Forest + The type of forest to use. Must be enabled with ``bootstrap=True``. + perm_est : Forest + The forest to use for the permuted dataset. Should be + ``PermutationHonestForestClassifier``. + X : ArrayLike of shape (n_samples, n_features) + Data. + y : ArrayLike of shape (n_samples, n_outputs) + Binary target, so ``n_outputs`` should be at most 1. + covariate_index : ArrayLike, optional of shape (n_covariates,) + The index array of covariates to shuffle, by default None. + metric : str, optional + The metric to compute, by default "s@98", for sensitivity at + 98% specificity. + n_repeats : int, optional + Number of times to bootstrap sample the two forests to construct + the null distribution, by default 10000. The construction of the + null forests will be parallelized according to the ``n_jobs`` + argument of the ``est`` forest. + verbose : bool, optional + Verbosity, by default False. + seed : int, optional + Random seed, by default None. + return_posteriors : bool, optional + Whether or not to return the posteriors, by default False. + **metric_kwargs : dict, optional + Additional keyword arguments to pass to the metric function. + + Returns + ------- + observe_stat : float + The test statistic. To compute the test statistic, take + ``permute_stat_`` and subtract ``observe_stat_``. + pvalue : float + The p-value of the test statistic. + orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) + The predicted posterior probabilities for each estimator on their + out of bag samples. + perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) + The predicted posterior probabilities for each of the permuted estimators + on their out of bag samples. + + References + ---------- + .. footbibliography:: + """ + rng = np.random.default_rng(seed) + metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] + + if covariate_index is None: + covariate_index = np.arange(X.shape[1], dtype=int) + + if not isinstance(perm_est, PermutationHonestForestClassifier): + raise RuntimeError( + f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" + ) + + # train the original forest on unpermuted data + est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose) + y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0) + observe_test_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs) + + # get the number of jobs + index_arr = np.arange(X.shape[0], dtype=int).reshape(-1, 1) + + # train many null forests + X_perm = X.copy() + null_dist = [] + for _ in range(n_repeats): + rng.shuffle(index_arr) + perm_X_cov = X_perm[index_arr, covariate_index] + X_perm[:, covariate_index] = perm_X_cov + + # + perm_est = clone(perm_est) + perm_est.set_params(random_state=rng.integers(0, np.iinfo(np.int32).max)) + + perm_est, perm_forest_proba = build_hyppo_oob_forest( + perm_est, X_perm, y, verbose=verbose, covariate_index=covariate_index + ) + + y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0) + permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs) + null_dist.append(permute_stat) + + # compute pvalue, which note is opposite that of the Coleman approach, since + # we are testing if the null distribution results in a test statistic greater + if metric in POSITIVE_METRICS: + pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) + else: + pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats) + + forest_result = ForestTestResult(observe_test_stat, permute_stat, None, pvalue) + if return_posteriors: + return forest_result, orig_forest_proba, perm_forest_proba + else: + return forest_result def build_hyppo_oob_forest(est, X, y, verbose=False, **est_kwargs): @@ -1474,36 +1613,3 @@ def build_hyppo_cv_forest( est_list.append(est) return est_list, all_proba_list - - -def predict_oob_proba(est, X, verbose=False): - """Predict out of bag posterior probabilities.""" - from sklearn.utils.validation import check_is_fitted - - check_is_fitted(est) - - # now evaluate - X = est._validate_X_predict(X) - - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if est.max_bins is not None: - X = est._bin_data(X, is_training_data=False).astype(DTYPE) - - # Assign chunk of trees to jobs - n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs) - - # avoid storing the output of every estimator by summing them here - lock = threading.Lock() - # accumulate the predictions across all trees - all_proba = np.full( - (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64 - ) - Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( - delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) - for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_)) - ) - return all_proba diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 001ca9669..ca9d7a1de 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -13,7 +13,13 @@ from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor from sktree._lib.sklearn.tree import DecisionTreeClassifier -from sktree.stats import FeatureImportanceForestClassifier, FeatureImportanceForestRegressor +from sktree.stats import ( + FeatureImportanceForestClassifier, + FeatureImportanceForestRegressor, + PermutationHonestForestClassifier, + build_coleman_forest, + build_permutation_forest, +) from sktree.stats.utils import _non_nan_samples from sktree.tree import MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier @@ -772,3 +778,86 @@ def test_null_with_partial_auc(): null_dist_pvalue = wilcoxon(first_null_dist, second_null_dist).pvalue assert null_dist_pvalue < 0.05, null_dist_pvalue assert pvalue > 0.05, f"pvalue: {pvalue}" + + +def test_build_coleman_forest(): + """Simple test for building a Coleman forest. + + Test the function under alternative and null hypothesis for a very simple dataset. + """ + n_estimators = 10 + n_samples = 20 + n_features = 3 + rng = np.random.default_rng(seed) + + _X = rng.uniform(size=(n_samples, n_features)) + _X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = _X + 3 + X = np.vstack([_X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = HonestForestClassifier( + n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + ) + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + ) + with pytest.raises( + RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" + ): + build_coleman_forest(clf, clf, X, y) + + forest_result, orig_forest_proba, perm_forest_proba = build_coleman_forest( + clf, perm_clf, X, y, metric="s@98" + ) + assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_stat > 0.1, f"{forest_result.observe_stat}" + assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) + + X = np.vstack([_X, _X]) + forest_result, _, _ = build_coleman_forest(clf, perm_clf, X, y, metric="s@98") + assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_stat < 0.05, f"{forest_result.observe_stat}" + + +def test_build_permutation_forest(): + """Simple test for building a permutation forest.""" + n_estimators = 30 + n_samples = 100 + n_features = 3 + rng = np.random.default_rng(seed) + + _X = rng.uniform(size=(n_samples, n_features)) + _X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = _X + 10 + X = np.vstack([_X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = HonestForestClassifier( + n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + ) + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + ) + with pytest.raises( + RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" + ): + build_permutation_forest(clf, clf, X, y, seed=seed) + + forest_result, orig_forest_proba, perm_forest_proba = build_permutation_forest( + clf, perm_clf, X, y, metric="s@98", n_repeats=20, seed=seed + ) + assert forest_result.observe_test_stat > 0.1, f"{forest_result.observe_stat}" + assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" + assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) + + X = np.vstack([_X, _X]) + forest_result, _, _ = build_permutation_forest( + clf, perm_clf, X, y, metric="s@98", n_repeats=10, seed=seed + ) + assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}" diff --git a/sktree/stats/utils.py b/sktree/stats/utils.py index ffda338a2..3dd4d9da8 100644 --- a/sktree/stats/utils.py +++ b/sktree/stats/utils.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple import numpy as np +from joblib import Parallel, delayed from numpy.typing import ArrayLike from scipy.stats import entropy from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap @@ -204,6 +205,7 @@ def _compute_null_distribution_coleman( metric: str = "mse", n_repeats: int = 1000, seed: Optional[int] = None, + n_jobs: Optional[int] = None, **metric_kwargs, ) -> Tuple[ArrayLike, ArrayLike]: """Compute null distribution using Coleman method. @@ -238,9 +240,6 @@ def _compute_null_distribution_coleman( metric_star_pi : ArrayLike of shape (n_samples,) An array of the metrics computed on the other half of the trees. """ - rng = np.random.default_rng(seed) - metric_func = METRIC_FUNCTIONS[metric] - # sample two sets of equal number of trees from the combined forest these are the posteriors # (n_estimators * 2, n_samples, n_outputs) all_y_pred = np.concatenate((y_pred_proba_normal, y_pred_proba_perm), axis=0) @@ -258,51 +257,82 @@ def _compute_null_distribution_coleman( metric_star = np.zeros((n_repeats,)) metric_star_pi = np.zeros((n_repeats,)) - for idx in range(n_repeats): - # two sets of random indices from 1 : 2N are sampled using Fisher-Yates - rng.shuffle(y_pred_ind_arr) - - # get random half of the posteriors from two sets of trees - first_forest_inds = y_pred_ind_arr[: n_estimators // 2] - second_forest_inds = y_pred_ind_arr[n_estimators // 2 :] - - # get random half of the posteriors as one forest - first_forest_pred = all_y_pred[first_forest_inds, ...] - second_forest_pred = all_y_pred[second_forest_inds, ...] - - # determine if there are any nans in the final posterior array, when - # averaged over the trees - first_forest_samples = _non_nan_samples(first_forest_pred) - second_forest_samples = _non_nan_samples(second_forest_pred) - - # todo: is this step necessary? - # non_nan_samples = np.intersect1d( - # first_forest_samples, second_forest_samples, assume_unique=True - # ) - # now average the posteriors over the trees for the non-nan samples - # y_pred_first_half = np.nanmean(first_forest_pred[:, non_nan_samples, :], axis=0) - # y_pred_second_half = np.nanmean(second_forest_pred[:, non_nan_samples, :], axis=0) - # # compute two instances of the metric from the sampled trees - # first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half) - # second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half) - - y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0) - y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0) - # compute two instances of the metric from the sampled trees - first_half_metric = metric_func( - y_test[first_forest_samples, :], y_pred_first_half, **metric_kwargs - ) - second_half_metric = metric_func( - y_test[second_forest_samples, :], y_pred_second_half, **metric_kwargs + # generate the random seeds for the parallel jobs + ss = np.random.SeedSequence(seed) + out = Parallel(n_jobs=n_jobs)( + delayed(_parallel_build_null_forests)( + y_pred_ind_arr, + n_estimators, + all_y_pred, + y_test, + seed, + metric, + **metric_kwargs, ) + for i, seed in zip(range(n_repeats), ss.spawn(n_repeats)) + ) + for idx, (first_half_metric, second_half_metric) in enumerate(out): metric_star[idx] = first_half_metric metric_star_pi[idx] = second_half_metric return metric_star, metric_star_pi +def _parallel_build_null_forests( + index_arr: ArrayLike, + n_estimators: int, + all_y_pred: ArrayLike, + y_test: ArrayLike, + seed: int, + metric: str, + **metric_kwargs: dict, +): + """Randomly sample two sets of forests and compute the metric on each.""" + rng = np.random.default_rng(seed) + metric_func = METRIC_FUNCTIONS[metric] + + # two sets of random indices from 1 : 2N are sampled using Fisher-Yates + rng.shuffle(index_arr) + + # get random half of the posteriors from two sets of trees + first_forest_inds = index_arr[: n_estimators // 2] + second_forest_inds = index_arr[n_estimators // 2 :] + + # get random half of the posteriors as one forest + first_forest_pred = all_y_pred[first_forest_inds, ...] + second_forest_pred = all_y_pred[second_forest_inds, ...] + + # determine if there are any nans in the final posterior array, when + # averaged over the trees + first_forest_samples = _non_nan_samples(first_forest_pred) + second_forest_samples = _non_nan_samples(second_forest_pred) + + # todo: is this step necessary? + # non_nan_samples = np.intersect1d( + # first_forest_samples, second_forest_samples, assume_unique=True + # ) + # now average the posteriors over the trees for the non-nan samples + # y_pred_first_half = np.nanmean(first_forest_pred[:, non_nan_samples, :], axis=0) + # y_pred_second_half = np.nanmean(second_forest_pred[:, non_nan_samples, :], axis=0) + # # compute two instances of the metric from the sampled trees + # first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half) + # second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half) + + y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0) + y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0) + + # compute two instances of the metric from the sampled trees + first_half_metric = metric_func( + y_test[first_forest_samples, :], y_pred_first_half, **metric_kwargs + ) + second_half_metric = metric_func( + y_test[second_forest_samples, :], y_pred_second_half, **metric_kwargs + ) + return first_half_metric, second_half_metric + + def get_per_tree_oob_samples(est: BaseForest): """The sample indices that are out-of-bag. diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 6e9260e57..2a434daea 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -4,6 +4,7 @@ import numpy as np from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context, clone from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions from sklearn.utils.multiclass import _check_partial_fit_first_call, check_classification_targets from sklearn.utils.validation import check_is_fitted, check_X_y @@ -269,6 +270,13 @@ class frequency in the voting subsample. 0.8 , 0.8 , 0.93333333, 1. , 1. ]) """ + _parameter_constraints: dict = { + **BaseDecisionTree._parameter_constraints, + "honest_fraction": [Interval(RealNotInt, 0.0, 1.0, closed="neither")], + "honest_prior": [StrOptions({"empirical", "uniform", "ignore"})], + "stratify": ["boolean"], + } + def __init__( self, criterion="gini",