neurodata · adam2392 · Feb 21, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/doc/api.rst b/doc/api.rst
@@ -150,6 +150,7 @@ tree models.
    PermutationForestClassifier
    PermutationForestRegressor
    build_coleman_forest
+   build_permutation_forest
    build_hyppo_oob_forest
    build_hyppo_cv_forest
    PermutationHonestForestClassifier

diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -25,6 +25,8 @@ Changelog
     by `Adam Li`_ (:pr:`#211`)
 - |Feature| Introduce a new set of simulations based on Marron and Wand 1992.
     by `Sambit Panda`_ (:pr:`#203`)
+- |Feature| :func:`sktree.stats.build_coleman_forest` and :func:`sktree.stats.build_permutation_forest`
+    are added to compute p-values given an estimator and permutation-estimator, `Adam Li`_ (:pr:`#222`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sktree/stats/__init__.py b/sktree/stats/__init__.py
@@ -4,6 +4,7 @@
     build_coleman_forest,
     build_hyppo_cv_forest,
     build_hyppo_oob_forest,
+    build_permutation_forest,
 )
 from .monte_carlo import PermutationTest
 from .permutationforest import PermutationForestClassifier, PermutationForestRegressor
@@ -18,5 +19,6 @@
     "build_hyppo_cv_forest",
     "build_hyppo_oob_forest",
     "build_coleman_forest",
+    "build_permutation_forest",
     "PermutationHonestForestClassifier",
 ]
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
@@ -24,6 +24,7 @@
 from ..experimental import conditional_resample
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..tree._classes import DTYPE
+from .permuteforest import PermutationHonestForestClassifier
 from .utils import (
     METRIC_FUNCTIONS,
     POSITIVE_METRICS,
@@ -1219,13 +1220,20 @@ def build_coleman_forest(
     y,
     covariate_index=None,
     metric="s@98",
-    n_repeats=1000,
+    n_repeats=10_000,
     verbose=False,
     seed=None,
     return_posteriors=True,
     **metric_kwargs,
 ):
-    """Build a hypothesis testing forest using oob samples.
+    """Build a hypothesis testing forest using a two-forest approach.
+
+    The two-forest approach stems from the Coleman et al. 2022 paper, where
+    two forests are trained: one on the original dataset, and one on the
+    permuted dataset. The dataset is either permuted once, or independently for
+    each tree in the permuted forest. The original test statistic is computed by
+    comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
+    For full details, see :footcite:`coleman2022scalable`.
 
     Parameters
     ----------
@@ -1244,7 +1252,9 @@ def build_coleman_forest(
         98% specificity.
     n_repeats : int, optional
         Number of times to bootstrap sample the two forests to construct
-        the null distribution, by default 1000.
+        the null distribution, by default 10000. The construction of the
+        null forests will be parallelized according to the ``n_jobs``
+        argument of the ``est`` forest.
     verbose : bool, optional
         Verbosity, by default False.
     seed : int, optional
@@ -1267,38 +1277,183 @@ def build_coleman_forest(
     perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
         The predicted posterior probabilities for each of the permuted estimators
         on their out of bag samples.
+
+    References
+    ----------
+    .. footbibliography::
     """
-    rng = np.random.default_rng(seed)
     metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]
 
     if covariate_index is None:
         covariate_index = np.arange(X.shape[1], dtype=int)
 
+    if not isinstance(perm_est, PermutationHonestForestClassifier):
+        raise RuntimeError(
+            f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
+        )
     # perform permutation of covariates
     # TODO: refactor permutations into the HonestForest(?)
-    n_samples_train = X.shape[0]
-    index_arr = rng.choice(
-        np.arange(n_samples_train, dtype=int),
-        size=(n_samples_train, 1),
-        replace=False,
-        shuffle=True,
+    # n_samples_train = X.shape[0]
+    # index_arr = rng.choice(
+    #     np.arange(n_samples_train, dtype=int),
+    #     size=(n_samples_train, 1),
+    #     replace=False,
+    #     shuffle=True,
+    # )
+    # X_permute = X.copy()
+    # X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]
+
+    # build two sets of forests
+    est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
+    perm_est, perm_forest_proba = build_hyppo_oob_forest(
+        perm_est, X, y, verbose=verbose, covariate_index=covariate_index
     )
-    X_permute = X.copy()
-    X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]
+
+    # get the number of jobs
+    n_jobs = est.n_jobs
+
+    metric_star, metric_star_pi = _compute_null_distribution_coleman(
+        y,
+        orig_forest_proba,
+        perm_forest_proba,
+        metric,
+        n_repeats=n_repeats,
+        seed=seed,
+        n_jobs=n_jobs,
+        **metric_kwargs,
+    )
+
+    y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
+    y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
+    observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
+    permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
+
+    # metric^\pi - metric = observed test statistic, which under the
+    # null is normally distributed around 0
+    observe_test_stat = permute_stat - observe_stat
+
+    # metric^\pi_j - metric_j, which is centered at 0
+    null_dist = metric_star_pi - metric_star
+
+    # compute pvalue
+    if metric in POSITIVE_METRICS:
+        pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats)
+    else:
+        pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)
+
+    if return_posteriors:
+        return observe_test_stat, pvalue, orig_forest_proba, perm_forest_proba
+    else:
+        return observe_test_stat, pvalue
+
+
+def build_permutation_forest(
+    est,
+    perm_est,
+    X,
+    y,
+    covariate_index=None,
+    metric="s@98",
+    n_repeats=500,
+    verbose=False,
+    seed=None,
+    return_posteriors=True,
+    **metric_kwargs,
+):
+    """Build a hypothesis testing forest using a permutation-forest approach.
+
+    The permutation-forest approach stems from standard permutaiton-testing, where
+    each forest is trained on a new permutation of the dataset. The original test
+    statistic is computed on the original data. Then the pvalue is computed
+    by comparing the original test statistic to the null distribution of the
+    test statistic computed from the permuted forests.
+
+    Parameters
+    ----------
+    est : Forest
+        The type of forest to use. Must be enabled with ``bootstrap=True``.
+    perm_est : Forest
+        The forest to use for the permuted dataset. Should be
+        ``PermutationHonestForestClassifier``.
+    X : ArrayLike of shape (n_samples, n_features)
+        Data.
+    y : ArrayLike of shape (n_samples, n_outputs)
+        Binary target, so ``n_outputs`` should be at most 1.
+    covariate_index : ArrayLike, optional of shape (n_covariates,)
+        The index array of covariates to shuffle, by default None.
+    metric : str, optional
+        The metric to compute, by default "s@98", for sensitivity at
+        98% specificity.
+    n_repeats : int, optional
+        Number of times to bootstrap sample the two forests to construct
+        the null distribution, by default 10000. The construction of the
+        null forests will be parallelized according to the ``n_jobs``
+        argument of the ``est`` forest.
+    verbose : bool, optional
+        Verbosity, by default False.
+    seed : int, optional
+        Random seed, by default None.
+    return_posteriors : bool, optional
+        Whether or not to return the posteriors, by default False.
+    **metric_kwargs : dict, optional
+        Additional keyword arguments to pass to the metric function.
+
+    Returns
+    -------
+    observe_stat : float
+        The test statistic. To compute the test statistic, take
+        ``permute_stat_`` and subtract ``observe_stat_``.
+    pvalue : float
+        The p-value of the test statistic.
+    orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
+        The predicted posterior probabilities for each estimator on their
+        out of bag samples.
+    perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
+        The predicted posterior probabilities for each of the permuted estimators
+        on their out of bag samples.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]
+
+    if covariate_index is None:
+        covariate_index = np.arange(X.shape[1], dtype=int)
+
+    if not isinstance(perm_est, PermutationHonestForestClassifier):
+        raise RuntimeError(
+            f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
+        )
+    # perform permutation of covariates
+    # TODO: refactor permutations into the HonestForest(?)
+    # n_samples_train = X.shape[0]
+    # index_arr = rng.choice(
+    #     np.arange(n_samples_train, dtype=int),
+    #     size=(n_samples_train, 1),
+    #     replace=False,
+    #     shuffle=True,
+    # )
+    # X_permute = X.copy()
+    # X_permute[:, covariate_index] = X_permute[index_arr, covariate_index]
 
     # build two sets of forests
     est, orig_forest_proba = build_hyppo_oob_forest(est, X, y, verbose=verbose)
     perm_est, perm_forest_proba = build_hyppo_oob_forest(
-        perm_est, X_permute, y, verbose=verbose, covariate_index=covariate_index
+        perm_est, X, y, verbose=verbose, covariate_index=covariate_index
     )
 
+    # get the number of jobs
+    n_jobs = est.n_jobs
+
     metric_star, metric_star_pi = _compute_null_distribution_coleman(
         y,
         orig_forest_proba,
         perm_forest_proba,
         metric,
         n_repeats=n_repeats,
         seed=seed,
+        n_jobs=n_jobs,
         **metric_kwargs,
     )
 
@@ -1476,6 +1631,7 @@ def build_hyppo_cv_forest(
     return est_list, all_proba_list
 
 
+# XXX: can delete?
 def predict_oob_proba(est, X, verbose=False):
     """Predict out of bag posterior probabilities."""
     from sklearn.utils.validation import check_is_fitted