neurodata · adam2392 · Jan 26, 2024 · Jan 16, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml
@@ -38,14 +38,14 @@ jobs:
             exit 0
           fi
           all_changelogs=$(cat ./doc/whats_new/v*.rst)
-          if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]]
+          if [[ "$all_changelogs" =~ :pr:\`#$PR_NUMBER\` ]]
           then
             echo "Changelog has been updated."
             # If the pull request is milestoned check the correspondent changelog
             if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst
             then
               expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst)
-              if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]]
+              if [[ "$expected_changelog" =~ :pr:\`#$PR_NUMBER\` ]]
               then
                 echo "Changelog and milestone correspond."
               else
@@ -56,7 +56,7 @@ jobs:
               fi
             fi
           else
-            echo "A Changelog entry is missing."
+            echo "A Changelog entry is missing for :pr:\`#$PR_NUMBER\`"
             echo ""
             echo "Please add an entry to the changelog at 'doc/whats_new/v*.rst'"
             echo "to document your change assuming that the PR will be merged"

diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -29,10 +29,10 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
-      - name: Setup Python 3.10
+      - name: Setup Python 3.11
         uses: actions/setup-python@v5
         with:
-          python-version: "3.10"
+          python-version: "3.11"
           architecture: "x64"
 
       - name: Install packages for Ubuntu

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,6 +5,12 @@ repos:
       - id: black
         args: [--quiet]
 
+  - repo: https://github.com/MarcoGorelli/cython-lint
+    rev: v0.16.0
+    hooks:
+      - id: cython-lint
+      - id: double-quote-cython-strings
+
   # Ruff sktree
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.6
@@ -65,7 +71,7 @@ repos:
       - id: rstcheck
         additional_dependencies:
           - tomli
-        files: ^doc/.*\.(rst|inc)$
+        files: ^(?!doc/use\.rst$).*\.(rst|inc)$
 
 ci:
   autofix_prs: false
diff --git a/doc/conf.py b/doc/conf.py
@@ -246,6 +246,7 @@
     "TreeBuilder",
     "joint_rank",
     "n_dim",
+    "n_samples_bootstrap",
 }
 
 # validation

diff --git a/doc/use.rst b/doc/use.rst
@@ -8,4 +8,4 @@ to learn everything you need!
 
 .. rstcheck: ignore-next-code-block
 .. include:: auto_examples/index.rst
-   :start-after: :orphan:
+   :start-after: :orphan:
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -29,6 +29,9 @@ Changelog
   has a generative model based on Trunk and banded covariance, :func:`sktree.datasets.approximate_clf_mutual_information` and
   :func:`sktree.datasets.approximate_clf_mutual_information_with_monte_carlo` to
   approximate mutual information either numerically or via Monte-Carlo, by `Adam Li`_ and `Haoyin Xu`_ (:pr:`#199`).
+- |Enhancement| :class:`sktree.HonestForestClassifier` now has a fitted
+  property ``oob_samples_``, which reproduces the sample indices per tree that is out
+  of bag, by `Adam Li`_ (:pr:`#200`).
 
 
 Code and Documentation Contributors

diff --git a/examples/calibration/plot_overlapping_gaussians.py b/examples/calibration/plot_overlapping_gaussians.py
@@ -65,7 +65,7 @@
     (
         "IRF",
         CalibratedClassifierCV(
-            base_estimator=RandomForestClassifier(
+            estimator=RandomForestClassifier(
                 n_estimators=n_estimators // clf_cv,
                 max_features=max_features,
                 n_jobs=n_jobs,
@@ -77,7 +77,7 @@
     (
         "SigRF",
         CalibratedClassifierCV(
-            base_estimator=RandomForestClassifier(
+            estimator=RandomForestClassifier(
                 n_estimators=n_estimators // clf_cv,
                 max_features=max_features,
                 n_jobs=n_jobs,

diff --git a/pyproject.toml b/pyproject.toml
@@ -293,7 +293,6 @@ ignore_roles = [
 ]
 report_level = "WARNING"
 ignore = ["SEVERE/4"]
-paths = ["doc/use.rst"]
 
 [tool.ruff]
 extend-exclude = [

diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
@@ -50,6 +50,7 @@ def make_quadratic_classification(n_samples: int, n_features: int, noise=False,
 def make_trunk_classification(
     n_samples,
     n_dim=10,
+    n_informative=10,
     m_factor: int = -1,
     rho: int = 0,
     band_type: str = "ma",
@@ -76,6 +77,9 @@ def make_trunk_classification(
     n_dim : int, optional
         The dimensionality of the dataset and the number of
         unique labels, by default 10.
+    n_informative : int, optional
+        The informative dimensions. All others for ``n_dim - n_informative``
+        are uniform noise.
     m_factor : int, optional
         The multiplicative factor to apply to the mean-vector of the first
         distribution to obtain the mean-vector of the second distribution.
@@ -108,25 +112,30 @@ def make_trunk_classification(
     ----------
     .. footbibliography::
     """
+    if n_dim < n_informative:
+        raise ValueError(
+            f"Number of informative dimensions {n_informative} must be less than number "
+            f"of dimensions, {n_dim}"
+        )
     rng = np.random.default_rng(seed=seed)
 
-    mu_1 = np.array([1 / np.sqrt(i) for i in range(1, n_dim + 1)])
+    mu_1 = np.array([1 / np.sqrt(i) for i in range(1, n_informative + 1)])
     mu_0 = m_factor * mu_1
 
     if rho != 0:
         if band_type == "ma":
-            cov = _moving_avg_cov(n_dim, rho)
+            cov = _moving_avg_cov(n_informative, rho)
         elif band_type == "ar":
-            cov = _autoregressive_cov(n_dim, rho)
+            cov = _autoregressive_cov(n_informative, rho)
         else:
             raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".')
     else:
-        cov = np.identity(n_dim)
+        cov = np.identity(n_informative)
 
     if mix < 0 or mix > 1:
         raise ValueError("Mix must be between 0 and 1.")
 
-    if n_dim > 1000:
+    if n_informative > 1000:
         method = "cholesky"
     else:
         method = "svd"
@@ -139,13 +148,29 @@ def make_trunk_classification(
             )
         )
     else:
+        mixture_idx = rng.choice(
+            [0, 1], n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
+        )
+        X_mixture = np.zeros((n_samples // 2, len(mu_1)))
+        for idx in range(n_samples // 2):
+            if mixture_idx[idx] == 1:
+                X_sample = rng.multivariate_normal(mu_1, cov, 1, method=method)
+            else:
+                X_sample = rng.multivariate_normal(mu_0, cov, 1, method=method)
+            X_mixture[idx, :] = X_sample
+
         X = np.vstack(
             (
-                rng.multivariate_normal(np.zeros(n_dim), cov, n_samples // 2, method=method),
-                (1 - mix) * rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method)
-                + mix * rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
+                rng.multivariate_normal(
+                    np.zeros(n_informative), cov, n_samples // 2, method=method
+                ),
+                X_mixture,
             )
         )
+
+    if n_dim > n_informative:
+        X = np.hstack((X, rng.uniform(low=0, high=1, size=(n_samples, n_dim - n_informative))))
+
     y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))
 
     if return_params:
@@ -208,19 +233,19 @@ def approximate_clf_mutual_information(
     # this implicitly assumes that the signal of interest is between -10 and 10
     scale = 10
     n_dims = [cov.shape[1] for cov in covs]
-    lims = [[-scale, scale]] * n_dims
+    lims = [[-scale, scale]] * max(n_dims)
 
     # Compute entropy and X and Y.
     def func(*args):
         x = np.array(args)
         p = 0
         for k in range(len(means)):
-            p += class_probs[k] * multivariate_normal(seed=seed).pdf(x, means[k], covs[k])
+            p += class_probs[k] * multivariate_normal.pdf(x, means[k], covs[k])
         return -p * np.log(p) / np.log(base)
 
     # numerically integrate H(X)
-    opts = dict(limit=1000)
-    H_X, int_err = nquad(func, lims, opts=opts)
+    # opts = dict(limit=1000)
+    H_X, int_err = nquad(func, lims)
 
     # Compute MI.
     H_XY = 0

diff --git a/sktree/datasets/tests/test_hyppo.py b/sktree/datasets/tests/test_hyppo.py
@@ -1,5 +1,12 @@
 import pytest
-from sktree.datasets import make_quadratic_classification, make_trunk_classification
+import numpy as np
+from numpy.testing import assert_array_equal
+from sktree.datasets import (
+    make_quadratic_classification,
+    make_trunk_classification,
+    approximate_clf_mutual_information,
+    approximate_clf_mutual_information_with_monte_carlo,
+)
 
 
 def test_make_quadratic_classification_v():
@@ -21,12 +28,37 @@ def test_make_trunk_classification_default():
 def test_make_trunk_classification_custom_parameters():
     # Test with custom parameters
     X, y = make_trunk_classification(
-        n_samples=50, n_dim=5, m_factor=2, rho=0.5, band_type="ma", return_params=False
+        n_samples=50,
+        n_dim=5,
+        n_informative=2,
+        m_factor=2,
+        rho=0.5,
+        band_type="ma",
+        return_params=False,
     )
     assert X.shape == (50, 5)
     assert y.shape == (50,)
 
 
+def test_make_trunk_classification_autoregressive_cov():
+    # Test with default parameters
+    n_dim = 10
+    rho = 0.5
+    _, _, _, cov_list = make_trunk_classification(
+        n_samples=100, n_dim=n_dim, rho=rho, band_type="ar", return_params=True
+    )
+    assert_array_equal(cov_list[0], cov_list[1])
+    assert cov_list[0].shape == (n_dim, n_dim)
+    assert_array_equal(cov_list[0][0, :], [rho**idx for idx in range(n_dim)])
+
+
+def test_make_trunk_classification_mixture():
+    # Test with default parameters
+    X, y, _, _ = make_trunk_classification(n_samples=100, mix=0.5, return_params=True)
+    assert X.shape == (100, 10)
+    assert y.shape == (100,)
+
+
 def test_make_trunk_classification_return_params():
     # Test with return_params=True and uneven number of samples
     X, y, means, covs = make_trunk_classification(n_samples=75, n_dim=10, return_params=True)
@@ -46,3 +78,27 @@ def test_make_trunk_classification_invalid_mix():
     # Test with an invalid band type
     with pytest.raises(ValueError, match="Mix must be between 0 and 1."):
         make_trunk_classification(n_samples=50, rho=0.5, mix=2)
+
+
+def test_make_trunk_classification_invalid_n_informative():
+    # Test with an invalid band type
+    with pytest.raises(ValueError, match="Number of informative dimensions"):
+        make_trunk_classification(n_samples=50, n_dim=10, n_informative=11, rho=0.5, mix=2)
+
+
+def test_approximate_clf_mutual_information_numerically_close():
+    mean1 = np.array([1, 1])
+    cov1 = np.array([[1, 0.5], [0.5, 1]])
+
+    mean2 = np.array([-1, -1])
+    cov2 = np.array([[1, -0.5], [-0.5, 1]])
+
+    means = [mean1, mean2]
+    covs = [cov1, cov2]
+
+    result_approximate = approximate_clf_mutual_information(means, covs)
+    result_monte_carlo = approximate_clf_mutual_information_with_monte_carlo(means, covs)
+
+    assert np.isclose(
+        result_approximate[0], result_monte_carlo[0], atol=5e-2
+    ), f"{result_approximate[0]}, {result_monte_carlo[0]}"
diff --git a/sktree/ensemble/_eiforest.py b/sktree/ensemble/_eiforest.py
@@ -88,15 +88,6 @@ class ExtendedIsolationForest(IsolationForest):
         fitted sub-estimators.
 
         .. versionadded:: 1.2
-           `base_estimator_` was renamed to `estimator_`.
-
-    base_estimator_ : ExtraTreeRegressor instance
-        The child estimator template used to create the collection of
-        fitted sub-estimators.
-
-        .. deprecated:: 1.2
-            `base_estimator_` is deprecated and will be removed in 1.4.
-            Use `estimator_` instead.
 
     estimators_ : list of ExtraTreeRegressor instances
         The collection of fitted sub-estimators.
+4 −4		.pre-commit-config.yaml
+8 −0		Makefile
+2 −2		SECURITY.md
+1 −0		azure-pipelines.yml
+16 −6		build_tools/azure/install.sh
+8 −2		build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock
+2 −0		build_tools/azure/pylatest_conda_forge_mkl_linux-64_environment.yml
+45 −0		build_tools/build-meson-editable-install.py
+4 −4		build_tools/cirrus/pymin_conda_forge_linux-aarch64_conda.lock
+2 −0		build_tools/update_environments_and_lock_files.py
+12 −0		build_tools/wheels/build_wheels.sh
+82 −0		doc/developers/advanced_installation.rst
+14 −7		doc/modules/cross_decomposition.rst
+6 −0		doc/whats_new/v1.4.rst
+33 −9		doc/whats_new/v1.5.rst
+53 −0		meson.build
+7 −0		sklearn/__check_build/meson.build
+8 −0		sklearn/__init__.py
+57 −0		sklearn/_build_utils/tempita.py
+14 −0		sklearn/_build_utils/version.py
+7 −0		sklearn/_config.py
+19 −0		sklearn/_loss/meson.build
+16 −0		sklearn/cluster/_hdbscan/meson.build
+16 −0		sklearn/cluster/_k_means_common.pyx
+28 −0		sklearn/cluster/meson.build
+18 −0		sklearn/cluster/tests/test_k_means.py
+37 −0		sklearn/covariance/_shrunk_covariance.py
+9 −0		sklearn/datasets/_california_housing.py
+18 −0		sklearn/datasets/_samples_generator.py
+7 −0		sklearn/datasets/_svmlight_format_io.py
+8 −0		sklearn/datasets/meson.build
+17 −0		sklearn/decomposition/meson.build
+3 −2		sklearn/ensemble/_forest.py
+4 −2		sklearn/ensemble/_gb.py
+2 −2		sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+16 −5		sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+21 −0		sklearn/ensemble/_hist_gradient_boosting/meson.build
+11 −0		sklearn/ensemble/meson.build
+12 −5		sklearn/ensemble/tests/test_voting.py
+9 −0		sklearn/feature_extraction/meson.build
+1 −1		sklearn/inspection/_partial_dependence.py
+2 −2		sklearn/inspection/tests/test_partial_dependence.py
+6 −5		sklearn/linear_model/_ridge.py
+33 −0		sklearn/linear_model/meson.build
+13 −3		sklearn/linear_model/tests/test_sgd.py
+11 −0		sklearn/manifold/_locally_linear.py
+10 −0		sklearn/manifold/_t_sne.py
+17 −0		sklearn/manifold/meson.build
+166 −0		sklearn/meson.build
+172 −0		sklearn/metrics/_pairwise_distances_reduction/meson.build
+21 −0		sklearn/metrics/_ranking.py
+8 −0		sklearn/metrics/cluster/_bicluster.py
+8 −0		sklearn/metrics/cluster/_unsupervised.py
+8 −0		sklearn/metrics/cluster/meson.build
+44 −0		sklearn/metrics/meson.build
+11 −4		sklearn/model_selection/tests/test_search.py
+53 −0		sklearn/neighbors/meson.build
+13 −4		sklearn/neighbors/tests/test_lof.py
+17 −0		sklearn/preprocessing/meson.build
+1 −1		sklearn/semi_supervised/tests/test_self_training.py
+57 −0		sklearn/svm/meson.build
+7 −0		sklearn/tests/test_common.py
+21 −5		sklearn/tests/test_multioutput.py
+17 −8		sklearn/tests/test_pipeline.py
+4 −3		sklearn/tree/_classes.py
+3 −3		sklearn/tree/_splitter.pxd
+19 −15		sklearn/tree/_splitter.pyx
+6 −6		sklearn/tree/_tree.pxd
+11 −11		sklearn/tree/_tree.pyx
+26 −0		sklearn/tree/meson.build
+16 −0		sklearn/utils/__init__.py
+13 −6		sklearn/utils/_available_if.py
+7 −0		sklearn/utils/_estimator_html_repr.py
+22 −18		sklearn/utils/_testing.py
+19 −0		sklearn/utils/arrayfuncs.pyx
+7 −0		sklearn/utils/estimator_checks.py
+31 −2		sklearn/utils/extmath.py
+73 −0		sklearn/utils/meson.build
+3 −3		sklearn/utils/src/MurmurHash3.cpp
+29 −0		sklearn/utils/tests/test_testing.py
+70 −16		sklearn/utils/validation.py