Merge pull request #28 from jinlow/feature/pdplots-fix

Making partial dependence plot creation faster, and easier to use from python API
jinlow · May 9, 2023 · cc33c5f · cc33c5f
2 parents 9fc288f + dda3410
commit cc33c5f
Show file tree

Hide file tree

Showing 7 changed files with 93 additions and 24 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.2.5"
+version = "0.2.6"
 edition = "2021"
 authors = ["James Inlow <[email protected]>"]
 homepage = "https://github.com/jinlow/forust"

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.2.5"
+forust-ml = "0.2.6"
 ```
 
 ## Usage
@@ -136,7 +136,9 @@ The `partial_dependence` method takes the following parameters...
 
  - `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array. This should be the same data passed into the models fit, or predict, with the columns in the same order.
  - `feature` ***(Union[str, int])***: The feature for which to calculate the partial dependence values. This can be the name of a column, if the provided X is a pandas DataFrame, or the index of the feature.
-
+ - `samples` ***(int | None, optional)***: Number of evenly spaced samples to select. If None is passed all unique values will be used. Defaults to 100.
+ - `exclude_missing` ***(bool, optional)***: Should missing excluded from the features? Defaults to True.
+ - `percentile_bounds` ***(tuple[float, float], optional)***: Upper and lower percentiles to start at  when calculating the samples. Defaults to (0.2, 0.98) to cap the samples selected  at the 5th and 95th percentiles respectively.
 This method returns a 2 dimensional numpy array, where the first column is the sorted unique values of the feature, and then the second column is the partial dependence values for each feature value.
 
 This information can be plotted to visualize how a feature is used in the model, like so.
@@ -145,7 +147,7 @@ This information can be plotted to visualize how a feature is used in the model,
 from seaborn import lineplot
 import matplotlib.pyplot as plt
 
-pd_values = model.partial_dependence(X=X, feature="age")
+pd_values = model.partial_dependence(X=X, feature="age", samples=None)
 
 fig = lineplot(x=pd_values[:,0], y=pd_values[:,1],)
 plt.title("Partial Dependence Plot")

diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.2.5"
+version = "0.2.6"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,6 +10,6 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.17", features = ["extension-module"] }
-forust-ml = { version="0.2.5", path="../" }
+forust-ml = { version="0.2.6", path="../" }
 numpy = "0.17.2"
 ndarray = "0.15.1"
diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import sys
+import warnings
+from ast import literal_eval
 from typing import Any, Union, cast
 
 import numpy as np
@@ -80,6 +82,13 @@ def get_metadata(self, key: str) -> str:
 
 
 class GradientBooster:
+    # Define the metadata parameters
+    # that are present on all instances of this class
+    # this is useful for parameters that should be
+    # attempted to be loaded in and set
+    # as attributes on the booster after it is loaded.
+    meta_data_attributes = ["feature_names_in_"]
+
     def __init__(
         self,
         objective_type: str = "LogLoss",
@@ -244,7 +253,7 @@ def fit(
         features_, rows, cols, flat_data = self._convert_input_frame(X)
         if len(features_) > 0:
             self.feature_names_in_ = features_
-            self.insert_metadata("feature_names_in_", str(self.feature_names_in_))
+            self.insert_metadata("feature_names_in_", self.feature_names_in_)
 
         y_ = y.to_numpy() if isinstance(y, pd.Series) else y
 
@@ -330,7 +339,14 @@ def predict_contributions(
         )
         return np.reshape(contributions, (rows, cols + 1))
 
-    def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarray:
+    def partial_dependence(
+        self,
+        X: FrameLike,
+        feature: Union[str, int],
+        samples: int | None = 100,
+        exclude_missing: bool = True,
+        percentile_bounds: tuple[float, float] = (0.2, 0.98),
+    ) -> np.ndarray:
         """Calculate the partial dependence values of a feature. For each unique
         value of the feature, this gives the estimate of the predicted value for that
         feature, with the effects of all features averaged out. This information gives
@@ -343,6 +359,12 @@ def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarr
             feature (Union[str, int]): The feature for which to calculate the partial
                 dependence values. This can be the name of a column, if the provided
                 X is a pandas DataFrame, or the index of the feature.
+            samples (int | None, optional): Number of evenly spaced samples to select. If None
+                is passed all unique values will be used. Defaults to 100.
+            exclude_missing (bool, optional): Should missing excluded from the features? Defaults to True.
+            percentile_bounds (tuple[float, float], optional): Upper and lower percentiles to start at
+                when calculating the samples. Defaults to (0.2, 0.98) to cap the samples selected
+                at the 5th and 95th percentiles respectively.
 
         Raises:
             ValueError: An error will be raised if the provided X parameter is not a
@@ -359,20 +381,43 @@ def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarr
                 raise ValueError(
                     "If `feature` is a string, then the object passed as `X` must be a pandas DataFrame."
                 )
-            values = np.sort(X.loc[:, feature].unique())
-            feature_idx = next(i for i, v in enumerate(X.columns) if v == feature)
+            values = X.loc[:, feature].to_numpy()
+            if hasattr(self, "feature_names_in_"):
+                [feature_idx] = [
+                    i for i, v in enumerate(self.feature_names_in_) if v == feature
+                ]
+            else:
+                w_msg = (
+                    "No feature names were provided at fit, but feature was a string, attempting to "
+                    + "determine feature index from DataFrame Column, "
+                    + "ensure columns are the same order as data passed when fit."
+                )
+                warnings.warn(w_msg)
+                [feature_idx] = [i for i, v in enumerate(X.columns) if v == feature]
         elif isinstance(feature, int):
+            feature_idx = feature
             if is_dataframe:
-                values = np.sort(X.iloc[:, feature].unique())
+                values = X.iloc[:, feature].unique()
             else:
-                values = np.sort(np.unique(X[:, feature]))
-            feature_idx = feature
+                values = X[:, feature]
         else:
             raise ValueError(
                 f"The parameter `feature` must be a string, or an int, however an object of type {type(feature)} was passed."
             )
+        min_p, max_p = percentile_bounds
+        values = values[~(np.isnan(values) | (values == self.missing))]
+        if samples is None:
+            search_values = np.sort(np.unique(values))
+        else:
+            # Exclude missing from this calculation.
+            search_values = np.quantile(values, np.linspace(min_p, max_p, num=samples))
+
+        # Add missing back, if they wanted it...
+        if not exclude_missing:
+            search_values = np.append([self.missing], search_values)
+
         res = []
-        for v in values:
+        for v in search_values:
             res.append(
                 (v, self.booster.value_partial_dependence(feature=feature_idx, value=v))
             )
@@ -410,6 +455,12 @@ def load_booster(cls, path: str) -> GradientBooster:
         params = booster.get_params()
         c = cls(**params)
         c.booster = booster
+        for m in c.meta_data_attributes:
+            try:
+                m_ = c.get_metadata(m)
+                setattr(c, m, m_)
+            except KeyError:
+                pass
         return c
 
     def save_booster(self, path: str):
@@ -430,16 +481,20 @@ def _standardize_monotonicity_map(
             feature_map = {f: i for i, f in enumerate(X.columns)}
             return {feature_map[f]: v for f, v in self.monotone_constraints.items()}
 
-    def insert_metadata(self, key: str, value: str):
+    def insert_metadata(self, key: str, value: Any):
         """Insert data into the models metadata, this will be saved on the booster object.
 
         Args:
             key (str): Key to give the inserted value in the metadata.
             value (str): Value to assign the the key.
         """
-        self.booster.insert_metadata(key=key, value=value)
+        if isinstance(value, str):
+            value_ = f"'{value}'"
+        else:
+            value_ = str(value)
+        self.booster.insert_metadata(key=key, value=value_)
 
-    def get_metadata(self, key: str) -> str:
+    def get_metadata(self, key: Any) -> Any:
         """Get the value associated with a given key, on the boosters metadata.
 
         Args:
@@ -448,4 +503,6 @@ def get_metadata(self, key: str) -> str:
         Returns:
             str: Value associated with the provided key in the boosters metadata.
         """
-        return self.booster.get_metadata(key=key)
+        # We use json to serialize/deserialize so that we can
+        v = self.booster.get_metadata(key=key)
+        return literal_eval(v)
diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
@@ -293,6 +293,8 @@ def test_booster_saving(X_y, tmp_path):
     fmod_preds = fmod.predict(X)
     fmod.save_booster(f64_model_path)
     fmod_loaded = GradientBooster.load_booster(f64_model_path)
+    assert fmod_loaded.feature_names_in_ == fmod.feature_names_in_
+    assert fmod_loaded.feature_names_in_ == X.columns.to_list()
     assert all(fmod_preds == fmod_loaded.predict(X))
 
 
@@ -318,6 +320,8 @@ def test_booster_saving_with_montone_constraints(X_y, tmp_path):
     fmod_preds = fmod.predict(X)
     fmod.save_booster(f64_model_path)
     fmod_loaded = GradientBooster.load_booster(f64_model_path)
+    assert fmod_loaded.feature_names_in_ == fmod.feature_names_in_
+    assert fmod_loaded.feature_names_in_ == X.columns.to_list()
     assert all(fmod_preds == fmod_loaded.predict(X))
 
     # LogLoss

diff --git a/rs-example.md b/rs-example.md
@@ -3,7 +3,7 @@
 To run this example, add the following code to your `Cargo.toml` file.
 ```toml
 [dependencies]
-forust-ml = "0.2.5"
+forust-ml = "0.2.6"
 polars = "0.24"
 reqwest = { version = "0.11", features = ["blocking"] }
 ```

diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
@@ -425,11 +425,17 @@ impl GradientBooster {
     /// * `feature` - The index of the feature.
     /// * `value` - The value for which to calculate the partial dependence.
     pub fn value_partial_dependence(&self, feature: usize, value: f64) -> f64 {
-        let pd: f64 = self
-            .trees
-            .iter()
-            .map(|t| t.value_partial_dependence(feature, value, &self.missing))
-            .sum();
+        let pd: f64 = if self.parallel {
+            self.trees
+                .par_iter()
+                .map(|t| t.value_partial_dependence(feature, value, &self.missing))
+                .sum()
+        } else {
+            self.trees
+                .iter()
+                .map(|t| t.value_partial_dependence(feature, value, &self.missing))
+                .sum()
+        };
         pd + self.base_score
     }