Skip to content

Commit

Permalink
Merge pull request #28 from jinlow/feature/pdplots-fix
Browse files Browse the repository at this point in the history
Making partial dependence plot creation faster, and easier to use from python API
  • Loading branch information
jinlow authored May 9, 2023
2 parents 9fc288f + dda3410 commit cc33c5f
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 24 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.2.5"
version = "0.2.6"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.2.5"
forust-ml = "0.2.6"
```

## Usage
Expand Down Expand Up @@ -136,7 +136,9 @@ The `partial_dependence` method takes the following parameters...

- `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array. This should be the same data passed into the models fit, or predict, with the columns in the same order.
- `feature` ***(Union[str, int])***: The feature for which to calculate the partial dependence values. This can be the name of a column, if the provided X is a pandas DataFrame, or the index of the feature.

- `samples` ***(int | None, optional)***: Number of evenly spaced samples to select. If None is passed all unique values will be used. Defaults to 100.
- `exclude_missing` ***(bool, optional)***: Should missing excluded from the features? Defaults to True.
- `percentile_bounds` ***(tuple[float, float], optional)***: Upper and lower percentiles to start at when calculating the samples. Defaults to (0.2, 0.98) to cap the samples selected at the 5th and 95th percentiles respectively.
This method returns a 2 dimensional numpy array, where the first column is the sorted unique values of the feature, and then the second column is the partial dependence values for each feature value.

This information can be plotted to visualize how a feature is used in the model, like so.
Expand All @@ -145,7 +147,7 @@ This information can be plotted to visualize how a feature is used in the model,
from seaborn import lineplot
import matplotlib.pyplot as plt

pd_values = model.partial_dependence(X=X, feature="age")
pd_values = model.partial_dependence(X=X, feature="age", samples=None)

fig = lineplot(x=pd_values[:,0], y=pd_values[:,1],)
plt.title("Partial Dependence Plot")
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.2.5"
version = "0.2.6"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,6 +10,6 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.17", features = ["extension-module"] }
forust-ml = { version="0.2.5", path="../" }
forust-ml = { version="0.2.6", path="../" }
numpy = "0.17.2"
ndarray = "0.15.1"
81 changes: 69 additions & 12 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import sys
import warnings
from ast import literal_eval
from typing import Any, Union, cast

import numpy as np
Expand Down Expand Up @@ -80,6 +82,13 @@ def get_metadata(self, key: str) -> str:


class GradientBooster:
# Define the metadata parameters
# that are present on all instances of this class
# this is useful for parameters that should be
# attempted to be loaded in and set
# as attributes on the booster after it is loaded.
meta_data_attributes = ["feature_names_in_"]

def __init__(
self,
objective_type: str = "LogLoss",
Expand Down Expand Up @@ -244,7 +253,7 @@ def fit(
features_, rows, cols, flat_data = self._convert_input_frame(X)
if len(features_) > 0:
self.feature_names_in_ = features_
self.insert_metadata("feature_names_in_", str(self.feature_names_in_))
self.insert_metadata("feature_names_in_", self.feature_names_in_)

y_ = y.to_numpy() if isinstance(y, pd.Series) else y

Expand Down Expand Up @@ -330,7 +339,14 @@ def predict_contributions(
)
return np.reshape(contributions, (rows, cols + 1))

def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarray:
def partial_dependence(
self,
X: FrameLike,
feature: Union[str, int],
samples: int | None = 100,
exclude_missing: bool = True,
percentile_bounds: tuple[float, float] = (0.2, 0.98),
) -> np.ndarray:
"""Calculate the partial dependence values of a feature. For each unique
value of the feature, this gives the estimate of the predicted value for that
feature, with the effects of all features averaged out. This information gives
Expand All @@ -343,6 +359,12 @@ def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarr
feature (Union[str, int]): The feature for which to calculate the partial
dependence values. This can be the name of a column, if the provided
X is a pandas DataFrame, or the index of the feature.
samples (int | None, optional): Number of evenly spaced samples to select. If None
is passed all unique values will be used. Defaults to 100.
exclude_missing (bool, optional): Should missing excluded from the features? Defaults to True.
percentile_bounds (tuple[float, float], optional): Upper and lower percentiles to start at
when calculating the samples. Defaults to (0.2, 0.98) to cap the samples selected
at the 5th and 95th percentiles respectively.
Raises:
ValueError: An error will be raised if the provided X parameter is not a
Expand All @@ -359,20 +381,43 @@ def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarr
raise ValueError(
"If `feature` is a string, then the object passed as `X` must be a pandas DataFrame."
)
values = np.sort(X.loc[:, feature].unique())
feature_idx = next(i for i, v in enumerate(X.columns) if v == feature)
values = X.loc[:, feature].to_numpy()
if hasattr(self, "feature_names_in_"):
[feature_idx] = [
i for i, v in enumerate(self.feature_names_in_) if v == feature
]
else:
w_msg = (
"No feature names were provided at fit, but feature was a string, attempting to "
+ "determine feature index from DataFrame Column, "
+ "ensure columns are the same order as data passed when fit."
)
warnings.warn(w_msg)
[feature_idx] = [i for i, v in enumerate(X.columns) if v == feature]
elif isinstance(feature, int):
feature_idx = feature
if is_dataframe:
values = np.sort(X.iloc[:, feature].unique())
values = X.iloc[:, feature].unique()
else:
values = np.sort(np.unique(X[:, feature]))
feature_idx = feature
values = X[:, feature]
else:
raise ValueError(
f"The parameter `feature` must be a string, or an int, however an object of type {type(feature)} was passed."
)
min_p, max_p = percentile_bounds
values = values[~(np.isnan(values) | (values == self.missing))]
if samples is None:
search_values = np.sort(np.unique(values))
else:
# Exclude missing from this calculation.
search_values = np.quantile(values, np.linspace(min_p, max_p, num=samples))

# Add missing back, if they wanted it...
if not exclude_missing:
search_values = np.append([self.missing], search_values)

res = []
for v in values:
for v in search_values:
res.append(
(v, self.booster.value_partial_dependence(feature=feature_idx, value=v))
)
Expand Down Expand Up @@ -410,6 +455,12 @@ def load_booster(cls, path: str) -> GradientBooster:
params = booster.get_params()
c = cls(**params)
c.booster = booster
for m in c.meta_data_attributes:
try:
m_ = c.get_metadata(m)
setattr(c, m, m_)
except KeyError:
pass
return c

def save_booster(self, path: str):
Expand All @@ -430,16 +481,20 @@ def _standardize_monotonicity_map(
feature_map = {f: i for i, f in enumerate(X.columns)}
return {feature_map[f]: v for f, v in self.monotone_constraints.items()}

def insert_metadata(self, key: str, value: str):
def insert_metadata(self, key: str, value: Any):
"""Insert data into the models metadata, this will be saved on the booster object.
Args:
key (str): Key to give the inserted value in the metadata.
value (str): Value to assign the the key.
"""
self.booster.insert_metadata(key=key, value=value)
if isinstance(value, str):
value_ = f"'{value}'"
else:
value_ = str(value)
self.booster.insert_metadata(key=key, value=value_)

def get_metadata(self, key: str) -> str:
def get_metadata(self, key: Any) -> Any:
"""Get the value associated with a given key, on the boosters metadata.
Args:
Expand All @@ -448,4 +503,6 @@ def get_metadata(self, key: str) -> str:
Returns:
str: Value associated with the provided key in the boosters metadata.
"""
return self.booster.get_metadata(key=key)
# We use json to serialize/deserialize so that we can
v = self.booster.get_metadata(key=key)
return literal_eval(v)
4 changes: 4 additions & 0 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ def test_booster_saving(X_y, tmp_path):
fmod_preds = fmod.predict(X)
fmod.save_booster(f64_model_path)
fmod_loaded = GradientBooster.load_booster(f64_model_path)
assert fmod_loaded.feature_names_in_ == fmod.feature_names_in_
assert fmod_loaded.feature_names_in_ == X.columns.to_list()
assert all(fmod_preds == fmod_loaded.predict(X))


Expand All @@ -318,6 +320,8 @@ def test_booster_saving_with_montone_constraints(X_y, tmp_path):
fmod_preds = fmod.predict(X)
fmod.save_booster(f64_model_path)
fmod_loaded = GradientBooster.load_booster(f64_model_path)
assert fmod_loaded.feature_names_in_ == fmod.feature_names_in_
assert fmod_loaded.feature_names_in_ == X.columns.to_list()
assert all(fmod_preds == fmod_loaded.predict(X))

# LogLoss
Expand Down
2 changes: 1 addition & 1 deletion rs-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
To run this example, add the following code to your `Cargo.toml` file.
```toml
[dependencies]
forust-ml = "0.2.5"
forust-ml = "0.2.6"
polars = "0.24"
reqwest = { version = "0.11", features = ["blocking"] }
```
Expand Down
16 changes: 11 additions & 5 deletions src/gradientbooster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,11 +425,17 @@ impl GradientBooster {
/// * `feature` - The index of the feature.
/// * `value` - The value for which to calculate the partial dependence.
pub fn value_partial_dependence(&self, feature: usize, value: f64) -> f64 {
let pd: f64 = self
.trees
.iter()
.map(|t| t.value_partial_dependence(feature, value, &self.missing))
.sum();
let pd: f64 = if self.parallel {
self.trees
.par_iter()
.map(|t| t.value_partial_dependence(feature, value, &self.missing))
.sum()
} else {
self.trees
.iter()
.map(|t| t.value_partial_dependence(feature, value, &self.missing))
.sum()
};
pd + self.base_score
}

Expand Down

0 comments on commit cc33c5f

Please sign in to comment.