Skip to content

Commit

Permalink
Merge pull request #89 from jinlow/feature/shapley-values
Browse files Browse the repository at this point in the history
Finished initial shapley support
  • Loading branch information
jinlow authored Dec 5, 2023
2 parents bdc68b0 + 1de6e61 commit c33138c
Show file tree
Hide file tree
Showing 9 changed files with 411 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.4.2"
version = "0.4.3"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.4.2"
forust-ml = "0.4.3"
```

## Usage
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.4.2"
version = "0.4.3"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,7 +10,7 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.20.0", features = ["extension-module"] }
forust-ml = { version = "0.4.2", path = "../" }
forust-ml = { version = "0.4.3", path = "../" }
numpy = "0.20.0"
ndarray = "0.15.1"
serde_plain = { version = "1.0" }
Expand Down
87 changes: 87 additions & 0 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import dataclasses
import inspect
import json
import sys
Expand Down Expand Up @@ -61,6 +62,91 @@ class Node:
right_child: int
is_leaf: bool

@classmethod
def _from_xgboost_node(
cls, xgb_node: dict[str, Any], feature_map: dict[Any, int]
) -> Node:
return Node(
num=xgb_node["nodeid"],
weight_value=xgb_node.get("leaf", 0.0),
hessian_sum=xgb_node["cover"],
depth=xgb_node.get("depth", 0),
split_value=float(np.float32(xgb_node.get("split_condition", 0.0))),
split_feature=feature_map.get(xgb_node.get("split", 0), 0),
split_gain=xgb_node.get("gain", 0.0),
missing_node=xgb_node.get("missing", 0),
left_child=xgb_node.get("yes", 0),
right_child=xgb_node.get("no", 0),
is_leaf="leaf" in xgb_node,
)


def _xgboost_tree_to_nodes(
tree: dict[str, Any], feature_map: dict[Any, int]
) -> list[dict[str, Any]]:
buffer = [tree]
node_list = []
while len(buffer) > 0:
xgb_node = buffer.pop(0)
node_list.append(
dataclasses.asdict(
Node._from_xgboost_node(xgb_node, feature_map=feature_map)
)
)
if "leaf" not in xgb_node:
buffer.extend(xgb_node["children"])
# Ensure the nodeids all align with the nodes index
for idx, node in enumerate(node_list):
if idx != node["num"]:
raise ValueError(
f"Nodes are unaligned for node {node['num']} at index {idx}"
)
return node_list


def _from_xgboost_model(model: Any) -> GradientBooster:
import xgboost

if isinstance(model, xgboost.XGBModel):
booster = model.get_booster()
else:
booster = cast(xgboost.Booster, model)
# Get the model dump...
model_dump = booster.get_dump(dump_format="json", with_stats=True)
features = booster.feature_names
if features is None:
feature_map = {}
else:
feature_map = {v: i for i, v in enumerate(features)}

# Get the nodes
trees = []
for tree in model_dump:
nodes = _xgboost_tree_to_nodes(tree=json.loads(tree), feature_map=feature_map)
trees.append({"nodes": nodes})

# This is would be wrong, for models trained with "binary:logistic"
# because the base score is modified prior to predictions.
# We would need to modify prior to handing it to the forust
# model.
learner_config = json.loads(model.get_booster().save_config())["learner"]
base_score = float(learner_config["learner_model_param"]["base_score"])
if learner_config["objective"]["name"] == "binary:logistic":
base_score = np.log(base_score / (1 - base_score))

# Get initial dump
model_json = json.loads(GradientBooster().json_dump())
model_json["base_score"] = base_score
model_json["trees"] = trees

# Populate booster from json
final_model = GradientBooster()
final_model.booster = CrateGradientBooster.from_json(json.dumps(model_json))
if features is not None:
final_model.feature_names_in_ = features
final_model.n_features_ = len(features)
return final_model


class BoosterType(Protocol):
monotone_constraints: dict[int, int]
Expand Down Expand Up @@ -584,6 +670,7 @@ def predict_contributions(
method (str, optional): Method to calculate the contributions, available options are:
- "Average": If this option is specified, the average internal node values are calculated, this is equivalent to the `approx_contribs` parameter in XGBoost.
- "Shapley": Using this option will calculate contributions using the tree shap algorithm.
- "Weight": This method will use the internal leaf weights, to calculate the contributions. This is the same as what is described by Saabas [here](https://blog.datadive.net/interpreting-random-forests/).
- "BranchDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the other non-missing branch. This method does not have the property where the contributions summed is equal to the final prediction of the model.
- "MidpointDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the mid-point between the right and left node weighted by the cover of each node. This method does not have the property where the contributions summed is equal to the final prediction of the model.
Expand Down
82 changes: 82 additions & 0 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,88 @@ def test_booster_to_xgboosts_with_contributions(X_y):
assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001)


def test_booster_to_xgboosts_with_contributions_shapley(X_y):
X, y = X_y
X = X.round(0)
fmod = GradientBooster(
iterations=2,
learning_rate=0.3,
max_depth=5,
l2=1,
min_leaf_weight=1,
gamma=1,
objective_type="LogLoss",
nbins=1_000,
parallel=True,
base_score=0.5,
initialize_base_score=False,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
contribs_average = fmod.predict_contributions(X)
fmod_preds[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)]
contribs_average.sum(1)[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)]
assert contribs_average.shape[1] == X.shape[1] + 1
assert np.allclose(contribs_average.sum(1), fmod_preds)

contribs_shapley = fmod.predict_contributions(X, method="Shapley")
assert np.allclose(contribs_shapley.sum(1), fmod_preds)
assert not np.allclose(contribs_shapley, contribs_average)

xmod = XGBClassifier(
n_estimators=2,
learning_rate=0.3,
max_depth=5,
reg_lambda=1,
min_child_weight=1,
gamma=1,
objective="binary:logitraw",
eval_metric="auc",
tree_method="hist",
max_bin=20000,
base_score=0.5,
)
xmod.fit(X, y)
import xgboost as xgb

xmod_contribs_shapley = xmod.get_booster().predict(
xgb.DMatrix(X), approx_contribs=False, pred_contribs=True
)
assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001)
assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001)


def test_booster_to_xgboosts_with_contributions_shapley_from_xgboost(X_y):
X, y = X_y
X = X.astype(np.float32)
xmod = XGBClassifier(
n_estimators=100,
learning_rate=0.3,
max_depth=10,
reg_lambda=1,
min_child_weight=1,
gamma=1,
objective="binary:logitraw",
eval_metric="auc",
tree_method="hist",
base_score=0.5,
)
xmod.fit(X, y)

fmod = forust._from_xgboost_model(xmod)

contribs_shapley = fmod.predict_contributions(X, method="Shapley")
fmod_preds = fmod.predict(X)

import xgboost as xgb

xmod_contribs_shapley = xmod.get_booster().predict(
xgb.DMatrix(X), approx_contribs=False, pred_contribs=True
)
assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001)
assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001)


def test_missing_branch_with_contributions(X_y):
X, y = X_y
X = X
Expand Down
2 changes: 1 addition & 1 deletion rs-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
To run this example, add the following code to your `Cargo.toml` file.
```toml
[dependencies]
forust-ml = "0.4.2"
forust-ml = "0.4.3"
polars = "0.28"
reqwest = { version = "0.11", features = ["blocking"] }
```
Expand Down
4 changes: 4 additions & 0 deletions src/gradientbooster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::objective::{
SquaredLoss,
};
use crate::sampler::{GossSampler, RandomSampler, SampleMethod, Sampler};
use crate::shapley::predict_contributions_row_shapley;
use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter};
use crate::tree::Tree;
use crate::utils::{fmt_vec_output, odds, validate_positive_float_field};
Expand Down Expand Up @@ -43,6 +44,8 @@ pub enum ContributionsMethod {
ModeDifference,
/// This method is only valid when the objective type is set to "LogLoss". This method will calculate contributions as the change in a records probability of being 1 moving from a parent node to a child node. The sum of the returned contributions matrix, will be equal to the probability a record will be 1. For example, given a model, `model.predict_contributions(X, method="ProbabilityChange") == 1 / (1 + np.exp(-model.predict(X)))`
ProbabilityChange,
/// This method computes the Shapley values for each record, and feature.
Shapley,
}

/// Method to calculate variable importance.
Expand Down Expand Up @@ -713,6 +716,7 @@ impl GradientBooster {
Tree::predict_contributions_row_midpoint_difference
}
ContributionsMethod::ModeDifference => Tree::predict_contributions_row_mode_difference,
ContributionsMethod::Shapley => predict_contributions_row_shapley,
ContributionsMethod::Average | ContributionsMethod::ProbabilityChange => unreachable!(),
};
// Clean this up..
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod histogram;
mod node;
mod partial_dependence;
mod shapley;

// Modules
pub mod binning;
Expand Down
Loading

0 comments on commit c33138c

Please sign in to comment.