From a4a595f3399fd99d71f4d1e4bfc7c785ed71f74f Mon Sep 17 00:00:00 2001 From: jinlow Date: Thu, 2 Nov 2023 16:31:39 -0500 Subject: [PATCH 1/6] Finished initial shapley support --- py-forust/tests/test_booster.py | 51 ++++++++ src/gradientbooster.rs | 4 + src/lib.rs | 1 + src/shaply.rs | 222 ++++++++++++++++++++++++++++++++ 4 files changed, 278 insertions(+) create mode 100644 src/shaply.rs diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 48cf03b..2f506b8 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -838,6 +838,57 @@ def test_booster_to_xgboosts_with_contributions(X_y): assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) +def test_booster_to_xgboosts_with_contributions_shapley(X_y): + X, y = X_y + X = X.round(0) + fmod = GradientBooster( + iterations=2, + learning_rate=0.3, + max_depth=5, + l2=1, + min_leaf_weight=1, + gamma=1, + objective_type="LogLoss", + nbins=1_000, + parallel=True, + base_score=0.5, + initialize_base_score=False, + ) + fmod.fit(X, y=y) + fmod_preds = fmod.predict(X) + contribs_average = fmod.predict_contributions(X) + fmod_preds[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)] + contribs_average.sum(1)[~np.isclose(contribs_average.sum(1), fmod_preds, rtol=5)] + assert contribs_average.shape[1] == X.shape[1] + 1 + assert np.allclose(contribs_average.sum(1), fmod_preds) + + contribs_shapley = fmod.predict_contributions(X, method="Shapley") + assert np.allclose(contribs_shapley.sum(1), fmod_preds) + assert not np.allclose(contribs_shapley, contribs_average) + + xmod = XGBClassifier( + n_estimators=2, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + min_child_weight=1, + gamma=1, + objective="binary:logitraw", + eval_metric="auc", + tree_method="hist", + max_bin=20000, + base_score=0.5, + ) + xmod.fit(X, y) + import xgboost as xgb + + xmod_contribs_shapley = xmod.get_booster().predict( + xgb.DMatrix(X), approx_contribs=False, pred_contribs=True + ) + assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001) + assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) + + def test_missing_branch_with_contributions(X_y): X, y = X_y X = X diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 77031f7..4c239f2 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -8,6 +8,7 @@ use crate::objective::{ SquaredLoss, }; use crate::sampler::{GossSampler, RandomSampler, SampleMethod, Sampler}; +use crate::shaply::predict_contributions_row_shapley; use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter}; use crate::tree::Tree; use crate::utils::{fmt_vec_output, odds, validate_positive_float_field}; @@ -43,6 +44,8 @@ pub enum ContributionsMethod { ModeDifference, /// This method is only valid when the objective type is set to "LogLoss". This method will calculate contributions as the change in a records probability of being 1 moving from a parent node to a child node. The sum of the returned contributions matrix, will be equal to the probability a record will be 1. For example, given a model, `model.predict_contributions(X, method="ProbabilityChange") == 1 / (1 + np.exp(-model.predict(X)))` ProbabilityChange, + /// This method computes the Shapley values for each record, and feature. + Shapley, } /// Method to calculate variable importance. @@ -713,6 +716,7 @@ impl GradientBooster { Tree::predict_contributions_row_midpoint_difference } ContributionsMethod::ModeDifference => Tree::predict_contributions_row_mode_difference, + ContributionsMethod::Shapley => predict_contributions_row_shapley, ContributionsMethod::Average | ContributionsMethod::ProbabilityChange => unreachable!(), }; // Clean this up.. diff --git a/src/lib.rs b/src/lib.rs index d019690..fa9f23a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ mod histogram; mod node; mod partial_dependence; +mod shaply; // Modules pub mod binning; diff --git a/src/shaply.rs b/src/shaply.rs new file mode 100644 index 0000000..f168344 --- /dev/null +++ b/src/shaply.rs @@ -0,0 +1,222 @@ +use crate::node::Node; +use crate::tree::Tree; + +#[derive(Debug, Clone, Copy)] +struct PathElement { + feature_index: usize, + zero_fraction: f32, + one_fraction: f32, + pweight: f32, +} + +impl Default for PathElement { + fn default() -> Self { + Self { + feature_index: 0, + zero_fraction: 0., + one_fraction: 0., + pweight: 0., + } + } +} + +#[derive(Debug, Clone, Default)] +struct PathList { + paths: Vec, +} + +impl PathList { + fn get_element(&mut self, i: usize) -> &PathElement { + if i == self.paths.len() { + self.paths.push(PathElement::default()); + &self.paths[i] + } else { + // This will panic for us, if we are out of bounds. + &self.paths[i] + } + } + fn get_element_mut(&mut self, i: usize) -> &mut PathElement { + if i == self.paths.len() { + self.paths.push(PathElement::default()); + &mut self.paths[i] + } else { + // This will panic for us, if we are out of bounds. + &mut self.paths[i] + } + } +} + +fn extend_path( + unique_path: &mut PathList, + unique_depth: usize, + zero_fraction: f32, + one_fraction: f32, + feature_index: usize, +) { + unique_path.get_element_mut(unique_depth).feature_index = feature_index; + unique_path.get_element_mut(unique_depth).zero_fraction = zero_fraction; + unique_path.get_element_mut(unique_depth).one_fraction = one_fraction; + unique_path.get_element_mut(unique_depth).pweight = if unique_depth == 0 { 1.0 } else { 0.0 }; + for i in (0..unique_depth).rev() { + unique_path.get_element_mut(i + 1).pweight += + (one_fraction * unique_path.get_element(i).pweight * (i + 1) as f32) + / (unique_depth + 1) as f32; + unique_path.get_element_mut(i).pweight = + (zero_fraction * unique_path.get_element(i).pweight * (unique_depth - i) as f32) + / (unique_depth + 1) as f32; + } +} + +fn unwind_path(unique_path: &mut PathList, unique_depth: usize, path_index: usize) { + let one_fraction = unique_path.get_element(path_index).one_fraction; + let zero_fraction = unique_path.get_element(path_index).zero_fraction; + let mut next_one_portion = unique_path.get_element(unique_depth).pweight; + for i in (0..unique_depth).rev() { + if one_fraction != 0. { + let tmp = unique_path.get_element(i).pweight; + unique_path.get_element_mut(i).pweight = + (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction); + next_one_portion = tmp + - (unique_path.get_element(i).pweight * zero_fraction * (unique_depth - i) as f32) + / (unique_depth + 1) as f32; + } else { + unique_path.get_element_mut(i).pweight = (unique_path.get_element(i).pweight + * (unique_depth + 1) as f32) + / (zero_fraction * (unique_depth - i) as f32); + } + } + for i in path_index..unique_depth { + unique_path.get_element_mut(i).feature_index = unique_path.get_element(i + 1).feature_index; + unique_path.get_element_mut(i).zero_fraction = unique_path.get_element(i + 1).zero_fraction; + unique_path.get_element_mut(i).one_fraction = unique_path.get_element(i + 1).one_fraction; + } +} + +fn unwound_path_sum(unique_path: &mut PathList, unique_depth: usize, path_index: usize) -> f32 { + let one_fraction = unique_path.get_element(path_index).one_fraction; + let zero_fraction = unique_path.get_element(path_index).zero_fraction; + let mut next_one_portion = unique_path.get_element(unique_depth).pweight; + let mut total = 0.0; + for i in (0..unique_depth).rev() { + if one_fraction != 0.0 { + let tmp = + (next_one_portion * (unique_depth + 1) as f32) / ((i + 1) as f32 * one_fraction); + total += tmp; + next_one_portion = unique_path.get_element(i).pweight + - tmp * zero_fraction * ((unique_depth - i) as f32 / (unique_depth + 1) as f32); + } else if zero_fraction != 0.0 { + total += (unique_path.get_element(i).pweight / zero_fraction) + / ((unique_depth - i) as f32 / (unique_depth + 1) as f32); + } else if unique_path.get_element(i).pweight != 0.0 { + panic!("Unique path {} must have zero weight", i); + } + } + total +} + +fn get_hot_cold_children(next_node_idx: usize, node: &Node) -> Vec { + if node.has_missing_branch() { + // we know there will be 3 children if there is a missing branch. + if next_node_idx == node.right_child { + vec![node.right_child, node.left_child, node.missing_node] + } else if next_node_idx == node.left_child { + vec![node.left_child, node.right_child, node.missing_node] + } else { + vec![node.missing_node, node.left_child, node.right_child] + } + } else if next_node_idx == node.right_child { + vec![node.right_child, node.left_child] + } else { + vec![node.left_child, node.right_child] + } +} + +#[allow(clippy::too_many_arguments)] +fn tree_shap( + tree: &Tree, + row: &[f64], + contribs: &mut [f64], + node_index: usize, + mut unique_depth: usize, + mut unique_path: PathList, + parent_zero_fraction: f32, + parent_one_fraction: f32, + parent_feature_index: usize, + missing: &f64, +) { + let node = &tree.nodes[node_index]; + extend_path( + &mut unique_path, + unique_depth, + parent_zero_fraction, + parent_one_fraction, + parent_feature_index, + ); + if node.is_leaf { + for i in 1..(unique_depth + 1) { + let w = unwound_path_sum(&mut unique_path, unique_depth, i); + let el = unique_path.get_element(i); + contribs[el.feature_index] += + f64::from(w * (el.one_fraction - el.zero_fraction) * node.weight_value); + } + } else { + let next_node_idx = node.get_child_idx(&row[node.split_feature], missing); + let hot_cold_children = get_hot_cold_children(next_node_idx, node); + let mut incoming_zero_fraction = 1.0; + let mut incoming_one_fraction = 1.0; + + let mut path_index = 0; + while path_index <= unique_depth { + if unique_path.get_element(path_index).feature_index == node.split_feature { + break; + } + path_index += 1; + } + + if path_index != (unique_depth + 1) { + incoming_zero_fraction = unique_path.get_element(path_index).zero_fraction; + incoming_one_fraction = unique_path.get_element(path_index).one_fraction; + unwind_path(&mut unique_path, unique_depth, path_index); + unique_depth -= 1; + } + + for (i, n_idx) in hot_cold_children.into_iter().enumerate() { + let zero_fraction = + (tree.nodes[n_idx].hessian_sum / node.hessian_sum) * incoming_zero_fraction; + let onf = if i == 0 { incoming_one_fraction } else { 0. }; + tree_shap( + tree, + row, + contribs, + n_idx, + unique_depth + 1, + unique_path.clone(), + zero_fraction, + onf, + node.split_feature, + missing, + ) + } + } +} + +pub fn predict_contributions_row_shapley( + tree: &Tree, + row: &[f64], + contribs: &mut [f64], + missing: &f64, +) { + contribs[contribs.len() - 1] += tree.get_average_leaf_weights(0); + tree_shap( + tree, + row, + contribs, + 0, + 0, + PathList::default(), + 1., + 1., + row.len() + 100, + missing, + ) +} From 29b050ce6ef669d9c0e749f74e4b500310e0f057 Mon Sep 17 00:00:00 2001 From: jinlow Date: Mon, 20 Nov 2023 15:15:21 -0600 Subject: [PATCH 2/6] Added xgboost to forust conversion, also more tests to validate shapley --- py-forust/forust/__init__.py | 86 +++++++++++++++++++++++++++++++++ py-forust/tests/test_booster.py | 31 ++++++++++++ 2 files changed, 117 insertions(+) diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 7d49124..0af27d0 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +import dataclasses import inspect import json import sys @@ -61,6 +62,91 @@ class Node: right_child: int is_leaf: bool + @classmethod + def _from_xgboost_node( + cls, xgb_node: dict[str, Any], feature_map: dict[Any, int] + ) -> Node: + return Node( + num=xgb_node["nodeid"], + weight_value=xgb_node.get("leaf", 0.0), + hessian_sum=xgb_node["cover"], + depth=xgb_node.get("depth", 0), + split_value=float(np.float32(xgb_node.get("split_condition", 0.0))), + split_feature=feature_map.get(xgb_node.get("split", 0), 0), + split_gain=xgb_node.get("gain", 0.0), + missing_node=xgb_node.get("missing", 0), + left_child=xgb_node.get("yes", 0), + right_child=xgb_node.get("no", 0), + is_leaf="leaf" in xgb_node, + ) + + +def _xgboost_tree_to_nodes( + tree: dict[str, Any], feature_map: dict[Any, int] +) -> list[dict[str, Any]]: + buffer = [tree] + node_list = [] + while len(buffer) > 0: + xgb_node = buffer.pop(0) + node_list.append( + dataclasses.asdict( + Node._from_xgboost_node(xgb_node, feature_map=feature_map) + ) + ) + if "leaf" not in xgb_node: + buffer.extend(xgb_node["children"]) + # Ensure the nodeids all align with the nodes index + for idx, node in enumerate(node_list): + if idx != node["num"]: + raise ValueError( + f"Nodes are unaligned for node {node['num']} at index {idx}" + ) + return node_list + + +def _from_xgboost_model(model: Any) -> GradientBooster: + import xgboost + + if isinstance(model, xgboost.XGBModel): + booster = model.get_booster() + else: + booster = cast(xgboost.Booster, model) + # Get the model dump... + model_dump = booster.get_dump(dump_format="json", with_stats=True) + features = booster.feature_names + if features is None: + feature_map = {} + else: + feature_map = {v: i for i, v in enumerate(features)} + + # Get the nodes + trees = [] + for tree in model_dump: + nodes = _xgboost_tree_to_nodes(tree=json.loads(tree), feature_map=feature_map) + trees.append({"nodes": nodes}) + + # This is would be wrong, for models trained with "binary:logistic" + # because the base score is modified prior to predictions. + # We would need to modify prior to handing it to the forust + # model. + learner_config = json.loads(model.get_booster().save_config())["learner"] + base_score = float(learner_config["learner_model_param"]["base_score"]) + if learner_config["objective"]["name"] == "binary:logistic": + base_score = np.log(base_score / (1 - base_score)) + + # Get initial dump + model_json = json.loads(GradientBooster().json_dump()) + model_json["base_score"] = base_score + model_json["trees"] = trees + + # Populate booster from json + final_model = GradientBooster() + final_model.booster = CrateGradientBooster.from_json(json.dumps(model_json)) + if features is not None: + final_model.feature_names_in_ = features + final_model.n_features_ = len(features) + return final_model + class BoosterType(Protocol): monotone_constraints: dict[int, int] diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 2f506b8..7e3679c 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -889,6 +889,37 @@ def test_booster_to_xgboosts_with_contributions_shapley(X_y): assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) +def test_booster_to_xgboosts_with_contributions_shapley_from_xgboost(X_y): + X, y = X_y + X = X.astype(np.float32) + xmod = XGBClassifier( + n_estimators=100, + learning_rate=0.3, + max_depth=10, + reg_lambda=1, + min_child_weight=1, + gamma=1, + objective="binary:logitraw", + eval_metric="auc", + tree_method="hist", + base_score=0.5, + ) + xmod.fit(X, y) + + fmod = forust._from_xgboost_model(xmod) + + contribs_shapley = fmod.predict_contributions(X, method="Shapley") + fmod_preds = fmod.predict(X) + + import xgboost as xgb + + xmod_contribs_shapley = xmod.get_booster().predict( + xgb.DMatrix(X), approx_contribs=False, pred_contribs=True + ) + assert np.allclose(contribs_shapley, xmod_contribs_shapley, atol=0.00001) + assert np.allclose(fmod_preds, xmod.predict(X, output_margin=True), atol=0.00001) + + def test_missing_branch_with_contributions(X_y): X, y = X_y X = X From 0d8e54cbebbbfbd970f0686a5762ebd9ce16b57f Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 21 Nov 2023 11:05:26 -0600 Subject: [PATCH 3/6] Adding docs and update version --- Cargo.toml | 2 +- README.md | 2 +- py-forust/Cargo.toml | 4 ++-- py-forust/forust/__init__.py | 1 + rs-example.md | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5e91d5c..94874ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.4.2" +version = "0.4.3" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index 200e889..0fd7129 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ pip install forust To use in a rust project add the following to your Cargo.toml file. ```toml -forust-ml = "0.4.2" +forust-ml = "0.4.3" ``` ## Usage diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index 5fc6d9a..111db83 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.4.2" +version = "0.4.3" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20.0", features = ["extension-module"] } -forust-ml = { version = "0.4.2", path = "../" } +forust-ml = { version = "0.4.3", path = "../" } numpy = "0.20.0" ndarray = "0.15.1" serde_plain = { version = "1.0" } diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 0af27d0..b23a599 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -670,6 +670,7 @@ def predict_contributions( method (str, optional): Method to calculate the contributions, available options are: - "Average": If this option is specified, the average internal node values are calculated, this is equivalent to the `approx_contribs` parameter in XGBoost. + - "Shapley": Using this option will calculate contributions using the tree shap algorithm. - "Weight": This method will use the internal leaf weights, to calculate the contributions. This is the same as what is described by Saabas [here](https://blog.datadive.net/interpreting-random-forests/). - "BranchDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the weight of the other non-missing branch. This method does not have the property where the contributions summed is equal to the final prediction of the model. - "MidpointDifference": This method will calculate contributions by subtracting the weight of the node the record will travel down by the mid-point between the right and left node weighted by the cover of each node. This method does not have the property where the contributions summed is equal to the final prediction of the model. diff --git a/rs-example.md b/rs-example.md index 5378237..88a5cc7 100644 --- a/rs-example.md +++ b/rs-example.md @@ -3,7 +3,7 @@ To run this example, add the following code to your `Cargo.toml` file. ```toml [dependencies] -forust-ml = "0.4.2" +forust-ml = "0.4.3" polars = "0.28" reqwest = { version = "0.11", features = ["blocking"] } ``` From 716769af9af62966b96b91c3ae949e67b165fbe2 Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 21 Nov 2023 14:20:58 -0600 Subject: [PATCH 4/6] Trying to set the capacity --- src/shaply.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/shaply.rs b/src/shaply.rs index f168344..60ae240 100644 --- a/src/shaply.rs +++ b/src/shaply.rs @@ -44,6 +44,11 @@ impl PathList { &mut self.paths[i] } } + fn with_capacity(capacity: usize) -> PathList { + PathList { + paths: Vec::with_capacity(capacity), + } + } } fn extend_path( @@ -213,7 +218,7 @@ pub fn predict_contributions_row_shapley( contribs, 0, 0, - PathList::default(), + PathList::with_capacity(tree.nodes.len()), 1., 1., row.len() + 100, From 0ef4ce7fa2b168d5bef29a26eb162aa53a72b17a Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 21 Nov 2023 14:31:08 -0600 Subject: [PATCH 5/6] Switch back to default pathlist, still slow --- src/shaply.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/shaply.rs b/src/shaply.rs index 60ae240..6fa6a68 100644 --- a/src/shaply.rs +++ b/src/shaply.rs @@ -44,11 +44,16 @@ impl PathList { &mut self.paths[i] } } - fn with_capacity(capacity: usize) -> PathList { - PathList { - paths: Vec::with_capacity(capacity), - } - } + // fn with_capacity(capacity: usize) -> PathList { + // PathList { + // paths: Vec::with_capacity(capacity), + // } + // } + // fn with_empty(l: usize) -> PathList { + // PathList { + // paths: vec![PathElement::default(); l], + // } + // } } fn extend_path( @@ -218,7 +223,7 @@ pub fn predict_contributions_row_shapley( contribs, 0, 0, - PathList::with_capacity(tree.nodes.len()), + PathList::default(), 1., 1., row.len() + 100, From 1de6e61750fb9a76d81f6d871608f531df33b8b9 Mon Sep 17 00:00:00 2001 From: jinlow Date: Mon, 4 Dec 2023 20:50:05 -0600 Subject: [PATCH 6/6] fix name --- src/gradientbooster.rs | 2 +- src/lib.rs | 2 +- src/{shaply.rs => shapley.rs} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename src/{shaply.rs => shapley.rs} (100%) diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs index 4c239f2..1d794e1 100644 --- a/src/gradientbooster.rs +++ b/src/gradientbooster.rs @@ -8,7 +8,7 @@ use crate::objective::{ SquaredLoss, }; use crate::sampler::{GossSampler, RandomSampler, SampleMethod, Sampler}; -use crate::shaply::predict_contributions_row_shapley; +use crate::shapley::predict_contributions_row_shapley; use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter}; use crate::tree::Tree; use crate::utils::{fmt_vec_output, odds, validate_positive_float_field}; diff --git a/src/lib.rs b/src/lib.rs index fa9f23a..99d93bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ mod histogram; mod node; mod partial_dependence; -mod shaply; +mod shapley; // Modules pub mod binning; diff --git a/src/shaply.rs b/src/shapley.rs similarity index 100% rename from src/shaply.rs rename to src/shapley.rs