Skip to content

Commit

Permalink
Merge pull request #21 from jinlow/adding-feature-contributions
Browse files Browse the repository at this point in the history
Sped up contribution calculation
  • Loading branch information
jinlow authored Apr 23, 2023
2 parents 76481b0 + 38268bb commit 5b53399
Show file tree
Hide file tree
Showing 14 changed files with 442 additions and 215 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.2.0"
version = "0.2.1"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.2.0"
forust-ml = "0.2.1"
```

## Usage
Expand Down Expand Up @@ -84,6 +84,13 @@ model.fit(X, y)
# Predict on data
model.predict(X.head())
# array([-1.94919663, 2.25863229, 0.32963671, 2.48732194, -3.00371813])

# predict contributions
model.predict_contributions(X.head())
# array([[-0.63014213, 0.33880048, -0.16520798, -0.07798772, -0.85083578,
# -1.07720813],
# [ 1.05406709, 0.08825999, 0.21662544, -0.12083538, 0.35209258,
# -1.07720813],
```

The `fit` method accepts the following arguments.
Expand All @@ -102,6 +109,13 @@ The predict method accepts the following arguments.
passed, the `parallel` attribute of the booster will be used.
Defaults to `None`.

The `predict_contributions` method will predict with the fitted booster on new data, returning the feature contribution matrix. The last column is the bias term.
- `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array, with numeric data.
- `parallel` ***(Optional[bool], optional)***: Optionally specify if the predict
function should run in parallel on multiple threads. If `None` is
passed, the `parallel` attribute of the booster will be used.
Defaults to `None`.

### Inspecting the Model

Once the booster has been fit, each individual tree structure can be retrieved in text form, using the `text_dump` method. This method returns a list, the same length as the number of trees in the model.
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.2.0"
version = "0.2.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,6 +10,6 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.17", features = ["extension-module"] }
forust-ml = { version="0.2.0", path="../" }
forust-ml = { version="0.2.1", path="../" }
numpy = "0.17.2"
ndarray = "0.15.1"
40 changes: 40 additions & 0 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ def predict(
) -> np.ndarray:
raise NotImplementedError()

def predict_contributions(
self,
flat_data: np.ndarray,
rows: int,
cols: int,
parallel: bool = True,
) -> np.ndarray:
raise NotImplementedError

def value_partial_dependence(
self,
feature: int,
Expand Down Expand Up @@ -238,6 +247,37 @@ def predict(self, X: FrameLike, parallel: Union[bool, None] = None) -> np.ndarra
parallel=parallel_,
)

def predict_contributions(
self, X: FrameLike, parallel: Union[bool, None] = None
) -> np.ndarray:
"""Predict with the fitted booster on new data, returning the feature
contribution matrix. The last column is the bias term.
Args:
X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
parallel (Union[bool, None], optional): Optionally specify if the predict
function should run in parallel on multiple threads. If `None` is
passed, the `parallel` attribute of the booster will be used.
Defaults to `None`.
Returns:
np.ndarray: Returns a numpy array of the predictions.
"""
X_ = X.to_numpy() if isinstance(X, pd.DataFrame) else X
if not np.issubdtype(X_.dtype, "float64"):
X_ = X_.astype(dtype="float64", copy=False)

parallel_ = self.parallel if parallel is None else parallel
flat_data = X_.ravel(order="F")
rows, cols = X_.shape
contributions = self.booster.predict_contributions(
flat_data=flat_data,
rows=rows,
cols=cols,
parallel=parallel_,
)
return np.reshape(contributions, (X_.shape[0], X_.shape[1] + 1))

def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarray:
"""Calculate the partial dependence values of a feature. For each unique
value of the feature, this gives the estimate of the predicted value for that
Expand Down
16 changes: 16 additions & 0 deletions py-forust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ impl GradientBooster {
let parallel = parallel.unwrap_or(true);
Ok(self.booster.predict(&data, parallel).into_pyarray(py))
}
pub fn predict_contributions<'py>(
&self,
py: Python<'py>,
flat_data: PyReadonlyArray1<f64>,
rows: usize,
cols: usize,
parallel: Option<bool>,
) -> PyResult<&'py PyArray1<f64>> {
let flat_data = flat_data.as_slice()?;
let data = Matrix::new(flat_data, rows, cols);
let parallel = parallel.unwrap_or(true);
Ok(self
.booster
.predict_contributions(&data, parallel)
.into_pyarray(py))
}

pub fn value_partial_dependence(&self, feature: usize, value: f64) -> PyResult<f64> {
Ok(self.booster.value_partial_dependence(feature, value))
Expand Down
46 changes: 46 additions & 0 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,3 +263,49 @@ def test_monotone_constraints(X_y):
assert np.all(p_d[0:-1, 1] >= p_d[1:, 1])
else:
assert np.all(p_d[0:-1, 1] <= p_d[1:, 1])


def test_booster_to_xgboosts_with_contributions(X_y):
X, y = X_y
X = X
fmod = GradientBooster(
iterations=100,
learning_rate=0.3,
max_depth=5,
l2=1,
min_leaf_weight=1,
gamma=1,
objective_type="LogLoss",
nbins=500,
parallel=False,
base_score=0.0,
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
fmod_contribs = fmod.predict_contributions(X)
fmod_preds[~np.isclose(fmod_contribs.sum(1), fmod_preds, rtol=5)]
fmod_contribs.sum(1)[~np.isclose(fmod_contribs.sum(1), fmod_preds, rtol=5)]
assert fmod_contribs.shape[1] == X.shape[1] + 1
assert np.allclose(fmod_contribs.sum(1), fmod_preds)

xmod = XGBClassifier(
n_estimators=100,
learning_rate=0.3,
max_depth=5,
reg_lambda=1,
min_child_weight=1,
gamma=1,
objective="binary:logitraw",
eval_metric="auc",
tree_method="hist",
max_bin=10000,
base_score=0.0,
)
xmod.fit(X, y)
xmod_preds = xmod.predict(X, output_margin=True)
import xgboost as xgb

xmod_contribs = xmod.get_booster().predict(
xgb.DMatrix(X), approx_contribs=True, pred_contribs=True
)
assert np.allclose(fmod_contribs, xmod_contribs, atol=0.000001)
2 changes: 1 addition & 1 deletion rs-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
To run this example, add the following code to your `Cargo.toml` file.
```toml
[dependencies]
forust-ml = "0.2.0"
forust-ml = "0.2.1"
polars = "0.24"
reqwest = { version = "0.11", features = ["blocking"] }
```
Expand Down
15 changes: 15 additions & 0 deletions src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ pub struct Matrix<'a, T> {
}

impl<'a, T> Matrix<'a, T> {
// Defaults to column major
pub fn new(data: &'a [T], rows: usize, cols: usize) -> Self {
Matrix {
data,
Expand Down Expand Up @@ -137,6 +138,20 @@ impl<'a, T> Matrix<'a, T> {
}
}

/// A lightweight row major matrix, this is primarily
/// for returning data to the user, it is especially
/// suited for appending rows to, such as when building
/// up a matrix of contributions to return to the
/// user, the added benefit is it will be even
/// faster to return to numpy.
// pub struct RowMajorMatrix<T> {
// pub data: Vec<T>,
// pub rows: usize,
// pub cols: usize,
// stride1: usize,
// stride2: usize,
// }

impl<'a, T> fmt::Display for Matrix<'a, T>
where
T: FromStr + std::fmt::Display,
Expand Down
25 changes: 25 additions & 0 deletions src/gradientbooster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::errors::ForustError;
use crate::objective::{gradient_hessian_callables, ObjectiveType};
use crate::splitter::MissingImputerSplitter;
use crate::tree::Tree;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::fs;

Expand Down Expand Up @@ -215,6 +216,28 @@ impl GradientBooster {
init_preds
}

/// Generate predictions on data using the gradient booster.
///
/// * `data` - Either a pandas DataFrame, or a 2 dimensional numpy array.
pub fn predict_contributions(&self, data: &Matrix<f64>, parallel: bool) -> Vec<f64> {
let weights: Vec<Vec<f64>> = if parallel {
self.trees
.par_iter()
.map(|t| t.distribute_leaf_weights())
.collect()
} else {
self.trees
.iter()
.map(|t| t.distribute_leaf_weights())
.collect()
};
let mut contribs = vec![0.; (data.cols + 1) * data.rows];
self.trees.iter().zip(weights.iter()).for_each(|(t, w)| {
t.predict_contributions(data, &mut contribs, w, parallel);
});
contribs
}

/// Given a value, return the partial dependence value of that value for that
/// feature in the model.
///
Expand Down Expand Up @@ -390,6 +413,8 @@ mod tests {
let sample_weight = vec![1.; y.len()];
booster.fit(&data, &y, &sample_weight).unwrap();
let preds = booster.predict(&data, false);
let contribs = booster.predict_contributions(&data, false);
assert_eq!(contribs.len(), (data.cols + 1) * data.rows);
println!("{}", booster.trees[0]);
println!("{}", booster.trees[0].nodes.len());
println!("{}", booster.trees.last().unwrap().nodes.len());
Expand Down
Loading

0 comments on commit 5b53399

Please sign in to comment.