Merge pull request #21 from jinlow/adding-feature-contributions

Sped up contribution calculation
jinlow · Apr 23, 2023 · 5b53399 · 5b53399
2 parents 76481b0 + 38268bb
commit 5b53399
Show file tree

Hide file tree

Showing 14 changed files with 442 additions and 215 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 authors = ["James Inlow <[email protected]>"]
 homepage = "https://github.com/jinlow/forust"

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.2.0"
+forust-ml = "0.2.1"
 ```
 
 ## Usage
@@ -84,6 +84,13 @@ model.fit(X, y)
 # Predict on data
 model.predict(X.head())
 # array([-1.94919663,  2.25863229,  0.32963671,  2.48732194, -3.00371813])
+
+# predict contributions
+model.predict_contributions(X.head())
+# array([[-0.63014213,  0.33880048, -0.16520798, -0.07798772, -0.85083578,
+#        -1.07720813],
+#       [ 1.05406709,  0.08825999,  0.21662544, -0.12083538,  0.35209258,
+#        -1.07720813],
 ```
 
 The `fit` method accepts the following arguments.
@@ -102,6 +109,13 @@ The predict method accepts the following arguments.
    passed, the `parallel` attribute of the booster will be used.
    Defaults to `None`.
 
+The `predict_contributions` method will predict with the fitted booster on new data, returning the feature contribution matrix. The last column is the bias term.
+ - `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array, with numeric data.
+ - `parallel` ***(Optional[bool], optional)***: Optionally specify if the predict
+   function should run in parallel on multiple threads. If `None` is
+   passed, the `parallel` attribute of the booster will be used.
+   Defaults to `None`.
+
 ### Inspecting the Model
 
 Once the booster has been fit, each individual tree structure can be retrieved in text form, using the `text_dump` method. This method returns a list, the same length as the number of trees in the model.

diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,6 +10,6 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.17", features = ["extension-module"] }
-forust-ml = { version="0.2.0", path="../" }
+forust-ml = { version="0.2.1", path="../" }
 numpy = "0.17.2"
 ndarray = "0.15.1"
diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -35,6 +35,15 @@ def predict(
     ) -> np.ndarray:
         raise NotImplementedError()
 
+    def predict_contributions(
+        self,
+        flat_data: np.ndarray,
+        rows: int,
+        cols: int,
+        parallel: bool = True,
+    ) -> np.ndarray:
+        raise NotImplementedError
+
     def value_partial_dependence(
         self,
         feature: int,
@@ -238,6 +247,37 @@ def predict(self, X: FrameLike, parallel: Union[bool, None] = None) -> np.ndarra
             parallel=parallel_,
         )
 
+    def predict_contributions(
+        self, X: FrameLike, parallel: Union[bool, None] = None
+    ) -> np.ndarray:
+        """Predict with the fitted booster on new data, returning the feature
+        contribution matrix. The last column is the bias term.
+
+        Args:
+            X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
+            parallel (Union[bool, None], optional): Optionally specify if the predict
+                function should run in parallel on multiple threads. If `None` is
+                passed, the `parallel` attribute of the booster will be used.
+                Defaults to `None`.
+
+        Returns:
+            np.ndarray: Returns a numpy array of the predictions.
+        """
+        X_ = X.to_numpy() if isinstance(X, pd.DataFrame) else X
+        if not np.issubdtype(X_.dtype, "float64"):
+            X_ = X_.astype(dtype="float64", copy=False)
+
+        parallel_ = self.parallel if parallel is None else parallel
+        flat_data = X_.ravel(order="F")
+        rows, cols = X_.shape
+        contributions = self.booster.predict_contributions(
+            flat_data=flat_data,
+            rows=rows,
+            cols=cols,
+            parallel=parallel_,
+        )
+        return np.reshape(contributions, (X_.shape[0], X_.shape[1] + 1))
+
     def partial_dependence(self, X: FrameLike, feature: Union[str, int]) -> np.ndarray:
         """Calculate the partial dependence values of a feature. For each unique
         value of the feature, this gives the estimate of the predicted value for that

diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
@@ -112,6 +112,22 @@ impl GradientBooster {
         let parallel = parallel.unwrap_or(true);
         Ok(self.booster.predict(&data, parallel).into_pyarray(py))
     }
+    pub fn predict_contributions<'py>(
+        &self,
+        py: Python<'py>,
+        flat_data: PyReadonlyArray1<f64>,
+        rows: usize,
+        cols: usize,
+        parallel: Option<bool>,
+    ) -> PyResult<&'py PyArray1<f64>> {
+        let flat_data = flat_data.as_slice()?;
+        let data = Matrix::new(flat_data, rows, cols);
+        let parallel = parallel.unwrap_or(true);
+        Ok(self
+            .booster
+            .predict_contributions(&data, parallel)
+            .into_pyarray(py))
+    }
 
     pub fn value_partial_dependence(&self, feature: usize, value: f64) -> PyResult<f64> {
         Ok(self.booster.value_partial_dependence(feature, value))

diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
@@ -263,3 +263,49 @@ def test_monotone_constraints(X_y):
             assert np.all(p_d[0:-1, 1] >= p_d[1:, 1])
         else:
             assert np.all(p_d[0:-1, 1] <= p_d[1:, 1])
+
+
+def test_booster_to_xgboosts_with_contributions(X_y):
+    X, y = X_y
+    X = X
+    fmod = GradientBooster(
+        iterations=100,
+        learning_rate=0.3,
+        max_depth=5,
+        l2=1,
+        min_leaf_weight=1,
+        gamma=1,
+        objective_type="LogLoss",
+        nbins=500,
+        parallel=False,
+        base_score=0.0,
+    )
+    fmod.fit(X, y=y)
+    fmod_preds = fmod.predict(X)
+    fmod_contribs = fmod.predict_contributions(X)
+    fmod_preds[~np.isclose(fmod_contribs.sum(1), fmod_preds, rtol=5)]
+    fmod_contribs.sum(1)[~np.isclose(fmod_contribs.sum(1), fmod_preds, rtol=5)]
+    assert fmod_contribs.shape[1] == X.shape[1] + 1
+    assert np.allclose(fmod_contribs.sum(1), fmod_preds)
+
+    xmod = XGBClassifier(
+        n_estimators=100,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        min_child_weight=1,
+        gamma=1,
+        objective="binary:logitraw",
+        eval_metric="auc",
+        tree_method="hist",
+        max_bin=10000,
+        base_score=0.0,
+    )
+    xmod.fit(X, y)
+    xmod_preds = xmod.predict(X, output_margin=True)
+    import xgboost as xgb
+
+    xmod_contribs = xmod.get_booster().predict(
+        xgb.DMatrix(X), approx_contribs=True, pred_contribs=True
+    )
+    assert np.allclose(fmod_contribs, xmod_contribs, atol=0.000001)
diff --git a/rs-example.md b/rs-example.md
@@ -3,7 +3,7 @@
 To run this example, add the following code to your `Cargo.toml` file.
 ```toml
 [dependencies]
-forust-ml = "0.2.0"
+forust-ml = "0.2.1"
 polars = "0.24"
 reqwest = { version = "0.11", features = ["blocking"] }
 ```

diff --git a/src/data.rs b/src/data.rs
@@ -93,6 +93,7 @@ pub struct Matrix<'a, T> {
 }
 
 impl<'a, T> Matrix<'a, T> {
+    // Defaults to column major
     pub fn new(data: &'a [T], rows: usize, cols: usize) -> Self {
         Matrix {
             data,
@@ -137,6 +138,20 @@ impl<'a, T> Matrix<'a, T> {
     }
 }
 
+/// A lightweight row major matrix, this is primarily
+/// for returning data to the user, it is especially
+/// suited for appending rows to, such as when building
+/// up a matrix of contributions to return to the
+/// user, the added benefit is it will be even
+/// faster to return to numpy.
+// pub struct RowMajorMatrix<T> {
+//     pub data: Vec<T>,
+//     pub rows: usize,
+//     pub cols: usize,
+//     stride1: usize,
+//     stride2: usize,
+// }
+
 impl<'a, T> fmt::Display for Matrix<'a, T>
 where
     T: FromStr + std::fmt::Display,

diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
@@ -5,6 +5,7 @@ use crate::errors::ForustError;
 use crate::objective::{gradient_hessian_callables, ObjectiveType};
 use crate::splitter::MissingImputerSplitter;
 use crate::tree::Tree;
+use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use std::fs;
 
@@ -215,6 +216,28 @@ impl GradientBooster {
         init_preds
     }
 
+    /// Generate predictions on data using the gradient booster.
+    ///
+    /// * `data` -  Either a pandas DataFrame, or a 2 dimensional numpy array.
+    pub fn predict_contributions(&self, data: &Matrix<f64>, parallel: bool) -> Vec<f64> {
+        let weights: Vec<Vec<f64>> = if parallel {
+            self.trees
+                .par_iter()
+                .map(|t| t.distribute_leaf_weights())
+                .collect()
+        } else {
+            self.trees
+                .iter()
+                .map(|t| t.distribute_leaf_weights())
+                .collect()
+        };
+        let mut contribs = vec![0.; (data.cols + 1) * data.rows];
+        self.trees.iter().zip(weights.iter()).for_each(|(t, w)| {
+            t.predict_contributions(data, &mut contribs, w, parallel);
+        });
+        contribs
+    }
+
     /// Given a value, return the partial dependence value of that value for that
     /// feature in the model.
     ///
@@ -390,6 +413,8 @@ mod tests {
         let sample_weight = vec![1.; y.len()];
         booster.fit(&data, &y, &sample_weight).unwrap();
         let preds = booster.predict(&data, false);
+        let contribs = booster.predict_contributions(&data, false);
+        assert_eq!(contribs.len(), (data.cols + 1) * data.rows);
         println!("{}", booster.trees[0]);
         println!("{}", booster.trees[0].nodes.len());
         println!("{}", booster.trees.last().unwrap().nodes.len());