Merge pull request #22 from jinlow/conributions-bug-fix

Conributions bug fix
jinlow · Apr 23, 2023 · 9155f60 · 9155f60
2 parents 5b53399 + 58eec18
commit 9155f60
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 22 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.2.1"
+version = "0.2.2"
 edition = "2021"
 authors = ["James Inlow <[email protected]>"]
 homepage = "https://github.com/jinlow/forust"

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ pip install forust
 
 To use in a rust project add the following to your Cargo.toml file.
 ```toml
-forust-ml = "0.2.1"
+forust-ml = "0.2.2"
 ```
 
 ## Usage

diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.2.1"
+version = "0.2.2"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -10,6 +10,6 @@ crate-type = ["cdylib"]
 
 [dependencies]
 pyo3 = { version = "0.17", features = ["extension-module"] }
-forust-ml = { version="0.2.1", path="../" }
+forust-ml = { version="0.2.2", path="../" }
 numpy = "0.17.2"
 ndarray = "0.15.1"
diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
@@ -278,7 +278,7 @@ def test_booster_to_xgboosts_with_contributions(X_y):
         objective_type="LogLoss",
         nbins=500,
         parallel=False,
-        base_score=0.0,
+        base_score=0.5,
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
@@ -299,7 +299,7 @@ def test_booster_to_xgboosts_with_contributions(X_y):
         eval_metric="auc",
         tree_method="hist",
         max_bin=10000,
-        base_score=0.0,
+        base_score=0.5,
     )
     xmod.fit(X, y)
     xmod_preds = xmod.predict(X, output_margin=True)

diff --git a/rs-example.md b/rs-example.md
@@ -3,7 +3,7 @@
 To run this example, add the following code to your `Cargo.toml` file.
 ```toml
 [dependencies]
-forust-ml = "0.2.1"
+forust-ml = "0.2.2"
 polars = "0.24"
 reqwest = { version = "0.11", features = ["blocking"] }
 ```

diff --git a/src/data.rs b/src/data.rs
@@ -119,6 +119,14 @@ impl<'a, T> Matrix<'a, T> {
         idx
     }
 
+    /// Get access to a row of the data, as an iterator.
+    pub fn get_row_iter(
+        &self,
+        row: usize,
+    ) -> std::iter::StepBy<std::iter::Skip<std::slice::Iter<T>>> {
+        self.data.iter().skip(row).step_by(self.rows)
+    }
+
     /// Get a slice of a column in the matrix.
     ///
     /// * `col` - The index of the column to select.
@@ -138,6 +146,16 @@ impl<'a, T> Matrix<'a, T> {
     }
 }
 
+impl<'a, T> Matrix<'a, T>
+where
+    T: Copy,
+{
+    /// Get a row of the data as a vector.
+    pub fn get_row(&self, row: usize) -> Vec<T> {
+        self.get_row_iter(row).copied().collect()
+    }
+}
+
 /// A lightweight row major matrix, this is primarily
 /// for returning data to the user, it is especially
 /// suited for appending rows to, such as when building
@@ -227,7 +245,6 @@ impl<T> JaggedMatrix<T> {
             n_records: 0,
         }
     }
-
     /// Get the column of a jagged array.
     pub fn get_col(&self, col: usize) -> &[T] {
         assert!(col < self.ends.len());
@@ -285,6 +302,15 @@ mod tests {
         assert_eq!(m.get_col(1), &vec![5, 6, 7]);
     }
 
+    #[test]
+    fn test_matrix_row() {
+        let v = vec![1, 2, 3, 5, 6, 7];
+        let m = Matrix::new(&v, 3, 2);
+        assert_eq!(m.get_row(2), vec![3, 7]);
+        assert_eq!(m.get_row(0), vec![1, 5]);
+        assert_eq!(m.get_row(1), vec![2, 6]);
+    }
+
     #[test]
     fn test_jaggedmatrix_get_col() {
         let vecs = vec![vec![0], vec![5, 4, 3, 2], vec![4, 5]];

diff --git a/src/gradientbooster.rs b/src/gradientbooster.rs
@@ -216,6 +216,30 @@ impl GradientBooster {
         init_preds
     }
 
+    // pub fn predict(&self, data: &Matrix<f64>, parallel: bool) -> Vec<f64> {
+    //     // After we disconvered it's faster materializing the row once, and then
+    //     // Passing that to each tree, let's see if we can do the same with the booster
+    //     // prediction...
+    //     // Clean this up..
+    //     let mut init_preds = vec![self.base_score; data.rows];
+    //     if parallel {
+    //         init_preds.par_iter_mut().enumerate().for_each(|(i, p)| {
+    //             let pred_row = data.get_row(i);
+    //             for t in &self.trees {
+    //                 *p += t.predict_row_from_row_slice(&pred_row);
+    //             }
+    //         });
+    //     } else {
+    //         init_preds.iter_mut().enumerate().for_each(|(i, p)| {
+    //             let pred_row = data.get_row(i);
+    //             for t in &self.trees {
+    //                 *p += t.predict_row_from_row_slice(&pred_row);
+    //             }
+    //         });
+    //     }
+    //     init_preds
+    // }
+
     /// Generate predictions on data using the gradient booster.
     ///
     /// * `data` -  Either a pandas DataFrame, or a 2 dimensional numpy array.
@@ -232,9 +256,41 @@ impl GradientBooster {
                 .collect()
         };
         let mut contribs = vec![0.; (data.cols + 1) * data.rows];
-        self.trees.iter().zip(weights.iter()).for_each(|(t, w)| {
-            t.predict_contributions(data, &mut contribs, w, parallel);
-        });
+
+        // Add the bias term to every bias value...
+        let bias_idx = data.cols + 1;
+        contribs
+            .iter_mut()
+            .skip(bias_idx - 1)
+            .step_by(bias_idx)
+            .for_each(|v| *v += self.base_score);
+
+        // Clean this up..
+        // materializing a row, and then passing that to all of the
+        // trees seems to be the fastest approach (5X faster), we should test
+        // something like this for normal predictions.
+        if parallel {
+            data.index
+                .par_iter()
+                .zip(contribs.par_chunks_mut(data.cols + 1))
+                .for_each(|(row, c)| {
+                    let r_ = data.get_row(*row);
+                    self.trees.iter().zip(weights.iter()).for_each(|(t, w)| {
+                        t.predict_contributions_row(&r_, c, w);
+                    });
+                });
+        } else {
+            data.index
+                .iter()
+                .zip(contribs.chunks_mut(data.cols + 1))
+                .for_each(|(row, c)| {
+                    let r_ = data.get_row(*row);
+                    self.trees.iter().zip(weights.iter()).for_each(|(t, w)| {
+                        t.predict_contributions_row(&r_, c, w);
+                    });
+                });
+        }
+
         contribs
     }
 

diff --git a/src/tree.rs b/src/tree.rs
@@ -118,23 +118,17 @@ impl Tree {
             }
         }
     }
-    pub fn predict_contributions_row(
-        &self,
-        data: &Matrix<f64>,
-        row: usize,
-        contribs: &mut [f64],
-        weights: &[f64],
-    ) {
+    pub fn predict_contributions_row(&self, row: &[f64], contribs: &mut [f64], weights: &[f64]) {
         // Add the bias term first...
-        contribs[data.cols] += weights[0];
+        contribs[contribs.len() - 1] += weights[0];
         let mut node_idx = 0;
         loop {
             let node = &self.nodes[node_idx];
             if node.is_leaf {
                 break;
             }
             // Get change of weight given child's weight.
-            let child_idx = node.get_child_idx(data.get(row, node.split_feature));
+            let child_idx = node.get_child_idx(&row[node.split_feature]);
             let node_weight = weights[node_idx];
             let child_weight = weights[child_idx];
             let delta = child_weight - node_weight;
@@ -154,9 +148,10 @@ impl Tree {
             .iter()
             .zip(contribs.chunks_mut(data.cols + 1))
             .for_each(|(row, contribs)| {
-                self.predict_contributions_row(data, *row, contribs, weights)
+                self.predict_contributions_row(&data.get_row(*row), contribs, weights)
             })
     }
+
     fn predict_contributions_parallel(
         &self,
         data: &Matrix<f64>,
@@ -168,7 +163,7 @@ impl Tree {
             .par_iter()
             .zip(contribs.par_chunks_mut(data.cols + 1))
             .for_each(|(row, contribs)| {
-                self.predict_contributions_row(data, *row, contribs, weights)
+                self.predict_contributions_row(&data.get_row(*row), contribs, weights)
             })
     }
 
@@ -198,6 +193,18 @@ impl Tree {
         }
     }
 
+    pub fn predict_row_from_row_slice(&self, row: &[f64]) -> f64 {
+        let mut node_idx = 0;
+        loop {
+            let node = &self.nodes[node_idx];
+            if node.is_leaf {
+                return node.weight_value as f64;
+            } else {
+                node_idx = node.get_child_idx(&row[node.split_feature]);
+            }
+        }
+    }
+
     fn predict_single_threaded(&self, data: &Matrix<f64>) -> Vec<f64> {
         data.index
             .iter()