Merge pull request #9 from jinlow/binning-sample-weights

Formatting and fixing sample weight in binning
jinlow · Jun 18, 2022 · d288c31 · d288c31
2 parents c5ed633 + 80029db
commit d288c31
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 68 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "forust-ml"
-version = "0.1.3"
+version = "0.1.4"
 edition = "2021"
 authors = ["James Inlow <[email protected]>"]
 homepage = "https://github.com/jinlow/forust"

diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 <p align="center">
-  <img  height="340" src="resources/tree-image-crop.png">
+  <img  height="340" src="https://github.com/jinlow/forust/raw/main/resources/tree-image-crop.png">
 </p>
 
 
 <div align="center">
 
-  <a href="">![PyPI](https://img.shields.io/pypi/v/forust?color=gr&style=for-the-badge)</a>
-  <a href="">![Crates.io](https://img.shields.io/crates/v/forust-ml?color=gr&style=for-the-badge)</a>
+  <a href="https://pypi.org/project/forust/">![PyPI](https://img.shields.io/pypi/v/forust?color=gr&style=for-the-badge)</a>
+  <a href="https://crates.io/crates/forust-ml">![Crates.io](https://img.shields.io/crates/v/forust-ml?color=gr&style=for-the-badge)</a>
 
 </div>
 
@@ -30,8 +30,8 @@ The `GradientBooster` class is currently the only public facing class in the pac
 It can be initialized with the following arguments.
 
  - `objective_type` ***(str, optional)***: The name of objective function used to optimize.
-    Valid options include "LogLoss" to use logistic loss as the objective function,
-    or "SquaredLoss" to use Squared Error as the objective function.
+    Valid options include "LogLoss" to use logistic loss as the objective function (binary classification),
+    or "SquaredLoss" to use Squared Error as the objective function (continuous regression).
     Defaults to "LogLoss".
  - `iterations` ***(int, optional)***: Total number of trees to train in the ensemble.
     Defaults to 100.
@@ -81,7 +81,9 @@ model.predict(X.head())
 
 The `fit` method accepts the following arguments.
  - `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array, with numeric data.
- - `y` ***(ArrayLike)***: Either a pandas Series, or a 1 dimensional numpy array.
+ - `y` ***(ArrayLike)***: Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss" was
+   the objective type specified, then this should only contain 1 or 0 values, where 1 is the positive class being predicted. If "SquaredLoss" is the objective type, then any continuous variable can be
+   provided.
  - `sample_weight` ***(Optional[ArrayLike], optional)***: Instance weights to use when
     training the model. If None is passed, a weight of 1 will be used for every record.
     Defaults to None.

diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-forust"
-version = "0.1.3"
+version = "0.1.4"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py
@@ -72,10 +72,10 @@ def __init__(
         """Gradient Booster Class, used to generate gradient boosted decision tree ensembles.
 
         Args:
-            objective_type (str, optional): The name of objective function used to optimize.
-                Valid options include "LogLoss" to use logistic loss as the objective function,
-                or "SquaredLoss" to use Squared Error as the objective function.
-                Defaults to "LogLoss".
+            objective_type (str, optional): The name of objective function used to optimize. 
+                Valid options include "LogLoss" to use logistic loss as the objective function 
+                (binary classification), or "SquaredLoss" to use Squared Error as the objective 
+                function (continuous regression). Defaults to "LogLoss".
             iterations (int, optional): Total number of trees to train in the ensemble.
                 Defaults to 100.
             learning_rate (float, optional): Step size to use at each iteration. Each
@@ -138,7 +138,10 @@ def fit(
 
         Args:
             X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
-            y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array.
+            y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss"
+                was the objective type specified, then this should only contain 1 or 0 values,
+                where 1 is the positive class being predicted. If "SquaredLoss" is the 
+                objective type, then any continuous variable can be provided.
             sample_weight (Optional[ArrayLike], optional): Instance weights to use when
                 training the model. If None is passed, a weight of 1 will be used for every record.
                 Defaults to None.

diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs
@@ -19,6 +19,7 @@ struct GradientBooster {
 #[pymethods]
 impl GradientBooster {
     #[new]
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         objective_type: &str,
         iterations: usize,
@@ -65,9 +66,7 @@ impl GradientBooster {
         let data = Matrix::new(flat_data, rows, cols);
         let y = y.as_slice()?;
         let sample_weight = sample_weight.as_slice()?;
-        self.booster
-            .fit(&data, &y, &sample_weight)
-            .unwrap();
+        self.booster.fit(&data, y, sample_weight).unwrap();
         Ok(())
     }
     pub fn predict<'py>(
@@ -80,10 +79,7 @@ impl GradientBooster {
     ) -> PyResult<&'py PyArray1<f64>> {
         let flat_data = flat_data.as_slice()?;
         let data = Matrix::new(flat_data, rows, cols);
-        let parallel = match parallel {
-            None => true,
-            Some(v) => v,
-        };
+        let parallel = parallel.unwrap_or(true);
         Ok(self.booster.predict(&data, parallel).into_pyarray(py))
     }
 
@@ -92,7 +88,7 @@ impl GradientBooster {
         for t in &self.booster.trees {
             trees.push(format!("{}", t));
         }
-        return Ok(trees);
+        Ok(trees)
     }
 
     pub fn save_booster(&self, path: &str) -> PyResult<()> {
@@ -127,7 +123,7 @@ impl GradientBooster {
         Ok(GradientBooster { booster })
     }
 
-    pub fn get_params<'py>(&self, py: Python<'py>) -> PyResult<PyObject> {
+    pub fn get_params(&self, py: Python) -> PyResult<PyObject> {
         let objective_ = match self.booster.objective_type {
             ObjectiveType::LogLoss => "LogLoss",
             ObjectiveType::SquaredLoss => "SquaredLoss",

diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 from forust import GradientBooster
-from xgboost import XGBClassifier
+from xgboost import XGBClassifier, XGBRegressor
 import pytest
 
 
@@ -81,42 +81,26 @@ def test_booster_to_xgboosts_with_missing(X_y):
     assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)
 
 
-def test_booster_to_xgboosts_weighted(X_y):
+def test_booster_to_xgboosts_with_missing_sl(X_y):
     X, y = X_y
-    X = X.fillna(0)
-    w = X["fare"].to_numpy() + 1
-    xmod = XGBClassifier(
+    X = X
+    X["survived"] = y
+    y = X["fare"]
+    X = X.drop(columns=["fare"])
+    xmod = XGBRegressor(
         n_estimators=100,
         learning_rate=0.3,
         max_depth=5,
         reg_lambda=1,
         min_child_weight=1,
-        gamma=0,
-        objective="binary:logitraw",
+        gamma=1,
+        eval_metric="auc",
         tree_method="hist",
-        max_bins=1000,
+        max_bin=10000,
     )
-    xmod.fit(X, y, sample_weight=w)
+    xmod.fit(X, y)
     xmod_preds = xmod.predict(X, output_margin=True)
 
-    fmod = GradientBooster(
-        iterations=100,
-        learning_rate=0.3,
-        max_depth=5,
-        l2=1,
-        min_leaf_weight=1,
-        gamma=0,
-        objective_type="LogLoss",
-    )
-    fmod.fit(X, y=y, sample_weight=w)
-    fmod_preds = fmod.predict(X)
-    assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)
-
-
-def test_booster_saving(X_y, tmp_path):
-    f32_model_path = tmp_path / "modelf32.json"
-    X, y = X_y
-    X = X
     fmod = GradientBooster(
         iterations=100,
         learning_rate=0.3,
@@ -130,31 +114,44 @@ def test_booster_saving(X_y, tmp_path):
     )
     fmod.fit(X, y=y)
     fmod_preds = fmod.predict(X)
-    fmod.save_booster(f32_model_path)
-    fmod_loaded = GradientBooster.load_booster(f32_model_path)
-    assert all(fmod_preds == fmod_loaded.predict(X))
+    assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)
+
 
-    f32_model_path = tmp_path / "modelf32.json"
+def test_booster_to_xgboosts_weighted(X_y):
     X, y = X_y
-    X = X
+    X = X.fillna(0)
+    w = X["fare"].to_numpy() + 1
+    xmod = XGBClassifier(
+        n_estimators=100,
+        learning_rate=0.3,
+        max_depth=5,
+        reg_lambda=1,
+        min_child_weight=1,
+        gamma=0,
+        objective="binary:logitraw",
+        tree_method="hist",
+        max_bins=1000,
+    )
+    xmod.fit(X, y, sample_weight=w)
+    xmod_preds = xmod.predict(X, output_margin=True)
+
     fmod = GradientBooster(
         iterations=100,
         learning_rate=0.3,
         max_depth=5,
         l2=1,
         min_leaf_weight=1,
-        gamma=1,
+        gamma=0,
         objective_type="LogLoss",
-        nbins=500,
-        parallel=False,
     )
-    fmod.fit(X, y=y)
+    fmod.fit(X, y=y, sample_weight=w)
     fmod_preds = fmod.predict(X)
-    fmod.save_booster(f32_model_path)
-    fmod_loaded = GradientBooster.load_booster(f32_model_path)
-    assert all(fmod_preds == fmod_loaded.predict(X))
+    assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)
+
 
-    f64_model_path = tmp_path / "modelf64.json"
+def test_booster_saving(X_y, tmp_path):
+    # squared loss
+    f64_model_path = tmp_path / "modelf64_sl.json"
     X, y = X_y
     X = X
     fmod = GradientBooster(
@@ -174,7 +171,8 @@ def test_booster_saving(X_y, tmp_path):
     fmod_loaded = GradientBooster.load_booster(f64_model_path)
     assert all(fmod_preds == fmod_loaded.predict(X))
 
-    f64_model_path = tmp_path / "modelf64.json"
+    # LogLoss
+    f64_model_path = tmp_path / "modelf64_ll.json"
     X, y = X_y
     X = X
     fmod = GradientBooster(

diff --git a/src/binning.rs b/src/binning.rs
@@ -86,13 +86,14 @@ pub fn bin_matrix<T: FloatData<T>>(
     let mut cuts = JaggedMatrix::new();
     let mut nunique = Vec::new();
     for i in 0..data.cols {
-        let no_miss: Vec<T> = data
+        let (no_miss, w): (Vec<T>, Vec<T>) = data
             .get_col(i)
             .iter()
-            .filter(|v| !v.is_nan())
-            .copied()
-            .collect();
-        let mut col_cuts = percentiles_or_value(&no_miss, sample_weight, &pcts);
+            .zip(sample_weight.iter())
+            .filter(|(v, _)| !v.is_nan())
+            .unzip();
+        assert_eq!(no_miss.len(), w.len());
+        let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts);
         col_cuts.push(T::MAX);
         col_cuts.dedup();
         if col_cuts.len() < 3 {