diff --git a/Cargo.toml b/Cargo.toml index 5b7760e..5ac3b3f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.1.3" +version = "0.1.4" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index eea5f48..bd81e09 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@

- +

- ![PyPI](https://img.shields.io/pypi/v/forust?color=gr&style=for-the-badge) - ![Crates.io](https://img.shields.io/crates/v/forust-ml?color=gr&style=for-the-badge) + ![PyPI](https://img.shields.io/pypi/v/forust?color=gr&style=for-the-badge) + ![Crates.io](https://img.shields.io/crates/v/forust-ml?color=gr&style=for-the-badge)
@@ -30,8 +30,8 @@ The `GradientBooster` class is currently the only public facing class in the pac It can be initialized with the following arguments. - `objective_type` ***(str, optional)***: The name of objective function used to optimize. - Valid options include "LogLoss" to use logistic loss as the objective function, - or "SquaredLoss" to use Squared Error as the objective function. + Valid options include "LogLoss" to use logistic loss as the objective function (binary classification), + or "SquaredLoss" to use Squared Error as the objective function (continuous regression). Defaults to "LogLoss". - `iterations` ***(int, optional)***: Total number of trees to train in the ensemble. Defaults to 100. @@ -81,7 +81,9 @@ model.predict(X.head()) The `fit` method accepts the following arguments. - `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array, with numeric data. - - `y` ***(ArrayLike)***: Either a pandas Series, or a 1 dimensional numpy array. + - `y` ***(ArrayLike)***: Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss" was + the objective type specified, then this should only contain 1 or 0 values, where 1 is the positive class being predicted. If "SquaredLoss" is the objective type, then any continuous variable can be + provided. - `sample_weight` ***(Optional[ArrayLike], optional)***: Instance weights to use when training the model. If None is passed, a weight of 1 will be used for every record. Defaults to None. diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index f89eb54..280c2f4 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.1.3" +version = "0.1.4" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 6b6ef2f..c526851 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -72,10 +72,10 @@ def __init__( """Gradient Booster Class, used to generate gradient boosted decision tree ensembles. Args: - objective_type (str, optional): The name of objective function used to optimize. - Valid options include "LogLoss" to use logistic loss as the objective function, - or "SquaredLoss" to use Squared Error as the objective function. - Defaults to "LogLoss". + objective_type (str, optional): The name of objective function used to optimize. + Valid options include "LogLoss" to use logistic loss as the objective function + (binary classification), or "SquaredLoss" to use Squared Error as the objective + function (continuous regression). Defaults to "LogLoss". iterations (int, optional): Total number of trees to train in the ensemble. Defaults to 100. learning_rate (float, optional): Step size to use at each iteration. Each @@ -138,7 +138,10 @@ def fit( Args: X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array. - y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array. + y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss" + was the objective type specified, then this should only contain 1 or 0 values, + where 1 is the positive class being predicted. If "SquaredLoss" is the + objective type, then any continuous variable can be provided. sample_weight (Optional[ArrayLike], optional): Instance weights to use when training the model. If None is passed, a weight of 1 will be used for every record. Defaults to None. diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index 3721745..fc54995 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -19,6 +19,7 @@ struct GradientBooster { #[pymethods] impl GradientBooster { #[new] + #[allow(clippy::too_many_arguments)] pub fn new( objective_type: &str, iterations: usize, @@ -65,9 +66,7 @@ impl GradientBooster { let data = Matrix::new(flat_data, rows, cols); let y = y.as_slice()?; let sample_weight = sample_weight.as_slice()?; - self.booster - .fit(&data, &y, &sample_weight) - .unwrap(); + self.booster.fit(&data, y, sample_weight).unwrap(); Ok(()) } pub fn predict<'py>( @@ -80,10 +79,7 @@ impl GradientBooster { ) -> PyResult<&'py PyArray1> { let flat_data = flat_data.as_slice()?; let data = Matrix::new(flat_data, rows, cols); - let parallel = match parallel { - None => true, - Some(v) => v, - }; + let parallel = parallel.unwrap_or(true); Ok(self.booster.predict(&data, parallel).into_pyarray(py)) } @@ -92,7 +88,7 @@ impl GradientBooster { for t in &self.booster.trees { trees.push(format!("{}", t)); } - return Ok(trees); + Ok(trees) } pub fn save_booster(&self, path: &str) -> PyResult<()> { @@ -127,7 +123,7 @@ impl GradientBooster { Ok(GradientBooster { booster }) } - pub fn get_params<'py>(&self, py: Python<'py>) -> PyResult { + pub fn get_params(&self, py: Python) -> PyResult { let objective_ = match self.booster.objective_type { ObjectiveType::LogLoss => "LogLoss", ObjectiveType::SquaredLoss => "SquaredLoss", diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 55d9676..11101f4 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np from forust import GradientBooster -from xgboost import XGBClassifier +from xgboost import XGBClassifier, XGBRegressor import pytest @@ -81,42 +81,26 @@ def test_booster_to_xgboosts_with_missing(X_y): assert np.allclose(fmod_preds, xmod_preds, atol=0.00001) -def test_booster_to_xgboosts_weighted(X_y): +def test_booster_to_xgboosts_with_missing_sl(X_y): X, y = X_y - X = X.fillna(0) - w = X["fare"].to_numpy() + 1 - xmod = XGBClassifier( + X = X + X["survived"] = y + y = X["fare"] + X = X.drop(columns=["fare"]) + xmod = XGBRegressor( n_estimators=100, learning_rate=0.3, max_depth=5, reg_lambda=1, min_child_weight=1, - gamma=0, - objective="binary:logitraw", + gamma=1, + eval_metric="auc", tree_method="hist", - max_bins=1000, + max_bin=10000, ) - xmod.fit(X, y, sample_weight=w) + xmod.fit(X, y) xmod_preds = xmod.predict(X, output_margin=True) - fmod = GradientBooster( - iterations=100, - learning_rate=0.3, - max_depth=5, - l2=1, - min_leaf_weight=1, - gamma=0, - objective_type="LogLoss", - ) - fmod.fit(X, y=y, sample_weight=w) - fmod_preds = fmod.predict(X) - assert np.allclose(fmod_preds, xmod_preds, atol=0.0001) - - -def test_booster_saving(X_y, tmp_path): - f32_model_path = tmp_path / "modelf32.json" - X, y = X_y - X = X fmod = GradientBooster( iterations=100, learning_rate=0.3, @@ -130,31 +114,44 @@ def test_booster_saving(X_y, tmp_path): ) fmod.fit(X, y=y) fmod_preds = fmod.predict(X) - fmod.save_booster(f32_model_path) - fmod_loaded = GradientBooster.load_booster(f32_model_path) - assert all(fmod_preds == fmod_loaded.predict(X)) + assert np.allclose(fmod_preds, xmod_preds, atol=0.00001) + - f32_model_path = tmp_path / "modelf32.json" +def test_booster_to_xgboosts_weighted(X_y): X, y = X_y - X = X + X = X.fillna(0) + w = X["fare"].to_numpy() + 1 + xmod = XGBClassifier( + n_estimators=100, + learning_rate=0.3, + max_depth=5, + reg_lambda=1, + min_child_weight=1, + gamma=0, + objective="binary:logitraw", + tree_method="hist", + max_bins=1000, + ) + xmod.fit(X, y, sample_weight=w) + xmod_preds = xmod.predict(X, output_margin=True) + fmod = GradientBooster( iterations=100, learning_rate=0.3, max_depth=5, l2=1, min_leaf_weight=1, - gamma=1, + gamma=0, objective_type="LogLoss", - nbins=500, - parallel=False, ) - fmod.fit(X, y=y) + fmod.fit(X, y=y, sample_weight=w) fmod_preds = fmod.predict(X) - fmod.save_booster(f32_model_path) - fmod_loaded = GradientBooster.load_booster(f32_model_path) - assert all(fmod_preds == fmod_loaded.predict(X)) + assert np.allclose(fmod_preds, xmod_preds, atol=0.0001) + - f64_model_path = tmp_path / "modelf64.json" +def test_booster_saving(X_y, tmp_path): + # squared loss + f64_model_path = tmp_path / "modelf64_sl.json" X, y = X_y X = X fmod = GradientBooster( @@ -174,7 +171,8 @@ def test_booster_saving(X_y, tmp_path): fmod_loaded = GradientBooster.load_booster(f64_model_path) assert all(fmod_preds == fmod_loaded.predict(X)) - f64_model_path = tmp_path / "modelf64.json" + # LogLoss + f64_model_path = tmp_path / "modelf64_ll.json" X, y = X_y X = X fmod = GradientBooster( diff --git a/src/binning.rs b/src/binning.rs index 140643b..0f1f218 100644 --- a/src/binning.rs +++ b/src/binning.rs @@ -86,13 +86,14 @@ pub fn bin_matrix>( let mut cuts = JaggedMatrix::new(); let mut nunique = Vec::new(); for i in 0..data.cols { - let no_miss: Vec = data + let (no_miss, w): (Vec, Vec) = data .get_col(i) .iter() - .filter(|v| !v.is_nan()) - .copied() - .collect(); - let mut col_cuts = percentiles_or_value(&no_miss, sample_weight, &pcts); + .zip(sample_weight.iter()) + .filter(|(v, _)| !v.is_nan()) + .unzip(); + assert_eq!(no_miss.len(), w.len()); + let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts); col_cuts.push(T::MAX); col_cuts.dedup(); if col_cuts.len() < 3 {