Skip to content

Commit

Permalink
Merge pull request #9 from jinlow/binning-sample-weights
Browse files Browse the repository at this point in the history
Formatting and fixing sample weight in binning
  • Loading branch information
jinlow authored Jun 18, 2022
2 parents c5ed633 + 80029db commit d288c31
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 68 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.1.3"
version = "0.1.4"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
<p align="center">
<img height="340" src="resources/tree-image-crop.png">
<img height="340" src="https://github.com/jinlow/forust/raw/main/resources/tree-image-crop.png">
</p>


<div align="center">

<a href="">![PyPI](https://img.shields.io/pypi/v/forust?color=gr&style=for-the-badge)</a>
<a href="">![Crates.io](https://img.shields.io/crates/v/forust-ml?color=gr&style=for-the-badge)</a>
<a href="https://pypi.org/project/forust/">![PyPI](https://img.shields.io/pypi/v/forust?color=gr&style=for-the-badge)</a>
<a href="https://crates.io/crates/forust-ml">![Crates.io](https://img.shields.io/crates/v/forust-ml?color=gr&style=for-the-badge)</a>

</div>

Expand All @@ -30,8 +30,8 @@ The `GradientBooster` class is currently the only public facing class in the pac
It can be initialized with the following arguments.

- `objective_type` ***(str, optional)***: The name of objective function used to optimize.
Valid options include "LogLoss" to use logistic loss as the objective function,
or "SquaredLoss" to use Squared Error as the objective function.
Valid options include "LogLoss" to use logistic loss as the objective function (binary classification),
or "SquaredLoss" to use Squared Error as the objective function (continuous regression).
Defaults to "LogLoss".
- `iterations` ***(int, optional)***: Total number of trees to train in the ensemble.
Defaults to 100.
Expand Down Expand Up @@ -81,7 +81,9 @@ model.predict(X.head())

The `fit` method accepts the following arguments.
- `X` ***(FrameLike)***: Either a pandas DataFrame, or a 2 dimensional numpy array, with numeric data.
- `y` ***(ArrayLike)***: Either a pandas Series, or a 1 dimensional numpy array.
- `y` ***(ArrayLike)***: Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss" was
the objective type specified, then this should only contain 1 or 0 values, where 1 is the positive class being predicted. If "SquaredLoss" is the objective type, then any continuous variable can be
provided.
- `sample_weight` ***(Optional[ArrayLike], optional)***: Instance weights to use when
training the model. If None is passed, a weight of 1 will be used for every record.
Defaults to None.
Expand Down
2 changes: 1 addition & 1 deletion py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.1.3"
version = "0.1.4"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
13 changes: 8 additions & 5 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ def __init__(
"""Gradient Booster Class, used to generate gradient boosted decision tree ensembles.
Args:
objective_type (str, optional): The name of objective function used to optimize.
Valid options include "LogLoss" to use logistic loss as the objective function,
or "SquaredLoss" to use Squared Error as the objective function.
Defaults to "LogLoss".
objective_type (str, optional): The name of objective function used to optimize.
Valid options include "LogLoss" to use logistic loss as the objective function
(binary classification), or "SquaredLoss" to use Squared Error as the objective
function (continuous regression). Defaults to "LogLoss".
iterations (int, optional): Total number of trees to train in the ensemble.
Defaults to 100.
learning_rate (float, optional): Step size to use at each iteration. Each
Expand Down Expand Up @@ -138,7 +138,10 @@ def fit(
Args:
X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array.
y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array.
y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss"
was the objective type specified, then this should only contain 1 or 0 values,
where 1 is the positive class being predicted. If "SquaredLoss" is the
objective type, then any continuous variable can be provided.
sample_weight (Optional[ArrayLike], optional): Instance weights to use when
training the model. If None is passed, a weight of 1 will be used for every record.
Defaults to None.
Expand Down
14 changes: 5 additions & 9 deletions py-forust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct GradientBooster {
#[pymethods]
impl GradientBooster {
#[new]
#[allow(clippy::too_many_arguments)]
pub fn new(
objective_type: &str,
iterations: usize,
Expand Down Expand Up @@ -65,9 +66,7 @@ impl GradientBooster {
let data = Matrix::new(flat_data, rows, cols);
let y = y.as_slice()?;
let sample_weight = sample_weight.as_slice()?;
self.booster
.fit(&data, &y, &sample_weight)
.unwrap();
self.booster.fit(&data, y, sample_weight).unwrap();
Ok(())
}
pub fn predict<'py>(
Expand All @@ -80,10 +79,7 @@ impl GradientBooster {
) -> PyResult<&'py PyArray1<f64>> {
let flat_data = flat_data.as_slice()?;
let data = Matrix::new(flat_data, rows, cols);
let parallel = match parallel {
None => true,
Some(v) => v,
};
let parallel = parallel.unwrap_or(true);
Ok(self.booster.predict(&data, parallel).into_pyarray(py))
}

Expand All @@ -92,7 +88,7 @@ impl GradientBooster {
for t in &self.booster.trees {
trees.push(format!("{}", t));
}
return Ok(trees);
Ok(trees)
}

pub fn save_booster(&self, path: &str) -> PyResult<()> {
Expand Down Expand Up @@ -127,7 +123,7 @@ impl GradientBooster {
Ok(GradientBooster { booster })
}

pub fn get_params<'py>(&self, py: Python<'py>) -> PyResult<PyObject> {
pub fn get_params(&self, py: Python) -> PyResult<PyObject> {
let objective_ = match self.booster.objective_type {
ObjectiveType::LogLoss => "LogLoss",
ObjectiveType::SquaredLoss => "SquaredLoss",
Expand Down
80 changes: 39 additions & 41 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import numpy as np
from forust import GradientBooster
from xgboost import XGBClassifier
from xgboost import XGBClassifier, XGBRegressor
import pytest


Expand Down Expand Up @@ -81,42 +81,26 @@ def test_booster_to_xgboosts_with_missing(X_y):
assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)


def test_booster_to_xgboosts_weighted(X_y):
def test_booster_to_xgboosts_with_missing_sl(X_y):
X, y = X_y
X = X.fillna(0)
w = X["fare"].to_numpy() + 1
xmod = XGBClassifier(
X = X
X["survived"] = y
y = X["fare"]
X = X.drop(columns=["fare"])
xmod = XGBRegressor(
n_estimators=100,
learning_rate=0.3,
max_depth=5,
reg_lambda=1,
min_child_weight=1,
gamma=0,
objective="binary:logitraw",
gamma=1,
eval_metric="auc",
tree_method="hist",
max_bins=1000,
max_bin=10000,
)
xmod.fit(X, y, sample_weight=w)
xmod.fit(X, y)
xmod_preds = xmod.predict(X, output_margin=True)

fmod = GradientBooster(
iterations=100,
learning_rate=0.3,
max_depth=5,
l2=1,
min_leaf_weight=1,
gamma=0,
objective_type="LogLoss",
)
fmod.fit(X, y=y, sample_weight=w)
fmod_preds = fmod.predict(X)
assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)


def test_booster_saving(X_y, tmp_path):
f32_model_path = tmp_path / "modelf32.json"
X, y = X_y
X = X
fmod = GradientBooster(
iterations=100,
learning_rate=0.3,
Expand All @@ -130,31 +114,44 @@ def test_booster_saving(X_y, tmp_path):
)
fmod.fit(X, y=y)
fmod_preds = fmod.predict(X)
fmod.save_booster(f32_model_path)
fmod_loaded = GradientBooster.load_booster(f32_model_path)
assert all(fmod_preds == fmod_loaded.predict(X))
assert np.allclose(fmod_preds, xmod_preds, atol=0.00001)


f32_model_path = tmp_path / "modelf32.json"
def test_booster_to_xgboosts_weighted(X_y):
X, y = X_y
X = X
X = X.fillna(0)
w = X["fare"].to_numpy() + 1
xmod = XGBClassifier(
n_estimators=100,
learning_rate=0.3,
max_depth=5,
reg_lambda=1,
min_child_weight=1,
gamma=0,
objective="binary:logitraw",
tree_method="hist",
max_bins=1000,
)
xmod.fit(X, y, sample_weight=w)
xmod_preds = xmod.predict(X, output_margin=True)

fmod = GradientBooster(
iterations=100,
learning_rate=0.3,
max_depth=5,
l2=1,
min_leaf_weight=1,
gamma=1,
gamma=0,
objective_type="LogLoss",
nbins=500,
parallel=False,
)
fmod.fit(X, y=y)
fmod.fit(X, y=y, sample_weight=w)
fmod_preds = fmod.predict(X)
fmod.save_booster(f32_model_path)
fmod_loaded = GradientBooster.load_booster(f32_model_path)
assert all(fmod_preds == fmod_loaded.predict(X))
assert np.allclose(fmod_preds, xmod_preds, atol=0.0001)


f64_model_path = tmp_path / "modelf64.json"
def test_booster_saving(X_y, tmp_path):
# squared loss
f64_model_path = tmp_path / "modelf64_sl.json"
X, y = X_y
X = X
fmod = GradientBooster(
Expand All @@ -174,7 +171,8 @@ def test_booster_saving(X_y, tmp_path):
fmod_loaded = GradientBooster.load_booster(f64_model_path)
assert all(fmod_preds == fmod_loaded.predict(X))

f64_model_path = tmp_path / "modelf64.json"
# LogLoss
f64_model_path = tmp_path / "modelf64_ll.json"
X, y = X_y
X = X
fmod = GradientBooster(
Expand Down
11 changes: 6 additions & 5 deletions src/binning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,14 @@ pub fn bin_matrix<T: FloatData<T>>(
let mut cuts = JaggedMatrix::new();
let mut nunique = Vec::new();
for i in 0..data.cols {
let no_miss: Vec<T> = data
let (no_miss, w): (Vec<T>, Vec<T>) = data
.get_col(i)
.iter()
.filter(|v| !v.is_nan())
.copied()
.collect();
let mut col_cuts = percentiles_or_value(&no_miss, sample_weight, &pcts);
.zip(sample_weight.iter())
.filter(|(v, _)| !v.is_nan())
.unzip();
assert_eq!(no_miss.len(), w.len());
let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts);
col_cuts.push(T::MAX);
col_cuts.dedup();
if col_cuts.len() < 3 {
Expand Down

0 comments on commit d288c31

Please sign in to comment.