From 09fde34d93e0e43aef3f2074d45651a55e13bc2e Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 17 Oct 2023 08:18:11 -0500 Subject: [PATCH 1/4] small refactor --- Cargo.toml | 2 +- README.md | 2 +- py-forust/Cargo.toml | 4 ++-- py-forust/src/lib.rs | 5 +++++ rs-example.md | 2 +- src/splitter.rs | 23 +++++++++++++---------- 6 files changed, 23 insertions(+), 15 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 457ef79..770d30b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forust-ml" -version = "0.4.0" +version = "0.4.1" edition = "2021" authors = ["James Inlow "] homepage = "https://github.com/jinlow/forust" diff --git a/README.md b/README.md index 58886a5..1d33e7b 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ pip install forust To use in a rust project add the following to your Cargo.toml file. ```toml -forust-ml = "0.4.0" +forust-ml = "0.4.1" ``` ## Usage diff --git a/py-forust/Cargo.toml b/py-forust/Cargo.toml index b21463f..2357b64 100644 --- a/py-forust/Cargo.toml +++ b/py-forust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-forust" -version = "0.4.0" +version = "0.4.1" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.19.0", features = ["extension-module"] } -forust-ml = { version = "0.4.0", path = "../" } +forust-ml = { version = "0.4.1", path = "../" } numpy = "0.19.0" ndarray = "0.15.1" serde_plain = { version = "1.0" } diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index 9ddfd32..22b91b1 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -193,6 +193,11 @@ impl GradientBooster { Ok(self.booster.base_score) } + // #[getter] + // fn number_of_trees(&self) -> PyResult { + // Ok(self.booster.trees.len()) + // } + pub fn fit( &mut self, flat_data: PyReadonlyArray1, diff --git a/rs-example.md b/rs-example.md index 0794899..4b73001 100644 --- a/rs-example.md +++ b/rs-example.md @@ -3,7 +3,7 @@ To run this example, add the following code to your `Cargo.toml` file. ```toml [dependencies] -forust-ml = "0.4.0" +forust-ml = "0.4.1" polars = "0.28" reqwest = { version = "0.11", features = ["blocking"] } ``` diff --git a/src/splitter.rs b/src/splitter.rs index 815e23e..cd5a2e7 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -114,11 +114,22 @@ pub trait Splitter { let elements = histogram.len(); assert!(elements == histogram.len()); - for (i, bin) in histogram[1..].iter().enumerate() { + for (i, bin) in histogram.iter().enumerate() { let left_gradient = cuml_grad; let left_hessian = cuml_hess; let right_gradient = node.gradient_sum - cuml_grad - missing.gradient_sum; let right_hessian = node.hessian_sum - cuml_hess - missing.hessian_sum; + // cuml_grad += bin.gradient_sum; + // cuml_hess += bin.hessian_sum; + if i > 0 { + // If i is zero, we are evaluating the missing bin... + cuml_grad += bin.gradient_sum; + cuml_hess += bin.hessian_sum; + // If this is the first bin, this is the missing bin. + // Is there even any missing data? + } else if missing.hessian_sum == 0. || missing.gradient_sum == 0. { + continue; + } let (mut left_node_info, mut right_node_info, missing_info) = match self.evaluate_split( left_gradient, @@ -133,8 +144,6 @@ pub trait Splitter { constraint, ) { None => { - cuml_grad += bin.gradient_sum; - cuml_hess += bin.hessian_sum; continue; } Some(v) => v, @@ -156,9 +165,6 @@ pub trait Splitter { ); if split_gain <= 0.0 { - // Update for new value - cuml_grad += bin.gradient_sum; - cuml_hess += bin.hessian_sum; continue; } @@ -183,15 +189,12 @@ pub trait Splitter { split_gain, split_feature: feature, split_value: bin.cut_value, - split_bin: (i + 1) as u16, + split_bin: i as u16, left_node: left_node_info, right_node: right_node_info, missing_node: missing_info, }); } - // Update for new value - cuml_grad += bin.gradient_sum; - cuml_hess += bin.hessian_sum; } split_info } From 5183e0e027b6d070eafd11315996204bf01c1a70 Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 17 Oct 2023 09:18:14 -0500 Subject: [PATCH 2/4] fix gain cull call --- src/splitter.rs | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/splitter.rs b/src/splitter.rs index cd5a2e7..303b48a 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -114,22 +114,13 @@ pub trait Splitter { let elements = histogram.len(); assert!(elements == histogram.len()); - for (i, bin) in histogram.iter().enumerate() { + for (i, bin) in histogram[1..].iter().enumerate() { let left_gradient = cuml_grad; let left_hessian = cuml_hess; let right_gradient = node.gradient_sum - cuml_grad - missing.gradient_sum; let right_hessian = node.hessian_sum - cuml_hess - missing.hessian_sum; - // cuml_grad += bin.gradient_sum; - // cuml_hess += bin.hessian_sum; - if i > 0 { - // If i is zero, we are evaluating the missing bin... - cuml_grad += bin.gradient_sum; - cuml_hess += bin.hessian_sum; - // If this is the first bin, this is the missing bin. - // Is there even any missing data? - } else if missing.hessian_sum == 0. || missing.gradient_sum == 0. { - continue; - } + cuml_grad += bin.gradient_sum; + cuml_hess += bin.hessian_sum; let (mut left_node_info, mut right_node_info, missing_info) = match self.evaluate_split( left_gradient, @@ -189,7 +180,7 @@ pub trait Splitter { split_gain, split_feature: feature, split_value: bin.cut_value, - split_bin: i as u16, + split_bin: (i + 1) as u16, left_node: left_node_info, right_node: right_node_info, missing_node: missing_info, @@ -802,9 +793,9 @@ impl Splitter for MissingImputerSplitter { // Don't even worry about it, if there are no missing values // in this bin. if (missing_gradient != 0.0) || (missing_hessian != 0.0) { + // If // TODO: Consider making this safer, by casting to f64, summing, and then // back to f32... - // The weight if missing went left let missing_left_weight = constrained_weight( &self.l2, @@ -846,10 +837,10 @@ impl Splitter for MissingImputerSplitter { missing_right_weight, ); // Confirm this wouldn't break monotonicity. - let missing_left_gain = cull_gain( - missing_left_gain, - missing_left_weight, - right_weight, + let missing_right_gain = cull_gain( + missing_right_gain, + left_weight, + missing_right_weight, constraint, ); From 3f0912cf9b39bf6d8a07663bef925aa07a7c1116 Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 17 Oct 2023 10:58:08 -0500 Subject: [PATCH 3/4] Add the number of trees --- py-forust/forust/__init__.py | 10 ++++++++++ py-forust/src/lib.rs | 8 ++++---- py-forust/tests/test_booster.py | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/py-forust/forust/__init__.py b/py-forust/forust/__init__.py index 5c1594c..7d49124 100644 --- a/py-forust/forust/__init__.py +++ b/py-forust/forust/__init__.py @@ -68,6 +68,7 @@ class BoosterType(Protocol): best_iteration: None | int base_score: float terminate_missing_features: set[int] + number_of_trees: int def fit( self, @@ -944,6 +945,15 @@ def prediction_iteration(self) -> int | None: """ return self.booster.prediction_iteration + @property + def number_of_trees(self) -> int: + """The number of trees in the model. + + Returns: + int: The total number of trees in the model. + """ + return self.booster.number_of_trees + def get_best_iteration(self) -> int | None: """Get the best iteration if `early_stopping_rounds` was used when fitting. diff --git a/py-forust/src/lib.rs b/py-forust/src/lib.rs index 22b91b1..1ee6eda 100644 --- a/py-forust/src/lib.rs +++ b/py-forust/src/lib.rs @@ -193,10 +193,10 @@ impl GradientBooster { Ok(self.booster.base_score) } - // #[getter] - // fn number_of_trees(&self) -> PyResult { - // Ok(self.booster.trees.len()) - // } + #[getter] + fn number_of_trees(&self) -> PyResult { + Ok(self.booster.trees.len()) + } pub fn fit( &mut self, diff --git a/py-forust/tests/test_booster.py b/py-forust/tests/test_booster.py index 7e4ca2b..88274d4 100644 --- a/py-forust/tests/test_booster.py +++ b/py-forust/tests/test_booster.py @@ -950,6 +950,7 @@ def test_early_stopping_with_dev_val(X_y): assert len(n_trees) == model.get_best_iteration() + 5 assert len(n_trees) == model.get_evaluation_history().shape[0] assert model.get_best_iteration() < 99 + assert model.number_of_trees == model.get_best_iteration() + 5 def test_goss_sampling_method(X_y): From 520ab79bd684ca2864dfbcdf6b322e30372501fb Mon Sep 17 00:00:00 2001 From: jinlow Date: Tue, 17 Oct 2023 12:13:33 -0500 Subject: [PATCH 4/4] Clean up objectives --- benches/forust_benchmarks.rs | 11 ++---- src/histogram.rs | 3 +- src/objective.rs | 76 ++++++++++++++++++------------------ src/partial_dependence.rs | 3 +- src/splitter.rs | 9 ++--- src/tree.rs | 13 +++--- 6 files changed, 50 insertions(+), 65 deletions(-) diff --git a/benches/forust_benchmarks.rs b/benches/forust_benchmarks.rs index 8fe1332..de6b637 100644 --- a/benches/forust_benchmarks.rs +++ b/benches/forust_benchmarks.rs @@ -20,20 +20,15 @@ pub fn tree_benchmarks(c: &mut Criterion) { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let g = LogLoss::calc_grad(&y, &yhat, &w); - let h = LogLoss::calc_hess(&y, &yhat, &w); + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); let v: Vec = vec![10.; 300000]; c.bench_function("Niave Sum", |b| b.iter(|| naive_sum(black_box(&v)))); c.bench_function("fast sum", |b| b.iter(|| fast_sum(black_box(&v)))); c.bench_function("fast f64 sum", |b| b.iter(|| fast_f64_sum(black_box(&v)))); - c.bench_function("calc_grad", |b| { - b.iter(|| LogLoss::calc_grad(black_box(&y), black_box(&yhat), black_box(&w))) - }); - - c.bench_function("calc_hess", |b| { - b.iter(|| LogLoss::calc_hess(black_box(&y), black_box(&yhat), black_box(&w))) + c.bench_function("calc_grad_hess", |b| { + b.iter(|| LogLoss::calc_grad_hess(black_box(&y), black_box(&yhat), black_box(&w))) }); let data = Matrix::new(&data_vec, y.len(), 5); diff --git a/src/histogram.rs b/src/histogram.rs index 7d84b95..14e820a 100644 --- a/src/histogram.rs +++ b/src/histogram.rs @@ -250,8 +250,7 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let g = LogLoss::calc_grad(&y, &yhat, &w); - let h = LogLoss::calc_hess(&y, &yhat, &w); + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); let hist = create_feature_histogram(&bdata.get_col(1), &b.cuts.get_col(1), &g, &h, &bdata.index); // println!("{:?}", hist); diff --git a/src/objective.rs b/src/objective.rs index 16e96e1..43711e4 100644 --- a/src/objective.rs +++ b/src/objective.rs @@ -26,8 +26,6 @@ pub fn calc_init_callables(objective_type: &ObjectiveType) -> fn(&[f64], &[f64]) pub trait ObjectiveFunction { fn calc_loss(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec; fn calc_grad_hess(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> (Vec, Vec); - fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec; - fn calc_hess(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec; fn calc_init(y: &[f64], sample_weight: &[f64]) -> f64; fn default_metric() -> Metric; } @@ -73,27 +71,27 @@ impl ObjectiveFunction for LogLoss { .unzip() } - #[inline] - fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec { - y.iter() - .zip(yhat) - .zip(sample_weight) - .map(|((y_, yhat_), w_)| { - let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); - ((yhat_ - *y_) * *w_) as f32 - }) - .collect() - } - #[inline] - fn calc_hess(_: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec { - yhat.iter() - .zip(sample_weight) - .map(|(yhat_, w_)| { - let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); - (yhat_ * (f64::ONE - yhat_) * *w_) as f32 - }) - .collect() - } + // #[inline] + // fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec { + // y.iter() + // .zip(yhat) + // .zip(sample_weight) + // .map(|((y_, yhat_), w_)| { + // let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); + // ((yhat_ - *y_) * *w_) as f32 + // }) + // .collect() + // } + // #[inline] + // fn calc_hess(_: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec { + // yhat.iter() + // .zip(sample_weight) + // .map(|(yhat_, w_)| { + // let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp()); + // (yhat_ * (f64::ONE - yhat_) * *w_) as f32 + // }) + // .collect() + // } fn default_metric() -> Metric { Metric::LogLoss } @@ -126,19 +124,19 @@ impl ObjectiveFunction for SquaredLoss { ytot / ntot } - #[inline] - fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec { - y.iter() - .zip(yhat) - .zip(sample_weight) - .map(|((y_, yhat_), w_)| ((*yhat_ - *y_) * *w_) as f32) - .collect() - } + // #[inline] + // fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec { + // y.iter() + // .zip(yhat) + // .zip(sample_weight) + // .map(|((y_, yhat_), w_)| ((*yhat_ - *y_) * *w_) as f32) + // .collect() + // } - #[inline] - fn calc_hess(_: &[f64], _: &[f64], sample_weight: &[f64]) -> Vec { - sample_weight.iter().map(|v| *v as f32).collect() - } + // #[inline] + // fn calc_hess(_: &[f64], _: &[f64], sample_weight: &[f64]) -> Vec { + // sample_weight.iter().map(|v| *v as f32).collect() + // } #[inline] fn calc_grad_hess(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> (Vec, Vec) { y.iter() @@ -171,9 +169,9 @@ mod tests { let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0]; let w = vec![1.; y.len()]; - let g1 = LogLoss::calc_grad(&y, &yhat1, &w); + let (g1, _) = LogLoss::calc_grad_hess(&y, &yhat1, &w); let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0]; - let g2 = LogLoss::calc_grad(&y, &yhat2, &w); + let (g2, _) = LogLoss::calc_grad_hess(&y, &yhat2, &w); assert!(g1.iter().sum::() < g2.iter().sum::()); } @@ -182,9 +180,9 @@ mod tests { let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0]; let yhat1 = vec![-1.0, -1.0, -1.0, 1.0, 1.0, 1.0]; let w = vec![1.; y.len()]; - let h1 = LogLoss::calc_hess(&y, &yhat1, &w); + let (_, h1) = LogLoss::calc_grad_hess(&y, &yhat1, &w); let yhat2 = vec![0.0, 0.0, -1.0, 1.0, 0.0, 1.0]; - let h2 = LogLoss::calc_hess(&y, &yhat2, &w); + let (_, h2) = LogLoss::calc_grad_hess(&y, &yhat2, &w); assert!(h1.iter().sum::() < h2.iter().sum::()); } diff --git a/src/partial_dependence.rs b/src/partial_dependence.rs index 3297b25..e765791 100644 --- a/src/partial_dependence.rs +++ b/src/partial_dependence.rs @@ -93,8 +93,7 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let g = LogLoss::calc_grad(&y, &yhat, &w); - let h = LogLoss::calc_hess(&y, &yhat, &w); + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { diff --git a/src/splitter.rs b/src/splitter.rs index 303b48a..c5a51ca 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -999,8 +999,7 @@ mod tests { let y = vec![0., 0., 0., 1., 1., 0., 1.]; let yhat = vec![0.; 7]; let w = vec![1.; y.len()]; - let grad = LogLoss::calc_grad(&y, &yhat, &w); - let hess = LogLoss::calc_hess(&y, &yhat, &w); + let (grad, hess) = LogLoss::calc_grad_hess(&y, &yhat, &w); let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); let index = data.index.to_owned(); @@ -1043,8 +1042,7 @@ mod tests { let y = vec![0., 0., 0., 1., 1., 0., 1.]; let yhat = vec![0.; 7]; let w = vec![1.; y.len()]; - let grad = LogLoss::calc_grad(&y, &yhat, &w); - let hess = LogLoss::calc_hess(&y, &yhat, &w); + let (grad, hess) = LogLoss::calc_grad_hess(&y, &yhat, &w); let b = bin_matrix(&data, &w, 10, f64::NAN).unwrap(); let bdata = Matrix::new(&b.binned_data, data.rows, data.cols); @@ -1094,8 +1092,7 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let grad = LogLoss::calc_grad(&y, &yhat, &w); - let hess = LogLoss::calc_hess(&y, &yhat, &w); + let (grad, hess) = LogLoss::calc_grad_hess(&y, &yhat, &w); let splitter = MissingImputerSplitter { l2: 1.0, diff --git a/src/tree.rs b/src/tree.rs index 1b7cb90..44caf43 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -571,8 +571,8 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let mut g = LogLoss::calc_grad(&y, &yhat, &w); - let mut h = LogLoss::calc_hess(&y, &yhat, &w); + let (mut g, mut h) = LogLoss::calc_grad_hess(&y, &yhat, &w); + // let mut h = LogLoss::calc_hess(&y, &yhat, &w); let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { @@ -616,8 +616,7 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let g = LogLoss::calc_grad(&y, &yhat, &w); - let h = LogLoss::calc_hess(&y, &yhat, &w); + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter { @@ -698,8 +697,7 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let g = LogLoss::calc_grad(&y, &yhat, &w); - let h = LogLoss::calc_hess(&y, &yhat, &w); + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); println!("GRADIENT -- {:?}", h); let data_ = Matrix::new(&data_vec, 891, 5); @@ -783,8 +781,7 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let yhat = vec![0.5; y.len()]; let w = vec![1.; y.len()]; - let g = LogLoss::calc_grad(&y, &yhat, &w); - let h = LogLoss::calc_hess(&y, &yhat, &w); + let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w); let data = Matrix::new(&data_vec, 891, 5); let splitter = MissingImputerSplitter {