From 1c47fac48fd041c77101124fc9eebf770557fca7 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 25 Jun 2019 14:48:24 +0100 Subject: [PATCH 01/45] Added 2 parameters to enable penalization of monotone splits and enable a slower but better constraining method. --- docs/Parameters.rst | 12 ++++++++++++ include/LightGBM/config.h | 12 ++++++++++++ src/io/config_auto.cpp | 24 ++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index a996b0132852..19e646126f7c 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -312,6 +312,18 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout +- ``monotone_penalty`` :raw-html:`🔗︎`, default = ``0.``, type = double, aliases: ``monotone_splits_penalty``, constraints: ``0.0 <= monotone_penalty (< max_depth, if max_depth > 0)`` + + - used only if ``monotone_constraints`` is set + + - monotone penalty: a penalization of 0 equals to no penalization. A penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter + +- ``monotone_precise_method`` :raw-html:`🔗︎`, default = ``false``, type = bool, aliases: ``monotone_constraints_precise_mode`` + + - used only if ``monotone_constraints`` is set + + - monotone precise method`: if set to false then the program will run as fast as without constraints, but the results may be over-constrained. If set to true, then the program will be slower, but results will be better. Note that if there are categorical features, in the dataset, they will be splitted using the fast method regardless of this parameter. Also, the parameter can only be set to true if the missing handle is disabled + - ``max_drop`` :raw-html:`🔗︎`, default = ``50``, type = int - used only in ``dart`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 974735532c29..0230ae245d0f 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -325,6 +325,18 @@ struct Config { // desc = dropout rate: a fraction of previous trees to drop during the dropout double drop_rate = 0.1; + // alias = monotone_splits_penalty + // check = >=0.0 + // check = 0 + // desc = used only if ``monotone_constraints`` is set + // desc = monotone penalty: a penalization of 0 equals to no penalization. A penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter + double monotone_penalty = 0.; + + // alias = monotone_constraints_precise_mode + // desc = used only if ``monotone_constraints`` is set + // desc = monotone precise mode: if set to false then the program will run as fast as without constraints, but the results may be over-constrained. If set to true, then the program will be slower, but results will be better. Note that if there are categorical features, in the dataset, they will be splitted using the fast method regardless of this parameter. Also, the parameter can only be set to true if the missing handle is disabled + bool monotone_precise_mode = false; + // desc = used only in ``dart`` // desc = max number of dropped trees during one boosting iteration // desc = ``<=0`` means no limit diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 809b8a7843aa..0534c8f6dfa0 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -6,6 +6,7 @@ * This file is auto generated by LightGBM\helpers\parameter_generator.py from LightGBM\include\LightGBM\config.h file. */ #include +#include namespace LightGBM { std::unordered_map Config::alias_table({ {"config_file", "config"}, @@ -80,6 +81,8 @@ std::unordered_map Config::alias_table({ {"lambda", "lambda_l2"}, {"min_split_gain", "min_gain_to_split"}, {"rate_drop", "drop_rate"}, + {"monotone_splits_penalty", "monotone_penalty"}, + {"monotone_constraints_precise_mode", "monotone_precise_mode"}, {"topk", "top_k"}, {"mc", "monotone_constraints"}, {"monotone_constraint", "monotone_constraints"}, @@ -199,6 +202,8 @@ std::unordered_set Config::parameter_set({ "lambda_l2", "min_gain_to_split", "drop_rate", + "monotone_penalty", + "monotone_precise_mode", "max_drop", "skip_drop", "xgboost_dart_mode", @@ -399,8 +404,21 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); + Log::Warning("The constraining method was just changed, which could significantly affect results of the algorithm"); } + GetDouble(params, "monotone_penalty", &monotone_penalty); + bool constraints_exist = false; + for (auto it = monotone_constraints.begin(); it != monotone_constraints.end(); + it++) { + if (*it != 0) { + constraints_exist = true; + } + } + CHECK(monotone_penalty == 0 || constraints_exist); + CHECK(max_depth <= 0 || monotone_penalty < max_depth); + CHECK(monotone_penalty >= 0.0); + if (GetString(params, "feature_contri", &tmp_str)) { feature_contri = Common::StringToArray(tmp_str, ','); } @@ -476,6 +494,10 @@ void Config::GetMembersFromString(const std::unordered_map Date: Tue, 25 Jun 2019 15:02:02 +0100 Subject: [PATCH 02/45] Added getters and new parameters like parents of nodes in the trees. --- include/LightGBM/tree.h | 58 +++++++++++++++++++++++++++++++++++++++++ src/io/tree.cpp | 3 +++ 2 files changed, 61 insertions(+) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index f672f62b347d..f99f0fd73616 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -124,6 +124,24 @@ class Tree { inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; + // Get node parent + inline int node_parent(int node_idx) const; + // Get leaf parent + inline int leaf_parent(int node_idx) const; + + // Get children + inline int left_child(int node_idx) const; + inline int right_child(int node_idx) const; + + // Get if the feature is in a monotone subtree + inline bool leaf_is_in_monotone_subtree(int leaf_idx) const; + + inline double internal_value(int node_idx) const; + + inline uint32_t threshold_in_bin(int node_idx) const; + + // Get the feature corresponding to the split + inline int split_feature_inner(int node_idx) const; inline void PredictContrib(const double* feature_values, int num_features, double* output); @@ -402,6 +420,10 @@ class Tree { std::vector leaf_depth_; double shrinkage_; int max_depth_; + // add parent node information + std::vector node_parent_; + // Keeps track of the monotone splits above the leaf + std::vector leaf_is_in_monotone_subtree_; }; inline void Tree::Split(int leaf, int feature, int real_feature, @@ -421,6 +443,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature, // add new node split_feature_inner_[new_node_idx] = feature; split_feature_[new_node_idx] = real_feature; + node_parent_[new_node_idx] = parent; split_gain_[new_node_idx] = gain; // add two new leaves @@ -529,6 +552,41 @@ inline int Tree::GetLeafByMap(const std::unordered_map& feature_val return ~node; } +inline int Tree::node_parent(int node_idx) const{ + return node_parent_[node_idx]; +} + +inline int Tree::left_child(int node_idx) const{ + return left_child_[node_idx]; +} + +inline int Tree::right_child(int node_idx) const{ + return right_child_[node_idx]; +} + +inline int Tree::split_feature_inner(int node_idx) const{ + return split_feature_inner_[node_idx]; +} + +inline int Tree::leaf_parent(int node_idx) const{ + return leaf_parent_[node_idx]; +} + +inline uint32_t Tree::threshold_in_bin(int node_idx) const{ + #ifdef DEBUG + CHECK(node_idx >= 0); + #endif + return threshold_in_bin_[node_idx]; +} + +inline bool Tree::leaf_is_in_monotone_subtree(int leaf_idx) const { + return leaf_is_in_monotone_subtree_[leaf_idx]; +} + +inline double Tree::internal_value(int node_idx) const { + return internal_value_[node_idx]; +} + } // namespace LightGBM diff --git a/src/io/tree.cpp b/src/io/tree.cpp index db0803bd3108..a108fb671a0d 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -24,7 +24,9 @@ Tree::Tree(int max_leaves) threshold_.resize(max_leaves_ - 1); decision_type_.resize(max_leaves_ - 1, 0); split_gain_.resize(max_leaves_ - 1); + node_parent_.resize(max_leaves_ - 1); leaf_parent_.resize(max_leaves_); + leaf_is_in_monotone_subtree_.resize(max_leaves_); leaf_value_.resize(max_leaves_); leaf_weight_.resize(max_leaves_); leaf_count_.resize(max_leaves_); @@ -38,6 +40,7 @@ Tree::Tree(int max_leaves) leaf_value_[0] = 0.0f; leaf_weight_[0] = 0.0f; leaf_parent_[0] = -1; + node_parent_[0] = -1; shrinkage_ = 1.0f; num_cat_ = 0; cat_boundaries_.push_back(0); From b65ace1df59fe764a7b9199ee9a6a63c98c663b6 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 10:25:35 +0100 Subject: [PATCH 03/45] Fill the feature_is_monotone variable in the trees. --- include/LightGBM/tree.h | 27 ++++++++++++++++++------- src/io/tree.cpp | 20 ++++++++++++------ src/treelearner/serial_tree_learner.cpp | 17 ++++++++++++---- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index f99f0fd73616..6bc7130fcdb3 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -60,7 +60,7 @@ class Tree { int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, int left_cnt, int right_cnt, double left_weight, double right_weight, - float gain, MissingType missing_type, bool default_left); + float gain, MissingType missing_type, bool default_left, bool feature_is_monotone); /*! * \brief Performing a split on tree leaves, with categorical feature @@ -80,9 +80,14 @@ class Tree { * \param gain Split gain * \return The index of new leaf. */ - int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin, - const uint32_t* threshold, int num_threshold, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type); + + int SplitCategorical(int leaf, int feature, int real_feature, + const uint32_t *threshold_bin, int num_threshold_bin, + const uint32_t *threshold, int num_threshold, + double left_value, double right_value, int left_cnt, + int right_cnt, double left_weight, double right_weight, + float gain, MissingType missing_type, + bool feature_is_monotone); /*! \brief Get the output of one leaf */ inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; } @@ -320,8 +325,10 @@ class Tree { } } - inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain); + inline void Split(int leaf, int feature, int real_feature, double left_value, + double right_value, int left_cnt, int right_cnt, double left_weight, + double right_weight,float gain, bool feature_is_monotone); + /*! * \brief Find leaf index of which record belongs by features * \param feature_values Feature value of this record @@ -428,8 +435,14 @@ class Tree { inline void Tree::Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain) { + double left_weight, double right_weight, float gain, bool feature_is_monotone) { int new_node_idx = num_leaves_ - 1; + + // Update if there is a monotone split above the leaf + if (feature_is_monotone || leaf_is_in_monotone_subtree_[leaf]) { + leaf_is_in_monotone_subtree_[leaf] = true; + leaf_is_in_monotone_subtree_[num_leaves_] = true; + } // update parent info int parent = leaf_parent_[leaf]; if (parent >= 0) { diff --git a/src/io/tree.cpp b/src/io/tree.cpp index a108fb671a0d..6fcbfbe321d6 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -53,8 +53,11 @@ Tree::~Tree() { int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool default_left) { - Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); + int left_cnt, int right_cnt, double left_weight, + double right_weight, float gain, + MissingType missing_type, bool default_left, + bool feature_was_monotone) { + Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, feature_was_monotone); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; SetDecisionType(&decision_type_[new_node_idx], false, kCategoricalMask); @@ -72,10 +75,15 @@ int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, return num_leaves_ - 1; } -int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin, - const uint32_t* threshold, int num_threshold, double left_value, double right_value, - data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type) { - Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); +int Tree::SplitCategorical(int leaf, int feature, int real_feature, + const uint32_t *threshold_bin, int num_threshold_bin, + const uint32_t *threshold, int num_threshold, + double left_value, double right_value, + data_size_t left_cnt, data_size_t right_cnt, + double left_weight, double right_weight, + float gain, MissingType missing_type, + bool feature_was_monotone) { + Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain, feature_was_monotone); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; SetDecisionType(&decision_type_[new_node_idx], true, kCategoricalMask); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e3531039ec9d..8a7ae3f57cfb 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -679,6 +679,11 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json SplitInfo current_split_info = forceSplitMap[current_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex( current_split_info.feature); + // we want to know if the feature has to be monotone + bool feature_is_monotone = false; + if (!config_->monotone_constraints.empty()) { + feature_is_monotone = config_->monotone_constraints[inner_feature_index] != 0; + } auto threshold_double = train_data_->RealThreshold( inner_feature_index, current_split_info.threshold); @@ -698,7 +703,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json static_cast(current_split_info.right_sum_hessian), static_cast(current_split_info.gain), train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), - current_split_info.default_left); + current_split_info.default_left, + feature_is_monotone); data_partition_->Split(current_leaf, train_data_, inner_feature_index, ¤t_split_info.threshold, 1, current_split_info.default_left, *right_leaf); @@ -726,7 +732,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json static_cast(current_split_info.left_sum_hessian), static_cast(current_split_info.right_sum_hessian), static_cast(current_split_info.gain), - train_data_->FeatureBinMapper(inner_feature_index)->missing_type()); + train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), + feature_is_monotone); data_partition_->Split(current_leaf, train_data_, inner_feature_index, cat_bitset_inner.data(), static_cast(cat_bitset_inner.size()), current_split_info.default_left, *right_leaf); @@ -793,7 +800,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri static_cast(best_split_info.right_sum_hessian), static_cast(best_split_info.gain), train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), - best_split_info.default_left); + best_split_info.default_left, + best_split_info.monotone_type != 0); data_partition_->Split(best_leaf, train_data_, inner_feature_index, &best_split_info.threshold, 1, best_split_info.default_left, *right_leaf); } else { @@ -817,7 +825,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri static_cast(best_split_info.left_sum_hessian), static_cast(best_split_info.right_sum_hessian), static_cast(best_split_info.gain), - train_data_->FeatureBinMapper(inner_feature_index)->missing_type()); + train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), + best_split_info.monotone_type != 0); data_partition_->Split(best_leaf, train_data_, inner_feature_index, cat_bitset_inner.data(), static_cast(cat_bitset_inner.size()), best_split_info.default_left, *right_leaf); } From cb9917f4b5b9749135ed1267c77a24c785efcfcf Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 10:27:53 +0100 Subject: [PATCH 04/45] Added utility function to know the number of bins of a feature. --- include/LightGBM/dataset.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 18417adb4dc6..3408cd6b70ef 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -586,6 +586,15 @@ class Dataset { return bufs; } + // This function retrieves the number of bins for a specific feature + int NumBin(int feature_idx) const { + const int group = feature2group_[feature_idx]; + const int sub_feature = feature2subfeature_[feature_idx]; + const BinMapper *bin_mapper = + feature_groups_[group]->bin_mappers_[sub_feature].get(); + return bin_mapper->num_bin(); + } + void ResetConfig(const char* parameters); /*! \brief Get Number of data */ From 8fea7622a264f397946eed7ffd9130a8fd4d91c8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 10:29:25 +0100 Subject: [PATCH 05/45] Added some debugging checks. --- src/boosting/gbdt_model_text.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 4e44b2a2527a..e080bcb1d214 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -540,6 +540,9 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + #ifdef DEBUG + CHECK(models_[iter]->split_feature(split_idx) >= 0); + #endif feature_importances[models_[iter]->split_feature(split_idx)] += 1.0; } } @@ -548,6 +551,9 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + #ifdef DEBUG + CHECK(models_[iter]->split_feature(split_idx) >= 0); + #endif feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx); } } From d412ca6dd0ec24c8c5d6557f26bd256c625f6437 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 11:36:51 +0100 Subject: [PATCH 06/45] Leaf splits now keep track of the depth. --- src/treelearner/leaf_splits.hpp | 13 ++++++- src/treelearner/serial_tree_learner.cpp | 38 +++++++++++++------ .../voting_parallel_tree_learner.cpp | 38 +++++++++++++------ 3 files changed, 64 insertions(+), 25 deletions(-) diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index e46d0b846fcb..c22872ec8ce1 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -38,7 +38,9 @@ class LeafSplits { * \param sum_gradients * \param sum_hessians */ - void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) { + void Init(int leaf, const DataPartition *data_partition, double sum_gradients, + double sum_hessians, int depth) { + depth_ = depth; leaf_index_ = leaf; data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); sum_gradients_ = sum_gradients; @@ -59,6 +61,7 @@ class LeafSplits { * \param hessians */ void Init(const score_t* gradients, const score_t* hessians) { + depth_ = 0; num_data_in_leaf_ = num_data_; leaf_index_ = 0; data_indices_ = nullptr; @@ -82,7 +85,9 @@ class LeafSplits { * \param gradients * \param hessians */ - void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) { + void Init(int leaf, const DataPartition *data_partition, + const score_t *gradients, const score_t *hessians, int depth) { + depth_ = depth; leaf_index_ = leaf; data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); double tmp_sum_gradients = 0.0f; @@ -106,6 +111,7 @@ class LeafSplits { * \param sum_hessians */ void Init(double sum_gradients, double sum_hessians) { + depth_ = 0; leaf_index_ = 0; sum_gradients_ = sum_gradients; sum_hessians_ = sum_hessians; @@ -117,6 +123,7 @@ class LeafSplits { * \brief Init splits on current leaf */ void Init() { + depth_ = 0; leaf_index_ = -1; data_indices_ = nullptr; num_data_in_leaf_ = 0; @@ -140,6 +147,7 @@ class LeafSplits { double max_constraint() const { return max_val_; } double min_constraint() const { return min_val_; } + int depth() const { return depth_; } /*! \brief Get indices of data of current leaf */ const data_size_t* data_indices() const { return data_indices_; } @@ -160,6 +168,7 @@ class LeafSplits { const data_size_t* data_indices_; double min_val_; double max_val_; + int depth_; }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 8a7ae3f57cfb..de9d4f5d3a9e 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -346,7 +346,7 @@ void SerialTreeLearner::BeforeTrain() { } else { // use bagging, only use part of data - smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); + smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_, 0.); } larger_leaf_splits_->Init(); @@ -739,20 +739,26 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json current_split_info.default_left, *right_leaf); } + int depth = tree->leaf_depth(*left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(*right_leaf)); + #endif if (current_split_info.left_count < current_split_info.right_count) { left_smaller = true; smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), current_split_info.left_sum_gradient, - current_split_info.left_sum_hessian); + current_split_info.left_sum_hessian, depth); larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), current_split_info.right_sum_gradient, - current_split_info.right_sum_hessian); + current_split_info.right_sum_hessian, depth); } else { left_smaller = false; smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), - current_split_info.right_sum_gradient, current_split_info.right_sum_hessian); + current_split_info.right_sum_gradient, + current_split_info.right_sum_hessian, depth); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), - current_split_info.left_sum_gradient, current_split_info.left_sum_hessian); + current_split_info.left_sum_gradient, + current_split_info.left_sum_hessian, depth); } left = Json(); @@ -837,14 +843,24 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri auto p_left = smaller_leaf_splits_.get(); auto p_right = larger_leaf_splits_.get(); // init the leaves that used on next iteration + int depth = tree->leaf_depth(*left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(*right_leaf)); + #endif if (best_split_info.left_count < best_split_info.right_count) { - smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); - larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); + smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); + larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); } else { - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); - p_right = smaller_leaf_splits_.get(); - p_left = larger_leaf_splits_.get(); + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); } p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index cb18e3779ba6..0cb49edf1f7a 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -149,16 +149,26 @@ bool VotingParallelTreeLearner::BeforeFindBestSplit(const Tree* t if (TREELEARNER_T::BeforeFindBestSplit(tree, left_leaf, right_leaf)) { data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf); data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf); + int depth = tree->leaf_depth(left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(right_leaf)); + #endif if (right_leaf < 0) { return true; } else if (num_data_in_left_child < num_data_in_right_child) { // get local sumup - this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); - this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); + this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, + depth); + this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, depth); } else { // get local sumup - this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); - this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); + this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, + depth); + this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, depth); } return true; } else { @@ -480,20 +490,24 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, auto p_left = smaller_leaf_splits_global_.get(); auto p_right = larger_leaf_splits_global_.get(); // init the global sumup info + int depth = tree->leaf_depth(*left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(*right_leaf)); + #endif if (best_split_info.left_count < best_split_info.right_count) { smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); } else { smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); p_left = larger_leaf_splits_global_.get(); p_right = smaller_leaf_splits_global_.get(); } From 37010ce58b0e589e806137280e55140e39414ae4 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 11:45:36 +0100 Subject: [PATCH 07/45] Added the tree as a parameter in the function UpdateBestSplitsFromHistograms. --- src/treelearner/data_parallel_tree_learner.cpp | 7 ++++--- .../feature_parallel_tree_learner.cpp | 7 +++++-- src/treelearner/parallel_tree_learner.h | 16 +++++++++++----- src/treelearner/serial_tree_learner.cpp | 12 +++++++----- src/treelearner/serial_tree_learner.h | 6 ++++-- src/treelearner/voting_parallel_tree_learner.cpp | 7 ++++--- 6 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index ed677ecf88d5..1ece1572eaea 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -146,7 +146,7 @@ void DataParallelTreeLearner::BeforeTrain() { } template -void DataParallelTreeLearner::FindBestSplits() { +void DataParallelTreeLearner::FindBestSplits(const Tree* tree) { TREELEARNER_T::ConstructHistograms(this->is_feature_used_, true); // construct local histograms #pragma omp parallel for schedule(static) @@ -160,11 +160,12 @@ void DataParallelTreeLearner::FindBestSplits() { // Reduce scatter for histogram Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramBinEntry::SumReducer); - this->FindBestSplitsFromHistograms(this->is_feature_used_, true); + this->FindBestSplitsFromHistograms(this->is_feature_used_, true, tree); } template -void DataParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector&, bool) { +void DataParallelTreeLearner::FindBestSplitsFromHistograms( + const std::vector &, bool, const Tree *tree) { std::vector smaller_bests_per_thread(this->num_threads_, SplitInfo()); std::vector larger_bests_per_thread(this->num_threads_, SplitInfo()); std::vector smaller_node_used_features(this->num_features_, 1); diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index 745ca44be68b..d5e6c013b73d 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -52,8 +52,11 @@ void FeatureParallelTreeLearner::BeforeTrain() { } template -void FeatureParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { - TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract); +void FeatureParallelTreeLearner::FindBestSplitsFromHistograms( + const std::vector &is_feature_used, bool use_subtract, + const Tree *tree) { + TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract, + tree); SplitInfo smaller_best_split, larger_best_split; // get best split at smaller leaf smaller_best_split = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()]; diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index c6754b517397..92a67cdbe5ea 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -31,7 +31,9 @@ class FeatureParallelTreeLearner: public TREELEARNER_T { protected: void BeforeTrain() override; - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) override; + void FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, + const Tree *tree) override; private: /*! \brief rank of local machine */ @@ -59,8 +61,10 @@ class DataParallelTreeLearner: public TREELEARNER_T { protected: void BeforeTrain() override; - void FindBestSplits() override; - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) override; + void FindBestSplits(const Tree *tree) override; + void FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, + const Tree *tree) override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override { @@ -114,8 +118,10 @@ class VotingParallelTreeLearner: public TREELEARNER_T { protected: void BeforeTrain() override; bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; - void FindBestSplits() override; - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) override; + void FindBestSplits(const Tree *tree) override; + void FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, + const Tree *tree) override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override { diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index de9d4f5d3a9e..83e44c25c4c2 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -208,7 +208,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians init_split_time += std::chrono::steady_clock::now() - start_time; #endif // find best threshold for every feature - FindBestSplits(); + FindBestSplits(tree.get()); } else if (aborted_last_force_split) { aborted_last_force_split = false; } @@ -476,7 +476,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int return true; } -void SerialTreeLearner::FindBestSplits() { +void SerialTreeLearner::FindBestSplits(const Tree* tree) { std::vector is_feature_used(num_features_, 0); #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { @@ -490,7 +490,7 @@ void SerialTreeLearner::FindBestSplits() { } bool use_subtract = parent_leaf_histogram_array_ != nullptr; ConstructHistograms(is_feature_used, use_subtract); - FindBestSplitsFromHistograms(is_feature_used, use_subtract); + FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); } void SerialTreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { @@ -521,7 +521,9 @@ void SerialTreeLearner::ConstructHistograms(const std::vector& is_featur #endif } -void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { +void SerialTreeLearner::FindBestSplitsFromHistograms( + const std::vector &is_feature_used, bool use_subtract, + const Tree *tree) { #ifdef TIMETAG auto start_time = std::chrono::steady_clock::now(); #endif @@ -620,7 +622,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json // before processing next node from queue, store info for current left/right leaf // store "best split" for left and right, even if they might be overwritten by forced split if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) { - FindBestSplits(); + FindBestSplits(tree); } // then, compute own splits SplitInfo left_split; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 31743933a780..8ee21b4ccc6a 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -91,11 +91,13 @@ class SerialTreeLearner: public TreeLearner { */ virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf); - virtual void FindBestSplits(); + virtual void FindBestSplits(const Tree* tree); virtual void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract); - virtual void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract); + virtual void + FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, const Tree *tree); /*! * \brief Partition tree and data according best split. diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 0cb49edf1f7a..bd6ecf8e4451 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -269,7 +269,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vec } template -void VotingParallelTreeLearner::FindBestSplits() { +void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) { // use local data to find local best splits std::vector is_feature_used(this->num_features_, 0); #pragma omp parallel for schedule(static) @@ -380,11 +380,12 @@ void VotingParallelTreeLearner::FindBestSplits() { Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramBinEntry::SumReducer); - this->FindBestSplitsFromHistograms(is_feature_used, false); + this->FindBestSplitsFromHistograms(is_feature_used, false, tree); } template -void VotingParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector&, bool) { +void VotingParallelTreeLearner::FindBestSplitsFromHistograms( + const std::vector &, bool, const Tree *tree) { std::vector smaller_bests_per_thread(this->num_threads_); std::vector larger_best_per_thread(this->num_threads_); std::vector smaller_node_used_features(this->num_features_, 1); From 3ff354d4dc6005556f49dbbf8b4c038b8c4d5da8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 11:48:43 +0100 Subject: [PATCH 08/45] Added a new struct to keep track of the constraints, in a more precise way when needed. --- src/treelearner/monotone_constraints.hpp | 224 +++++++++++++++++++++++ src/treelearner/serial_tree_learner.cpp | 13 ++ src/treelearner/serial_tree_learner.h | 3 + 3 files changed, 240 insertions(+) create mode 100644 src/treelearner/monotone_constraints.hpp diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp new file mode 100644 index 000000000000..dd76e3de64b4 --- /dev/null +++ b/src/treelearner/monotone_constraints.hpp @@ -0,0 +1,224 @@ +#ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ +#define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ + +#include + +namespace LightGBM { + +// the purpose of this structure is to store the constraints for one leaf +// when the monotone precise mode is disabled, then it will just store +// one min and one max constraint +// but if the monotone precise mode is enabled, then it may store a +// large number of constraints for different thresholds and features +struct Constraints { + std::vector > min_constraints; + std::vector > max_constraints; + // the constraint number i is valid on the slice [thresholds[i]:threshold[i+1]) + // if threshold[i+1] does not exist, then it is valid for thresholds following threshold[i] + std::vector > min_thresholds; + std::vector > max_thresholds; + // These 2 vectors keep track of which constraints over which features + // have to be upated + std::vector min_to_be_updated; + std::vector max_to_be_updated; + // This vector keeps track of the constraints that we didn't update for some + // features, because they could only be worse, and another better split was + // available, so we didn't need to compute them yet, but we may need to in the future + std::vector are_actual_constraints_worse; + + bool IsInConstraints(double element, + const std::vector > &constraints, + std::vector &to_be_updated) { + bool ret = false; + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if (element == constraints[i][j]) { + ret = true; + to_be_updated[i] = true; + are_actual_constraints_worse[i] = false; + } + } + } + return ret; + } + + bool IsInMinConstraints(double min) { + return IsInConstraints(min, min_constraints, min_to_be_updated); + } + + bool IsInMaxConstraints(double max) { + return IsInConstraints(max, max_constraints, max_to_be_updated); + } + + void SetConstraint(double element, + std::vector > &constraints, + bool is_operator_greater) const { + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if ((is_operator_greater && element > constraints[i][j]) || + (!is_operator_greater && element < constraints[i][j])) { + constraints[i][j] = element; + } + } + } + } + + // this function is the same as the previous one, but it also returns + // if it actually modified something or not + bool + SetConstraintAndReturnChange(double element, + std::vector > &constraints, + bool is_operator_greater) const { + bool something_changed = false; + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if ((is_operator_greater && element > constraints[i][j]) || + (!is_operator_greater && element < constraints[i][j])) { + constraints[i][j] = element; + something_changed = true; + } + } + } + return something_changed; + } + + // this function checks if the element passed as a parameter would actually update + // the constraints if they were to be set it as an additional constraint + bool CrossesConstraint(double element, + std::vector > &constraints, + bool is_operator_greater, + std::vector &to_be_updated) { + bool ret = false; + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if ((is_operator_greater && element > constraints[i][j]) || + (!is_operator_greater && element < constraints[i][j])) { + ret = true; + to_be_updated[i] = true; + are_actual_constraints_worse[i] = true; + } + } + } + return ret; + } + + bool SetMinConstraintAndReturnChange(double min) { + return SetConstraintAndReturnChange(min, min_constraints, true); + } + + bool SetMaxConstraintAndReturnChange(double max) { + return SetConstraintAndReturnChange(max, max_constraints, false); + } + + void SetMinConstraint(double min) { + SetConstraint(min, min_constraints, true); + } + + void SetMaxConstraint(double max) { + SetConstraint(max, max_constraints, false); + } + + bool CrossesMinConstraint(double min) { + return CrossesConstraint(min, min_constraints, true, min_to_be_updated); + } + + bool CrossesMaxConstraint(double max) { + return CrossesConstraint(max, max_constraints, false, max_to_be_updated); + } + + void ResetUpdates(unsigned int i) { +#ifdef DEBUG + CHECK(i < are_actual_constraints_worse.size()); +#endif + are_actual_constraints_worse[i] = false; + min_to_be_updated[i] = false; + max_to_be_updated[i] = false; + } + + // when the monotone precise mode is disabled, then we can just store + // 1 min and 1 max constraints per leaf, so we call this constructor + Constraints() { + min_constraints.push_back( + std::vector(1, -std::numeric_limits::max())); + max_constraints.push_back( + std::vector(1, std::numeric_limits::max())); + min_thresholds.push_back(std::vector(1, 0)); + max_thresholds.push_back(std::vector(1, 0)); + } + + // when the monotone precise mode is enabled, then for each feature, + // we need to sort an array of constraints + Constraints(unsigned int num_features) { + min_constraints.resize(num_features); + max_constraints.resize(num_features); + + min_thresholds.resize(num_features); + max_thresholds.resize(num_features); + + min_to_be_updated.resize(num_features, false); + max_to_be_updated.resize(num_features, false); + are_actual_constraints_worse.resize(num_features, false); + + for (unsigned int i = 0; i < num_features; i++) { + // The number 32 has no real meaning here, but during our experiments, + // we found that the number of constraints per feature was well below 32, so by + // allocating this space, we may save some time because we won't have to allocate it later + min_constraints[i].reserve(32); + max_constraints[i].reserve(32); + + min_thresholds[i].reserve(32); + max_thresholds[i].reserve(32); + + min_constraints[i].push_back(-std::numeric_limits::max()); + max_constraints[i].push_back(std::numeric_limits::max()); + + min_thresholds[i].push_back(0); + max_thresholds[i].push_back(0); + } + } + + bool AreActualConstraintsWorse(unsigned int feature_idx) const { + return are_actual_constraints_worse[feature_idx]; + } + + bool ToBeUpdated(unsigned int feature_idx) const { + return min_to_be_updated[feature_idx] || max_to_be_updated[feature_idx]; + } + + bool MinToBeUpdated(unsigned int feature_idx) const { + return min_to_be_updated[feature_idx]; + } + + bool MaxToBeUpdated(unsigned int feature_idx) const { + return max_to_be_updated[feature_idx]; + } + + Constraints(const Constraints &constraints) + : min_constraints(constraints.min_constraints), + max_constraints(constraints.max_constraints), + min_thresholds(constraints.min_thresholds), + max_thresholds(constraints.max_thresholds), + min_to_be_updated(constraints.min_to_be_updated), + max_to_be_updated(constraints.max_to_be_updated), + are_actual_constraints_worse(constraints.are_actual_constraints_worse) { + } + + // When we reset the constraints, then we just need to write that the constraints + // are +/- inf, starting from the threshold 0 + void Reset() { + for (unsigned int i = 0; i < min_constraints.size(); i++) { + min_constraints[i].resize(1); + max_constraints[i].resize(1); + min_thresholds[i].resize(1); + max_thresholds[i].resize(1); + + min_constraints[i][0] = -std::numeric_limits::max(); + max_constraints[i][0] = std::numeric_limits::max(); + min_thresholds[i][0] = 0; + max_thresholds[i][0] = 0; + } + } +}; + +} // namespace LightGBM +#endif // LightGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 83e44c25c4c2..c0163a37496d 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -71,6 +71,18 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves); // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); + + // when the monotone precise mode is enabled, we need to store + // more constraints; hence the constructors are different + if (config_->monotone_precise_mode) { + constraints_per_leaf_.resize(config_->num_leaves, + Constraints(num_features_)); + } else { + constraints_per_leaf_.resize(config_->num_leaves, + Constraints()); + } + splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features()); + // get ordered bin train_data_->CreateOrderedBins(&ordered_bins_); @@ -337,6 +349,7 @@ void SerialTreeLearner::BeforeTrain() { // reset the splits for leaves for (int i = 0; i < config_->num_leaves; ++i) { best_split_per_leaf_[i].Reset(); + constraints_per_leaf_[i].Reset(); } // Sumup for root diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 8ee21b4ccc6a..3b286a9cfc4d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -22,6 +22,7 @@ #include "feature_histogram.hpp" #include "leaf_splits.hpp" #include "split_info.hpp" +#include "monotone_constraints.hpp" #ifdef USE_GPU // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled. @@ -147,6 +148,8 @@ class SerialTreeLearner: public TreeLearner { /*! \brief store best split points for all leaves */ std::vector best_split_per_leaf_; + + std::vector constraints_per_leaf_; /*! \brief store best split per feature for all leaves */ std::vector splits_per_leaf_; From d8563da6aef3bcad81ed8b93e831c1b4ec379198 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:02:51 +0100 Subject: [PATCH 09/45] Added variables to keep track of the constraints efficiently. --- src/treelearner/serial_tree_learner.cpp | 61 +++++++++++++++++++++++++ src/treelearner/serial_tree_learner.h | 14 ++++++ 2 files changed, 75 insertions(+) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index c0163a37496d..2ccf59b40ef0 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -120,6 +120,45 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_->Init(); } + + dummy_min_constraints.resize(num_threads_); + min_constraints.resize(num_threads_); + dummy_max_constraints.resize(num_threads_); + max_constraints.resize(num_threads_); + + thresholds_min_constraints.resize(num_threads_); + thresholds_max_constraints.resize(num_threads_); + + features.resize(num_threads_); + is_in_right_split.resize(num_threads_); + thresholds.resize(num_threads_); + + // the number 32 has no real meaning here, but during our experiments, + // we found that the number of constraints per feature was well below 32, so by + // allocating this space, we may save some time because we won't have to allocate it later + int space_to_reserve = 32; + if (!config_->monotone_precise_mode) { + space_to_reserve = 1; + } + + for (int i = 0; i < num_threads_; ++i) { + dummy_min_constraints[i].reserve(space_to_reserve); + min_constraints[i].reserve(space_to_reserve); + dummy_max_constraints[i].reserve(space_to_reserve); + max_constraints[i].reserve(space_to_reserve); + + thresholds_min_constraints[i].reserve(space_to_reserve); + thresholds_max_constraints[i].reserve(space_to_reserve); + + if (!config_->monotone_constraints.empty()) { + // the number 100 has no real meaning here, same as before + features[i].reserve(std::max(100, config_->max_depth)); + is_in_right_split[i].reserve(std::max(100, config_->max_depth)); + thresholds[i].reserve(std::max(100, config_->max_depth)); + } + + InitializeConstraints(i); + } } void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { @@ -932,4 +971,26 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj } } +// initializing constraints is just writing that the constraints should +/- inf from threshold 0 +void SerialTreeLearner::InitializeConstraints(unsigned int tid) { + thresholds[tid].clear(); + is_in_right_split[tid].clear(); + features[tid].clear(); + + thresholds_min_constraints[tid].resize(1); + thresholds_max_constraints[tid].resize(1); + + dummy_min_constraints[tid].resize(1); + min_constraints[tid].resize(1); + dummy_max_constraints[tid].resize(1); + max_constraints[tid].resize(1); + + dummy_min_constraints[tid][0] = -std::numeric_limits::max(); + min_constraints[tid][0] = -std::numeric_limits::max(); + dummy_max_constraints[tid][0] = std::numeric_limits::max(); + max_constraints[tid][0] = std::numeric_limits::max(); + + thresholds_min_constraints[tid][0] = 0; + thresholds_max_constraints[tid][0] = 0; +} } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 3b286a9cfc4d..4c532aae57f4 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -121,6 +121,8 @@ class SerialTreeLearner: public TreeLearner { */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; + void InitializeConstraints(unsigned int tid); + /*! \brief number of data */ data_size_t num_data_; /*! \brief number of features */ @@ -185,6 +187,18 @@ class SerialTreeLearner: public TreeLearner { std::vector ordered_bin_indices_; bool is_constant_hessian_; std::unique_ptr cegb_; + + std::vector > dummy_min_constraints; + std::vector > min_constraints; + std::vector > dummy_max_constraints; + std::vector > max_constraints; + + std::vector > thresholds_min_constraints; + std::vector > thresholds_max_constraints; + + std::vector > features; + std::vector > thresholds; + std::vector > is_in_right_split; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { From 10c5cbbb60940cceaf72fb73d63e72f1abd5ab14 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:06:18 +0100 Subject: [PATCH 10/45] Added a penalty function for monotone splits. --- src/treelearner/serial_tree_learner.cpp | 13 +++++++++++++ src/treelearner/serial_tree_learner.h | 3 +++ 2 files changed, 16 insertions(+) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 2ccf59b40ef0..ac409533f681 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -993,4 +993,17 @@ void SerialTreeLearner::InitializeConstraints(unsigned int tid) { thresholds_min_constraints[tid][0] = 0; thresholds_max_constraints[tid][0] = 0; } + +double SerialTreeLearner::ComputeMonotoneSplitGainPenalty(int depth, + double penalization, + double epsilon) { + if (penalization >= depth + 1.) { + return epsilon; + } + if (penalization <= 1.) { + return 1. - penalization / pow(2., depth) + epsilon; + } + return 1. - pow(2, penalization - 1. - depth) + epsilon; +} + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 4c532aae57f4..092dcad5fa04 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -121,6 +121,9 @@ class SerialTreeLearner: public TreeLearner { */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; + static double ComputeMonotoneSplitGainPenalty(int depth, double penalization, + double epsilon = 1e-10); + void InitializeConstraints(unsigned int tid); /*! \brief number of data */ From d69e4ffee557a0ac4204946c3ca5fac3a1eae77c Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:36:53 +0100 Subject: [PATCH 11/45] Added core functions to go through the tree, and update split when constraints change. --- .../cost_effective_gradient_boosting.hpp | 3 +- src/treelearner/serial_tree_learner.cpp | 709 ++++++++++++++++++ src/treelearner/serial_tree_learner.h | 87 +++ 3 files changed, 798 insertions(+), 1 deletion(-) diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp index 82c6b9abc120..2f71aa49fcac 100644 --- a/src/treelearner/cost_effective_gradient_boosting.hpp +++ b/src/treelearner/cost_effective_gradient_boosting.hpp @@ -20,6 +20,8 @@ namespace LightGBM { class CostEfficientGradientBoosting { public: + std::vector splits_per_leaf_; + explicit CostEfficientGradientBoosting(const SerialTreeLearner* tree_learner):tree_learner_(tree_learner) { } static bool IsEnable(const Config* config) { @@ -106,7 +108,6 @@ class CostEfficientGradientBoosting { } const SerialTreeLearner* tree_learner_; - std::vector splits_per_leaf_; std::vector is_feature_used_in_split_; std::vector feature_used_in_data_; }; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index ac409533f681..d832fe84f040 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -27,6 +27,8 @@ std::chrono::duration split_time; std::chrono::duration ordered_bin_time; #endif // TIMETAG +double EPS = 1e-12; + SerialTreeLearner::SerialTreeLearner(const Config* config) :config_(config) { random_ = Random(config_->feature_fraction_seed); @@ -930,6 +932,308 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri } } +// this function is only used if the monotone precise mode is enabled +// it computes the constraints for a given leaf and a given feature +// (there can be many constraints because the constraints can depend on thresholds) +void SerialTreeLearner::ComputeConstraintsPerThreshold( + int feature, const Tree *tree, int node_idx, unsigned int tid, + bool per_threshold, bool compute_min, bool compute_max, uint32_t it_start, + uint32_t it_end) { + int parent_idx = (node_idx < 0) ? tree->leaf_parent(~node_idx) + : tree->node_parent(node_idx); + + if (parent_idx != -1) { + int inner_feature = tree->split_feature_inner(parent_idx); + int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); + bool is_right_split = tree->right_child(parent_idx) == node_idx; + bool split_contains_new_information = true; + bool is_split_numerical = (train_data_->FeatureBinMapper(inner_feature) + ->bin_type()) == BinType::NumericalBin; + uint32_t threshold = tree->threshold_in_bin(parent_idx); + + // when we go up, we can get more information about the position of the original leaf + // so the starting and ending thresholds can be updated, which will save some time later + if ((feature == inner_feature) && is_split_numerical) { + if (is_right_split) { + it_start = std::max(threshold, it_start); + } else { + it_end = std::min(threshold + 1, it_end); + } +#ifdef DEBUG + CHECK(it_start < it_end); +#endif + } + + // only branches that contain leaves that are contiguous to the original leaf need to be visited + for (unsigned int i = 0; i < features[tid].size(); ++i) { + if (features[tid][i] == inner_feature && is_split_numerical && + is_in_right_split[tid][i] == is_right_split) { + split_contains_new_information = false; + break; + } + } + + if (split_contains_new_information) { + if (monotone_type != 0) { + int left_child_idx = tree->left_child(parent_idx); + int right_child_idx = tree->right_child(parent_idx); + bool left_child_is_curr_idx = (left_child_idx == node_idx); + + bool take_min = (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + if ((take_min && compute_min) || (!take_min && compute_max)) { + int node_idx_to_pass = + (left_child_is_curr_idx) ? right_child_idx : left_child_idx; + + // we go down in the opposite branch to see if some + // constraints that would apply to the original leaf can be found + ComputeConstraintsPerThresholdInSubtree( + feature, inner_feature, tree, node_idx_to_pass, take_min, + it_start, it_end, features[tid], thresholds[tid], + is_in_right_split[tid], tid, per_threshold); + } + } + + is_in_right_split[tid].push_back(is_right_split); + thresholds[tid].push_back(threshold); + features[tid].push_back(inner_feature); + } + + // we keep going up the tree to find constraints that could come from somewhere else + if (parent_idx != 0) { + ComputeConstraintsPerThreshold(feature, tree, parent_idx, tid, + per_threshold, compute_min, compute_max, + it_start, it_end); + } + } +} + +// this function checks if the original leaf and the children of the node that is +// currently being visited are contiguous, and if so, the children should be visited too +std::pair SerialTreeLearner::ShouldKeepGoingLeftRight( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split) { + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + bool keep_going_right = true; + bool keep_going_left = true; + // we check if the left and right node are contiguous with the original leaf + // if so we should keep going down these nodes to update constraints + for (unsigned int i = 0; i < features.size(); ++i) { + if (features[i] == inner_feature) { + if (is_split_numerical) { + if (threshold >= thresholds[i] && !is_in_right_split[i]) { + keep_going_right = false; + } + if (threshold <= thresholds[i] && is_in_right_split[i]) { + keep_going_left = false; + } + } + } + } + return std::pair(keep_going_left, keep_going_right); +} + +// this function is called only when computing constraints when the monotone +// precise mode is set to true +// it makes sure that it is worth it to visit a branch, as it could +// not contain any relevant constraint (for example if the a branch +// with bigger values is also constraining the original leaf, then +// it is useless to visit the branch with smaller values) +std::pair SerialTreeLearner::LeftRightContainsRelevantInformation( + bool maximum, int inner_feature, bool split_feature_is_inner_feature) { + if (split_feature_is_inner_feature) { + return std::pair(true, true); + } + int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); + if (monotone_type == 0) { + return std::pair(true, true); + } + if ((monotone_type == -1 && maximum) || (monotone_type == 1 && !maximum)) { + return std::pair(true, false); + } + if ((monotone_type == 1 && maximum) || (monotone_type == -1 && !maximum)) { + return std::pair(false, true); + } +} + +// at any point in time, for an index i, the constraint constraint[i] has to be valid on +// [threshold[i]: threshold[i + 1]) (or [threshold[i]: +inf) if i is the last index of the array) +// therefore, when a constraint is added on a leaf, it must be done very carefully +void SerialTreeLearner::UpdateConstraints( + std::vector > &constraints, + std::vector > &thresholds, double extremum, + uint32_t it_start, uint32_t it_end, int split_feature, int tid, + bool maximum) { + bool start_done = false; + bool end_done = false; + // one must always keep track of the previous constraint + // for example when adding a constraints cstr2 on thresholds [1:2), + // on an existing constraints cstr1 on thresholds [0, +inf), + // the thresholds and constraints must become + // [0, 1, 2] and [cstr1, cstr2, cstr1] + // so since we loop through thresholds only once, + // the previous constraint that still applies needs to be recorded + double previous_constraint; + double current_constraint; + for (unsigned int i = 0; i < thresholds[tid].size();) { + current_constraint = constraints[tid][i]; + // this is the easy case when the thresholds match + if (thresholds[tid][i] == it_start) { + constraints[tid][i] = (maximum) ? std::max(extremum, constraints[tid][i]) + : std::min(extremum, constraints[tid][i]); + start_done = true; + } + if (thresholds[tid][i] > it_start) { + // existing constraint is updated if there is a need for it + if (thresholds[tid][i] < it_end) { + constraints[tid][i] = (maximum) + ? std::max(extremum, constraints[tid][i]) + : std::min(extremum, constraints[tid][i]); + } + // when thresholds don't match, a new threshold + // and a new constraint may need to be inserted + if (!start_done) { + start_done = true; + if ((maximum && extremum > previous_constraint) || + (!maximum && extremum < previous_constraint)) { + constraints[tid].insert(constraints[tid].begin() + i, extremum); + thresholds[tid].insert(thresholds[tid].begin() + i, it_start); + i += 1; + } + } + } + // easy case when the thresholds match again + if (thresholds[tid][i] == it_end) { + end_done = true; + i += 1; + break; + } + // if they don't then, the previous constraint needs to be added back where the current one ends + if (thresholds[tid][i] > it_end) { + if (i != 0 && previous_constraint != constraints[tid][i - 1]) { + constraints[tid] + .insert(constraints[tid].begin() + i, previous_constraint); + thresholds[tid].insert(thresholds[tid].begin() + i, it_end); + } + end_done = true; + i += 1; + break; + } + // If 2 successive constraints are the same then the second one may as well be deleted + if (i != 0 && constraints[tid][i] == constraints[tid][i - 1]) { + constraints[tid].erase(constraints[tid].begin() + i); + thresholds[tid].erase(thresholds[tid].begin() + i); + previous_constraint = current_constraint; + i -= 1; + } + previous_constraint = current_constraint; + i += 1; + } + // if the loop didn't get to an index greater than it_start, it needs to be added at the end + if (!start_done) { + if ((maximum && extremum > constraints[tid].back()) || + (!maximum && extremum < constraints[tid].back())) { + constraints[tid].push_back(extremum); + thresholds[tid].push_back(it_start); + } else { + end_done = true; + } + } + // if we didn't get to an index after it_end, then the previous constraint needs to be set back + // unless it_end goes up to the last bin of the feature + if (!end_done && + static_cast(it_end) != train_data_->NumBin(split_feature) && + previous_constraint != constraints[tid].back()) { + constraints[tid].push_back(previous_constraint); + thresholds[tid].push_back(it_end); + } +} + +// this function goes down in a subtree to find the constraints that would apply +void SerialTreeLearner::ComputeConstraintsPerThresholdInSubtree( + int split_feature, int monotone_feature, const Tree *tree, int node_idx, + bool maximum, uint32_t it_start, uint32_t it_end, + const std::vector &features, const std::vector &thresholds, + const std::vector &is_in_right_split, unsigned int tid, + bool per_threshold) { + bool is_original_split_numerical = + train_data_->FeatureBinMapper(split_feature)->bin_type() == + BinType::NumericalBin; + double extremum; + // if we just got to a leaf, then we update + // the constraints using the leaf value + if (node_idx < 0) { + extremum = tree->LeafOutput(~node_idx); +#ifdef DEBUG + CHECK(it_start < it_end); +#endif + // if the constraints per threshold are needed then monotone + // precise mode is enabled and we are not refitting leaves + if (per_threshold && is_original_split_numerical) { + std::vector > &constraints = + (maximum) ? min_constraints : max_constraints; + std::vector > &thresholds = + (maximum) ? thresholds_min_constraints : thresholds_max_constraints; + UpdateConstraints(constraints, thresholds, extremum, it_start, it_end, + split_feature, tid, maximum); + } else { // otherwise the constraints can be updated just by performing a min / max + if (maximum) { + min_constraints[tid][0] = std::max(min_constraints[tid][0], extremum); + } else { + max_constraints[tid][0] = std::min(max_constraints[tid][0], extremum); + } + } + } + // if the function got to a node, it keeps going down the tree + else { + // check if the children are contiguous to the original leaf + std::pair keep_going_left_right = ShouldKeepGoingLeftRight( + tree, node_idx, features, thresholds, is_in_right_split); + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + + bool split_feature_is_inner_feature = (inner_feature == split_feature); + bool split_feature_is_monotone_feature = + (monotone_feature == split_feature); + // it is made sure that both children contain values that could potentially + // help determine the true constraints for the original leaf + std::pair left_right_contain_relevant_information = + LeftRightContainsRelevantInformation( + maximum, inner_feature, split_feature_is_inner_feature && + !split_feature_is_monotone_feature); + // if a child does not contain relevant information compared to the other child, + // and if the other child is not contiguous, then we still need to go down the first child + if (keep_going_left_right.first && + (left_right_contain_relevant_information.first || + !keep_going_left_right.second)) { + uint32_t new_it_end = + (split_feature_is_inner_feature && is_original_split_numerical) + ? std::min(threshold + 1, it_end) + : it_end; + ComputeConstraintsPerThresholdInSubtree( + split_feature, monotone_feature, tree, tree->left_child(node_idx), + maximum, it_start, new_it_end, features, thresholds, + is_in_right_split, tid, per_threshold); + } + if (keep_going_left_right.second && + (left_right_contain_relevant_information.second || + !keep_going_left_right.first)) { + uint32_t new_it_start = + (split_feature_is_inner_feature && is_original_split_numerical) + ? std::max(threshold + 1, it_start) + : it_start; + ComputeConstraintsPerThresholdInSubtree( + split_feature, monotone_feature, tree, tree->right_child(node_idx), + maximum, new_it_start, it_end, features, thresholds, + is_in_right_split, tid, per_threshold); + } + } +} void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { @@ -971,6 +1275,411 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj } } +// this function goes through the tree to find how the split that was just made is +// going to affect other leaves +void SerialTreeLearner::GoDownToFindLeavesToUpdate( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, int maximum, int split_feature, + const SplitInfo &split_info, double previous_leaf_output, + bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold) { + if (node_idx < 0) { + int leaf_idx = ~node_idx; + + // if leaf is at max depth then there is no need to update it + int max_depth = config_->max_depth; + if (tree->leaf_depth(leaf_idx) >= max_depth && max_depth > 0) { + return; + } + + // splits that are not to be used shall not be updated + if (best_split_per_leaf_[leaf_idx].gain == kMinScore) { + return; + } + + std::pair min_max_constraints; + bool something_changed; + if (use_right_leaf && use_left_leaf) { + min_max_constraints = + std::minmax(split_info.right_output, split_info.left_output); + } else if (use_right_leaf && !use_left_leaf) { + min_max_constraints = std::pair(split_info.right_output, + split_info.right_output); + } else { + min_max_constraints = std::pair(split_info.left_output, + split_info.left_output); + } + +#ifdef DEBUG + if (maximum) { + CHECK(min_max_constraints.first >= tree->LeafOutput(leaf_idx)); + } else { + CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); + } +#endif + + if (!config_->monotone_precise_mode) { + if (!maximum) { + something_changed = + constraints_per_leaf_[leaf_idx] + .SetMinConstraintAndReturnChange(min_max_constraints.second); + } else { + something_changed = + constraints_per_leaf_[leaf_idx] + .SetMaxConstraintAndReturnChange(min_max_constraints.first); + } + if (!something_changed) { + return; + } + } else { + if (!maximum) { + // both functions need to be called in this order + // because they modify the struct + something_changed = + constraints_per_leaf_[leaf_idx] + .CrossesMinConstraint(min_max_constraints.second); + something_changed = constraints_per_leaf_[leaf_idx] + .IsInMinConstraints(previous_leaf_output) || + something_changed; + } else { + // both functions need to be called in this order + // because they modify the struct + something_changed = + constraints_per_leaf_[leaf_idx] + .CrossesMaxConstraint(min_max_constraints.first); + something_changed = constraints_per_leaf_[leaf_idx] + .IsInMaxConstraints(previous_leaf_output) || + something_changed; + } + // if constraints have changed, then best splits need to be updated + // otherwise, we can just continue and go to the next split + if (!something_changed) { + return; + } + } + UpdateBestSplitsFromHistograms(best_split_per_leaf_[leaf_idx], leaf_idx, + tree->leaf_depth(leaf_idx), tree); + } else { + // check if the children are contiguous with the original leaf + std::pair keep_going_left_right = ShouldKeepGoingLeftRight( + tree, node_idx, features, thresholds, is_in_right_split); + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + bool use_left_leaf_for_update = true; + bool use_right_leaf_for_update = true; + if (is_split_numerical && inner_feature == split_feature) { + if (threshold >= split_threshold) { + use_left_leaf_for_update = false; + } + if (threshold <= split_threshold) { + use_right_leaf_for_update = false; + } + } + + if (keep_going_left_right.first) { + GoDownToFindLeavesToUpdate( + tree, tree->left_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + previous_leaf_output, use_left_leaf, + use_right_leaf_for_update && use_right_leaf, split_threshold); + } + if (keep_going_left_right.second) { + GoDownToFindLeavesToUpdate( + tree, tree->right_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + previous_leaf_output, use_left_leaf_for_update && use_left_leaf, + use_right_leaf, split_threshold); + } + } +} + +// this function goes through the tree to find how the split that +// has just been performed is going to affect the constraints of other leaves +void SerialTreeLearner::GoUpToFindLeavesToUpdate( + const Tree *tree, int node_idx, std::vector &features, + std::vector &thresholds, std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, double previous_leaf_output, + uint32_t split_threshold) { + int parent_idx = tree->node_parent(node_idx); + if (parent_idx != -1) { + int inner_feature = tree->split_feature_inner(parent_idx); + int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); + bool is_right_split = tree->right_child(parent_idx) == node_idx; + bool split_contains_new_information = true; + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + // only branches containing leaves that are contiguous to the original leaf need to be updated + for (unsigned int i = 0; i < features.size(); ++i) { + if ((features[i] == inner_feature && is_split_numerical) && + (is_in_right_split[i] == is_right_split)) { + split_contains_new_information = false; + break; + } + } + + if (split_contains_new_information) { + if (monotone_type != 0) { + int left_child_idx = tree->left_child(parent_idx); + int right_child_idx = tree->right_child(parent_idx); + bool left_child_is_curr_idx = (left_child_idx == node_idx); + int node_idx_to_pass = + (left_child_is_curr_idx) ? right_child_idx : left_child_idx; + bool take_min = (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + + GoDownToFindLeavesToUpdate(tree, node_idx_to_pass, features, thresholds, + is_in_right_split, take_min, split_feature, + split_info, previous_leaf_output, true, true, + split_threshold); + } + + is_in_right_split.push_back(tree->right_child(parent_idx) == node_idx); + thresholds.push_back(tree->threshold_in_bin(parent_idx)); + features.push_back(tree->split_feature_inner(parent_idx)); + } + + if (parent_idx != 0) { + GoUpToFindLeavesToUpdate(tree, parent_idx, features, thresholds, + is_in_right_split, split_feature, split_info, + previous_leaf_output, split_threshold); + } + } +} + +// this function updates the best split for each leaf +// it is called only when monotone constraints exist +void SerialTreeLearner::UpdateBestSplitsFromHistograms(SplitInfo &split, + int leaf, int depth, + const Tree *tree) { + std::vector bests(num_threads_); + std::vector should_split_be_worse(num_threads_, false); + + // the feature histogram is retrieved + FeatureHistogram *histogram_array_; + histogram_pool_.Get(leaf, &histogram_array_); + + OMP_INIT_EX(); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + // the feature that are supposed to be used are computed + if (!is_feature_used_[feature_index]) + continue; + if (!histogram_array_[feature_index].is_splittable()) { + constraints_per_leaf_[leaf].are_actual_constraints_worse[feature_index] = + false; + continue; + } + + // loop through the features to find the best one just like in the + // FindBestSplitsFromHistograms function + const int tid = omp_get_thread_num(); + int real_fidx = train_data_->RealFeatureIndex(feature_index); + + // if the monotone precise mode is disabled or if the constraints have to be updated, + // but are not exclusively worse, then we update the constraints and the best split + if (!config_->monotone_precise_mode || + (constraints_per_leaf_[leaf].ToBeUpdated(feature_index) && + !constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index))) { + + ComputeBestSplitForFeature( + split.left_sum_gradient + split.right_sum_gradient, + split.left_sum_hessian + split.right_sum_hessian, + split.left_count + split.right_count, feature_index, histogram_array_, + bests, leaf, depth, tid, real_fidx, tree, true); + } else { + if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + feature_index] > + bests[tid]) { + bests[tid] = cegb_->splits_per_leaf_ + [leaf * train_data_->num_features() + feature_index]; + should_split_be_worse[tid] = + constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index); + } + } + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + + auto best_idx = ArrayArgs::ArgMax(bests); + // if the best split that has been found previously actually doesn't have the true constraints + // but worse ones that were not computed before to optimize the computation time, + // then we update every split and every constraints that should be updated + if (should_split_be_worse[best_idx]) { + std::fill(bests.begin(), bests.end(), SplitInfo()); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; + ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_used_[feature_index]) + continue; + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } + + const int tid = omp_get_thread_num(); + int real_fidx = train_data_->RealFeatureIndex(feature_index); + + if (constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index)) { +#ifdef DEBUG + CHECK(config_->monotone_precise_mode); + CHECK((constraints_per_leaf_[leaf].ToBeUpdated(feature_index))); +#endif + + ComputeBestSplitForFeature( + split.left_sum_gradient + split.right_sum_gradient, + split.left_sum_hessian + split.right_sum_hessian, + split.left_count + split.right_count, feature_index, + histogram_array_, bests, leaf, depth, tid, real_fidx, tree, true); + } else { +#ifdef DEBUG + CHECK(!constraints_per_leaf_[leaf].ToBeUpdated(feature_index)); +#endif + if (cegb_->splits_per_leaf_ + [leaf * train_data_->num_features() + feature_index] > + bests[tid]) { + bests[tid] = cegb_->splits_per_leaf_ + [leaf * train_data_->num_features() + feature_index]; + } + } + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + best_idx = ArrayArgs::ArgMax(bests); + } + + // note: the gains may differ for the same set of constraints due to the non-deterministic OMP reduction. + split = bests[best_idx]; +} + +// this function computes the best split for a given leaf and a given feature +void SerialTreeLearner::ComputeBestSplitForFeature( + double sum_gradient, double sum_hessian, data_size_t num_data, + int feature_index, FeatureHistogram *histogram_array_, + std::vector &bests, int leaf_index, int depth, const int tid, + int real_fidx, const Tree *tree, bool update) { + + // if this is not a subtree stemming from a monotone split, then no constraint apply + if (tree->leaf_is_in_monotone_subtree(leaf_index)) { + if (config_->monotone_precise_mode) { + + ComputeConstraintsPerThreshold( + feature_index, tree, ~leaf_index, tid, config_->monotone_precise_mode, + constraints_per_leaf_[leaf_index].MinToBeUpdated(feature_index) || + !update, + constraints_per_leaf_[leaf_index].MaxToBeUpdated(feature_index) || + !update); + + if (!constraints_per_leaf_[leaf_index].MinToBeUpdated(feature_index) && + update) { + min_constraints[tid] = + constraints_per_leaf_[leaf_index].min_constraints[feature_index]; + thresholds_min_constraints[tid] = + constraints_per_leaf_[leaf_index].min_thresholds[feature_index]; + } else { + constraints_per_leaf_[leaf_index].min_constraints[feature_index] = + min_constraints[tid]; + constraints_per_leaf_[leaf_index].min_thresholds[feature_index] = + thresholds_min_constraints[tid]; + } + + if (!constraints_per_leaf_[leaf_index].MaxToBeUpdated(feature_index) && + update) { + max_constraints[tid] = + constraints_per_leaf_[leaf_index].max_constraints[feature_index]; + thresholds_max_constraints[tid] = + constraints_per_leaf_[leaf_index].max_thresholds[feature_index]; + } else { + constraints_per_leaf_[leaf_index].max_constraints[feature_index] = + max_constraints[tid]; + constraints_per_leaf_[leaf_index].max_thresholds[feature_index] = + thresholds_max_constraints[tid]; + } + + dummy_min_constraints[tid] = min_constraints[tid]; + dummy_max_constraints[tid] = max_constraints[tid]; + } + if (!config_->monotone_precise_mode) { + dummy_min_constraints[tid][0] = + constraints_per_leaf_[leaf_index].min_constraints[0][0]; + dummy_max_constraints[tid][0] = + constraints_per_leaf_[leaf_index].max_constraints[0][0]; + + min_constraints[tid][0] = + constraints_per_leaf_[leaf_index].min_constraints[0][0]; + max_constraints[tid][0] = + constraints_per_leaf_[leaf_index].max_constraints[0][0]; + + thresholds_min_constraints[tid][0] = + constraints_per_leaf_[leaf_index].min_thresholds[0][0]; + thresholds_max_constraints[tid][0] = + constraints_per_leaf_[leaf_index].max_thresholds[0][0]; + } + } + +#ifdef DEBUG + CHECK(dummy_min_constraints[tid] == min_constraints[tid]); + CHECK(dummy_max_constraints[tid] == max_constraints[tid]); + for (const auto &x : max_constraints[tid]) { + CHECK(tree->LeafOutput(leaf_index) <= EPS + x); + CHECK(x > -std::numeric_limits::max()); + } + for (const auto &x : dummy_min_constraints[tid]) { + CHECK(tree->LeafOutput(leaf_index) + EPS >= x); + CHECK(x < std::numeric_limits::max()); + } +#endif + + SplitInfo new_split; + // FIXME Need to call histogram_array_[feature_index].FindBestThreshold + + if (tree->leaf_is_in_monotone_subtree(leaf_index)) { + InitializeConstraints(tid); + } + + new_split.feature = real_fidx; + if (cegb_ != nullptr) { + new_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, leaf_index, num_data, new_split); + } + + + if (new_split.monotone_type != 0) { + double penalty = + ComputeMonotoneSplitGainPenalty(depth, config_->monotone_penalty); + new_split.gain *= penalty; + } + + if (new_split > bests[tid]) { + bests[tid] = new_split; + } + + if (config_->monotone_precise_mode && + tree->leaf_is_in_monotone_subtree(leaf_index)) { + constraints_per_leaf_[leaf_index].ResetUpdates(feature_index); + } + +#ifdef DEBUG + ComputeConstraintsPerThreshold(-1, tree, ~leaf_index, tid, false); + double min_constraint = min_constraints[tid][0]; + double max_constraint = max_constraints[tid][0]; + CHECK(tree->LeafOutput(leaf_index) >= min_constraint); + CHECK(tree->LeafOutput(leaf_index) <= max_constraint); + + min_constraints[tid][0] = -std::numeric_limits::max(); + max_constraints[tid][0] = std::numeric_limits::max(); + thresholds[tid].clear(); + is_in_right_split[tid].clear(); + features[tid].clear(); +#endif +} + // initializing constraints is just writing that the constraints should +/- inf from threshold 0 void SerialTreeLearner::InitializeConstraints(unsigned int tid) { thresholds[tid].clear(); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 092dcad5fa04..e709295a867d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -100,6 +100,9 @@ class SerialTreeLearner: public TreeLearner { FindBestSplitsFromHistograms(const std::vector &is_feature_used, bool use_subtract, const Tree *tree); + virtual void UpdateBestSplitsFromHistograms(SplitInfo &split, int leaf, + int depth, const Tree *tree); + /*! * \brief Partition tree and data according best split. * \param tree Current tree, will be splitted on this function. @@ -121,11 +124,95 @@ class SerialTreeLearner: public TreeLearner { */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; + void ComputeBestSplitForFeature(double sum_gradient, double sum_hessian, + data_size_t num_data, int feature_index, + FeatureHistogram *histogram_array_, + std::vector &bests, int leaf_index, + int depth, const int tid, int real_fidx, + const Tree *tree, bool update = false); + + void ComputeConstraintsPerThreshold(int feature, const Tree *tree, + int node_idx, unsigned int tid, + bool per_threshold, bool compute_min, + bool compute_max, uint32_t it_start, + uint32_t it_end); + + void ComputeConstraintsPerThreshold(int feature, const Tree *tree, + int node_idx, unsigned int tid, + bool per_threshold = true, + bool compute_min = true, + bool compute_max = true) { + ComputeConstraintsPerThreshold(feature, tree, node_idx, tid, per_threshold, + compute_min, compute_max, 0, + train_data_->NumBin(feature)); + } + + void ComputeConstraintsPerThresholdInSubtree( + int split_feature, int monotone_feature, const Tree *tree, int node_idx, + bool maximum, uint32_t it_start, uint32_t it_end, + const std::vector &features, const std::vector &thresholds, + const std::vector &is_in_right_split, unsigned int tid, + bool per_threshold); + static double ComputeMonotoneSplitGainPenalty(int depth, double penalization, double epsilon = 1e-10); + void GoDownToFindLeavesToUpdate(const Tree *tree, int node_idx, + const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, + int maximum, int split_feature, + const SplitInfo &split_info, + double previous_leaf_output, + bool use_left_leaf, bool use_right_leaf, + uint32_t split_threshold); + + /* Once we made a split, the constraints on other leaves may change. + We need to update them to remain coherent. */ + void GoUpToFindLeavesToUpdate(const Tree *tree, int node_idx, + std::vector &features, + std::vector &thresholds, + std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, + double previous_leaf_output, + uint32_t split_threshold); + + void GoUpToFindLeavesToUpdate(const Tree *tree, int node_idx, + int split_feature, const SplitInfo &split_info, + double previous_leaf_output, + uint32_t split_threshold) { + int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; + + std::vector features; + std::vector thresholds; + std::vector is_in_right_split; + + features.reserve(depth); + thresholds.reserve(depth); + is_in_right_split.reserve(depth); + + GoUpToFindLeavesToUpdate(tree, node_idx, features, thresholds, + is_in_right_split, split_feature, split_info, + previous_leaf_output, split_threshold); + } + + std::pair + ShouldKeepGoingLeftRight(const Tree *tree, int node_idx, + const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split); + + std::pair + LeftRightContainsRelevantInformation(bool maximum, int inner_feature, + bool split_feature_is_inner_feature); + void InitializeConstraints(unsigned int tid); + void UpdateConstraints(std::vector > &constraints, + std::vector > &thresholds, + double extremum, uint32_t it_start, uint32_t it_end, + int split_feature, int tid, bool maximum); + /*! \brief number of data */ data_size_t num_data_; /*! \brief number of features */ From e4ec6a09fbbbe197232b77f4bb42642d8d127ddb Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:41:54 +0100 Subject: [PATCH 12/45] Modified the way constraints are handled in feature histograms. --- .../data_parallel_tree_learner.cpp | 27 +- src/treelearner/feature_histogram.hpp | 390 +++++++++++++++--- src/treelearner/gpu_tree_learner.cpp | 1 + src/treelearner/serial_tree_learner.cpp | 96 +++-- .../voting_parallel_tree_learner.cpp | 82 ++-- 5 files changed, 432 insertions(+), 164 deletions(-) diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 1ece1572eaea..cc8bf7d4199c 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -191,13 +191,16 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms( this->smaller_leaf_histogram_array_[feature_index].RawData()); SplitInfo smaller_split; // find best threshold for smaller child + // FIXME Fill the vectors with the actual constraints and thresholds + std::vector max_constraints; + std::vector min_constraints; + std::vector thresholds; this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( - this->smaller_leaf_splits_->sum_gradients(), - this->smaller_leaf_splits_->sum_hessians(), - GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()), - this->smaller_leaf_splits_->min_constraint(), - this->smaller_leaf_splits_->max_constraint(), - &smaller_split); + this->smaller_leaf_splits_->sum_gradients(), + this->smaller_leaf_splits_->sum_hessians(), + GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()), + &smaller_split, max_constraints, min_constraints, max_constraints, + min_constraints, thresholds, thresholds); smaller_split.feature = real_feature_index; if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) { smaller_bests_per_thread[tid] = smaller_split; @@ -211,13 +214,13 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms( this->smaller_leaf_histogram_array_[feature_index]); SplitInfo larger_split; // find best threshold for larger child + // FIXME Fill the vectors with the actual constraints and thresholds this->larger_leaf_histogram_array_[feature_index].FindBestThreshold( - this->larger_leaf_splits_->sum_gradients(), - this->larger_leaf_splits_->sum_hessians(), - GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()), - this->larger_leaf_splits_->min_constraint(), - this->larger_leaf_splits_->max_constraint(), - &larger_split); + this->larger_leaf_splits_->sum_gradients(), + this->larger_leaf_splits_->sum_hessians(), + GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()), + &larger_split, max_constraints, min_constraints, max_constraints, + min_constraints, thresholds, thresholds); larger_split.feature = real_feature_index; if (larger_split > larger_bests_per_thread[tid] && larger_node_used_features[feature_index]) { larger_bests_per_thread[tid] = larger_split; diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 20437c0be6b0..7d3363a0de96 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -57,11 +57,19 @@ class FeatureHistogram { meta_ = meta; data_ = data; if (meta_->bin_type == BinType::NumericalBin) { - find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1 - , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); + find_best_threshold_fun_ = std::bind( + &FeatureHistogram::FindBestThresholdNumerical, this, + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, + std::placeholders::_4, std::placeholders::_5, std::placeholders::_6, + std::placeholders::_7, std::placeholders::_8, std::placeholders::_9, + std::placeholders::_10); } else { - find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1 - , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); + find_best_threshold_fun_ = std::bind( + &FeatureHistogram::FindBestThresholdCategorical, this, + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, + std::placeholders::_4, std::placeholders::_5, std::placeholders::_6, + std::placeholders::_7, std::placeholders::_8, std::placeholders::_9, + std::placeholders::_10); } } @@ -80,30 +88,91 @@ class FeatureHistogram { } } - void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - SplitInfo* output) { + void FindBestThreshold( + double sum_gradient, double sum_hessian, data_size_t num_data, + SplitInfo *output, + std::vector &cumulative_min_constraint_left_to_right, + std::vector &cumulative_min_constraint_right_to_left, + std::vector &cumulative_max_constraint_left_to_right, + std::vector &cumulative_max_constraint_right_to_left, + const std::vector &thresholds_min_constraint, + const std::vector &thresholds_max_constraint) { output->default_left = true; output->gain = kMinScore; - find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output); + find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, + output, cumulative_min_constraint_left_to_right, + cumulative_min_constraint_right_to_left, + cumulative_max_constraint_left_to_right, + cumulative_max_constraint_right_to_left, + thresholds_min_constraint, + thresholds_max_constraint); output->gain *= meta_->penalty; } - void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - SplitInfo* output) { + void FindBestThresholdNumerical( + double sum_gradient, double sum_hessian, data_size_t num_data, + SplitInfo *output, + std::vector &cumulative_min_constraint_left_to_right, + std::vector &cumulative_min_constraint_right_to_left, + std::vector &cumulative_max_constraint_left_to_right, + std::vector &cumulative_max_constraint_right_to_left, + const std::vector &thresholds_min_constraint, + const std::vector &thresholds_max_constraint) { is_splittable_ = false; + could_be_splittable_ = false; double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; + + const double &(*min)(const double &, const double &) = std::min; + const double &(*max)(const double &, const double &) = std::max; + // at this point, the following arrays contain the constraints applied on every part of the leaf + // since we are splitting the leaf in 2, we can compute the cumulative / minimum maximum in both directions + CumulativeExtremum(max, true, cumulative_min_constraint_left_to_right); + CumulativeExtremum(max, false, cumulative_min_constraint_right_to_left); + CumulativeExtremum(min, true, cumulative_max_constraint_left_to_right); + CumulativeExtremum(min, false, cumulative_max_constraint_right_to_left); + if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, true, false); - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, true, false); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, + true, false, cumulative_min_constraint_left_to_right, + cumulative_min_constraint_right_to_left, + cumulative_max_constraint_left_to_right, + cumulative_max_constraint_right_to_left, thresholds_min_constraint, + thresholds_max_constraint); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, + true, false, cumulative_min_constraint_left_to_right, + cumulative_min_constraint_right_to_left, + cumulative_max_constraint_left_to_right, + cumulative_max_constraint_right_to_left, thresholds_min_constraint, + thresholds_max_constraint); } else { - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, true); - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, false, true); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, + false, true, cumulative_min_constraint_left_to_right, + cumulative_min_constraint_right_to_left, + cumulative_max_constraint_left_to_right, + cumulative_max_constraint_right_to_left, thresholds_min_constraint, + thresholds_max_constraint); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, + false, true, cumulative_min_constraint_left_to_right, + cumulative_min_constraint_right_to_left, + cumulative_max_constraint_left_to_right, + cumulative_max_constraint_right_to_left, thresholds_min_constraint, + thresholds_max_constraint); } } else { - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, false); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, + false, false, cumulative_min_constraint_left_to_right, + cumulative_min_constraint_right_to_left, + cumulative_max_constraint_left_to_right, + cumulative_max_constraint_right_to_left, thresholds_min_constraint, + thresholds_max_constraint); // fix the direction error when only have 2 bins if (meta_->missing_type == MissingType::NaN) { output->default_left = false; @@ -111,13 +180,17 @@ class FeatureHistogram { } output->gain -= min_gain_shift; output->monotone_type = meta_->monotone_type; - output->min_constraint = min_constraint; - output->max_constraint = max_constraint; } - void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data, - double min_constraint, double max_constraint, - SplitInfo* output) { + void FindBestThresholdCategorical( + double sum_gradient, double sum_hessian, data_size_t num_data, + SplitInfo *output, + std::vector &cumulative_min_constraint_left_to_right, + std::vector &cumulative_min_constraint_right_to_left, + std::vector &cumulative_max_constraint_left_to_right, + std::vector &cumulative_max_constraint_right_to_left, + const std::vector &thresholds_min_constraint, + const std::vector &thresholds_max_constraint) { output->default_left = false; double best_gain = kMinScore; data_size_t best_left_count = 0; @@ -149,10 +222,20 @@ class FeatureHistogram { if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue; double sum_other_gradient = sum_gradient - data_[t].sum_gradients; + +#ifdef DEBUG + CHECK(t >= 0); +#endif // current split gain - double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint, 0); + // the threshold is included in the left leaf + double current_gain = GetSplitGains( + sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, + data_[t].sum_hessians + kEpsilon, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + cumulative_min_constraint_right_to_left[0], + cumulative_max_constraint_right_to_left[0], + cumulative_min_constraint_left_to_right[0], + cumulative_max_constraint_left_to_right[0], 0); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -222,9 +305,16 @@ class FeatureHistogram { cnt_cur_group = 0; double sum_right_gradient = sum_gradient - sum_left_gradient; - double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint, 0); + // the threshold is included in the left leaf + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + cumulative_min_constraint_right_to_left[0], + cumulative_max_constraint_right_to_left[0], + cumulative_min_constraint_left_to_right[0], + cumulative_max_constraint_left_to_right[0], 0); + if (current_gain <= min_gain_shift) continue; is_splittable_ = true; if (current_gain > best_gain) { @@ -240,16 +330,20 @@ class FeatureHistogram { } if (is_splittable_) { - output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + cumulative_min_constraint_left_to_right[0], + cumulative_max_constraint_left_to_right[0]); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - best_sum_left_gradient, + sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + cumulative_min_constraint_right_to_left[0], + cumulative_max_constraint_right_to_left[0]); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -273,8 +367,6 @@ class FeatureHistogram { } } output->monotone_type = 0; - output->min_constraint = min_constraint; - output->max_constraint = max_constraint; } } @@ -436,12 +528,24 @@ class FeatureHistogram { /*! * \brief True if this histogram can be splitted */ - bool is_splittable() { return is_splittable_; } + bool is_splittable() { + // if the monotone precise mode is enabled, then, even if a leaf is not splittable right now, + // it may become splittable later, because it can be unconstrained by splits happening somewhere else in the tree + if (meta_->config->monotone_precise_mode && + meta_->bin_type == BinType::NumericalBin) { + return could_be_splittable_; + } else { + return is_splittable_; + } + } /*! * \brief Set splittable to this histogram */ - void set_is_splittable(bool val) { is_splittable_ = val; } + void set_is_splittable(bool val) { + is_splittable_ = val; + could_be_splittable_ = val; + } static double ThresholdL1(double s, double l1) { const double reg_s = std::max(0.0, std::fabs(s) - l1); @@ -457,19 +561,27 @@ class FeatureHistogram { } } - private: - static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, - double sum_right_gradients, double sum_right_hessians, - double l1, double l2, double max_delta_step, - double min_constraint, double max_constraint, int8_t monotone_constraint) { - double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); - double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); - if (((monotone_constraint > 0) && (left_output > right_output)) || - ((monotone_constraint < 0) && (left_output < right_output))) { - return 0; + static void CumulativeExtremum( + const double &(*extremum_function)(const double &, const double &), + bool is_direction_from_left_to_right, + std::vector &cumulative_extremum) { + if (cumulative_extremum.size() == 1) { + return; + } + +#ifdef DEBUG + CHECK(cumulative_extremum.size() != 0); +#endif + + std::size_t n_exts = cumulative_extremum.size(); + int step = is_direction_from_left_to_right ? 1 : -1; + std::size_t start = is_direction_from_left_to_right ? 0 : n_exts - 1; + std::size_t end = is_direction_from_left_to_right ? n_exts - 1 : 0; + + for (auto i = start; i != end; i = i + step) { + cumulative_extremum[i + step] = extremum_function( + cumulative_extremum[i + step], cumulative_extremum[i]); } - return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output) - + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output); } /*! @@ -489,6 +601,22 @@ class FeatureHistogram { return ret; } + private: + static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, + double sum_right_gradients, double sum_right_hessians, + double l1, double l2, double max_delta_step, + double min_constraint_right, double max_constraint_right, + double min_constraint_left, double max_constraint_left, int8_t monotone_constraint) { + double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint_left, max_constraint_left); + double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint_right, max_constraint_right); + if (((monotone_constraint > 0) && (left_output > right_output)) || + ((monotone_constraint < 0) && (left_output < right_output))) { + return 0; + } + return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output) + + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output); + } + /*! * \brief Calculate the split gain based on regularized sum_gradients and sum_hessians * \param sum_gradients @@ -505,12 +633,28 @@ class FeatureHistogram { return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output); } - void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) { + void FindBestThresholdSequence( + double sum_gradient, double sum_hessian, data_size_t num_data, + double min_gain_shift, SplitInfo *output, int dir, bool skip_default_bin, + bool use_na_as_missing, + const std::vector &cumulative_min_constraint_left_to_right, + const std::vector &cumulative_min_constraint_right_to_left, + const std::vector &cumulative_max_constraint_left_to_right, + const std::vector &cumulative_max_constraint_right_to_left, + const std::vector &thresholds_min_constraint, + const std::vector &thresholds_max_constraint) { const int8_t bias = meta_->bias; double best_sum_left_gradient = NAN; double best_sum_left_hessian = NAN; + + // when the monotone precise mode is enabled, then the left and the right children may not + // have the same min and max constraints because constraints can depend on the thresholds + double best_min_constraint_left = NAN; + double best_max_constraint_left = NAN; + double best_min_constraint_right = NAN; + double best_max_constraint_right = NAN; + double best_gain = kMinScore; data_size_t best_left_count = 0; uint32_t best_threshold = static_cast(meta_->num_bin); @@ -522,6 +666,16 @@ class FeatureHistogram { int t = meta_->num_bin - 1 - bias - use_na_as_missing; const int t_end = 1 - bias; + unsigned int index_min_constraint_left_to_right = + thresholds_min_constraint.size() - 1; + unsigned int index_min_constraint_right_to_left = + thresholds_min_constraint.size() - 1; + unsigned int index_max_constraint_left_to_right = + thresholds_max_constraint.size() - 1; + unsigned int index_max_constraint_right_to_left = + thresholds_max_constraint.size() - 1; + bool update_is_necessary = !(thresholds_max_constraint.size() == 1 && + thresholds_min_constraint.size() == 1); // from right to left, and we don't need data in bin0 for (; t >= t_end; --t) { @@ -544,9 +698,61 @@ class FeatureHistogram { double sum_left_gradient = sum_gradient - sum_right_gradient; // current split gain - double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint, meta_->monotone_type); + + could_be_splittable_ = true; + + // when the monotone precise mode in enabled, as t changes, the constraints applied on + // each child may change, because the constraints may depend on thresholds + if (update_is_necessary) { + while (static_cast(thresholds_min_constraint + [index_min_constraint_left_to_right]) > + t + bias - 1) { + index_min_constraint_left_to_right -= 1; + } + while (static_cast(thresholds_min_constraint + [index_min_constraint_right_to_left]) > + t + bias) { + index_min_constraint_right_to_left -= 1; + } + while (static_cast(thresholds_max_constraint + [index_max_constraint_left_to_right]) > + t + bias - 1) { + index_max_constraint_left_to_right -= 1; + } + while (static_cast(thresholds_max_constraint + [index_max_constraint_right_to_left]) > + t + bias) { + index_max_constraint_right_to_left -= 1; + } + } + +#ifdef DEBUG + CHECK(index_min_constraint_left_to_right < + thresholds_min_constraint.size()); + CHECK(index_min_constraint_right_to_left < + thresholds_min_constraint.size()); + CHECK(index_max_constraint_left_to_right < + thresholds_max_constraint.size()); + CHECK(index_max_constraint_right_to_left < + thresholds_max_constraint.size()); +#endif + + // when the algorithm goes through the thresholds we use the same index for cumulative arrays + // in both directions but each leaf is constrained according to the corresponding array + // the threshold is included in the left leaf + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + cumulative_min_constraint_right_to_left + [index_min_constraint_right_to_left], + cumulative_max_constraint_right_to_left + [index_max_constraint_right_to_left], + cumulative_min_constraint_left_to_right + [index_min_constraint_left_to_right], + cumulative_max_constraint_left_to_right + [index_max_constraint_left_to_right], + meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -560,6 +766,15 @@ class FeatureHistogram { // left is <= threshold, right is > threshold. so this is t-1 best_threshold = static_cast(t - 1 + bias); best_gain = current_gain; + + best_min_constraint_right = cumulative_min_constraint_right_to_left + [index_min_constraint_right_to_left]; + best_max_constraint_right = cumulative_max_constraint_right_to_left + [index_max_constraint_right_to_left]; + best_min_constraint_left = cumulative_min_constraint_left_to_right + [index_min_constraint_left_to_right]; + best_max_constraint_left = cumulative_max_constraint_left_to_right + [index_max_constraint_left_to_right]; } } } else { @@ -582,6 +797,11 @@ class FeatureHistogram { t = -1; } + unsigned int index_min_constraint_left_to_right = 0; + unsigned int index_min_constraint_right_to_left = 0; + unsigned int index_max_constraint_left_to_right = 0; + unsigned int index_max_constraint_right_to_left = 0; + for (; t <= t_end; ++t) { // need to skip default bin if (skip_default_bin && (t + bias) == static_cast(meta_->default_bin)) { continue; } @@ -601,11 +821,35 @@ class FeatureHistogram { // if sum hessian too small if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break; + could_be_splittable_ = true; + double sum_right_gradient = sum_gradient - sum_left_gradient; + // current split gain - double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint, meta_->monotone_type); +#ifdef DEBUG + CHECK(index_min_constraint_left_to_right < + thresholds_min_constraint.size()); + CHECK(index_min_constraint_right_to_left < + thresholds_min_constraint.size()); + CHECK(index_max_constraint_left_to_right < + thresholds_max_constraint.size()); + CHECK(index_max_constraint_right_to_left < + thresholds_max_constraint.size()); +#endif + + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + cumulative_min_constraint_right_to_left + [index_min_constraint_right_to_left], + cumulative_max_constraint_right_to_left + [index_max_constraint_right_to_left], + cumulative_min_constraint_left_to_right + [index_min_constraint_left_to_right], + cumulative_max_constraint_left_to_right + [index_max_constraint_left_to_right], + meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -618,6 +862,15 @@ class FeatureHistogram { best_sum_left_hessian = sum_left_hessian; best_threshold = static_cast(t + bias); best_gain = current_gain; + + best_max_constraint_left = cumulative_max_constraint_left_to_right + [index_max_constraint_left_to_right]; + best_min_constraint_left = cumulative_min_constraint_left_to_right + [index_min_constraint_left_to_right]; + best_max_constraint_right = cumulative_max_constraint_right_to_left + [index_max_constraint_right_to_left]; + best_min_constraint_right = cumulative_min_constraint_right_to_left + [index_min_constraint_right_to_left]; } } } @@ -625,21 +878,25 @@ class FeatureHistogram { if (is_splittable_ && best_gain > output->gain) { // update split information output->threshold = best_threshold; - output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, best_min_constraint_left, + best_max_constraint_left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - best_sum_left_gradient, + sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + best_min_constraint_right, best_max_constraint_right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; output->gain = best_gain; output->default_left = dir == -1; + } } @@ -648,8 +905,13 @@ class FeatureHistogram { HistogramBinEntry* data_; // std::vector data_; bool is_splittable_ = true; + bool could_be_splittable_ = true; - std::function find_best_threshold_fun_; + std::function &, std::vector &, + std::vector &, std::vector &, + const std::vector &, + const std::vector &)> find_best_threshold_fun_; }; class HistogramPool { public: diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index f279fdc7331e..12fbf746ec87 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -1085,6 +1085,7 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right double smaller_max = smaller_leaf_splits_->max_constraint(); double larger_min = larger_leaf_splits_->min_constraint(); double larger_max = larger_leaf_splits_->max_constraint(); + // FIXME This part of the code has not been updated smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max); diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index d832fe84f040..7db662f437c7 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -596,26 +596,20 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( OMP_LOOP_EX_BEGIN(); if (!is_feature_used[feature_index]) { continue; } const int tid = omp_get_thread_num(); - SplitInfo smaller_split; + train_data_->FixHistogram(feature_index, smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(), smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_histogram_array_[feature_index].RawData()); int real_fidx = train_data_->RealFeatureIndex(feature_index); - smaller_leaf_histogram_array_[feature_index].FindBestThreshold( - smaller_leaf_splits_->sum_gradients(), - smaller_leaf_splits_->sum_hessians(), - smaller_leaf_splits_->num_data_in_leaf(), - smaller_leaf_splits_->min_constraint(), - smaller_leaf_splits_->max_constraint(), - &smaller_split); - smaller_split.feature = real_fidx; - if (cegb_ != nullptr) { - smaller_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->num_data_in_leaf(), smaller_split); - } - if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) { - smaller_best[tid] = smaller_split; - } + + ComputeBestSplitForFeature(smaller_leaf_splits_->sum_gradients(), + smaller_leaf_splits_->sum_hessians(), + smaller_leaf_splits_->num_data_in_leaf(), + feature_index, smaller_leaf_histogram_array_, + smaller_best, smaller_leaf_splits_->LeafIndex(), + smaller_leaf_splits_->depth(), tid, real_fidx, + tree); // only has root leaf if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -626,22 +620,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_leaf_splits_->num_data_in_leaf(), larger_leaf_histogram_array_[feature_index].RawData()); } - SplitInfo larger_split; - // find best threshold for larger child - larger_leaf_histogram_array_[feature_index].FindBestThreshold( - larger_leaf_splits_->sum_gradients(), - larger_leaf_splits_->sum_hessians(), - larger_leaf_splits_->num_data_in_leaf(), - larger_leaf_splits_->min_constraint(), - larger_leaf_splits_->max_constraint(), - &larger_split); - larger_split.feature = real_fidx; - if (cegb_ != nullptr) { - larger_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->num_data_in_leaf(), larger_split); - } - if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) { - larger_best[tid] = larger_split; - } + + ComputeBestSplitForFeature(larger_leaf_splits_->sum_gradients(), + larger_leaf_splits_->sum_hessians(), + larger_leaf_splits_->num_data_in_leaf(), + feature_index, larger_leaf_histogram_array_, + larger_best, larger_leaf_splits_->LeafIndex(), + larger_leaf_splits_->depth(), tid, real_fidx, + tree); OMP_LOOP_EX_END(); } OMP_THROW_EX(); @@ -839,6 +825,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; + double previous_leaf_output = tree->LeafOutput(best_leaf); const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); if (cegb_ != nullptr) { cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_); @@ -896,8 +883,6 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri #ifdef DEBUG CHECK(best_split_info.left_count == data_partition_->leaf_count(best_leaf)); #endif - auto p_left = smaller_leaf_splits_.get(); - auto p_right = larger_leaf_splits_.get(); // init the leaves that used on next iteration int depth = tree->leaf_depth(*left_leaf); #ifdef DEBUG @@ -918,18 +903,37 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, depth); } - p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - if (is_numerical_split) { - double mid = (best_split_info.left_output + best_split_info.right_output) / 2.0f; - if (best_split_info.monotone_type < 0) { - p_left->SetValueConstraint(mid, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, mid); - } else if (best_split_info.monotone_type > 0) { - p_left->SetValueConstraint(best_split_info.min_constraint, mid); - p_right->SetValueConstraint(mid, best_split_info.max_constraint); + + // when the monotone precise mode is disabled it is very easy to compute the constraints of + // the children of a leaf, but when it is enabled, one needs to go through the tree to do so, + // and it is done directly before computing best splits + if (!config_->monotone_precise_mode) { + constraints_per_leaf_[*right_leaf] = constraints_per_leaf_[*left_leaf]; + if (is_numerical_split) { + // depending on the monotone type we set constraints on the future splits + // these constraints may be updated later in the algorithm + if (best_split_info.monotone_type < 0) { + constraints_per_leaf_[*left_leaf] + .SetMinConstraint(best_split_info.right_output); + constraints_per_leaf_[*right_leaf] + .SetMaxConstraint(best_split_info.left_output); + } else if (best_split_info.monotone_type > 0) { + constraints_per_leaf_[*left_leaf] + .SetMaxConstraint(best_split_info.right_output); + constraints_per_leaf_[*right_leaf] + .SetMinConstraint(best_split_info.left_output); + } } } + + // if there is a monotone split above, we need to make sure the new + // values don't clash with existing constraints in the subtree, + // and if they do, the existing splits need to be updated + if (tree->leaf_is_in_monotone_subtree(*right_leaf)) { + GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(*right_leaf), + inner_feature_index, best_split_info, + previous_leaf_output, best_split_info.threshold); + } } // this function is only used if the monotone precise mode is enabled @@ -1638,7 +1642,11 @@ void SerialTreeLearner::ComputeBestSplitForFeature( #endif SplitInfo new_split; - // FIXME Need to call histogram_array_[feature_index].FindBestThreshold + histogram_array_[feature_index].FindBestThreshold( + sum_gradient, sum_hessian, num_data, &new_split, min_constraints[tid], + dummy_min_constraints[tid], max_constraints[tid], + dummy_max_constraints[tid], thresholds_min_constraints[tid], + thresholds_max_constraints[tid]); if (tree->leaf_is_in_monotone_subtree(leaf_index)) { InitializeConstraints(tid); diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index bd6ecf8e4451..d295e14e96be 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -303,13 +303,17 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) this->smaller_leaf_splits_->num_data_in_leaf(), this->smaller_leaf_histogram_array_[feature_index].RawData()); - this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( - this->smaller_leaf_splits_->sum_gradients(), - this->smaller_leaf_splits_->sum_hessians(), - this->smaller_leaf_splits_->num_data_in_leaf(), - this->smaller_leaf_splits_->min_constraint(), - this->smaller_leaf_splits_->max_constraint(), - &smaller_bestsplit_per_features[feature_index]); + // FIXME Fill the vectors with the actual constraints and thresholds + std::vector max_constraints; + std::vector min_constraints; + std::vector thresholds; + this->smaller_leaf_histogram_array_[feature_index] + .FindBestThreshold(this->smaller_leaf_splits_->sum_gradients(), + this->smaller_leaf_splits_->sum_hessians(), + this->smaller_leaf_splits_->num_data_in_leaf(), + &smaller_bestsplit_per_features[feature_index], + max_constraints, min_constraints, max_constraints, + min_constraints, thresholds, thresholds); smaller_bestsplit_per_features[feature_index].feature = real_feature_index; // only has root leaf if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -322,13 +326,14 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) this->larger_leaf_histogram_array_[feature_index].RawData()); } // find best threshold for larger child - this->larger_leaf_histogram_array_[feature_index].FindBestThreshold( - this->larger_leaf_splits_->sum_gradients(), - this->larger_leaf_splits_->sum_hessians(), - this->larger_leaf_splits_->num_data_in_leaf(), - this->larger_leaf_splits_->min_constraint(), - this->larger_leaf_splits_->max_constraint(), - &larger_bestsplit_per_features[feature_index]); + // FIXME Fill the vectors with the actual constraints and thresholds + this->larger_leaf_histogram_array_[feature_index] + .FindBestThreshold(this->larger_leaf_splits_->sum_gradients(), + this->larger_leaf_splits_->sum_hessians(), + this->larger_leaf_splits_->num_data_in_leaf(), + &larger_bestsplit_per_features[feature_index], + max_constraints, min_constraints, max_constraints, + min_constraints, thresholds, thresholds); larger_bestsplit_per_features[feature_index].feature = real_feature_index; OMP_LOOP_EX_END(); } @@ -414,13 +419,16 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms( smaller_leaf_histogram_array_global_[feature_index].RawData()); // find best threshold + // FIXME Fill the vectors with the actual constraints and thresholds + std::vector max_constraints; + std::vector min_constraints; + std::vector thresholds; smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold( - smaller_leaf_splits_global_->sum_gradients(), - smaller_leaf_splits_global_->sum_hessians(), - GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), - smaller_leaf_splits_global_->min_constraint(), - smaller_leaf_splits_global_->max_constraint(), - &smaller_split); + smaller_leaf_splits_global_->sum_gradients(), + smaller_leaf_splits_global_->sum_hessians(), + GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), + &smaller_split, max_constraints, min_constraints, max_constraints, + min_constraints, thresholds, thresholds); smaller_split.feature = real_feature_index; if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) { smaller_bests_per_thread[tid] = smaller_split; @@ -438,13 +446,17 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms( larger_leaf_histogram_array_global_[feature_index].RawData()); // find best threshold + // FIXME Fill the vectors with the actual constraints and thresholds + std::vector max_constraints; + std::vector min_constraints; + std::vector thresholds; + larger_leaf_histogram_array_global_[feature_index].FindBestThreshold( - larger_leaf_splits_global_->sum_gradients(), - larger_leaf_splits_global_->sum_hessians(), - GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), - larger_leaf_splits_global_->min_constraint(), - larger_leaf_splits_global_->max_constraint(), - &larger_split); + larger_leaf_splits_global_->sum_gradients(), + larger_leaf_splits_global_->sum_hessians(), + GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), + &larger_split, max_constraints, min_constraints, max_constraints, + min_constraints, thresholds, thresholds); larger_split.feature = real_feature_index; if (larger_split > larger_best_per_thread[tid] && larger_node_used_features[feature_index]) { larger_best_per_thread[tid] = larger_split; @@ -488,8 +500,6 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, // set the global number of data for leaves global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count; global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count; - auto p_left = smaller_leaf_splits_global_.get(); - auto p_right = larger_leaf_splits_global_.get(); // init the global sumup info int depth = tree->leaf_depth(*left_leaf); #ifdef DEBUG @@ -509,22 +519,6 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, depth); - p_left = larger_leaf_splits_global_.get(); - p_right = smaller_leaf_splits_global_.get(); - } - const int inner_feature_index = this->train_data_->InnerFeatureIndex(best_split_info.feature); - bool is_numerical_split = this->train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; - p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - if (is_numerical_split) { - double mid = (best_split_info.left_output + best_split_info.right_output) / 2.0f; - if (best_split_info.monotone_type < 0) { - p_left->SetValueConstraint(mid, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, mid); - } else if (best_split_info.monotone_type > 0) { - p_left->SetValueConstraint(best_split_info.min_constraint, mid); - p_right->SetValueConstraint(mid, best_split_info.max_constraint); - } } } From 3ea72afee3f7d46390d0185a6c4d67b4d7195bb8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:44:03 +0100 Subject: [PATCH 13/45] Removed old constraints that are not used anymore. --- src/treelearner/leaf_splits.hpp | 21 --------------------- src/treelearner/split_info.hpp | 10 ---------- 2 files changed, 31 deletions(-) diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index c22872ec8ce1..b63560340c53 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -45,16 +45,8 @@ class LeafSplits { data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); sum_gradients_ = sum_gradients; sum_hessians_ = sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } - void SetValueConstraint(double min, double max) { - min_val_ = min; - max_val_ = max; - } - - /*! * \brief Init splits on current leaf, it will traverse all data to sum up the results * \param gradients @@ -74,8 +66,6 @@ class LeafSplits { } sum_gradients_ = tmp_sum_gradients; sum_hessians_ = tmp_sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } /*! @@ -100,8 +90,6 @@ class LeafSplits { } sum_gradients_ = tmp_sum_gradients; sum_hessians_ = tmp_sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } @@ -115,8 +103,6 @@ class LeafSplits { leaf_index_ = 0; sum_gradients_ = sum_gradients; sum_hessians_ = sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } /*! @@ -127,8 +113,6 @@ class LeafSplits { leaf_index_ = -1; data_indices_ = nullptr; num_data_in_leaf_ = 0; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } @@ -144,9 +128,6 @@ class LeafSplits { /*! \brief Get sum of hessians of current leaf */ double sum_hessians() const { return sum_hessians_; } - double max_constraint() const { return max_val_; } - - double min_constraint() const { return min_val_; } int depth() const { return depth_; } /*! \brief Get indices of data of current leaf */ @@ -166,8 +147,6 @@ class LeafSplits { double sum_hessians_; /*! \brief indices of data of current leaf */ const data_size_t* data_indices_; - double min_val_; - double max_val_; int depth_; }; diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp index 3afa72a0f4a3..86653522dd04 100644 --- a/src/treelearner/split_info.hpp +++ b/src/treelearner/split_info.hpp @@ -48,8 +48,6 @@ struct SplitInfo { /*! \brief True if default split is left */ bool default_left = true; int8_t monotone_type = 0; - double min_constraint = -std::numeric_limits::max(); - double max_constraint = std::numeric_limits::max(); inline static int Size(int max_cat_threshold) { return 2 * sizeof(int) + sizeof(uint32_t) + sizeof(bool) + sizeof(double) * 9 + sizeof(data_size_t) * 2 + max_cat_threshold * sizeof(uint32_t) + sizeof(int8_t); } @@ -81,10 +79,6 @@ struct SplitInfo { buffer += sizeof(default_left); std::memcpy(buffer, &monotone_type, sizeof(monotone_type)); buffer += sizeof(monotone_type); - std::memcpy(buffer, &min_constraint, sizeof(min_constraint)); - buffer += sizeof(min_constraint); - std::memcpy(buffer, &max_constraint, sizeof(max_constraint)); - buffer += sizeof(max_constraint); std::memcpy(buffer, &num_cat_threshold, sizeof(num_cat_threshold)); buffer += sizeof(num_cat_threshold); std::memcpy(buffer, cat_threshold.data(), sizeof(uint32_t) * num_cat_threshold); @@ -117,10 +111,6 @@ struct SplitInfo { buffer += sizeof(default_left); std::memcpy(&monotone_type, buffer, sizeof(monotone_type)); buffer += sizeof(monotone_type); - std::memcpy(&min_constraint, buffer, sizeof(min_constraint)); - buffer += sizeof(min_constraint); - std::memcpy(&max_constraint, buffer, sizeof(max_constraint)); - buffer += sizeof(max_constraint); std::memcpy(&num_cat_threshold, buffer, sizeof(num_cat_threshold)); buffer += sizeof(num_cat_threshold); cat_threshold.resize(num_cat_threshold); From 692af0d331ab329f222a8fa845f59cdc215a0330 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:47:12 +0100 Subject: [PATCH 14/45] Added a function to refit leaves at the end of the training. --- src/treelearner/serial_tree_learner.cpp | 76 +++++++++++++++++++++++++ src/treelearner/serial_tree_learner.h | 1 + 2 files changed, 77 insertions(+) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 7db662f437c7..736af729d617 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -25,6 +25,7 @@ std::chrono::duration hist_time; std::chrono::duration find_split_time; std::chrono::duration split_time; std::chrono::duration ordered_bin_time; +std::chrono::duration refit_leaves_time; #endif // TIMETAG double EPS = 1e-12; @@ -47,6 +48,7 @@ SerialTreeLearner::~SerialTreeLearner() { Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3); Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3); Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3); + Log::Info("SerialTreeLearner::refit_leaves costs %f", refit_leaves_time * 1e-3); #endif } @@ -285,10 +287,84 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians #endif cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } + #ifdef TIMETAG + start_time = std::chrono::steady_clock::now(); + #endif + // when the monotone precise mode is enabled, some splits might unconstrain leaves in other branches + // if these leaves are not split before the tree is being fully built, then it might be possible to + // move their internal value (because they have been unconstrained) to achieve a better gain + if (config_->monotone_precise_mode) { + ReFitLeaves(tree.get()); + } + #ifdef TIMETAG + refit_leaves_time += std::chrono::steady_clock::now() - start_time; + #endif Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth); return tree.release(); } +void SerialTreeLearner::ReFitLeaves(Tree *tree) { + CHECK(data_partition_->num_leaves() >= tree->num_leaves()); + bool might_be_something_to_update = true; + std::vector sum_grad(tree->num_leaves(), 0.0f); + std::vector sum_hess(tree->num_leaves(), kEpsilon); + OMP_INIT_EX(); + // first we need to compute gradients and hessians for each leaf +#pragma omp parallel for schedule(static) + for (int i = 0; i < tree->num_leaves(); ++i) { + OMP_LOOP_EX_BEGIN(); + if (!tree->leaf_is_in_monotone_subtree(i)) { + continue; + } + data_size_t cnt_leaf_data = 0; + auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data); + for (data_size_t j = 0; j < cnt_leaf_data; ++j) { + auto idx = tmp_idx[j]; + sum_grad[i] += gradients_[idx]; + sum_hess[i] += hessians_[idx]; + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + + while (might_be_something_to_update) { + might_be_something_to_update = false; + // this loop can't be multi-threaded easily because we could break + // monotonicity in the tree + for (int i = 0; i < tree->num_leaves(); ++i) { + if (!tree->leaf_is_in_monotone_subtree(i)) { + continue; + } + // we compute the constraints, and we only need one min and one max constraint this time + // because we are not going to split the leaf, we may just change its value + ComputeConstraintsPerThreshold(-1, tree, ~i, 0, false); + double min_constraint = min_constraints[0][0]; + double max_constraint = max_constraints[0][0]; +#ifdef DEBUG + CHECK(tree->LeafOutput(i) >= min_constraint); + CHECK(tree->LeafOutput(i) <= max_constraint); +#endif + double new_constrained_output = + FeatureHistogram::CalculateSplittedLeafOutput( + sum_grad[i], sum_hess[i], config_->lambda_l1, config_->lambda_l2, + config_->max_delta_step, min_constraint, max_constraint); + double old_output = tree->LeafOutput(i); + // a more accurate value may not immediately result in a loss reduction because of the shrinkage rate + if (fabs(old_output - new_constrained_output) > EPS) { + might_be_something_to_update = true; + tree->SetLeafOutput(i, new_constrained_output); + } + + // we reset the constraints + min_constraints[0][0] = -std::numeric_limits::max(); + max_constraints[0][0] = std::numeric_limits::max(); + thresholds[0].clear(); + is_in_right_split[0].clear(); + features[0].clear(); + } + } +} + Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const { auto tree = std::unique_ptr(new Tree(*old_tree)); CHECK(data_partition_->num_leaves() >= tree->num_leaves()); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index e709295a867d..7208fa7a1b8d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -205,6 +205,7 @@ class SerialTreeLearner: public TreeLearner { std::pair LeftRightContainsRelevantInformation(bool maximum, int inner_feature, bool split_feature_is_inner_feature); + void ReFitLeaves(Tree* tree); void InitializeConstraints(unsigned int tid); From 7aaec4a944f93624bcdb7e1e832e4892f9c87ffb Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 26 Jun 2019 12:48:45 +0100 Subject: [PATCH 15/45] Updated tests. --- tests/python_package_test/test_engine.py | 189 ++++++++++++++++++++--- 1 file changed, 171 insertions(+), 18 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 51c99494e68e..5f58e8846433 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -856,45 +856,198 @@ def test_init_with_subset(self): self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin") self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin") - def test_monotone_constraint(self): + def test_monotone_constraints_categorical_feature(self): def is_increasing(y): return (np.diff(y) >= 0.0).all() def is_decreasing(y): return (np.diff(y) <= 0.0).all() - def is_correctly_constrained(learner): - n = 200 + def is_correctly_constrained(learner, number_categories): + n = 1000 variable_x = np.linspace(0, 1, n).reshape((n, 1)) fixed_xs_values = np.linspace(0, 1, n) for i in range(n): fixed_x = fixed_xs_values[i] * np.ones((n, 1)) - monotonically_increasing_x = np.column_stack((variable_x, fixed_x)) + monotonically_increasing_x = np.column_stack((variable_x, fixed_x, + (fixed_x * number_categories).astype(int))) monotonically_increasing_y = learner.predict(monotonically_increasing_x) - monotonically_decreasing_x = np.column_stack((fixed_x, variable_x)) + monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, + (fixed_x * number_categories).astype(int))) monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) - if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y)): + if not (is_increasing(monotonically_increasing_y) and + is_decreasing(monotonically_decreasing_y)): return False return True + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + number_categories = 2 ** (np.random.randint(1, 12)) + number_of_dpoints = 3000 + x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) + x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x3_categorical = (np.random.random(size=number_of_dpoints) * number_categories).astype(int) + x = np.column_stack( + (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_categorical)) + zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) + scales = 10. * (np.random.random(6) + 0.5) + y = (scales[0] * x1_positively_correlated_with_y + + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) + - scales[2] * x2_negatively_correlated_with_y + - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y) + - scales[4] * x3_categorical + - np.cos(scales[5] * np.pi * x3_categorical) + + zs) + trainset = lgb.Dataset(x, label=y) + params = { + 'min_data': 20, + 'num_leaves': 10, + "num_threads": 1, + 'monotone_constraints': '1,-1,0', + "categorical_feature": [2], + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model, number_categories)) + + # test if categorical features and monotone features can both be in a dataset without causing issues + def generate_trainset_for_monotone_constraints_tests(self): number_of_dpoints = 3000 x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) - x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y)) + x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x = np.column_stack( + (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_negatively_correlated_with_y)) zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) - y = (5 * x1_positively_correlated_with_y - + np.sin(10 * np.pi * x1_positively_correlated_with_y) - - 5 * x2_negatively_correlated_with_y - - np.cos(10 * np.pi * x2_negatively_correlated_with_y) + scales = 10. * (np.random.random(6) + 0.5) + y = (scales[0] * x1_positively_correlated_with_y + + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) + - scales[2] * x2_negatively_correlated_with_y + - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y) + - scales[4] * x3_negatively_correlated_with_y + - np.cos(scales[5] * np.pi * x3_negatively_correlated_with_y) + zs) trainset = lgb.Dataset(x, label=y) - params = { - 'min_data': 20, - 'num_leaves': 20, - 'monotone_constraints': '1,-1' - } - constrained_model = lgb.train(params, trainset) - self.assertTrue(is_correctly_constrained(constrained_model)) + return trainset + + def test_monotone_constraints(self): + def is_increasing(y): + return (np.diff(y) >= 0.0).all() + + def is_decreasing(y): + return (np.diff(y) <= 0.0).all() + + def is_non_monotone(y): + return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any() + + def is_correctly_constrained(learner): + n = 1000 + variable_x = np.linspace(0, 1, n).reshape((n, 1)) + fixed_xs_values = np.linspace(0, 1, n) + for i in range(n): + fixed_x = fixed_xs_values[i] * np.ones((n, 1)) + monotonically_increasing_x = np.column_stack((variable_x, fixed_x, fixed_x)) + monotonically_increasing_y = learner.predict(monotonically_increasing_x) + monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, fixed_x)) + monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) + non_monotone_x = np.column_stack((fixed_x, fixed_x, variable_x)) + non_monotone_y = learner.predict(non_monotone_x) + if not (is_increasing(monotonically_increasing_y) and + is_decreasing(monotonically_decreasing_y) and + is_non_monotone(non_monotone_y)): + return False + return True + + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + trainset = self.generate_trainset_for_monotone_constraints_tests() + params = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': '1,-1,0', + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model)) + + # test if the monotone penalty is working + def test_monotone_penalty(self): + def are_first_splits_non_monotone(tree, n, monotone_constraints): + if n <= 0: + return True + if "leaf_value" in tree: + return True + if monotone_constraints[tree["split_feature"]] != 0: + return False + return (are_first_splits_non_monotone(tree["left_child"], n - 1, monotone_constraints) and + are_first_splits_non_monotone(tree["right_child"], n - 1, monotone_constraints)) + + def are_there_monotone_splits(tree, monotone_constraints): + if "leaf_value" in tree: + return False + if monotone_constraints[tree["split_feature"]] != 0: + return True + return (are_there_monotone_splits(tree["left_child"], monotone_constraints) or + are_there_monotone_splits(tree["right_child"], monotone_constraints)) + + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + penalization_parameter = np.random.random() * 3 + trainset = self.generate_trainset_for_monotone_constraints_tests() + monotone_constraints = [1, -1, 0] + params = { + 'min_data': 20, + 'num_leaves': 100, + 'monotone_constraints': monotone_constraints, + 'monotone_penalty': penalization_parameter, + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params, trainset, 10) + dumped_model = constrained_model.dump_model()["tree_info"] + for tree in dumped_model: + self.assert_(are_first_splits_non_monotone(tree["tree_structure"], int(penalization_parameter), + monotone_constraints)) + self.assert_(are_there_monotone_splits(tree["tree_structure"], monotone_constraints)) + + # test if a penalty as high as the depth indeed prohibits all monotone splits + def test_monotone_penalty_max(self): + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + max_depth = 5 + penalization_parameter = max_depth - 1e-10 + trainset_constrained_model = self.generate_trainset_for_monotone_constraints_tests() + x = trainset_constrained_model.data + y = trainset_constrained_model.label + x3_negatively_correlated_with_y = x[:, 2] + monotone_constraints = [1, -1, 0] + params_constrained_model = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': monotone_constraints, + 'monotone_penalty': penalization_parameter, + "max_depth": max_depth, + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params_constrained_model, trainset_constrained_model, 10) + + trainset_unconstrained_model = lgb.Dataset(x3_negatively_correlated_with_y.reshape(-1, 1), label=y) + params_unconstrained_model = { + 'min_data': 20, + 'num_leaves': 20, + "max_depth": max_depth + } + unconstrained_model = lgb.train(params_unconstrained_model, trainset_unconstrained_model, 10) + + self.assert_((constrained_model.predict(x) == + unconstrained_model.predict(x3_negatively_correlated_with_y.reshape(-1, 1))).all()) def test_max_bin_by_feature(self): col1 = np.arange(0, 100)[:, np.newaxis] From 21b8e320430d0d9b33b094ff42c189379a323ee4 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 6 Aug 2019 10:48:56 +0100 Subject: [PATCH 16/45] Small bug fix. --- src/treelearner/serial_tree_learner.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 736af729d617..8dc5497fe641 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -1549,8 +1549,10 @@ void SerialTreeLearner::UpdateBestSplitsFromHistograms(SplitInfo &split, if (!is_feature_used_[feature_index]) continue; if (!histogram_array_[feature_index].is_splittable()) { - constraints_per_leaf_[leaf].are_actual_constraints_worse[feature_index] = - false; + if (config_->monotone_precise_mode) { + constraints_per_leaf_[leaf] + .are_actual_constraints_worse[feature_index] = false; + } continue; } From c8cea6f5df9803a0a74241c4411fa9d891a6fa0d Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 7 Aug 2019 15:55:24 +0100 Subject: [PATCH 17/45] Made testing time for new tests more reasonable. --- tests/python_package_test/test_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 5f58e8846433..e9e47145bc6a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -865,9 +865,10 @@ def is_decreasing(y): def is_correctly_constrained(learner, number_categories): n = 1000 + iterations = 10 variable_x = np.linspace(0, 1, n).reshape((n, 1)) fixed_xs_values = np.linspace(0, 1, n) - for i in range(n): + for i in range(iterations): fixed_x = fixed_xs_values[i] * np.ones((n, 1)) monotonically_increasing_x = np.column_stack((variable_x, fixed_x, (fixed_x * number_categories).astype(int))) @@ -943,10 +944,11 @@ def is_non_monotone(y): return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any() def is_correctly_constrained(learner): + iterations = 10 n = 1000 variable_x = np.linspace(0, 1, n).reshape((n, 1)) fixed_xs_values = np.linspace(0, 1, n) - for i in range(n): + for i in range(iterations): fixed_x = fixed_xs_values[i] * np.ones((n, 1)) monotonically_increasing_x = np.column_stack((variable_x, fixed_x, fixed_x)) monotonically_increasing_y = learner.predict(monotonically_increasing_x) From 1b0713dbbde99d75b3609f68a894064fdb514385 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 9 Sep 2019 16:59:43 +0100 Subject: [PATCH 18/45] Removed code specific to the slow method. --- src/treelearner/serial_tree_learner.cpp | 325 +----------------------- src/treelearner/serial_tree_learner.h | 20 -- 2 files changed, 2 insertions(+), 343 deletions(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 8dc5497fe641..55b72a5bd1ad 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -79,8 +79,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian // when the monotone precise mode is enabled, we need to store // more constraints; hence the constructors are different if (config_->monotone_precise_mode) { - constraints_per_leaf_.resize(config_->num_leaves, - Constraints(num_features_)); + ; } else { constraints_per_leaf_.resize(config_->num_leaves, Constraints()); @@ -287,83 +286,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians #endif cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } - #ifdef TIMETAG - start_time = std::chrono::steady_clock::now(); - #endif - // when the monotone precise mode is enabled, some splits might unconstrain leaves in other branches - // if these leaves are not split before the tree is being fully built, then it might be possible to - // move their internal value (because they have been unconstrained) to achieve a better gain - if (config_->monotone_precise_mode) { - ReFitLeaves(tree.get()); - } - #ifdef TIMETAG - refit_leaves_time += std::chrono::steady_clock::now() - start_time; - #endif Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth); return tree.release(); } -void SerialTreeLearner::ReFitLeaves(Tree *tree) { - CHECK(data_partition_->num_leaves() >= tree->num_leaves()); - bool might_be_something_to_update = true; - std::vector sum_grad(tree->num_leaves(), 0.0f); - std::vector sum_hess(tree->num_leaves(), kEpsilon); - OMP_INIT_EX(); - // first we need to compute gradients and hessians for each leaf -#pragma omp parallel for schedule(static) - for (int i = 0; i < tree->num_leaves(); ++i) { - OMP_LOOP_EX_BEGIN(); - if (!tree->leaf_is_in_monotone_subtree(i)) { - continue; - } - data_size_t cnt_leaf_data = 0; - auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data); - for (data_size_t j = 0; j < cnt_leaf_data; ++j) { - auto idx = tmp_idx[j]; - sum_grad[i] += gradients_[idx]; - sum_hess[i] += hessians_[idx]; - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - - while (might_be_something_to_update) { - might_be_something_to_update = false; - // this loop can't be multi-threaded easily because we could break - // monotonicity in the tree - for (int i = 0; i < tree->num_leaves(); ++i) { - if (!tree->leaf_is_in_monotone_subtree(i)) { - continue; - } - // we compute the constraints, and we only need one min and one max constraint this time - // because we are not going to split the leaf, we may just change its value - ComputeConstraintsPerThreshold(-1, tree, ~i, 0, false); - double min_constraint = min_constraints[0][0]; - double max_constraint = max_constraints[0][0]; -#ifdef DEBUG - CHECK(tree->LeafOutput(i) >= min_constraint); - CHECK(tree->LeafOutput(i) <= max_constraint); -#endif - double new_constrained_output = - FeatureHistogram::CalculateSplittedLeafOutput( - sum_grad[i], sum_hess[i], config_->lambda_l1, config_->lambda_l2, - config_->max_delta_step, min_constraint, max_constraint); - double old_output = tree->LeafOutput(i); - // a more accurate value may not immediately result in a loss reduction because of the shrinkage rate - if (fabs(old_output - new_constrained_output) > EPS) { - might_be_something_to_update = true; - tree->SetLeafOutput(i, new_constrained_output); - } - - // we reset the constraints - min_constraints[0][0] = -std::numeric_limits::max(); - max_constraints[0][0] = std::numeric_limits::max(); - thresholds[0].clear(); - is_in_right_split[0].clear(); - features[0].clear(); - } - } -} Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const { auto tree = std::unique_ptr(new Tree(*old_tree)); @@ -1012,81 +938,6 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri } } -// this function is only used if the monotone precise mode is enabled -// it computes the constraints for a given leaf and a given feature -// (there can be many constraints because the constraints can depend on thresholds) -void SerialTreeLearner::ComputeConstraintsPerThreshold( - int feature, const Tree *tree, int node_idx, unsigned int tid, - bool per_threshold, bool compute_min, bool compute_max, uint32_t it_start, - uint32_t it_end) { - int parent_idx = (node_idx < 0) ? tree->leaf_parent(~node_idx) - : tree->node_parent(node_idx); - - if (parent_idx != -1) { - int inner_feature = tree->split_feature_inner(parent_idx); - int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); - bool is_right_split = tree->right_child(parent_idx) == node_idx; - bool split_contains_new_information = true; - bool is_split_numerical = (train_data_->FeatureBinMapper(inner_feature) - ->bin_type()) == BinType::NumericalBin; - uint32_t threshold = tree->threshold_in_bin(parent_idx); - - // when we go up, we can get more information about the position of the original leaf - // so the starting and ending thresholds can be updated, which will save some time later - if ((feature == inner_feature) && is_split_numerical) { - if (is_right_split) { - it_start = std::max(threshold, it_start); - } else { - it_end = std::min(threshold + 1, it_end); - } -#ifdef DEBUG - CHECK(it_start < it_end); -#endif - } - - // only branches that contain leaves that are contiguous to the original leaf need to be visited - for (unsigned int i = 0; i < features[tid].size(); ++i) { - if (features[tid][i] == inner_feature && is_split_numerical && - is_in_right_split[tid][i] == is_right_split) { - split_contains_new_information = false; - break; - } - } - - if (split_contains_new_information) { - if (monotone_type != 0) { - int left_child_idx = tree->left_child(parent_idx); - int right_child_idx = tree->right_child(parent_idx); - bool left_child_is_curr_idx = (left_child_idx == node_idx); - - bool take_min = (monotone_type < 0) ? left_child_is_curr_idx - : !left_child_is_curr_idx; - if ((take_min && compute_min) || (!take_min && compute_max)) { - int node_idx_to_pass = - (left_child_is_curr_idx) ? right_child_idx : left_child_idx; - - // we go down in the opposite branch to see if some - // constraints that would apply to the original leaf can be found - ComputeConstraintsPerThresholdInSubtree( - feature, inner_feature, tree, node_idx_to_pass, take_min, - it_start, it_end, features[tid], thresholds[tid], - is_in_right_split[tid], tid, per_threshold); - } - } - - is_in_right_split[tid].push_back(is_right_split); - thresholds[tid].push_back(threshold); - features[tid].push_back(inner_feature); - } - - // we keep going up the tree to find constraints that could come from somewhere else - if (parent_idx != 0) { - ComputeConstraintsPerThreshold(feature, tree, parent_idx, tid, - per_threshold, compute_min, compute_max, - it_start, it_end); - } - } -} // this function checks if the original leaf and the children of the node that is // currently being visited are contiguous, and if so, the children should be visited too @@ -1118,28 +969,6 @@ std::pair SerialTreeLearner::ShouldKeepGoingLeftRight( return std::pair(keep_going_left, keep_going_right); } -// this function is called only when computing constraints when the monotone -// precise mode is set to true -// it makes sure that it is worth it to visit a branch, as it could -// not contain any relevant constraint (for example if the a branch -// with bigger values is also constraining the original leaf, then -// it is useless to visit the branch with smaller values) -std::pair SerialTreeLearner::LeftRightContainsRelevantInformation( - bool maximum, int inner_feature, bool split_feature_is_inner_feature) { - if (split_feature_is_inner_feature) { - return std::pair(true, true); - } - int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); - if (monotone_type == 0) { - return std::pair(true, true); - } - if ((monotone_type == -1 && maximum) || (monotone_type == 1 && !maximum)) { - return std::pair(true, false); - } - if ((monotone_type == 1 && maximum) || (monotone_type == -1 && !maximum)) { - return std::pair(false, true); - } -} // at any point in time, for an index i, the constraint constraint[i] has to be valid on // [threshold[i]: threshold[i + 1]) (or [threshold[i]: +inf) if i is the last index of the array) @@ -1234,87 +1063,6 @@ void SerialTreeLearner::UpdateConstraints( } } -// this function goes down in a subtree to find the constraints that would apply -void SerialTreeLearner::ComputeConstraintsPerThresholdInSubtree( - int split_feature, int monotone_feature, const Tree *tree, int node_idx, - bool maximum, uint32_t it_start, uint32_t it_end, - const std::vector &features, const std::vector &thresholds, - const std::vector &is_in_right_split, unsigned int tid, - bool per_threshold) { - bool is_original_split_numerical = - train_data_->FeatureBinMapper(split_feature)->bin_type() == - BinType::NumericalBin; - double extremum; - // if we just got to a leaf, then we update - // the constraints using the leaf value - if (node_idx < 0) { - extremum = tree->LeafOutput(~node_idx); -#ifdef DEBUG - CHECK(it_start < it_end); -#endif - // if the constraints per threshold are needed then monotone - // precise mode is enabled and we are not refitting leaves - if (per_threshold && is_original_split_numerical) { - std::vector > &constraints = - (maximum) ? min_constraints : max_constraints; - std::vector > &thresholds = - (maximum) ? thresholds_min_constraints : thresholds_max_constraints; - UpdateConstraints(constraints, thresholds, extremum, it_start, it_end, - split_feature, tid, maximum); - } else { // otherwise the constraints can be updated just by performing a min / max - if (maximum) { - min_constraints[tid][0] = std::max(min_constraints[tid][0], extremum); - } else { - max_constraints[tid][0] = std::min(max_constraints[tid][0], extremum); - } - } - } - // if the function got to a node, it keeps going down the tree - else { - // check if the children are contiguous to the original leaf - std::pair keep_going_left_right = ShouldKeepGoingLeftRight( - tree, node_idx, features, thresholds, is_in_right_split); - int inner_feature = tree->split_feature_inner(node_idx); - uint32_t threshold = tree->threshold_in_bin(node_idx); - - bool split_feature_is_inner_feature = (inner_feature == split_feature); - bool split_feature_is_monotone_feature = - (monotone_feature == split_feature); - // it is made sure that both children contain values that could potentially - // help determine the true constraints for the original leaf - std::pair left_right_contain_relevant_information = - LeftRightContainsRelevantInformation( - maximum, inner_feature, split_feature_is_inner_feature && - !split_feature_is_monotone_feature); - // if a child does not contain relevant information compared to the other child, - // and if the other child is not contiguous, then we still need to go down the first child - if (keep_going_left_right.first && - (left_right_contain_relevant_information.first || - !keep_going_left_right.second)) { - uint32_t new_it_end = - (split_feature_is_inner_feature && is_original_split_numerical) - ? std::min(threshold + 1, it_end) - : it_end; - ComputeConstraintsPerThresholdInSubtree( - split_feature, monotone_feature, tree, tree->left_child(node_idx), - maximum, it_start, new_it_end, features, thresholds, - is_in_right_split, tid, per_threshold); - } - if (keep_going_left_right.second && - (left_right_contain_relevant_information.second || - !keep_going_left_right.first)) { - uint32_t new_it_start = - (split_feature_is_inner_feature && is_original_split_numerical) - ? std::max(threshold + 1, it_start) - : it_start; - ComputeConstraintsPerThresholdInSubtree( - split_feature, monotone_feature, tree, tree->right_child(node_idx), - maximum, new_it_start, it_end, features, thresholds, - is_in_right_split, tid, per_threshold); - } - } -} - void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { if (obj != nullptr && obj->IsRenewTreeOutput()) { @@ -1549,10 +1297,6 @@ void SerialTreeLearner::UpdateBestSplitsFromHistograms(SplitInfo &split, if (!is_feature_used_[feature_index]) continue; if (!histogram_array_[feature_index].is_splittable()) { - if (config_->monotone_precise_mode) { - constraints_per_leaf_[leaf] - .are_actual_constraints_worse[feature_index] = false; - } continue; } @@ -1609,16 +1353,7 @@ void SerialTreeLearner::UpdateBestSplitsFromHistograms(SplitInfo &split, if (constraints_per_leaf_[leaf] .AreActualConstraintsWorse(feature_index)) { -#ifdef DEBUG - CHECK(config_->monotone_precise_mode); - CHECK((constraints_per_leaf_[leaf].ToBeUpdated(feature_index))); -#endif - - ComputeBestSplitForFeature( - split.left_sum_gradient + split.right_sum_gradient, - split.left_sum_hessian + split.right_sum_hessian, - split.left_count + split.right_count, feature_index, - histogram_array_, bests, leaf, depth, tid, real_fidx, tree, true); + ; } else { #ifdef DEBUG CHECK(!constraints_per_leaf_[leaf].ToBeUpdated(feature_index)); @@ -1650,44 +1385,6 @@ void SerialTreeLearner::ComputeBestSplitForFeature( // if this is not a subtree stemming from a monotone split, then no constraint apply if (tree->leaf_is_in_monotone_subtree(leaf_index)) { - if (config_->monotone_precise_mode) { - - ComputeConstraintsPerThreshold( - feature_index, tree, ~leaf_index, tid, config_->monotone_precise_mode, - constraints_per_leaf_[leaf_index].MinToBeUpdated(feature_index) || - !update, - constraints_per_leaf_[leaf_index].MaxToBeUpdated(feature_index) || - !update); - - if (!constraints_per_leaf_[leaf_index].MinToBeUpdated(feature_index) && - update) { - min_constraints[tid] = - constraints_per_leaf_[leaf_index].min_constraints[feature_index]; - thresholds_min_constraints[tid] = - constraints_per_leaf_[leaf_index].min_thresholds[feature_index]; - } else { - constraints_per_leaf_[leaf_index].min_constraints[feature_index] = - min_constraints[tid]; - constraints_per_leaf_[leaf_index].min_thresholds[feature_index] = - thresholds_min_constraints[tid]; - } - - if (!constraints_per_leaf_[leaf_index].MaxToBeUpdated(feature_index) && - update) { - max_constraints[tid] = - constraints_per_leaf_[leaf_index].max_constraints[feature_index]; - thresholds_max_constraints[tid] = - constraints_per_leaf_[leaf_index].max_thresholds[feature_index]; - } else { - constraints_per_leaf_[leaf_index].max_constraints[feature_index] = - max_constraints[tid]; - constraints_per_leaf_[leaf_index].max_thresholds[feature_index] = - thresholds_max_constraints[tid]; - } - - dummy_min_constraints[tid] = min_constraints[tid]; - dummy_max_constraints[tid] = max_constraints[tid]; - } if (!config_->monotone_precise_mode) { dummy_min_constraints[tid][0] = constraints_per_leaf_[leaf_index].min_constraints[0][0]; @@ -1746,24 +1443,6 @@ void SerialTreeLearner::ComputeBestSplitForFeature( bests[tid] = new_split; } - if (config_->monotone_precise_mode && - tree->leaf_is_in_monotone_subtree(leaf_index)) { - constraints_per_leaf_[leaf_index].ResetUpdates(feature_index); - } - -#ifdef DEBUG - ComputeConstraintsPerThreshold(-1, tree, ~leaf_index, tid, false); - double min_constraint = min_constraints[tid][0]; - double max_constraint = max_constraints[tid][0]; - CHECK(tree->LeafOutput(leaf_index) >= min_constraint); - CHECK(tree->LeafOutput(leaf_index) <= max_constraint); - - min_constraints[tid][0] = -std::numeric_limits::max(); - max_constraints[tid][0] = std::numeric_limits::max(); - thresholds[tid].clear(); - is_in_right_split[tid].clear(); - features[tid].clear(); -#endif } // initializing constraints is just writing that the constraints should +/- inf from threshold 0 diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 7208fa7a1b8d..aedaa5b9892d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -137,22 +137,6 @@ class SerialTreeLearner: public TreeLearner { bool compute_max, uint32_t it_start, uint32_t it_end); - void ComputeConstraintsPerThreshold(int feature, const Tree *tree, - int node_idx, unsigned int tid, - bool per_threshold = true, - bool compute_min = true, - bool compute_max = true) { - ComputeConstraintsPerThreshold(feature, tree, node_idx, tid, per_threshold, - compute_min, compute_max, 0, - train_data_->NumBin(feature)); - } - - void ComputeConstraintsPerThresholdInSubtree( - int split_feature, int monotone_feature, const Tree *tree, int node_idx, - bool maximum, uint32_t it_start, uint32_t it_end, - const std::vector &features, const std::vector &thresholds, - const std::vector &is_in_right_split, unsigned int tid, - bool per_threshold); static double ComputeMonotoneSplitGainPenalty(int depth, double penalization, double epsilon = 1e-10); @@ -202,10 +186,6 @@ class SerialTreeLearner: public TreeLearner { const std::vector &thresholds, const std::vector &is_in_right_split); - std::pair - LeftRightContainsRelevantInformation(bool maximum, int inner_feature, - bool split_feature_is_inner_feature); - void ReFitLeaves(Tree* tree); void InitializeConstraints(unsigned int tid); From 742c8e08198f1256019cf46db1e9446668ed30d8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 9 Sep 2019 18:47:09 +0100 Subject: [PATCH 19/45] Move the monotone penalty in the monotone_cosntraints file. --- src/treelearner/monotone_constraints.hpp | 11 +++++++++++ src/treelearner/serial_tree_learner.cpp | 15 +-------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index dd76e3de64b4..30e791fd6e6a 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -218,6 +218,17 @@ struct Constraints { max_thresholds[i][0] = 0; } } + + static double ComputeMonotoneSplitGainPenalty(int depth, double penalization, + double epsilon = 1e-10) { + if (penalization >= depth + 1.) { + return epsilon; + } + if (penalization <= 1.) { + return 1. - penalization / pow(2., depth) + epsilon; + } + return 1. - pow(2, penalization - 1. - depth) + epsilon; + } }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 55b72a5bd1ad..cbed3d459e37 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -1434,8 +1434,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature( if (new_split.monotone_type != 0) { - double penalty = - ComputeMonotoneSplitGainPenalty(depth, config_->monotone_penalty); + double penalty = Constraints::ComputeMonotoneSplitGainPenalty(depth, config_->monotone_penalty); new_split.gain *= penalty; } @@ -1468,16 +1467,4 @@ void SerialTreeLearner::InitializeConstraints(unsigned int tid) { thresholds_max_constraints[tid][0] = 0; } -double SerialTreeLearner::ComputeMonotoneSplitGainPenalty(int depth, - double penalization, - double epsilon) { - if (penalization >= depth + 1.) { - return epsilon; - } - if (penalization <= 1.) { - return 1. - penalization / pow(2., depth) + epsilon; - } - return 1. - pow(2, penalization - 1. - depth) + epsilon; -} - } // namespace LightGBM From e50b1413144e052a46ef24c711d072df736a7bc8 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 11 Sep 2019 16:38:57 +0100 Subject: [PATCH 20/45] Change name of current Constraints to LeafConstraints. --- src/treelearner/monotone_constraints.hpp | 8 ++++---- src/treelearner/serial_tree_learner.cpp | 4 ++-- src/treelearner/serial_tree_learner.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 30e791fd6e6a..ef19aba39192 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -10,7 +10,7 @@ namespace LightGBM { // one min and one max constraint // but if the monotone precise mode is enabled, then it may store a // large number of constraints for different thresholds and features -struct Constraints { +struct LeafConstraints { std::vector > min_constraints; std::vector > max_constraints; // the constraint number i is valid on the slice [thresholds[i]:threshold[i+1]) @@ -137,7 +137,7 @@ struct Constraints { // when the monotone precise mode is disabled, then we can just store // 1 min and 1 max constraints per leaf, so we call this constructor - Constraints() { + LeafConstraints() { min_constraints.push_back( std::vector(1, -std::numeric_limits::max())); max_constraints.push_back( @@ -148,7 +148,7 @@ struct Constraints { // when the monotone precise mode is enabled, then for each feature, // we need to sort an array of constraints - Constraints(unsigned int num_features) { + LeafConstraints(unsigned int num_features) { min_constraints.resize(num_features); max_constraints.resize(num_features); @@ -193,7 +193,7 @@ struct Constraints { return max_to_be_updated[feature_idx]; } - Constraints(const Constraints &constraints) + LeafConstraints(const LeafConstraints &constraints) : min_constraints(constraints.min_constraints), max_constraints(constraints.max_constraints), min_thresholds(constraints.min_thresholds), diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index cbed3d459e37..ebf098693ff9 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -82,7 +82,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ; } else { constraints_per_leaf_.resize(config_->num_leaves, - Constraints()); + LeafConstraints()); } splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features()); @@ -1434,7 +1434,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature( if (new_split.monotone_type != 0) { - double penalty = Constraints::ComputeMonotoneSplitGainPenalty(depth, config_->monotone_penalty); + double penalty = LeafConstraints::ComputeMonotoneSplitGainPenalty(depth, config_->monotone_penalty); new_split.gain *= penalty; } diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index aedaa5b9892d..218b33014726 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -222,7 +222,7 @@ class SerialTreeLearner: public TreeLearner { /*! \brief store best split points for all leaves */ std::vector best_split_per_leaf_; - std::vector constraints_per_leaf_; + std::vector constraints_per_leaf_; /*! \brief store best split per feature for all leaves */ std::vector splits_per_leaf_; From c49640ed1c348e6226dbdc2278d6c4d7e3d35062 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 12 Sep 2019 09:13:47 +0100 Subject: [PATCH 21/45] Created a class for current constraints. --- src/treelearner/monotone_constraints.hpp | 93 ++++++++++++++++++++++++ src/treelearner/serial_tree_learner.cpp | 88 +++------------------- src/treelearner/serial_tree_learner.h | 3 +- 3 files changed, 105 insertions(+), 79 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index ef19aba39192..9196ab74f5e2 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -231,5 +231,98 @@ struct LeafConstraints { } }; +struct CurrentConstraints { + std::vector > dummy_min_constraints; + std::vector > min_constraints; + std::vector > dummy_max_constraints; + std::vector > max_constraints; + + std::vector > thresholds_min_constraints; + std::vector > thresholds_max_constraints; + + const int space_to_reserve_non_monotone_precise_mode; + const int space_to_reserve_monotone_precise_mode; + + // the number 32 has no real meaning here, but during our experiments, + // we found that the number of constraints per feature was well below 32, so + // by allocating this space, we may save some time because we won't have to + // allocate it later + CurrentConstraints() + : space_to_reserve_non_monotone_precise_mode(1), + space_to_reserve_monotone_precise_mode(32) {}; + + void Init(int num_threads_, const Config *config_) { + dummy_min_constraints.resize(num_threads_); + min_constraints.resize(num_threads_); + dummy_max_constraints.resize(num_threads_); + max_constraints.resize(num_threads_); + + thresholds_min_constraints.resize(num_threads_); + thresholds_max_constraints.resize(num_threads_); + + int space_to_reserve = space_to_reserve_monotone_precise_mode; + if (!config_->monotone_precise_mode) { + space_to_reserve = space_to_reserve_non_monotone_precise_mode; + } + + for (int i = 0; i < num_threads_; ++i) { + dummy_min_constraints[i].reserve(space_to_reserve); + min_constraints[i].reserve(space_to_reserve); + dummy_max_constraints[i].reserve(space_to_reserve); + max_constraints[i].reserve(space_to_reserve); + + thresholds_min_constraints[i].reserve(space_to_reserve); + thresholds_max_constraints[i].reserve(space_to_reserve); + + InitializeConstraints(i); + } + } + + // initializing constraints is just writing that the constraints should +/- + // inf from threshold 0 + void InitializeConstraints(unsigned int tid) { + thresholds_min_constraints[tid].resize(1); + thresholds_max_constraints[tid].resize(1); + + dummy_min_constraints[tid].resize(1); + min_constraints[tid].resize(1); + dummy_max_constraints[tid].resize(1); + max_constraints[tid].resize(1); + + dummy_min_constraints[tid][0] = -std::numeric_limits::max(); + min_constraints[tid][0] = -std::numeric_limits::max(); + dummy_max_constraints[tid][0] = std::numeric_limits::max(); + max_constraints[tid][0] = std::numeric_limits::max(); + + thresholds_min_constraints[tid][0] = 0; + thresholds_max_constraints[tid][0] = 0; + } + + void Set(const LeafConstraints &leaf_constraints, unsigned int tid) { + dummy_min_constraints[tid][0] = leaf_constraints.min_constraints[0][0]; + dummy_max_constraints[tid][0] = leaf_constraints.max_constraints[0][0]; + + min_constraints[tid][0] = leaf_constraints.min_constraints[0][0]; + max_constraints[tid][0] = leaf_constraints.max_constraints[0][0]; + + thresholds_min_constraints[tid][0] = leaf_constraints.min_thresholds[0][0]; + thresholds_max_constraints[tid][0] = leaf_constraints.max_thresholds[0][0]; + } + + void CheckCoherenceWithLeafOutput(double leaf_output, unsigned int tid, + double EPS) { + CHECK(dummy_min_constraints[tid] == min_constraints[tid]); + CHECK(dummy_max_constraints[tid] == max_constraints[tid]); + for (const auto &x : max_constraints[tid]) { + CHECK(leaf_output <= EPS + x); + CHECK(x > -std::numeric_limits::max()); + } + for (const auto &x : dummy_min_constraints[tid]) { + CHECK(leaf_output + EPS >= x); + CHECK(x < std::numeric_limits::max()); + } + } +}; + } // namespace LightGBM #endif // LightGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index ebf098693ff9..c1dda6405a12 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -124,43 +124,22 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian cegb_->Init(); } - dummy_min_constraints.resize(num_threads_); - min_constraints.resize(num_threads_); - dummy_max_constraints.resize(num_threads_); - max_constraints.resize(num_threads_); - - thresholds_min_constraints.resize(num_threads_); - thresholds_max_constraints.resize(num_threads_); + current_constraints.Init(num_threads_, config_); features.resize(num_threads_); is_in_right_split.resize(num_threads_); thresholds.resize(num_threads_); - // the number 32 has no real meaning here, but during our experiments, - // we found that the number of constraints per feature was well below 32, so by - // allocating this space, we may save some time because we won't have to allocate it later - int space_to_reserve = 32; - if (!config_->monotone_precise_mode) { - space_to_reserve = 1; - } - for (int i = 0; i < num_threads_; ++i) { - dummy_min_constraints[i].reserve(space_to_reserve); - min_constraints[i].reserve(space_to_reserve); - dummy_max_constraints[i].reserve(space_to_reserve); - max_constraints[i].reserve(space_to_reserve); - - thresholds_min_constraints[i].reserve(space_to_reserve); - thresholds_max_constraints[i].reserve(space_to_reserve); - if (!config_->monotone_constraints.empty()) { // the number 100 has no real meaning here, same as before features[i].reserve(std::max(100, config_->max_depth)); is_in_right_split[i].reserve(std::max(100, config_->max_depth)); thresholds[i].reserve(std::max(100, config_->max_depth)); } - - InitializeConstraints(i); + thresholds[i].clear(); + is_in_right_split[i].clear(); + features[i].clear(); } } @@ -1386,45 +1365,23 @@ void SerialTreeLearner::ComputeBestSplitForFeature( // if this is not a subtree stemming from a monotone split, then no constraint apply if (tree->leaf_is_in_monotone_subtree(leaf_index)) { if (!config_->monotone_precise_mode) { - dummy_min_constraints[tid][0] = - constraints_per_leaf_[leaf_index].min_constraints[0][0]; - dummy_max_constraints[tid][0] = - constraints_per_leaf_[leaf_index].max_constraints[0][0]; - - min_constraints[tid][0] = - constraints_per_leaf_[leaf_index].min_constraints[0][0]; - max_constraints[tid][0] = - constraints_per_leaf_[leaf_index].max_constraints[0][0]; - - thresholds_min_constraints[tid][0] = - constraints_per_leaf_[leaf_index].min_thresholds[0][0]; - thresholds_max_constraints[tid][0] = - constraints_per_leaf_[leaf_index].max_thresholds[0][0]; + current_constraints.Set(constraints_per_leaf_[leaf_index], tid); } } #ifdef DEBUG - CHECK(dummy_min_constraints[tid] == min_constraints[tid]); - CHECK(dummy_max_constraints[tid] == max_constraints[tid]); - for (const auto &x : max_constraints[tid]) { - CHECK(tree->LeafOutput(leaf_index) <= EPS + x); - CHECK(x > -std::numeric_limits::max()); - } - for (const auto &x : dummy_min_constraints[tid]) { - CHECK(tree->LeafOutput(leaf_index) + EPS >= x); - CHECK(x < std::numeric_limits::max()); - } + current_constraints.CheckCoherenceWithLeafOutput(tree->LeafOutput(leaf_index), tid, EPS) #endif SplitInfo new_split; histogram_array_[feature_index].FindBestThreshold( - sum_gradient, sum_hessian, num_data, &new_split, min_constraints[tid], - dummy_min_constraints[tid], max_constraints[tid], - dummy_max_constraints[tid], thresholds_min_constraints[tid], - thresholds_max_constraints[tid]); + sum_gradient, sum_hessian, num_data, &new_split, current_constraints.min_constraints[tid], + current_constraints.dummy_min_constraints[tid], current_constraints.max_constraints[tid], + current_constraints.dummy_max_constraints[tid], current_constraints.thresholds_min_constraints[tid], + current_constraints.thresholds_max_constraints[tid]); if (tree->leaf_is_in_monotone_subtree(leaf_index)) { - InitializeConstraints(tid); + current_constraints.InitializeConstraints(tid); } new_split.feature = real_fidx; @@ -1444,27 +1401,4 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } -// initializing constraints is just writing that the constraints should +/- inf from threshold 0 -void SerialTreeLearner::InitializeConstraints(unsigned int tid) { - thresholds[tid].clear(); - is_in_right_split[tid].clear(); - features[tid].clear(); - - thresholds_min_constraints[tid].resize(1); - thresholds_max_constraints[tid].resize(1); - - dummy_min_constraints[tid].resize(1); - min_constraints[tid].resize(1); - dummy_max_constraints[tid].resize(1); - max_constraints[tid].resize(1); - - dummy_min_constraints[tid][0] = -std::numeric_limits::max(); - min_constraints[tid][0] = -std::numeric_limits::max(); - dummy_max_constraints[tid][0] = std::numeric_limits::max(); - max_constraints[tid][0] = std::numeric_limits::max(); - - thresholds_min_constraints[tid][0] = 0; - thresholds_max_constraints[tid][0] = 0; -} - } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 218b33014726..981ad7966ffd 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -264,8 +264,7 @@ class SerialTreeLearner: public TreeLearner { std::vector > dummy_max_constraints; std::vector > max_constraints; - std::vector > thresholds_min_constraints; - std::vector > thresholds_max_constraints; + CurrentConstraints current_constraints; std::vector > features; std::vector > thresholds; From 5e20b4b6ca0c520ee6c49ca9a01027983dc306e1 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 17 Sep 2019 08:59:20 +0100 Subject: [PATCH 22/45] Created the structure SplittingConstraints to encapsulate monotone constraints appearing in feature_histograme.hpp. --- .../data_parallel_tree_learner.cpp | 9 +- src/treelearner/feature_histogram.hpp | 267 ++++-------------- src/treelearner/monotone_constraints.hpp | 166 +++++++++++ src/treelearner/serial_tree_learner.cpp | 6 +- .../voting_parallel_tree_learner.cpp | 24 +- 5 files changed, 235 insertions(+), 237 deletions(-) diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index cc8bf7d4199c..28778f9426cd 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -192,15 +192,13 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms( SplitInfo smaller_split; // find best threshold for smaller child // FIXME Fill the vectors with the actual constraints and thresholds - std::vector max_constraints; - std::vector min_constraints; + SplittingConstraints constraints; std::vector thresholds; this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()), - &smaller_split, max_constraints, min_constraints, max_constraints, - min_constraints, thresholds, thresholds); + &smaller_split, constraints); smaller_split.feature = real_feature_index; if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) { smaller_bests_per_thread[tid] = smaller_split; @@ -219,8 +217,7 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms( this->larger_leaf_splits_->sum_gradients(), this->larger_leaf_splits_->sum_hessians(), GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()), - &larger_split, max_constraints, min_constraints, max_constraints, - min_constraints, thresholds, thresholds); + &larger_split, constraints); larger_split.feature = real_feature_index; if (larger_split > larger_bests_per_thread[tid] && larger_node_used_features[feature_index]) { larger_bests_per_thread[tid] = larger_split; diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 7d3363a0de96..a4610c3d0a08 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -16,6 +16,7 @@ #include #include "split_info.hpp" +#include "monotone_constraints.hpp" namespace LightGBM { @@ -60,16 +61,12 @@ class FeatureHistogram { find_best_threshold_fun_ = std::bind( &FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, - std::placeholders::_4, std::placeholders::_5, std::placeholders::_6, - std::placeholders::_7, std::placeholders::_8, std::placeholders::_9, - std::placeholders::_10); + std::placeholders::_4, std::placeholders::_5); } else { find_best_threshold_fun_ = std::bind( &FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, - std::placeholders::_4, std::placeholders::_5, std::placeholders::_6, - std::placeholders::_7, std::placeholders::_8, std::placeholders::_9, - std::placeholders::_10); + std::placeholders::_4, std::placeholders::_5); } } @@ -88,91 +85,49 @@ class FeatureHistogram { } } - void FindBestThreshold( - double sum_gradient, double sum_hessian, data_size_t num_data, - SplitInfo *output, - std::vector &cumulative_min_constraint_left_to_right, - std::vector &cumulative_min_constraint_right_to_left, - std::vector &cumulative_max_constraint_left_to_right, - std::vector &cumulative_max_constraint_right_to_left, - const std::vector &thresholds_min_constraint, - const std::vector &thresholds_max_constraint) { + void FindBestThreshold(double sum_gradient, double sum_hessian, + data_size_t num_data, SplitInfo *output, + SplittingConstraints &constraints) { output->default_left = true; output->gain = kMinScore; find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, - output, cumulative_min_constraint_left_to_right, - cumulative_min_constraint_right_to_left, - cumulative_max_constraint_left_to_right, - cumulative_max_constraint_right_to_left, - thresholds_min_constraint, - thresholds_max_constraint); + output, constraints); output->gain *= meta_->penalty; } - void FindBestThresholdNumerical( - double sum_gradient, double sum_hessian, data_size_t num_data, - SplitInfo *output, - std::vector &cumulative_min_constraint_left_to_right, - std::vector &cumulative_min_constraint_right_to_left, - std::vector &cumulative_max_constraint_left_to_right, - std::vector &cumulative_max_constraint_right_to_left, - const std::vector &thresholds_min_constraint, - const std::vector &thresholds_max_constraint) { + void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, + data_size_t num_data, SplitInfo *output, + SplittingConstraints &constraints) { is_splittable_ = false; could_be_splittable_ = false; double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - const double &(*min)(const double &, const double &) = std::min; - const double &(*max)(const double &, const double &) = std::max; // at this point, the following arrays contain the constraints applied on every part of the leaf // since we are splitting the leaf in 2, we can compute the cumulative / minimum maximum in both directions - CumulativeExtremum(max, true, cumulative_min_constraint_left_to_right); - CumulativeExtremum(max, false, cumulative_min_constraint_right_to_left); - CumulativeExtremum(min, true, cumulative_max_constraint_left_to_right); - CumulativeExtremum(min, false, cumulative_max_constraint_right_to_left); + constraints.ComputeCumulativeExtremums(); if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { FindBestThresholdSequence( sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, - true, false, cumulative_min_constraint_left_to_right, - cumulative_min_constraint_right_to_left, - cumulative_max_constraint_left_to_right, - cumulative_max_constraint_right_to_left, thresholds_min_constraint, - thresholds_max_constraint); + true, false, constraints); FindBestThresholdSequence( sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, - true, false, cumulative_min_constraint_left_to_right, - cumulative_min_constraint_right_to_left, - cumulative_max_constraint_left_to_right, - cumulative_max_constraint_right_to_left, thresholds_min_constraint, - thresholds_max_constraint); + true, false, constraints); } else { FindBestThresholdSequence( sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, - false, true, cumulative_min_constraint_left_to_right, - cumulative_min_constraint_right_to_left, - cumulative_max_constraint_left_to_right, - cumulative_max_constraint_right_to_left, thresholds_min_constraint, - thresholds_max_constraint); + false, true, constraints); FindBestThresholdSequence( sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, - false, true, cumulative_min_constraint_left_to_right, - cumulative_min_constraint_right_to_left, - cumulative_max_constraint_left_to_right, - cumulative_max_constraint_right_to_left, thresholds_min_constraint, - thresholds_max_constraint); + false, true, constraints); } } else { FindBestThresholdSequence( sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, - false, false, cumulative_min_constraint_left_to_right, - cumulative_min_constraint_right_to_left, - cumulative_max_constraint_left_to_right, - cumulative_max_constraint_right_to_left, thresholds_min_constraint, - thresholds_max_constraint); + false, false, constraints); // fix the direction error when only have 2 bins if (meta_->missing_type == MissingType::NaN) { output->default_left = false; @@ -184,13 +139,7 @@ class FeatureHistogram { void FindBestThresholdCategorical( double sum_gradient, double sum_hessian, data_size_t num_data, - SplitInfo *output, - std::vector &cumulative_min_constraint_left_to_right, - std::vector &cumulative_min_constraint_right_to_left, - std::vector &cumulative_max_constraint_left_to_right, - std::vector &cumulative_max_constraint_right_to_left, - const std::vector &thresholds_min_constraint, - const std::vector &thresholds_max_constraint) { + SplitInfo *output, SplittingConstraints& constraints) { output->default_left = false; double best_gain = kMinScore; data_size_t best_left_count = 0; @@ -208,6 +157,8 @@ class FeatureHistogram { int best_threshold = -1; int best_dir = 1; + constraints.InitializeIndices(1); + if (use_onehot) { for (int t = 0; t < used_bin; ++t) { // if data not enough, or sum hessian too small @@ -232,10 +183,10 @@ class FeatureHistogram { sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - cumulative_min_constraint_right_to_left[0], - cumulative_max_constraint_right_to_left[0], - cumulative_min_constraint_left_to_right[0], - cumulative_max_constraint_left_to_right[0], 0); + constraints.CurrentMinConstraintRight(), + constraints.CurrentMaxConstraintRight(), + constraints.CurrentMinConstraintLeft(), + constraints.CurrentMaxConstraintLeft(), 0); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -310,10 +261,10 @@ class FeatureHistogram { sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - cumulative_min_constraint_right_to_left[0], - cumulative_max_constraint_right_to_left[0], - cumulative_min_constraint_left_to_right[0], - cumulative_max_constraint_left_to_right[0], 0); + constraints.CurrentMinConstraintRight(), + constraints.CurrentMaxConstraintRight(), + constraints.CurrentMinConstraintLeft(), + constraints.CurrentMaxConstraintLeft(), 0); if (current_gain <= min_gain_shift) continue; is_splittable_ = true; @@ -333,8 +284,8 @@ class FeatureHistogram { output->left_output = CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - cumulative_min_constraint_left_to_right[0], - cumulative_max_constraint_left_to_right[0]); + constraints.CurrentMinConstraintLeft(), + constraints.CurrentMaxConstraintLeft()); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; @@ -342,8 +293,8 @@ class FeatureHistogram { sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - cumulative_min_constraint_right_to_left[0], - cumulative_max_constraint_right_to_left[0]); + constraints.CurrentMinConstraintRight(), + constraints.CurrentMaxConstraintRight()); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -561,29 +512,6 @@ class FeatureHistogram { } } - static void CumulativeExtremum( - const double &(*extremum_function)(const double &, const double &), - bool is_direction_from_left_to_right, - std::vector &cumulative_extremum) { - if (cumulative_extremum.size() == 1) { - return; - } - -#ifdef DEBUG - CHECK(cumulative_extremum.size() != 0); -#endif - - std::size_t n_exts = cumulative_extremum.size(); - int step = is_direction_from_left_to_right ? 1 : -1; - std::size_t start = is_direction_from_left_to_right ? 0 : n_exts - 1; - std::size_t end = is_direction_from_left_to_right ? n_exts - 1 : 0; - - for (auto i = start; i != end; i = i + step) { - cumulative_extremum[i + step] = extremum_function( - cumulative_extremum[i + step], cumulative_extremum[i]); - } - } - /*! * \brief Calculate the output of a leaf based on regularized sum_gradients and sum_hessians * \param sum_gradients @@ -633,16 +561,11 @@ class FeatureHistogram { return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output); } - void FindBestThresholdSequence( - double sum_gradient, double sum_hessian, data_size_t num_data, - double min_gain_shift, SplitInfo *output, int dir, bool skip_default_bin, - bool use_na_as_missing, - const std::vector &cumulative_min_constraint_left_to_right, - const std::vector &cumulative_min_constraint_right_to_left, - const std::vector &cumulative_max_constraint_left_to_right, - const std::vector &cumulative_max_constraint_right_to_left, - const std::vector &thresholds_min_constraint, - const std::vector &thresholds_max_constraint) { + void FindBestThresholdSequence(double sum_gradient, double sum_hessian, + data_size_t num_data, double min_gain_shift, + SplitInfo *output, int dir, + bool skip_default_bin, bool use_na_as_missing, + SplittingConstraints &constraints) { const int8_t bias = meta_->bias; double best_sum_left_gradient = NAN; @@ -666,16 +589,7 @@ class FeatureHistogram { int t = meta_->num_bin - 1 - bias - use_na_as_missing; const int t_end = 1 - bias; - unsigned int index_min_constraint_left_to_right = - thresholds_min_constraint.size() - 1; - unsigned int index_min_constraint_right_to_left = - thresholds_min_constraint.size() - 1; - unsigned int index_max_constraint_left_to_right = - thresholds_max_constraint.size() - 1; - unsigned int index_max_constraint_right_to_left = - thresholds_max_constraint.size() - 1; - bool update_is_necessary = !(thresholds_max_constraint.size() == 1 && - thresholds_min_constraint.size() == 1); + constraints.InitializeIndices(dir); // from right to left, and we don't need data in bin0 for (; t >= t_end; --t) { @@ -703,39 +617,7 @@ class FeatureHistogram { // when the monotone precise mode in enabled, as t changes, the constraints applied on // each child may change, because the constraints may depend on thresholds - if (update_is_necessary) { - while (static_cast(thresholds_min_constraint - [index_min_constraint_left_to_right]) > - t + bias - 1) { - index_min_constraint_left_to_right -= 1; - } - while (static_cast(thresholds_min_constraint - [index_min_constraint_right_to_left]) > - t + bias) { - index_min_constraint_right_to_left -= 1; - } - while (static_cast(thresholds_max_constraint - [index_max_constraint_left_to_right]) > - t + bias - 1) { - index_max_constraint_left_to_right -= 1; - } - while (static_cast(thresholds_max_constraint - [index_max_constraint_right_to_left]) > - t + bias) { - index_max_constraint_right_to_left -= 1; - } - } - -#ifdef DEBUG - CHECK(index_min_constraint_left_to_right < - thresholds_min_constraint.size()); - CHECK(index_min_constraint_right_to_left < - thresholds_min_constraint.size()); - CHECK(index_max_constraint_left_to_right < - thresholds_max_constraint.size()); - CHECK(index_max_constraint_right_to_left < - thresholds_max_constraint.size()); -#endif + constraints.UpdateIndices(dir, bias, t); // when the algorithm goes through the thresholds we use the same index for cumulative arrays // in both directions but each leaf is constrained according to the corresponding array @@ -744,15 +626,10 @@ class FeatureHistogram { sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - cumulative_min_constraint_right_to_left - [index_min_constraint_right_to_left], - cumulative_max_constraint_right_to_left - [index_max_constraint_right_to_left], - cumulative_min_constraint_left_to_right - [index_min_constraint_left_to_right], - cumulative_max_constraint_left_to_right - [index_max_constraint_left_to_right], - meta_->monotone_type); + constraints.CurrentMinConstraintRight(), + constraints.CurrentMaxConstraintRight(), + constraints.CurrentMinConstraintLeft(), + constraints.CurrentMaxConstraintLeft(), meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -767,14 +644,10 @@ class FeatureHistogram { best_threshold = static_cast(t - 1 + bias); best_gain = current_gain; - best_min_constraint_right = cumulative_min_constraint_right_to_left - [index_min_constraint_right_to_left]; - best_max_constraint_right = cumulative_max_constraint_right_to_left - [index_max_constraint_right_to_left]; - best_min_constraint_left = cumulative_min_constraint_left_to_right - [index_min_constraint_left_to_right]; - best_max_constraint_left = cumulative_max_constraint_left_to_right - [index_max_constraint_left_to_right]; + best_min_constraint_right = constraints.CurrentMinConstraintRight(); + best_max_constraint_right = constraints.CurrentMaxConstraintRight(); + best_min_constraint_left = constraints.CurrentMinConstraintLeft(); + best_max_constraint_left = constraints.CurrentMaxConstraintLeft(); } } } else { @@ -797,10 +670,7 @@ class FeatureHistogram { t = -1; } - unsigned int index_min_constraint_left_to_right = 0; - unsigned int index_min_constraint_right_to_left = 0; - unsigned int index_max_constraint_left_to_right = 0; - unsigned int index_max_constraint_right_to_left = 0; + constraints.InitializeIndices(dir); for (; t <= t_end; ++t) { // need to skip default bin @@ -825,31 +695,15 @@ class FeatureHistogram { double sum_right_gradient = sum_gradient - sum_left_gradient; - // current split gain -#ifdef DEBUG - CHECK(index_min_constraint_left_to_right < - thresholds_min_constraint.size()); - CHECK(index_min_constraint_right_to_left < - thresholds_min_constraint.size()); - CHECK(index_max_constraint_left_to_right < - thresholds_max_constraint.size()); - CHECK(index_max_constraint_right_to_left < - thresholds_max_constraint.size()); -#endif - + constraints.UpdateIndices(1, bias, t); double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - cumulative_min_constraint_right_to_left - [index_min_constraint_right_to_left], - cumulative_max_constraint_right_to_left - [index_max_constraint_right_to_left], - cumulative_min_constraint_left_to_right - [index_min_constraint_left_to_right], - cumulative_max_constraint_left_to_right - [index_max_constraint_left_to_right], - meta_->monotone_type); + constraints.CurrentMinConstraintRight(), + constraints.CurrentMaxConstraintRight(), + constraints.CurrentMinConstraintLeft(), + constraints.CurrentMaxConstraintLeft(), meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -863,14 +717,10 @@ class FeatureHistogram { best_threshold = static_cast(t + bias); best_gain = current_gain; - best_max_constraint_left = cumulative_max_constraint_left_to_right - [index_max_constraint_left_to_right]; - best_min_constraint_left = cumulative_min_constraint_left_to_right - [index_min_constraint_left_to_right]; - best_max_constraint_right = cumulative_max_constraint_right_to_left - [index_max_constraint_right_to_left]; - best_min_constraint_right = cumulative_min_constraint_right_to_left - [index_min_constraint_right_to_left]; + best_min_constraint_right = constraints.CurrentMinConstraintRight(); + best_max_constraint_right = constraints.CurrentMaxConstraintRight(); + best_min_constraint_left = constraints.CurrentMinConstraintLeft(); + best_max_constraint_left = constraints.CurrentMaxConstraintLeft(); } } } @@ -908,10 +758,7 @@ class FeatureHistogram { bool could_be_splittable_ = true; std::function &, std::vector &, - std::vector &, std::vector &, - const std::vector &, - const std::vector &)> find_best_threshold_fun_; + SplittingConstraints &)> find_best_threshold_fun_; }; class HistogramPool { public: diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 9196ab74f5e2..6c2b8f24c14b 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -231,6 +231,165 @@ struct LeafConstraints { } }; +struct SplittingConstraints { + std::vector cumulative_min_constraint_right_to_left; + std::vector cumulative_max_constraint_right_to_left; + std::vector cumulative_min_constraint_left_to_right; + std::vector cumulative_max_constraint_left_to_right; + + std::vector thresholds_min_constraints; + std::vector thresholds_max_constraints; + + unsigned int index_min_constraint_left_to_right; + unsigned int index_min_constraint_right_to_left; + unsigned int index_max_constraint_left_to_right; + unsigned int index_max_constraint_right_to_left; + bool update_is_necessary; + + SplittingConstraints() {}; + + SplittingConstraints( + std::vector &cumulative_min_constraint_right_to_left, + std::vector &cumulative_min_constraint_left_to_right, + std::vector &cumulative_max_constraint_right_to_left, + std::vector &cumulative_max_constraint_left_to_right, + std::vector &thresholds_min_constraints, + std::vector &thresholds_max_constraints) { + this->cumulative_min_constraint_right_to_left = + cumulative_min_constraint_right_to_left; + this->cumulative_min_constraint_left_to_right = + cumulative_min_constraint_left_to_right; + this->cumulative_max_constraint_right_to_left = + cumulative_max_constraint_right_to_left; + this->cumulative_max_constraint_left_to_right = + cumulative_max_constraint_left_to_right; + + this->thresholds_min_constraints = thresholds_min_constraints; + this->thresholds_max_constraints = thresholds_max_constraints; + } + + static void CumulativeExtremum( + const double &(*extremum_function)(const double &, const double &), + bool is_direction_from_left_to_right, + std::vector &cumulative_extremum) { + if (cumulative_extremum.size() == 1) { + return; + } +#ifdef DEBUG + CHECK(cumulative_extremum.size() != 0); +#endif + + std::size_t n_exts = cumulative_extremum.size(); + int step = is_direction_from_left_to_right ? 1 : -1; + std::size_t start = is_direction_from_left_to_right ? 0 : n_exts - 1; + std::size_t end = is_direction_from_left_to_right ? n_exts - 1 : 0; + + for (auto i = start; i != end; i = i + step) { + cumulative_extremum[i + step] = extremum_function( + cumulative_extremum[i + step], cumulative_extremum[i]); + } + } + + void ComputeCumulativeExtremums() { + const double &(*min)(const double &, const double &) = std::min; + const double &(*max)(const double &, const double &) = std::max; + + CumulativeExtremum(max, true, cumulative_min_constraint_left_to_right); + CumulativeExtremum(max, false, cumulative_min_constraint_right_to_left); + CumulativeExtremum(min, true, cumulative_max_constraint_left_to_right); + CumulativeExtremum(min, false, cumulative_max_constraint_right_to_left); + } + + void InitializeIndices(int dir) { + if (dir == -1) { + index_min_constraint_left_to_right = thresholds_min_constraints.size() - 1; + index_min_constraint_right_to_left = thresholds_min_constraints.size() - 1; + index_max_constraint_left_to_right = thresholds_max_constraints.size() - 1; + index_max_constraint_right_to_left = thresholds_max_constraints.size() - 1; + update_is_necessary = !(thresholds_max_constraints.size() == 1 && + thresholds_min_constraints.size() == 1); + } else { + index_min_constraint_left_to_right = 0; + index_min_constraint_right_to_left = 0; + index_max_constraint_left_to_right = 0; + index_max_constraint_right_to_left = 0; + } + } + + void UpdateIndices(int dir, const int8_t bias, int t) { + if (dir == -1) { + if (update_is_necessary) { + while ( + static_cast( + thresholds_min_constraints[index_min_constraint_left_to_right]) > + t + bias - 1) { + index_min_constraint_left_to_right -= 1; + } + while ( + static_cast( + thresholds_min_constraints[index_min_constraint_right_to_left]) > + t + bias) { + index_min_constraint_right_to_left -= 1; + } + while ( + static_cast( + thresholds_max_constraints[index_max_constraint_left_to_right]) > + t + bias - 1) { + index_max_constraint_left_to_right -= 1; + } + while ( + static_cast( + thresholds_max_constraints[index_max_constraint_right_to_left]) > + t + bias) { + index_max_constraint_right_to_left -= 1; + } + } +#ifdef DEBUG + CHECK(index_min_constraint_left_to_right < + thresholds_min_constraint.size()); + CHECK(index_min_constraint_right_to_left < + thresholds_min_constraint.size()); + CHECK(index_max_constraint_left_to_right < + thresholds_max_constraint.size()); + CHECK(index_max_constraint_right_to_left < + thresholds_max_constraint.size()); +#endif + } else { +// current split gain +#ifdef DEBUG + CHECK(index_min_constraint_left_to_right < + thresholds_min_constraint.size()); + CHECK(index_min_constraint_right_to_left < + thresholds_min_constraint.size()); + CHECK(index_max_constraint_left_to_right < + thresholds_max_constraint.size()); + CHECK(index_max_constraint_right_to_left < + thresholds_max_constraint.size()); +#endif + } + } + + double CurrentMinConstraintRight() const { + return cumulative_min_constraint_right_to_left + [index_min_constraint_right_to_left]; + } + + double CurrentMaxConstraintRight() const { + return cumulative_max_constraint_right_to_left + [index_max_constraint_right_to_left]; + } + + double CurrentMinConstraintLeft() const { + return cumulative_min_constraint_left_to_right + [index_min_constraint_left_to_right]; + } + + double CurrentMaxConstraintLeft() const { + return cumulative_max_constraint_left_to_right + [index_max_constraint_left_to_right]; + } +}; + struct CurrentConstraints { std::vector > dummy_min_constraints; std::vector > min_constraints; @@ -322,6 +481,13 @@ struct CurrentConstraints { CHECK(x < std::numeric_limits::max()); } } + + SplittingConstraints GetSplittingConstraints(int tid) { + return SplittingConstraints( + dummy_min_constraints[tid], min_constraints[tid], + dummy_max_constraints[tid], max_constraints[tid], + thresholds_min_constraints[tid], thresholds_max_constraints[tid]); + } }; } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index c1dda6405a12..97d5cea5c691 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -1374,11 +1374,9 @@ void SerialTreeLearner::ComputeBestSplitForFeature( #endif SplitInfo new_split; + SplittingConstraints splitting_constraints = current_constraints.GetSplittingConstraints(tid); histogram_array_[feature_index].FindBestThreshold( - sum_gradient, sum_hessian, num_data, &new_split, current_constraints.min_constraints[tid], - current_constraints.dummy_min_constraints[tid], current_constraints.max_constraints[tid], - current_constraints.dummy_max_constraints[tid], current_constraints.thresholds_min_constraints[tid], - current_constraints.thresholds_max_constraints[tid]); + sum_gradient, sum_hessian, num_data, &new_split, splitting_constraints); if (tree->leaf_is_in_monotone_subtree(leaf_index)) { current_constraints.InitializeConstraints(tid); diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index d295e14e96be..8b99e63272b0 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -304,16 +304,13 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) this->smaller_leaf_histogram_array_[feature_index].RawData()); // FIXME Fill the vectors with the actual constraints and thresholds - std::vector max_constraints; - std::vector min_constraints; - std::vector thresholds; + SplittingConstraints constraints; this->smaller_leaf_histogram_array_[feature_index] .FindBestThreshold(this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), this->smaller_leaf_splits_->num_data_in_leaf(), &smaller_bestsplit_per_features[feature_index], - max_constraints, min_constraints, max_constraints, - min_constraints, thresholds, thresholds); + constraints); smaller_bestsplit_per_features[feature_index].feature = real_feature_index; // only has root leaf if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -332,8 +329,7 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) this->larger_leaf_splits_->sum_hessians(), this->larger_leaf_splits_->num_data_in_leaf(), &larger_bestsplit_per_features[feature_index], - max_constraints, min_constraints, max_constraints, - min_constraints, thresholds, thresholds); + constraints); larger_bestsplit_per_features[feature_index].feature = real_feature_index; OMP_LOOP_EX_END(); } @@ -420,15 +416,12 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms( // find best threshold // FIXME Fill the vectors with the actual constraints and thresholds - std::vector max_constraints; - std::vector min_constraints; - std::vector thresholds; + SplittingConstraints constraints; smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold( smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(), GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), - &smaller_split, max_constraints, min_constraints, max_constraints, - min_constraints, thresholds, thresholds); + &smaller_split, constraints); smaller_split.feature = real_feature_index; if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) { smaller_bests_per_thread[tid] = smaller_split; @@ -447,16 +440,13 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms( // find best threshold // FIXME Fill the vectors with the actual constraints and thresholds - std::vector max_constraints; - std::vector min_constraints; - std::vector thresholds; + SplittingConstraints constraints; larger_leaf_histogram_array_global_[feature_index].FindBestThreshold( larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians(), GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), - &larger_split, max_constraints, min_constraints, max_constraints, - min_constraints, thresholds, thresholds); + &larger_split, constraints); larger_split.feature = real_feature_index; if (larger_split > larger_best_per_thread[tid] && larger_node_used_features[feature_index]) { larger_best_per_thread[tid] = larger_split; From 61ad8d65f5159b982ee7c58fd70f11bfb6ba6217 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 17 Sep 2019 09:48:05 +0100 Subject: [PATCH 23/45] Refactoring to make CurrentConstraints an array of SplittingConstraints. --- src/treelearner/monotone_constraints.hpp | 125 ++++++++++++----------- src/treelearner/serial_tree_learner.cpp | 3 +- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 6c2b8f24c14b..3a1a1af5387d 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -388,16 +388,62 @@ struct SplittingConstraints { return cumulative_max_constraint_left_to_right [index_max_constraint_left_to_right]; } + + void Reserve(int space_to_reserve) { + cumulative_max_constraint_right_to_left.reserve(space_to_reserve); + cumulative_max_constraint_left_to_right.reserve(space_to_reserve); + cumulative_min_constraint_right_to_left.reserve(space_to_reserve); + cumulative_min_constraint_left_to_right.reserve(space_to_reserve); + thresholds_max_constraints.reserve(space_to_reserve); + thresholds_min_constraints.reserve(space_to_reserve); + } + + void InitializeConstraints() { + thresholds_min_constraints.resize(1); + thresholds_max_constraints.resize(1); + + cumulative_min_constraint_right_to_left.resize(1); + cumulative_min_constraint_left_to_right.resize(1); + cumulative_max_constraint_right_to_left.resize(1); + cumulative_max_constraint_left_to_right.resize(1); + + cumulative_min_constraint_right_to_left[0] = -std::numeric_limits::max(); + cumulative_min_constraint_left_to_right[0] = -std::numeric_limits::max(); + cumulative_max_constraint_right_to_left[0] = std::numeric_limits::max(); + cumulative_max_constraint_left_to_right[0] = std::numeric_limits::max(); + + thresholds_min_constraints[0] = 0; + thresholds_max_constraints[0] = 0; + } + + void Set(const LeafConstraints &leaf_constraints) { + cumulative_min_constraint_right_to_left[0] = leaf_constraints.min_constraints[0][0]; + cumulative_max_constraint_right_to_left[0] = leaf_constraints.max_constraints[0][0]; + + cumulative_min_constraint_left_to_right[0] = leaf_constraints.min_constraints[0][0]; + cumulative_max_constraint_left_to_right[0] = leaf_constraints.max_constraints[0][0]; + + thresholds_min_constraints[0] = leaf_constraints.min_thresholds[0][0]; + thresholds_max_constraints[0] = leaf_constraints.max_thresholds[0][0]; + } + + void CheckCoherenceWithLeafOutput(double leaf_output, + double EPS) { + CHECK(cumulative_min_constraint_left_to_right == cumulative_min_constraint_right_to_left); + CHECK(cumulative_max_constraint_left_to_right == cumulative_max_constraint_right_to_left); + for (const auto &x : cumulative_max_constraint_left_to_right) { + CHECK(leaf_output <= EPS + x); + CHECK(x > -std::numeric_limits::max()); + } + for (const auto &x : cumulative_min_constraint_right_to_left) { + CHECK(leaf_output + EPS >= x); + CHECK(x < std::numeric_limits::max()); + } + } }; struct CurrentConstraints { - std::vector > dummy_min_constraints; - std::vector > min_constraints; - std::vector > dummy_max_constraints; - std::vector > max_constraints; - - std::vector > thresholds_min_constraints; - std::vector > thresholds_max_constraints; + std::vector splitting_constraints_vector; const int space_to_reserve_non_monotone_precise_mode; const int space_to_reserve_monotone_precise_mode; @@ -411,13 +457,7 @@ struct CurrentConstraints { space_to_reserve_monotone_precise_mode(32) {}; void Init(int num_threads_, const Config *config_) { - dummy_min_constraints.resize(num_threads_); - min_constraints.resize(num_threads_); - dummy_max_constraints.resize(num_threads_); - max_constraints.resize(num_threads_); - - thresholds_min_constraints.resize(num_threads_); - thresholds_max_constraints.resize(num_threads_); + splitting_constraints_vector.resize(num_threads_); int space_to_reserve = space_to_reserve_monotone_precise_mode; if (!config_->monotone_precise_mode) { @@ -425,68 +465,29 @@ struct CurrentConstraints { } for (int i = 0; i < num_threads_; ++i) { - dummy_min_constraints[i].reserve(space_to_reserve); - min_constraints[i].reserve(space_to_reserve); - dummy_max_constraints[i].reserve(space_to_reserve); - max_constraints[i].reserve(space_to_reserve); - - thresholds_min_constraints[i].reserve(space_to_reserve); - thresholds_max_constraints[i].reserve(space_to_reserve); - + splitting_constraints_vector[i].Reserve(space_to_reserve); InitializeConstraints(i); } } + SplittingConstraints& operator[](unsigned int i) { + return splitting_constraints_vector[i]; + } + // initializing constraints is just writing that the constraints should +/- // inf from threshold 0 void InitializeConstraints(unsigned int tid) { - thresholds_min_constraints[tid].resize(1); - thresholds_max_constraints[tid].resize(1); - - dummy_min_constraints[tid].resize(1); - min_constraints[tid].resize(1); - dummy_max_constraints[tid].resize(1); - max_constraints[tid].resize(1); - - dummy_min_constraints[tid][0] = -std::numeric_limits::max(); - min_constraints[tid][0] = -std::numeric_limits::max(); - dummy_max_constraints[tid][0] = std::numeric_limits::max(); - max_constraints[tid][0] = std::numeric_limits::max(); - - thresholds_min_constraints[tid][0] = 0; - thresholds_max_constraints[tid][0] = 0; + splitting_constraints_vector[tid].InitializeConstraints(); } void Set(const LeafConstraints &leaf_constraints, unsigned int tid) { - dummy_min_constraints[tid][0] = leaf_constraints.min_constraints[0][0]; - dummy_max_constraints[tid][0] = leaf_constraints.max_constraints[0][0]; - - min_constraints[tid][0] = leaf_constraints.min_constraints[0][0]; - max_constraints[tid][0] = leaf_constraints.max_constraints[0][0]; - - thresholds_min_constraints[tid][0] = leaf_constraints.min_thresholds[0][0]; - thresholds_max_constraints[tid][0] = leaf_constraints.max_thresholds[0][0]; + splitting_constraints_vector[tid].Set(leaf_constraints); } void CheckCoherenceWithLeafOutput(double leaf_output, unsigned int tid, double EPS) { - CHECK(dummy_min_constraints[tid] == min_constraints[tid]); - CHECK(dummy_max_constraints[tid] == max_constraints[tid]); - for (const auto &x : max_constraints[tid]) { - CHECK(leaf_output <= EPS + x); - CHECK(x > -std::numeric_limits::max()); - } - for (const auto &x : dummy_min_constraints[tid]) { - CHECK(leaf_output + EPS >= x); - CHECK(x < std::numeric_limits::max()); - } - } - - SplittingConstraints GetSplittingConstraints(int tid) { - return SplittingConstraints( - dummy_min_constraints[tid], min_constraints[tid], - dummy_max_constraints[tid], max_constraints[tid], - thresholds_min_constraints[tid], thresholds_max_constraints[tid]); + splitting_constraints_vector[tid] + .CheckCoherenceWithLeafOutput(leaf_output, EPS); } }; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 97d5cea5c691..62cfda3662fd 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -1374,9 +1374,8 @@ void SerialTreeLearner::ComputeBestSplitForFeature( #endif SplitInfo new_split; - SplittingConstraints splitting_constraints = current_constraints.GetSplittingConstraints(tid); histogram_array_[feature_index].FindBestThreshold( - sum_gradient, sum_hessian, num_data, &new_split, splitting_constraints); + sum_gradient, sum_hessian, num_data, &new_split, current_constraints[tid]); if (tree->leaf_is_in_monotone_subtree(leaf_index)) { current_constraints.InitializeConstraints(tid); From f020bfb52f55053baaf63d0d43be191fbefa2113 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 17 Sep 2019 13:10:54 +0100 Subject: [PATCH 24/45] Using the standard kEpsilon. --- src/treelearner/serial_tree_learner.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 62cfda3662fd..c154054ed076 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -28,8 +28,6 @@ std::chrono::duration ordered_bin_time; std::chrono::duration refit_leaves_time; #endif // TIMETAG -double EPS = 1e-12; - SerialTreeLearner::SerialTreeLearner(const Config* config) :config_(config) { random_ = Random(config_->feature_fraction_seed); @@ -1370,7 +1368,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } #ifdef DEBUG - current_constraints.CheckCoherenceWithLeafOutput(tree->LeafOutput(leaf_index), tid, EPS) + current_constraints.CheckCoherenceWithLeafOutput(tree->LeafOutput(leaf_index), tid, kEpsilon) #endif SplitInfo new_split; From 1e30c19a0bd9f2ae7e6e1d6c08b2a9a4efc4078d Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 26 Sep 2019 14:32:04 +0100 Subject: [PATCH 25/45] Making ComputeBestSplitForFeature static. --- src/treelearner/serial_tree_learner.cpp | 45 ++++++++++++++++--------- src/treelearner/serial_tree_learner.h | 21 ++++++++---- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index c154054ed076..4c1faf504c00 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -582,13 +582,16 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( smaller_leaf_histogram_array_[feature_index].RawData()); int real_fidx = train_data_->RealFeatureIndex(feature_index); - ComputeBestSplitForFeature(smaller_leaf_splits_->sum_gradients(), - smaller_leaf_splits_->sum_hessians(), - smaller_leaf_splits_->num_data_in_leaf(), - feature_index, smaller_leaf_histogram_array_, - smaller_best, smaller_leaf_splits_->LeafIndex(), - smaller_leaf_splits_->depth(), tid, real_fidx, - tree); + ComputeBestSplitForFeature( + smaller_leaf_splits_->sum_gradients(), + smaller_leaf_splits_->sum_hessians(), + smaller_leaf_splits_->num_data_in_leaf(), feature_index, + smaller_leaf_histogram_array_, smaller_best, + smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->depth(), tid, + real_fidx, tree, train_data_, splits_per_leaf_, config_, + current_constraints, data_partition_, constraints_per_leaf_, + cegb_); + // only has root leaf if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -600,13 +603,16 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_leaf_histogram_array_[feature_index].RawData()); } - ComputeBestSplitForFeature(larger_leaf_splits_->sum_gradients(), - larger_leaf_splits_->sum_hessians(), - larger_leaf_splits_->num_data_in_leaf(), - feature_index, larger_leaf_histogram_array_, - larger_best, larger_leaf_splits_->LeafIndex(), - larger_leaf_splits_->depth(), tid, real_fidx, - tree); + ComputeBestSplitForFeature( + larger_leaf_splits_->sum_gradients(), + larger_leaf_splits_->sum_hessians(), + larger_leaf_splits_->num_data_in_leaf(), feature_index, + larger_leaf_histogram_array_, larger_best, + larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->depth(), tid, + real_fidx, tree, train_data_, splits_per_leaf_, config_, + current_constraints, data_partition_, constraints_per_leaf_, + cegb_); + OMP_LOOP_EX_END(); } OMP_THROW_EX(); @@ -1293,7 +1299,9 @@ void SerialTreeLearner::UpdateBestSplitsFromHistograms(SplitInfo &split, split.left_sum_gradient + split.right_sum_gradient, split.left_sum_hessian + split.right_sum_hessian, split.left_count + split.right_count, feature_index, histogram_array_, - bests, leaf, depth, tid, real_fidx, tree, true); + bests, leaf, depth, tid, real_fidx, tree, train_data_, + splits_per_leaf_, config_, current_constraints, + data_partition_, constraints_per_leaf_, cegb_, true); } else { if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + feature_index] > bests[tid]) { @@ -1358,7 +1366,12 @@ void SerialTreeLearner::ComputeBestSplitForFeature( double sum_gradient, double sum_hessian, data_size_t num_data, int feature_index, FeatureHistogram *histogram_array_, std::vector &bests, int leaf_index, int depth, const int tid, - int real_fidx, const Tree *tree, bool update) { + int real_fidx, const Tree *tree, const Dataset *train_data_, + std::vector &splits_per_leaf_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::unique_ptr &data_partition_, + const std::vector &constraints_per_leaf_, + std::unique_ptr& cegb_, bool update) { // if this is not a subtree stemming from a monotone split, then no constraint apply if (tree->leaf_is_in_monotone_subtree(leaf_index)) { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 981ad7966ffd..75aa8521f7c9 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -124,12 +124,21 @@ class SerialTreeLearner: public TreeLearner { */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; - void ComputeBestSplitForFeature(double sum_gradient, double sum_hessian, - data_size_t num_data, int feature_index, - FeatureHistogram *histogram_array_, - std::vector &bests, int leaf_index, - int depth, const int tid, int real_fidx, - const Tree *tree, bool update = false); +// static double CalculateOndemandCosts( +// int feature_index, int leaf_index, const Dataset *train_data_, +// std::unique_ptr &data_partition_, +// const std::vector &feature_used_in_data, const Config *config_); + + static void ComputeBestSplitForFeature( + double sum_gradient, double sum_hessian, data_size_t num_data, + int feature_index, FeatureHistogram *histogram_array_, + std::vector &bests, int leaf_index, int depth, const int tid, + int real_fidx, const Tree *tree, const Dataset *train_data_, + std::vector &splits_per_leaf_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::unique_ptr &data_partition_, + const std::vector &constraints_per_leaf_, + std::unique_ptr& cegb_, bool update = false); void ComputeConstraintsPerThreshold(int feature, const Tree *tree, int node_idx, unsigned int tid, From d2de571d3e7915b583732649b78902e59e22a1b5 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 27 Sep 2019 10:57:30 +0100 Subject: [PATCH 26/45] Move the functions used for the constraints in the constraints files. Removed more of the code for the Slow constraining method. --- src/treelearner/monotone_constraints.cpp | 351 ++++++++++++++++++ src/treelearner/monotone_constraints.hpp | 75 ++++ src/treelearner/serial_tree_learner.cpp | 438 +---------------------- src/treelearner/serial_tree_learner.h | 91 +---- 4 files changed, 448 insertions(+), 507 deletions(-) create mode 100644 src/treelearner/monotone_constraints.cpp diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp new file mode 100644 index 000000000000..2252f1f63d8f --- /dev/null +++ b/src/treelearner/monotone_constraints.cpp @@ -0,0 +1,351 @@ +#include "monotone_constraints.hpp" +#include "serial_tree_learner.h" +#include "feature_histogram.hpp" +#include "cost_effective_gradient_boosting.hpp" + +namespace LightGBM { + +// this function goes through the tree to find how the split that +// has just been performed is going to affect the constraints of other leaves +void LeafConstraints::GoUpToFindLeavesToUpdate( + const Tree *tree, int node_idx, std::vector &features, + std::vector &thresholds, std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, double previous_leaf_output, + uint32_t split_threshold, const Dataset *train_data_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_) { + int parent_idx = tree->node_parent(node_idx); + if (parent_idx != -1) { + int inner_feature = tree->split_feature_inner(parent_idx); + int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); + bool is_right_split = tree->right_child(parent_idx) == node_idx; + bool split_contains_new_information = true; + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + // only branches containing leaves that are contiguous to the original leaf + // need to be updated + for (unsigned int i = 0; i < features.size(); ++i) { + if ((features[i] == inner_feature && is_split_numerical) && + (is_in_right_split[i] == is_right_split)) { + split_contains_new_information = false; + break; + } + } + + if (split_contains_new_information) { + if (monotone_type != 0) { + int left_child_idx = tree->left_child(parent_idx); + int right_child_idx = tree->right_child(parent_idx); + bool left_child_is_curr_idx = (left_child_idx == node_idx); + int node_idx_to_pass = + (left_child_is_curr_idx) ? right_child_idx : left_child_idx; + bool take_min = (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + + GoDownToFindLeavesToUpdate( + tree, node_idx_to_pass, features, thresholds, is_in_right_split, + take_min, split_feature, split_info, previous_leaf_output, true, + true, split_threshold, train_data_, config_, current_constraints, + constraints_per_leaf_, best_split_per_leaf_, is_feature_used_, + num_threads_, num_features_, histogram_pool_, cegb_); + } + + is_in_right_split.push_back(tree->right_child(parent_idx) == node_idx); + thresholds.push_back(tree->threshold_in_bin(parent_idx)); + features.push_back(tree->split_feature_inner(parent_idx)); + } + + if (parent_idx != 0) { + LeafConstraints::GoUpToFindLeavesToUpdate( + tree, parent_idx, features, thresholds, is_in_right_split, + split_feature, split_info, previous_leaf_output, split_threshold, + train_data_, config_, current_constraints, constraints_per_leaf_, + best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, + histogram_pool_, cegb_); + } + } +} + +// this function goes through the tree to find how the split that was just made +// is +// going to affect other leaves +void LeafConstraints::GoDownToFindLeavesToUpdate( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, int maximum, int split_feature, + const SplitInfo &split_info, double previous_leaf_output, + bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold, + const Dataset *train_data_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_) { + if (node_idx < 0) { + int leaf_idx = ~node_idx; + + // if leaf is at max depth then there is no need to update it + int max_depth = config_->max_depth; + if (tree->leaf_depth(leaf_idx) >= max_depth && max_depth > 0) { + return; + } + + // splits that are not to be used shall not be updated + if (best_split_per_leaf_[leaf_idx].gain == kMinScore) { + return; + } + + std::pair min_max_constraints; + bool something_changed; + if (use_right_leaf && use_left_leaf) { + min_max_constraints = + std::minmax(split_info.right_output, split_info.left_output); + } else if (use_right_leaf && !use_left_leaf) { + min_max_constraints = std::pair(split_info.right_output, + split_info.right_output); + } else { + min_max_constraints = std::pair(split_info.left_output, + split_info.left_output); + } + +#ifdef DEBUG + if (maximum) { + CHECK(min_max_constraints.first >= tree->LeafOutput(leaf_idx)); + } else { + CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); + } +#endif + + if (!config_->monotone_precise_mode) { + if (!maximum) { + something_changed = + constraints_per_leaf_[leaf_idx] + .SetMinConstraintAndReturnChange(min_max_constraints.second); + } else { + something_changed = + constraints_per_leaf_[leaf_idx] + .SetMaxConstraintAndReturnChange(min_max_constraints.first); + } + if (!something_changed) { + return; + } + } else { + if (!maximum) { + // both functions need to be called in this order + // because they modify the struct + something_changed = + constraints_per_leaf_[leaf_idx] + .CrossesMinConstraint(min_max_constraints.second); + something_changed = constraints_per_leaf_[leaf_idx] + .IsInMinConstraints(previous_leaf_output) || + something_changed; + } else { + // both functions need to be called in this order + // because they modify the struct + something_changed = + constraints_per_leaf_[leaf_idx] + .CrossesMaxConstraint(min_max_constraints.first); + something_changed = constraints_per_leaf_[leaf_idx] + .IsInMaxConstraints(previous_leaf_output) || + something_changed; + } + // if constraints have changed, then best splits need to be updated + // otherwise, we can just continue and go to the next split + if (!something_changed) { + return; + } + } + UpdateBestSplitsFromHistograms( + best_split_per_leaf_[leaf_idx], leaf_idx, tree->leaf_depth(leaf_idx), + tree, train_data_, config_, current_constraints, constraints_per_leaf_, + is_feature_used_, num_threads_, num_features_, histogram_pool_, cegb_); + } else { + // check if the children are contiguous with the original leaf + std::pair keep_going_left_right = ShouldKeepGoingLeftRight( + tree, node_idx, features, thresholds, is_in_right_split, train_data_); + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + bool use_left_leaf_for_update = true; + bool use_right_leaf_for_update = true; + if (is_split_numerical && inner_feature == split_feature) { + if (threshold >= split_threshold) { + use_left_leaf_for_update = false; + } + if (threshold <= split_threshold) { + use_right_leaf_for_update = false; + } + } + + if (keep_going_left_right.first) { + GoDownToFindLeavesToUpdate( + tree, tree->left_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + previous_leaf_output, use_left_leaf, + use_right_leaf_for_update && use_right_leaf, split_threshold, + train_data_, config_, current_constraints, constraints_per_leaf_, + best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, + histogram_pool_, cegb_); + } + if (keep_going_left_right.second) { + GoDownToFindLeavesToUpdate( + tree, tree->right_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + previous_leaf_output, use_left_leaf_for_update && use_left_leaf, + use_right_leaf, split_threshold, train_data_, config_, + current_constraints, constraints_per_leaf_, best_split_per_leaf_, + is_feature_used_, num_threads_, num_features_, histogram_pool_, + cegb_); + } + } +} + +// this function checks if the original leaf and the children of the node that +// is +// currently being visited are contiguous, and if so, the children should be +// visited too +std::pair LeafConstraints::ShouldKeepGoingLeftRight( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, const Dataset *train_data_) { + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + bool keep_going_right = true; + bool keep_going_left = true; + // we check if the left and right node are contiguous with the original leaf + // if so we should keep going down these nodes to update constraints + for (unsigned int i = 0; i < features.size(); ++i) { + if (features[i] == inner_feature) { + if (is_split_numerical) { + if (threshold >= thresholds[i] && !is_in_right_split[i]) { + keep_going_right = false; + } + if (threshold <= thresholds[i] && is_in_right_split[i]) { + keep_going_left = false; + } + } + } + } + return std::pair(keep_going_left, keep_going_right); +} + +// this function updates the best split for each leaf +// it is called only when monotone constraints exist +void LeafConstraints::UpdateBestSplitsFromHistograms( + SplitInfo &split, int leaf, int depth, const Tree *tree, + const Dataset *train_data_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_) { + std::vector bests(num_threads_); + std::vector should_split_be_worse(num_threads_, false); + + // the feature histogram is retrieved + FeatureHistogram *histogram_array_; + histogram_pool_.Get(leaf, &histogram_array_); + + OMP_INIT_EX(); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + // the feature that are supposed to be used are computed + if (!is_feature_used_[feature_index]) + continue; + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } + + // loop through the features to find the best one just like in the + // FindBestSplitsFromHistograms function + const int tid = omp_get_thread_num(); + int real_fidx = train_data_->RealFeatureIndex(feature_index); + + // if the monotone precise mode is disabled or if the constraints have to be + // updated, + // but are not exclusively worse, then we update the constraints and the + // best split + if (!config_->monotone_precise_mode || + (constraints_per_leaf_[leaf].ToBeUpdated(feature_index) && + !constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index))) { + + SerialTreeLearner::ComputeBestSplitForFeature( + split.left_sum_gradient + split.right_sum_gradient, + split.left_sum_hessian + split.right_sum_hessian, + split.left_count + split.right_count, feature_index, histogram_array_, + bests, leaf, depth, tid, real_fidx, tree, config_, + current_constraints, constraints_per_leaf_, cegb_, true); + } else { + if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + + feature_index] > bests[tid]) { + bests[tid] = cegb_->splits_per_leaf_ + [leaf * train_data_->num_features() + feature_index]; + should_split_be_worse[tid] = + constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index); + } + } + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + + auto best_idx = ArrayArgs::ArgMax(bests); + // if the best split that has been found previously actually doesn't have the + // true constraints + // but worse ones that were not computed before to optimize the computation + // time, + // then we update every split and every constraints that should be updated + if (should_split_be_worse[best_idx]) { + std::fill(bests.begin(), bests.end(), SplitInfo()); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; + ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_used_[feature_index]) + continue; + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } + + const int tid = omp_get_thread_num(); + int real_fidx = train_data_->RealFeatureIndex(feature_index); + + if (constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index)) { + ; + } else { +#ifdef DEBUG + CHECK(!constraints_per_leaf_[leaf].ToBeUpdated(feature_index)); +#endif + if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + + feature_index] > bests[tid]) { + bests[tid] = cegb_->splits_per_leaf_ + [leaf * train_data_->num_features() + feature_index]; + } + } + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + best_idx = ArrayArgs::ArgMax(bests); + } + + // note: the gains may differ for the same set of constraints due to the + // non-deterministic OMP reduction. + split = bests[best_idx]; +} + +} // namespace LightGBM diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 3a1a1af5387d..9794cea32bb0 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -2,9 +2,16 @@ #define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ #include +#include +#include "split_info.hpp" +#include namespace LightGBM { +struct CostEfficientGradientBoosting; +struct CurrentConstraints; +class HistogramPool; + // the purpose of this structure is to store the constraints for one leaf // when the monotone precise mode is disabled, then it will just store // one min and one max constraint @@ -26,6 +33,74 @@ struct LeafConstraints { // available, so we didn't need to compute them yet, but we may need to in the future std::vector are_actual_constraints_worse; + static void GoUpToFindLeavesToUpdate( + const Tree *tree, int node_idx, std::vector &features, + std::vector &thresholds, std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, + double previous_leaf_output, uint32_t split_threshold, + const Dataset *train_data_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_); + + static void GoUpToFindLeavesToUpdate( + const Tree *tree, int node_idx, int split_feature, + const SplitInfo &split_info, double previous_leaf_output, + uint32_t split_threshold, const Dataset *train_data_, + const Config *config_, CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_) { + int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; + + std::vector features; + std::vector thresholds; + std::vector is_in_right_split; + + features.reserve(depth); + thresholds.reserve(depth); + is_in_right_split.reserve(depth); + + GoUpToFindLeavesToUpdate( + tree, node_idx, features, thresholds, is_in_right_split, split_feature, + split_info, previous_leaf_output, split_threshold, train_data_, config_, + current_constraints, constraints_per_leaf_, best_split_per_leaf_, + is_feature_used_, num_threads_, num_features_, histogram_pool_, cegb_); + } + + static void GoDownToFindLeavesToUpdate( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, int maximum, + int split_feature, const SplitInfo &split_info, + double previous_leaf_output, bool use_left_leaf, bool use_right_leaf, + uint32_t split_threshold, const Dataset *train_data_, + const Config *config_, CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_); + + static std::pair ShouldKeepGoingLeftRight( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, const Dataset *train_data_); + + static void UpdateBestSplitsFromHistograms( + SplitInfo &split, int leaf, int depth, const Tree *tree, + const Dataset *train_data_, const Config *config_, + CurrentConstraints ¤t_constraints, + std::vector &constraints_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + std::unique_ptr &cegb_); + bool IsInConstraints(double element, const std::vector > &constraints, std::vector &to_be_updated) { diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 4c1faf504c00..e0756483d152 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -123,22 +123,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian } current_constraints.Init(num_threads_, config_); - - features.resize(num_threads_); - is_in_right_split.resize(num_threads_); - thresholds.resize(num_threads_); - - for (int i = 0; i < num_threads_; ++i) { - if (!config_->monotone_constraints.empty()) { - // the number 100 has no real meaning here, same as before - features[i].reserve(std::max(100, config_->max_depth)); - is_in_right_split[i].reserve(std::max(100, config_->max_depth)); - thresholds[i].reserve(std::max(100, config_->max_depth)); - } - thresholds[i].clear(); - is_in_right_split[i].clear(); - features[i].clear(); - } } void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { @@ -588,8 +572,8 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( smaller_leaf_splits_->num_data_in_leaf(), feature_index, smaller_leaf_histogram_array_, smaller_best, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->depth(), tid, - real_fidx, tree, train_data_, splits_per_leaf_, config_, - current_constraints, data_partition_, constraints_per_leaf_, + real_fidx, tree, config_, + current_constraints, constraints_per_leaf_, cegb_); // only has root leaf @@ -609,8 +593,8 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_leaf_splits_->num_data_in_leaf(), feature_index, larger_leaf_histogram_array_, larger_best, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->depth(), tid, - real_fidx, tree, train_data_, splits_per_leaf_, config_, - current_constraints, data_partition_, constraints_per_leaf_, + real_fidx, tree, config_, + current_constraints, constraints_per_leaf_, cegb_); OMP_LOOP_EX_END(); @@ -915,134 +899,12 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri // values don't clash with existing constraints in the subtree, // and if they do, the existing splits need to be updated if (tree->leaf_is_in_monotone_subtree(*right_leaf)) { - GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(*right_leaf), - inner_feature_index, best_split_info, - previous_leaf_output, best_split_info.threshold); - } -} - - -// this function checks if the original leaf and the children of the node that is -// currently being visited are contiguous, and if so, the children should be visited too -std::pair SerialTreeLearner::ShouldKeepGoingLeftRight( - const Tree *tree, int node_idx, const std::vector &features, - const std::vector &thresholds, - const std::vector &is_in_right_split) { - int inner_feature = tree->split_feature_inner(node_idx); - uint32_t threshold = tree->threshold_in_bin(node_idx); - bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; - - bool keep_going_right = true; - bool keep_going_left = true; - // we check if the left and right node are contiguous with the original leaf - // if so we should keep going down these nodes to update constraints - for (unsigned int i = 0; i < features.size(); ++i) { - if (features[i] == inner_feature) { - if (is_split_numerical) { - if (threshold >= thresholds[i] && !is_in_right_split[i]) { - keep_going_right = false; - } - if (threshold <= thresholds[i] && is_in_right_split[i]) { - keep_going_left = false; - } - } - } - } - return std::pair(keep_going_left, keep_going_right); -} - - -// at any point in time, for an index i, the constraint constraint[i] has to be valid on -// [threshold[i]: threshold[i + 1]) (or [threshold[i]: +inf) if i is the last index of the array) -// therefore, when a constraint is added on a leaf, it must be done very carefully -void SerialTreeLearner::UpdateConstraints( - std::vector > &constraints, - std::vector > &thresholds, double extremum, - uint32_t it_start, uint32_t it_end, int split_feature, int tid, - bool maximum) { - bool start_done = false; - bool end_done = false; - // one must always keep track of the previous constraint - // for example when adding a constraints cstr2 on thresholds [1:2), - // on an existing constraints cstr1 on thresholds [0, +inf), - // the thresholds and constraints must become - // [0, 1, 2] and [cstr1, cstr2, cstr1] - // so since we loop through thresholds only once, - // the previous constraint that still applies needs to be recorded - double previous_constraint; - double current_constraint; - for (unsigned int i = 0; i < thresholds[tid].size();) { - current_constraint = constraints[tid][i]; - // this is the easy case when the thresholds match - if (thresholds[tid][i] == it_start) { - constraints[tid][i] = (maximum) ? std::max(extremum, constraints[tid][i]) - : std::min(extremum, constraints[tid][i]); - start_done = true; - } - if (thresholds[tid][i] > it_start) { - // existing constraint is updated if there is a need for it - if (thresholds[tid][i] < it_end) { - constraints[tid][i] = (maximum) - ? std::max(extremum, constraints[tid][i]) - : std::min(extremum, constraints[tid][i]); - } - // when thresholds don't match, a new threshold - // and a new constraint may need to be inserted - if (!start_done) { - start_done = true; - if ((maximum && extremum > previous_constraint) || - (!maximum && extremum < previous_constraint)) { - constraints[tid].insert(constraints[tid].begin() + i, extremum); - thresholds[tid].insert(thresholds[tid].begin() + i, it_start); - i += 1; - } - } - } - // easy case when the thresholds match again - if (thresholds[tid][i] == it_end) { - end_done = true; - i += 1; - break; - } - // if they don't then, the previous constraint needs to be added back where the current one ends - if (thresholds[tid][i] > it_end) { - if (i != 0 && previous_constraint != constraints[tid][i - 1]) { - constraints[tid] - .insert(constraints[tid].begin() + i, previous_constraint); - thresholds[tid].insert(thresholds[tid].begin() + i, it_end); - } - end_done = true; - i += 1; - break; - } - // If 2 successive constraints are the same then the second one may as well be deleted - if (i != 0 && constraints[tid][i] == constraints[tid][i - 1]) { - constraints[tid].erase(constraints[tid].begin() + i); - thresholds[tid].erase(thresholds[tid].begin() + i); - previous_constraint = current_constraint; - i -= 1; - } - previous_constraint = current_constraint; - i += 1; - } - // if the loop didn't get to an index greater than it_start, it needs to be added at the end - if (!start_done) { - if ((maximum && extremum > constraints[tid].back()) || - (!maximum && extremum < constraints[tid].back())) { - constraints[tid].push_back(extremum); - thresholds[tid].push_back(it_start); - } else { - end_done = true; - } - } - // if we didn't get to an index after it_end, then the previous constraint needs to be set back - // unless it_end goes up to the last bin of the feature - if (!end_done && - static_cast(it_end) != train_data_->NumBin(split_feature) && - previous_constraint != constraints[tid].back()) { - constraints[tid].push_back(previous_constraint); - thresholds[tid].push_back(it_end); + LeafConstraints::GoUpToFindLeavesToUpdate( + tree, tree->leaf_parent(*right_leaf), inner_feature_index, + best_split_info, previous_leaf_output, best_split_info.threshold, + train_data_, config_, current_constraints, constraints_per_leaf_, + best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, + histogram_pool_, cegb_); } } @@ -1086,290 +948,14 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj } } -// this function goes through the tree to find how the split that was just made is -// going to affect other leaves -void SerialTreeLearner::GoDownToFindLeavesToUpdate( - const Tree *tree, int node_idx, const std::vector &features, - const std::vector &thresholds, - const std::vector &is_in_right_split, int maximum, int split_feature, - const SplitInfo &split_info, double previous_leaf_output, - bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold) { - if (node_idx < 0) { - int leaf_idx = ~node_idx; - - // if leaf is at max depth then there is no need to update it - int max_depth = config_->max_depth; - if (tree->leaf_depth(leaf_idx) >= max_depth && max_depth > 0) { - return; - } - - // splits that are not to be used shall not be updated - if (best_split_per_leaf_[leaf_idx].gain == kMinScore) { - return; - } - - std::pair min_max_constraints; - bool something_changed; - if (use_right_leaf && use_left_leaf) { - min_max_constraints = - std::minmax(split_info.right_output, split_info.left_output); - } else if (use_right_leaf && !use_left_leaf) { - min_max_constraints = std::pair(split_info.right_output, - split_info.right_output); - } else { - min_max_constraints = std::pair(split_info.left_output, - split_info.left_output); - } - -#ifdef DEBUG - if (maximum) { - CHECK(min_max_constraints.first >= tree->LeafOutput(leaf_idx)); - } else { - CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); - } -#endif - - if (!config_->monotone_precise_mode) { - if (!maximum) { - something_changed = - constraints_per_leaf_[leaf_idx] - .SetMinConstraintAndReturnChange(min_max_constraints.second); - } else { - something_changed = - constraints_per_leaf_[leaf_idx] - .SetMaxConstraintAndReturnChange(min_max_constraints.first); - } - if (!something_changed) { - return; - } - } else { - if (!maximum) { - // both functions need to be called in this order - // because they modify the struct - something_changed = - constraints_per_leaf_[leaf_idx] - .CrossesMinConstraint(min_max_constraints.second); - something_changed = constraints_per_leaf_[leaf_idx] - .IsInMinConstraints(previous_leaf_output) || - something_changed; - } else { - // both functions need to be called in this order - // because they modify the struct - something_changed = - constraints_per_leaf_[leaf_idx] - .CrossesMaxConstraint(min_max_constraints.first); - something_changed = constraints_per_leaf_[leaf_idx] - .IsInMaxConstraints(previous_leaf_output) || - something_changed; - } - // if constraints have changed, then best splits need to be updated - // otherwise, we can just continue and go to the next split - if (!something_changed) { - return; - } - } - UpdateBestSplitsFromHistograms(best_split_per_leaf_[leaf_idx], leaf_idx, - tree->leaf_depth(leaf_idx), tree); - } else { - // check if the children are contiguous with the original leaf - std::pair keep_going_left_right = ShouldKeepGoingLeftRight( - tree, node_idx, features, thresholds, is_in_right_split); - int inner_feature = tree->split_feature_inner(node_idx); - uint32_t threshold = tree->threshold_in_bin(node_idx); - bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; - bool use_left_leaf_for_update = true; - bool use_right_leaf_for_update = true; - if (is_split_numerical && inner_feature == split_feature) { - if (threshold >= split_threshold) { - use_left_leaf_for_update = false; - } - if (threshold <= split_threshold) { - use_right_leaf_for_update = false; - } - } - - if (keep_going_left_right.first) { - GoDownToFindLeavesToUpdate( - tree, tree->left_child(node_idx), features, thresholds, - is_in_right_split, maximum, split_feature, split_info, - previous_leaf_output, use_left_leaf, - use_right_leaf_for_update && use_right_leaf, split_threshold); - } - if (keep_going_left_right.second) { - GoDownToFindLeavesToUpdate( - tree, tree->right_child(node_idx), features, thresholds, - is_in_right_split, maximum, split_feature, split_info, - previous_leaf_output, use_left_leaf_for_update && use_left_leaf, - use_right_leaf, split_threshold); - } - } -} - -// this function goes through the tree to find how the split that -// has just been performed is going to affect the constraints of other leaves -void SerialTreeLearner::GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, std::vector &features, - std::vector &thresholds, std::vector &is_in_right_split, - int split_feature, const SplitInfo &split_info, double previous_leaf_output, - uint32_t split_threshold) { - int parent_idx = tree->node_parent(node_idx); - if (parent_idx != -1) { - int inner_feature = tree->split_feature_inner(parent_idx); - int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); - bool is_right_split = tree->right_child(parent_idx) == node_idx; - bool split_contains_new_information = true; - bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; - - // only branches containing leaves that are contiguous to the original leaf need to be updated - for (unsigned int i = 0; i < features.size(); ++i) { - if ((features[i] == inner_feature && is_split_numerical) && - (is_in_right_split[i] == is_right_split)) { - split_contains_new_information = false; - break; - } - } - - if (split_contains_new_information) { - if (monotone_type != 0) { - int left_child_idx = tree->left_child(parent_idx); - int right_child_idx = tree->right_child(parent_idx); - bool left_child_is_curr_idx = (left_child_idx == node_idx); - int node_idx_to_pass = - (left_child_is_curr_idx) ? right_child_idx : left_child_idx; - bool take_min = (monotone_type < 0) ? left_child_is_curr_idx - : !left_child_is_curr_idx; - - GoDownToFindLeavesToUpdate(tree, node_idx_to_pass, features, thresholds, - is_in_right_split, take_min, split_feature, - split_info, previous_leaf_output, true, true, - split_threshold); - } - - is_in_right_split.push_back(tree->right_child(parent_idx) == node_idx); - thresholds.push_back(tree->threshold_in_bin(parent_idx)); - features.push_back(tree->split_feature_inner(parent_idx)); - } - - if (parent_idx != 0) { - GoUpToFindLeavesToUpdate(tree, parent_idx, features, thresholds, - is_in_right_split, split_feature, split_info, - previous_leaf_output, split_threshold); - } - } -} - -// this function updates the best split for each leaf -// it is called only when monotone constraints exist -void SerialTreeLearner::UpdateBestSplitsFromHistograms(SplitInfo &split, - int leaf, int depth, - const Tree *tree) { - std::vector bests(num_threads_); - std::vector should_split_be_worse(num_threads_, false); - - // the feature histogram is retrieved - FeatureHistogram *histogram_array_; - histogram_pool_.Get(leaf, &histogram_array_); - - OMP_INIT_EX(); -#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) - for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - OMP_LOOP_EX_BEGIN(); - // the feature that are supposed to be used are computed - if (!is_feature_used_[feature_index]) - continue; - if (!histogram_array_[feature_index].is_splittable()) { - continue; - } - - // loop through the features to find the best one just like in the - // FindBestSplitsFromHistograms function - const int tid = omp_get_thread_num(); - int real_fidx = train_data_->RealFeatureIndex(feature_index); - - // if the monotone precise mode is disabled or if the constraints have to be updated, - // but are not exclusively worse, then we update the constraints and the best split - if (!config_->monotone_precise_mode || - (constraints_per_leaf_[leaf].ToBeUpdated(feature_index) && - !constraints_per_leaf_[leaf] - .AreActualConstraintsWorse(feature_index))) { - - ComputeBestSplitForFeature( - split.left_sum_gradient + split.right_sum_gradient, - split.left_sum_hessian + split.right_sum_hessian, - split.left_count + split.right_count, feature_index, histogram_array_, - bests, leaf, depth, tid, real_fidx, tree, train_data_, - splits_per_leaf_, config_, current_constraints, - data_partition_, constraints_per_leaf_, cegb_, true); - } else { - if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + feature_index] > - bests[tid]) { - bests[tid] = cegb_->splits_per_leaf_ - [leaf * train_data_->num_features() + feature_index]; - should_split_be_worse[tid] = - constraints_per_leaf_[leaf] - .AreActualConstraintsWorse(feature_index); - } - } - - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - - auto best_idx = ArrayArgs::ArgMax(bests); - // if the best split that has been found previously actually doesn't have the true constraints - // but worse ones that were not computed before to optimize the computation time, - // then we update every split and every constraints that should be updated - if (should_split_be_worse[best_idx]) { - std::fill(bests.begin(), bests.end(), SplitInfo()); -#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) - for (int feature_index = 0; feature_index < num_features_; - ++feature_index) { - OMP_LOOP_EX_BEGIN(); - if (!is_feature_used_[feature_index]) - continue; - if (!histogram_array_[feature_index].is_splittable()) { - continue; - } - - const int tid = omp_get_thread_num(); - int real_fidx = train_data_->RealFeatureIndex(feature_index); - - if (constraints_per_leaf_[leaf] - .AreActualConstraintsWorse(feature_index)) { - ; - } else { -#ifdef DEBUG - CHECK(!constraints_per_leaf_[leaf].ToBeUpdated(feature_index)); -#endif - if (cegb_->splits_per_leaf_ - [leaf * train_data_->num_features() + feature_index] > - bests[tid]) { - bests[tid] = cegb_->splits_per_leaf_ - [leaf * train_data_->num_features() + feature_index]; - } - } - - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - best_idx = ArrayArgs::ArgMax(bests); - } - - // note: the gains may differ for the same set of constraints due to the non-deterministic OMP reduction. - split = bests[best_idx]; -} - // this function computes the best split for a given leaf and a given feature void SerialTreeLearner::ComputeBestSplitForFeature( double sum_gradient, double sum_hessian, data_size_t num_data, int feature_index, FeatureHistogram *histogram_array_, std::vector &bests, int leaf_index, int depth, const int tid, - int real_fidx, const Tree *tree, const Dataset *train_data_, - std::vector &splits_per_leaf_, const Config *config_, + int real_fidx, const Tree *tree, + const Config *config_, CurrentConstraints ¤t_constraints, - std::unique_ptr &data_partition_, const std::vector &constraints_per_leaf_, std::unique_ptr& cegb_, bool update) { diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 75aa8521f7c9..bb7a0fdf0968 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -80,6 +80,16 @@ class SerialTreeLearner: public TreeLearner { void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + static void ComputeBestSplitForFeature( + double sum_gradient, double sum_hessian, data_size_t num_data, + int feature_index, FeatureHistogram *histogram_array_, + std::vector &bests, int leaf_index, int depth, const int tid, + int real_fidx, const Tree *tree, + const Config *config_, + CurrentConstraints ¤t_constraints, + const std::vector &constraints_per_leaf_, + std::unique_ptr& cegb_, bool update = false); + protected: virtual std::vector GetUsedFeatures(bool is_tree_level); /*! @@ -100,9 +110,6 @@ class SerialTreeLearner: public TreeLearner { FindBestSplitsFromHistograms(const std::vector &is_feature_used, bool use_subtract, const Tree *tree); - virtual void UpdateBestSplitsFromHistograms(SplitInfo &split, int leaf, - int depth, const Tree *tree); - /*! * \brief Partition tree and data according best split. * \param tree Current tree, will be splitted on this function. @@ -129,80 +136,6 @@ class SerialTreeLearner: public TreeLearner { // std::unique_ptr &data_partition_, // const std::vector &feature_used_in_data, const Config *config_); - static void ComputeBestSplitForFeature( - double sum_gradient, double sum_hessian, data_size_t num_data, - int feature_index, FeatureHistogram *histogram_array_, - std::vector &bests, int leaf_index, int depth, const int tid, - int real_fidx, const Tree *tree, const Dataset *train_data_, - std::vector &splits_per_leaf_, const Config *config_, - CurrentConstraints ¤t_constraints, - std::unique_ptr &data_partition_, - const std::vector &constraints_per_leaf_, - std::unique_ptr& cegb_, bool update = false); - - void ComputeConstraintsPerThreshold(int feature, const Tree *tree, - int node_idx, unsigned int tid, - bool per_threshold, bool compute_min, - bool compute_max, uint32_t it_start, - uint32_t it_end); - - - static double ComputeMonotoneSplitGainPenalty(int depth, double penalization, - double epsilon = 1e-10); - - void GoDownToFindLeavesToUpdate(const Tree *tree, int node_idx, - const std::vector &features, - const std::vector &thresholds, - const std::vector &is_in_right_split, - int maximum, int split_feature, - const SplitInfo &split_info, - double previous_leaf_output, - bool use_left_leaf, bool use_right_leaf, - uint32_t split_threshold); - - /* Once we made a split, the constraints on other leaves may change. - We need to update them to remain coherent. */ - void GoUpToFindLeavesToUpdate(const Tree *tree, int node_idx, - std::vector &features, - std::vector &thresholds, - std::vector &is_in_right_split, - int split_feature, const SplitInfo &split_info, - double previous_leaf_output, - uint32_t split_threshold); - - void GoUpToFindLeavesToUpdate(const Tree *tree, int node_idx, - int split_feature, const SplitInfo &split_info, - double previous_leaf_output, - uint32_t split_threshold) { - int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; - - std::vector features; - std::vector thresholds; - std::vector is_in_right_split; - - features.reserve(depth); - thresholds.reserve(depth); - is_in_right_split.reserve(depth); - - GoUpToFindLeavesToUpdate(tree, node_idx, features, thresholds, - is_in_right_split, split_feature, split_info, - previous_leaf_output, split_threshold); - } - - std::pair - ShouldKeepGoingLeftRight(const Tree *tree, int node_idx, - const std::vector &features, - const std::vector &thresholds, - const std::vector &is_in_right_split); - - - void InitializeConstraints(unsigned int tid); - - void UpdateConstraints(std::vector > &constraints, - std::vector > &thresholds, - double extremum, uint32_t it_start, uint32_t it_end, - int split_feature, int tid, bool maximum); - /*! \brief number of data */ data_size_t num_data_; /*! \brief number of features */ @@ -274,10 +207,6 @@ class SerialTreeLearner: public TreeLearner { std::vector > max_constraints; CurrentConstraints current_constraints; - - std::vector > features; - std::vector > thresholds; - std::vector > is_in_right_split; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { From e1125a9247c93f7343fd20f1111e8240243c28d0 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 27 Sep 2019 11:19:11 +0100 Subject: [PATCH 27/45] Moved another monotone-constraints-related function in the monotone constraints files. --- src/treelearner/monotone_constraints.cpp | 18 ++++++++++++++++++ src/treelearner/monotone_constraints.hpp | 5 +++++ src/treelearner/serial_tree_learner.cpp | 22 +++++----------------- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp index 2252f1f63d8f..086887f13106 100644 --- a/src/treelearner/monotone_constraints.cpp +++ b/src/treelearner/monotone_constraints.cpp @@ -5,6 +5,24 @@ namespace LightGBM { +void LeafConstraints::SetChildrenConstraintsFastMethod( + std::vector &constraints_per_leaf, int *right_leaf, + int *left_leaf, int8_t monotone_type, double right_output, + double left_output, bool is_numerical_split) { + constraints_per_leaf[*right_leaf] = constraints_per_leaf[*left_leaf]; + if (is_numerical_split) { + // depending on the monotone type we set constraints on the future splits + // these constraints may be updated later in the algorithm + if (monotone_type < 0) { + constraints_per_leaf[*left_leaf].SetMinConstraint(right_output); + constraints_per_leaf[*right_leaf].SetMaxConstraint(left_output); + } else if (monotone_type > 0) { + constraints_per_leaf[*left_leaf].SetMaxConstraint(right_output); + constraints_per_leaf[*right_leaf].SetMinConstraint(left_output); + } + } +} + // this function goes through the tree to find how the split that // has just been performed is going to affect the constraints of other leaves void LeafConstraints::GoUpToFindLeavesToUpdate( diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 9794cea32bb0..1e8c4b622af2 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -33,6 +33,11 @@ struct LeafConstraints { // available, so we didn't need to compute them yet, but we may need to in the future std::vector are_actual_constraints_worse; + static void SetChildrenConstraintsFastMethod( + std::vector &constraints_per_leaf, int *right_leaf, + int *left_leaf, int8_t monotone_type, double right_output, + double left_output, bool is_numerical_split); + static void GoUpToFindLeavesToUpdate( const Tree *tree, int node_idx, std::vector &features, std::vector &thresholds, std::vector &is_in_right_split, diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e0756483d152..0e1be2526a50 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -877,28 +877,16 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri // the children of a leaf, but when it is enabled, one needs to go through the tree to do so, // and it is done directly before computing best splits if (!config_->monotone_precise_mode) { - constraints_per_leaf_[*right_leaf] = constraints_per_leaf_[*left_leaf]; - if (is_numerical_split) { - // depending on the monotone type we set constraints on the future splits - // these constraints may be updated later in the algorithm - if (best_split_info.monotone_type < 0) { - constraints_per_leaf_[*left_leaf] - .SetMinConstraint(best_split_info.right_output); - constraints_per_leaf_[*right_leaf] - .SetMaxConstraint(best_split_info.left_output); - } else if (best_split_info.monotone_type > 0) { - constraints_per_leaf_[*left_leaf] - .SetMaxConstraint(best_split_info.right_output); - constraints_per_leaf_[*right_leaf] - .SetMinConstraint(best_split_info.left_output); - } - } + LeafConstraints::SetChildrenConstraintsFastMethod( + constraints_per_leaf_, right_leaf, left_leaf, + best_split_info.monotone_type, best_split_info.right_output, + best_split_info.left_output, is_numerical_split); } // if there is a monotone split above, we need to make sure the new // values don't clash with existing constraints in the subtree, // and if they do, the existing splits need to be updated - if (tree->leaf_is_in_monotone_subtree(*right_leaf)) { + if (tree->leaf_is_in_monotone_subtree(*right_leaf) && !config_->monotone_constraints.empty()) { LeafConstraints::GoUpToFindLeavesToUpdate( tree, tree->leaf_parent(*right_leaf), inner_feature_index, best_split_info, previous_leaf_output, best_split_info.threshold, From dbe74f2ddd016fd33d485da21b490b9db3bd9e50 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 27 Sep 2019 12:56:28 +0100 Subject: [PATCH 28/45] Added LearnerState structure to have functions with less arguments. --- src/treelearner/monotone_constraints.cpp | 157 +++++++++++------------ src/treelearner/monotone_constraints.hpp | 84 +++++++----- src/treelearner/serial_tree_learner.cpp | 84 ++++++------ src/treelearner/serial_tree_learner.h | 6 +- 4 files changed, 170 insertions(+), 161 deletions(-) diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp index 086887f13106..b950de4d6eb5 100644 --- a/src/treelearner/monotone_constraints.cpp +++ b/src/treelearner/monotone_constraints.cpp @@ -26,24 +26,24 @@ void LeafConstraints::SetChildrenConstraintsFastMethod( // this function goes through the tree to find how the split that // has just been performed is going to affect the constraints of other leaves void LeafConstraints::GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, std::vector &features, - std::vector &thresholds, std::vector &is_in_right_split, - int split_feature, const SplitInfo &split_info, double previous_leaf_output, - uint32_t split_threshold, const Dataset *train_data_, const Config *config_, - CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, - std::vector &best_split_per_leaf_, + int node_idx, std::vector &features, std::vector &thresholds, + std::vector &is_in_right_split, int split_feature, + const SplitInfo &split_info, double previous_leaf_output, + uint32_t split_threshold, std::vector &best_split_per_leaf_, const std::vector &is_feature_used_, int num_threads_, int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_) { - int parent_idx = tree->node_parent(node_idx); + LearnerState &learner_state) { + int parent_idx = learner_state.tree->node_parent(node_idx); if (parent_idx != -1) { - int inner_feature = tree->split_feature_inner(parent_idx); - int8_t monotone_type = train_data_->FeatureMonotone(inner_feature); - bool is_right_split = tree->right_child(parent_idx) == node_idx; + int inner_feature = learner_state.tree->split_feature_inner(parent_idx); + int8_t monotone_type = + learner_state.train_data_->FeatureMonotone(inner_feature); + bool is_right_split = + learner_state.tree->right_child(parent_idx) == node_idx; bool split_contains_new_information = true; - bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; + bool is_split_numerical = + learner_state.train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; // only branches containing leaves that are contiguous to the original leaf // need to be updated @@ -57,8 +57,8 @@ void LeafConstraints::GoUpToFindLeavesToUpdate( if (split_contains_new_information) { if (monotone_type != 0) { - int left_child_idx = tree->left_child(parent_idx); - int right_child_idx = tree->right_child(parent_idx); + int left_child_idx = learner_state.tree->left_child(parent_idx); + int right_child_idx = learner_state.tree->right_child(parent_idx); bool left_child_is_curr_idx = (left_child_idx == node_idx); int node_idx_to_pass = (left_child_is_curr_idx) ? right_child_idx : left_child_idx; @@ -66,25 +66,24 @@ void LeafConstraints::GoUpToFindLeavesToUpdate( : !left_child_is_curr_idx; GoDownToFindLeavesToUpdate( - tree, node_idx_to_pass, features, thresholds, is_in_right_split, - take_min, split_feature, split_info, previous_leaf_output, true, - true, split_threshold, train_data_, config_, current_constraints, - constraints_per_leaf_, best_split_per_leaf_, is_feature_used_, - num_threads_, num_features_, histogram_pool_, cegb_); + node_idx_to_pass, features, thresholds, is_in_right_split, take_min, + split_feature, split_info, previous_leaf_output, true, true, + split_threshold, best_split_per_leaf_, is_feature_used_, + num_threads_, num_features_, histogram_pool_, learner_state); } - is_in_right_split.push_back(tree->right_child(parent_idx) == node_idx); - thresholds.push_back(tree->threshold_in_bin(parent_idx)); - features.push_back(tree->split_feature_inner(parent_idx)); + is_in_right_split.push_back(learner_state.tree->right_child(parent_idx) == + node_idx); + thresholds.push_back(learner_state.tree->threshold_in_bin(parent_idx)); + features.push_back(learner_state.tree->split_feature_inner(parent_idx)); } if (parent_idx != 0) { LeafConstraints::GoUpToFindLeavesToUpdate( - tree, parent_idx, features, thresholds, is_in_right_split, - split_feature, split_info, previous_leaf_output, split_threshold, - train_data_, config_, current_constraints, constraints_per_leaf_, + parent_idx, features, thresholds, is_in_right_split, split_feature, + split_info, previous_leaf_output, split_threshold, best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, - histogram_pool_, cegb_); + histogram_pool_, learner_state); } } } @@ -93,24 +92,22 @@ void LeafConstraints::GoUpToFindLeavesToUpdate( // is // going to affect other leaves void LeafConstraints::GoDownToFindLeavesToUpdate( - const Tree *tree, int node_idx, const std::vector &features, + int node_idx, const std::vector &features, const std::vector &thresholds, const std::vector &is_in_right_split, int maximum, int split_feature, const SplitInfo &split_info, double previous_leaf_output, bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold, - const Dataset *train_data_, const Config *config_, - CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, std::vector &best_split_per_leaf_, const std::vector &is_feature_used_, int num_threads_, int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_) { + LearnerState &learner_state) { if (node_idx < 0) { int leaf_idx = ~node_idx; // if leaf is at max depth then there is no need to update it - int max_depth = config_->max_depth; - if (tree->leaf_depth(leaf_idx) >= max_depth && max_depth > 0) { + int max_depth = learner_state.config_->max_depth; + if (learner_state.tree->leaf_depth(leaf_idx) >= max_depth && + max_depth > 0) { return; } @@ -134,20 +131,22 @@ void LeafConstraints::GoDownToFindLeavesToUpdate( #ifdef DEBUG if (maximum) { - CHECK(min_max_constraints.first >= tree->LeafOutput(leaf_idx)); + CHECK(min_max_constraints.first >= + learner_state.tree->LeafOutput(leaf_idx)); } else { - CHECK(min_max_constraints.second <= tree->LeafOutput(leaf_idx)); + CHECK(min_max_constraints.second <= + learner_state.tree->LeafOutput(leaf_idx)); } #endif - if (!config_->monotone_precise_mode) { + if (!learner_state.config_->monotone_precise_mode) { if (!maximum) { something_changed = - constraints_per_leaf_[leaf_idx] + learner_state.constraints_per_leaf_[leaf_idx] .SetMinConstraintAndReturnChange(min_max_constraints.second); } else { something_changed = - constraints_per_leaf_[leaf_idx] + learner_state.constraints_per_leaf_[leaf_idx] .SetMaxConstraintAndReturnChange(min_max_constraints.first); } if (!something_changed) { @@ -158,18 +157,18 @@ void LeafConstraints::GoDownToFindLeavesToUpdate( // both functions need to be called in this order // because they modify the struct something_changed = - constraints_per_leaf_[leaf_idx] + learner_state.constraints_per_leaf_[leaf_idx] .CrossesMinConstraint(min_max_constraints.second); - something_changed = constraints_per_leaf_[leaf_idx] + something_changed = learner_state.constraints_per_leaf_[leaf_idx] .IsInMinConstraints(previous_leaf_output) || something_changed; } else { // both functions need to be called in this order // because they modify the struct something_changed = - constraints_per_leaf_[leaf_idx] + learner_state.constraints_per_leaf_[leaf_idx] .CrossesMaxConstraint(min_max_constraints.first); - something_changed = constraints_per_leaf_[leaf_idx] + something_changed = learner_state.constraints_per_leaf_[leaf_idx] .IsInMaxConstraints(previous_leaf_output) || something_changed; } @@ -180,17 +179,19 @@ void LeafConstraints::GoDownToFindLeavesToUpdate( } } UpdateBestSplitsFromHistograms( - best_split_per_leaf_[leaf_idx], leaf_idx, tree->leaf_depth(leaf_idx), - tree, train_data_, config_, current_constraints, constraints_per_leaf_, - is_feature_used_, num_threads_, num_features_, histogram_pool_, cegb_); + best_split_per_leaf_[leaf_idx], leaf_idx, + learner_state.tree->leaf_depth(leaf_idx), is_feature_used_, + num_threads_, num_features_, histogram_pool_, learner_state); } else { // check if the children are contiguous with the original leaf std::pair keep_going_left_right = ShouldKeepGoingLeftRight( - tree, node_idx, features, thresholds, is_in_right_split, train_data_); - int inner_feature = tree->split_feature_inner(node_idx); - uint32_t threshold = tree->threshold_in_bin(node_idx); - bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) - ->bin_type() == BinType::NumericalBin; + learner_state.tree, node_idx, features, thresholds, is_in_right_split, + learner_state.train_data_); + int inner_feature = learner_state.tree->split_feature_inner(node_idx); + uint32_t threshold = learner_state.tree->threshold_in_bin(node_idx); + bool is_split_numerical = + learner_state.train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; bool use_left_leaf_for_update = true; bool use_right_leaf_for_update = true; if (is_split_numerical && inner_feature == split_feature) { @@ -204,23 +205,21 @@ void LeafConstraints::GoDownToFindLeavesToUpdate( if (keep_going_left_right.first) { GoDownToFindLeavesToUpdate( - tree, tree->left_child(node_idx), features, thresholds, + learner_state.tree->left_child(node_idx), features, thresholds, is_in_right_split, maximum, split_feature, split_info, previous_leaf_output, use_left_leaf, use_right_leaf_for_update && use_right_leaf, split_threshold, - train_data_, config_, current_constraints, constraints_per_leaf_, best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, - histogram_pool_, cegb_); + histogram_pool_, learner_state); } if (keep_going_left_right.second) { GoDownToFindLeavesToUpdate( - tree, tree->right_child(node_idx), features, thresholds, + learner_state.tree->right_child(node_idx), features, thresholds, is_in_right_split, maximum, split_feature, split_info, previous_leaf_output, use_left_leaf_for_update && use_left_leaf, - use_right_leaf, split_threshold, train_data_, config_, - current_constraints, constraints_per_leaf_, best_split_per_leaf_, + use_right_leaf, split_threshold, best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, histogram_pool_, - cegb_); + learner_state); } } } @@ -260,13 +259,10 @@ std::pair LeafConstraints::ShouldKeepGoingLeftRight( // this function updates the best split for each leaf // it is called only when monotone constraints exist void LeafConstraints::UpdateBestSplitsFromHistograms( - SplitInfo &split, int leaf, int depth, const Tree *tree, - const Dataset *train_data_, const Config *config_, - CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, + SplitInfo &split, int leaf, int depth, const std::vector &is_feature_used_, int num_threads_, int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_) { + LearnerState &learner_state) { std::vector bests(num_threads_); std::vector should_split_be_worse(num_threads_, false); @@ -288,30 +284,29 @@ void LeafConstraints::UpdateBestSplitsFromHistograms( // loop through the features to find the best one just like in the // FindBestSplitsFromHistograms function const int tid = omp_get_thread_num(); - int real_fidx = train_data_->RealFeatureIndex(feature_index); + int real_fidx = learner_state.train_data_->RealFeatureIndex(feature_index); // if the monotone precise mode is disabled or if the constraints have to be // updated, // but are not exclusively worse, then we update the constraints and the // best split - if (!config_->monotone_precise_mode || - (constraints_per_leaf_[leaf].ToBeUpdated(feature_index) && - !constraints_per_leaf_[leaf] + if (!learner_state.config_->monotone_precise_mode || + (learner_state.constraints_per_leaf_[leaf].ToBeUpdated(feature_index) && + !learner_state.constraints_per_leaf_[leaf] .AreActualConstraintsWorse(feature_index))) { SerialTreeLearner::ComputeBestSplitForFeature( split.left_sum_gradient + split.right_sum_gradient, split.left_sum_hessian + split.right_sum_hessian, split.left_count + split.right_count, feature_index, histogram_array_, - bests, leaf, depth, tid, real_fidx, tree, config_, - current_constraints, constraints_per_leaf_, cegb_, true); + bests, leaf, depth, tid, real_fidx, learner_state, true); } else { - if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + + if (learner_state.cegb_->splits_per_leaf_[leaf * learner_state.train_data_->num_features() + feature_index] > bests[tid]) { - bests[tid] = cegb_->splits_per_leaf_ - [leaf * train_data_->num_features() + feature_index]; + bests[tid] = learner_state.cegb_->splits_per_leaf_ + [leaf * learner_state.train_data_->num_features() + feature_index]; should_split_be_worse[tid] = - constraints_per_leaf_[leaf] + learner_state.constraints_per_leaf_[leaf] .AreActualConstraintsWorse(feature_index); } } @@ -339,19 +334,21 @@ void LeafConstraints::UpdateBestSplitsFromHistograms( } const int tid = omp_get_thread_num(); - int real_fidx = train_data_->RealFeatureIndex(feature_index); + int real_fidx = + learner_state.train_data_->RealFeatureIndex(feature_index); - if (constraints_per_leaf_[leaf] + if (learner_state.constraints_per_leaf_[leaf] .AreActualConstraintsWorse(feature_index)) { ; } else { #ifdef DEBUG - CHECK(!constraints_per_leaf_[leaf].ToBeUpdated(feature_index)); + CHECK(!learner_state.constraints_per_leaf_[leaf] + .ToBeUpdated(feature_index)); #endif - if (cegb_->splits_per_leaf_[leaf * train_data_->num_features() + + if (learner_state.cegb_->splits_per_leaf_[leaf * learner_state.train_data_->num_features() + feature_index] > bests[tid]) { - bests[tid] = cegb_->splits_per_leaf_ - [leaf * train_data_->num_features() + feature_index]; + bests[tid] = learner_state.cegb_->splits_per_leaf_ + [leaf * learner_state.train_data_->num_features() + feature_index]; } } diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 1e8c4b622af2..12e87124b736 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -5,12 +5,34 @@ #include #include "split_info.hpp" #include +#include "data_partition.hpp" namespace LightGBM { struct CostEfficientGradientBoosting; struct CurrentConstraints; class HistogramPool; +struct LeafConstraints; + +struct LearnerState { + const Config *config_; + std::unique_ptr &data_partition_; + const Dataset *train_data_; + std::vector &constraints_per_leaf_; + const Tree *tree; + CurrentConstraints ¤t_constraints; + std::unique_ptr &cegb_; + + LearnerState(const Config *config_, + std::unique_ptr &data_partition_, + const Dataset *train_data_, + std::vector &constraints_per_leaf_, + const Tree *tree, CurrentConstraints ¤t_constraints, + std::unique_ptr &cegb_) + : config_(config_), data_partition_(data_partition_), + train_data_(train_data_), constraints_per_leaf_(constraints_per_leaf_), + tree(tree), current_constraints(current_constraints), cegb_(cegb_) {}; +}; // the purpose of this structure is to store the constraints for one leaf // when the monotone precise mode is disabled, then it will just store @@ -20,8 +42,10 @@ class HistogramPool; struct LeafConstraints { std::vector > min_constraints; std::vector > max_constraints; - // the constraint number i is valid on the slice [thresholds[i]:threshold[i+1]) - // if threshold[i+1] does not exist, then it is valid for thresholds following threshold[i] + // the constraint number i is valid on the slice + // [thresholds[i]:threshold[i+1]) + // if threshold[i+1] does not exist, then it is valid for thresholds following + // threshold[i] std::vector > min_thresholds; std::vector > max_thresholds; // These 2 vectors keep track of which constraints over which features @@ -30,7 +54,8 @@ struct LeafConstraints { std::vector max_to_be_updated; // This vector keeps track of the constraints that we didn't update for some // features, because they could only be worse, and another better split was - // available, so we didn't need to compute them yet, but we may need to in the future + // available, so we didn't need to compute them yet, but we may need to in the + // future std::vector are_actual_constraints_worse; static void SetChildrenConstraintsFastMethod( @@ -39,29 +64,25 @@ struct LeafConstraints { double left_output, bool is_numerical_split); static void GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, std::vector &features, + int node_idx, std::vector &features, std::vector &thresholds, std::vector &is_in_right_split, int split_feature, const SplitInfo &split_info, double previous_leaf_output, uint32_t split_threshold, - const Dataset *train_data_, const Config *config_, - CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, std::vector &best_split_per_leaf_, const std::vector &is_feature_used_, int num_threads_, int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_); + LearnerState &learner_state); static void GoUpToFindLeavesToUpdate( - const Tree *tree, int node_idx, int split_feature, - const SplitInfo &split_info, double previous_leaf_output, - uint32_t split_threshold, const Dataset *train_data_, - const Config *config_, CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, + int node_idx, int split_feature, const SplitInfo &split_info, + double previous_leaf_output, uint32_t split_threshold, std::vector &best_split_per_leaf_, const std::vector &is_feature_used_, int num_threads_, int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_) { - int depth = tree->leaf_depth(~tree->left_child(node_idx)) - 1; + LearnerState &learner_state) { + int depth = learner_state.tree->leaf_depth( + ~learner_state.tree->left_child(node_idx)) - + 1; std::vector features; std::vector thresholds; @@ -71,40 +92,35 @@ struct LeafConstraints { thresholds.reserve(depth); is_in_right_split.reserve(depth); - GoUpToFindLeavesToUpdate( - tree, node_idx, features, thresholds, is_in_right_split, split_feature, - split_info, previous_leaf_output, split_threshold, train_data_, config_, - current_constraints, constraints_per_leaf_, best_split_per_leaf_, - is_feature_used_, num_threads_, num_features_, histogram_pool_, cegb_); + GoUpToFindLeavesToUpdate(node_idx, features, thresholds, is_in_right_split, + split_feature, split_info, previous_leaf_output, + split_threshold, best_split_per_leaf_, + is_feature_used_, num_threads_, num_features_, + histogram_pool_, learner_state); } static void GoDownToFindLeavesToUpdate( - const Tree *tree, int node_idx, const std::vector &features, + int node_idx, const std::vector &features, const std::vector &thresholds, const std::vector &is_in_right_split, int maximum, int split_feature, const SplitInfo &split_info, double previous_leaf_output, bool use_left_leaf, bool use_right_leaf, - uint32_t split_threshold, const Dataset *train_data_, - const Config *config_, CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, - std::vector &best_split_per_leaf_, + uint32_t split_threshold, std::vector &best_split_per_leaf_, const std::vector &is_feature_used_, int num_threads_, int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_); + LearnerState &learner_state); static std::pair ShouldKeepGoingLeftRight( const Tree *tree, int node_idx, const std::vector &features, const std::vector &thresholds, const std::vector &is_in_right_split, const Dataset *train_data_); - static void UpdateBestSplitsFromHistograms( - SplitInfo &split, int leaf, int depth, const Tree *tree, - const Dataset *train_data_, const Config *config_, - CurrentConstraints ¤t_constraints, - std::vector &constraints_per_leaf_, - const std::vector &is_feature_used_, int num_threads_, - int num_features_, HistogramPool &histogram_pool_, - std::unique_ptr &cegb_); + static void + UpdateBestSplitsFromHistograms(SplitInfo &split, int leaf, int depth, + const std::vector &is_feature_used_, + int num_threads_, int num_features_, + HistogramPool &histogram_pool_, + LearnerState &learner_state); bool IsInConstraints(double element, const std::vector > &constraints, diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 0e1be2526a50..b3387b048dd8 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -544,6 +544,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( #ifdef TIMETAG auto start_time = std::chrono::steady_clock::now(); #endif + LearnerState learner_state(config_, data_partition_, train_data_, + constraints_per_leaf_, tree, current_constraints, + cegb_); std::vector smaller_best(num_threads_); std::vector larger_best(num_threads_); std::vector smaller_node_used_features(num_features_, 1); @@ -566,15 +569,13 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( smaller_leaf_histogram_array_[feature_index].RawData()); int real_fidx = train_data_->RealFeatureIndex(feature_index); - ComputeBestSplitForFeature( - smaller_leaf_splits_->sum_gradients(), - smaller_leaf_splits_->sum_hessians(), - smaller_leaf_splits_->num_data_in_leaf(), feature_index, - smaller_leaf_histogram_array_, smaller_best, - smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->depth(), tid, - real_fidx, tree, config_, - current_constraints, constraints_per_leaf_, - cegb_); + ComputeBestSplitForFeature(smaller_leaf_splits_->sum_gradients(), + smaller_leaf_splits_->sum_hessians(), + smaller_leaf_splits_->num_data_in_leaf(), + feature_index, smaller_leaf_histogram_array_, + smaller_best, smaller_leaf_splits_->LeafIndex(), + smaller_leaf_splits_->depth(), tid, real_fidx, + learner_state); // only has root leaf if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -587,15 +588,13 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( larger_leaf_histogram_array_[feature_index].RawData()); } - ComputeBestSplitForFeature( - larger_leaf_splits_->sum_gradients(), - larger_leaf_splits_->sum_hessians(), - larger_leaf_splits_->num_data_in_leaf(), feature_index, - larger_leaf_histogram_array_, larger_best, - larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->depth(), tid, - real_fidx, tree, config_, - current_constraints, constraints_per_leaf_, - cegb_); + ComputeBestSplitForFeature(larger_leaf_splits_->sum_gradients(), + larger_leaf_splits_->sum_hessians(), + larger_leaf_splits_->num_data_in_leaf(), + feature_index, larger_leaf_histogram_array_, + larger_best, larger_leaf_splits_->LeafIndex(), + larger_leaf_splits_->depth(), tid, real_fidx, + learner_state); OMP_LOOP_EX_END(); } @@ -887,12 +886,14 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri // values don't clash with existing constraints in the subtree, // and if they do, the existing splits need to be updated if (tree->leaf_is_in_monotone_subtree(*right_leaf) && !config_->monotone_constraints.empty()) { + LearnerState learner_state(config_, data_partition_, train_data_, + constraints_per_leaf_, tree, current_constraints, + cegb_); LeafConstraints::GoUpToFindLeavesToUpdate( - tree, tree->leaf_parent(*right_leaf), inner_feature_index, + tree->leaf_parent(*right_leaf), inner_feature_index, best_split_info, previous_leaf_output, best_split_info.threshold, - train_data_, config_, current_constraints, constraints_per_leaf_, - best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, - histogram_pool_, cegb_); + best_split_per_leaf_, is_feature_used_, + num_threads_, num_features_, histogram_pool_, learner_state); } } @@ -941,46 +942,45 @@ void SerialTreeLearner::ComputeBestSplitForFeature( double sum_gradient, double sum_hessian, data_size_t num_data, int feature_index, FeatureHistogram *histogram_array_, std::vector &bests, int leaf_index, int depth, const int tid, - int real_fidx, const Tree *tree, - const Config *config_, - CurrentConstraints ¤t_constraints, - const std::vector &constraints_per_leaf_, - std::unique_ptr& cegb_, bool update) { + int real_fidx, LearnerState &learner_state, bool update) { - // if this is not a subtree stemming from a monotone split, then no constraint apply - if (tree->leaf_is_in_monotone_subtree(leaf_index)) { - if (!config_->monotone_precise_mode) { - current_constraints.Set(constraints_per_leaf_[leaf_index], tid); + // if this is not a subtree stemming from a monotone split, then no constraint + // apply + if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) { + if (!learner_state.config_->monotone_precise_mode) { + learner_state.current_constraints.Set(learner_state.constraints_per_leaf_[leaf_index], + tid); } } #ifdef DEBUG - current_constraints.CheckCoherenceWithLeafOutput(tree->LeafOutput(leaf_index), tid, kEpsilon) + learner_state.current_constraints.CheckCoherenceWithLeafOutput( + learner_state.tree->LeafOutput(leaf_index), tid, kEpsilon) #endif + SplitInfo new_split; + histogram_array_[feature_index] + .FindBestThreshold(sum_gradient, sum_hessian, num_data, &new_split, + learner_state.current_constraints[tid]); - SplitInfo new_split; - histogram_array_[feature_index].FindBestThreshold( - sum_gradient, sum_hessian, num_data, &new_split, current_constraints[tid]); - - if (tree->leaf_is_in_monotone_subtree(leaf_index)) { - current_constraints.InitializeConstraints(tid); + if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) { + learner_state.current_constraints.InitializeConstraints(tid); } new_split.feature = real_fidx; - if (cegb_ != nullptr) { - new_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, leaf_index, num_data, new_split); + if (learner_state.cegb_ != nullptr) { + new_split.gain -= learner_state.cegb_->DetlaGain(feature_index, real_fidx, leaf_index, num_data, new_split); } if (new_split.monotone_type != 0) { - double penalty = LeafConstraints::ComputeMonotoneSplitGainPenalty(depth, config_->monotone_penalty); + double penalty = LeafConstraints::ComputeMonotoneSplitGainPenalty( + depth, learner_state.config_->monotone_penalty); new_split.gain *= penalty; } if (new_split > bests[tid]) { bests[tid] = new_split; } - } } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index bb7a0fdf0968..1c9dffccfaa7 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -84,11 +84,7 @@ class SerialTreeLearner: public TreeLearner { double sum_gradient, double sum_hessian, data_size_t num_data, int feature_index, FeatureHistogram *histogram_array_, std::vector &bests, int leaf_index, int depth, const int tid, - int real_fidx, const Tree *tree, - const Config *config_, - CurrentConstraints ¤t_constraints, - const std::vector &constraints_per_leaf_, - std::unique_ptr& cegb_, bool update = false); + int real_fidx, LearnerState &learner_state, bool update = false); protected: virtual std::vector GetUsedFeatures(bool is_tree_level); From f1717b6a2b57c245e3eb7719f9cd1f47c4dc6714 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 7 Oct 2019 07:49:52 +0100 Subject: [PATCH 29/45] Remove commented unused code. --- src/treelearner/serial_tree_learner.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 1c9dffccfaa7..3c80e31a9f22 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -127,11 +127,6 @@ class SerialTreeLearner: public TreeLearner { */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; -// static double CalculateOndemandCosts( -// int feature_index, int leaf_index, const Dataset *train_data_, -// std::unique_ptr &data_partition_, -// const std::vector &feature_used_in_data, const Config *config_); - /*! \brief number of data */ data_size_t num_data_; /*! \brief number of features */ From b0202e1bf2c45354c3ef1d2b1beeb05a95b3842a Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Mon, 7 Oct 2019 07:51:15 +0100 Subject: [PATCH 30/45] Removed unused variable splits_per_leaf_. --- src/treelearner/serial_tree_learner.cpp | 1 - src/treelearner/serial_tree_learner.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index b3387b048dd8..7a3d07bd89c0 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -82,7 +82,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian constraints_per_leaf_.resize(config_->num_leaves, LeafConstraints()); } - splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features()); // get ordered bin train_data_->CreateOrderedBins(&ordered_bins_); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 3c80e31a9f22..8093d0d2fbb1 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -156,8 +156,6 @@ class SerialTreeLearner: public TreeLearner { std::vector best_split_per_leaf_; std::vector constraints_per_leaf_; - /*! \brief store best split per feature for all leaves */ - std::vector splits_per_leaf_; /*! \brief stores best thresholds for all feature for smaller leaf */ std::unique_ptr smaller_leaf_splits_; From 54b5d479ddeba4605f9e6194e0176299098af294 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 22 Oct 2019 16:04:46 +0100 Subject: [PATCH 31/45] Remove duplicated function. --- include/LightGBM/dataset.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 3408cd6b70ef..18417adb4dc6 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -586,15 +586,6 @@ class Dataset { return bufs; } - // This function retrieves the number of bins for a specific feature - int NumBin(int feature_idx) const { - const int group = feature2group_[feature_idx]; - const int sub_feature = feature2subfeature_[feature_idx]; - const BinMapper *bin_mapper = - feature_groups_[group]->bin_mappers_[sub_feature].get(); - return bin_mapper->num_bin(); - } - void ResetConfig(const char* parameters); /*! \brief Get Number of data */ From 033de6f73f4ebcfe1a9c39063a552b216be1c62b Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Tue, 22 Oct 2019 16:49:22 +0100 Subject: [PATCH 32/45] Remove useless class members. --- src/treelearner/serial_tree_learner.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 8093d0d2fbb1..1026f0172060 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -190,11 +190,6 @@ class SerialTreeLearner: public TreeLearner { bool is_constant_hessian_; std::unique_ptr cegb_; - std::vector > dummy_min_constraints; - std::vector > min_constraints; - std::vector > dummy_max_constraints; - std::vector > max_constraints; - CurrentConstraints current_constraints; }; From 7272a352560653d15feb54ea8dee5fe2c35df90d Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 23 Oct 2019 16:48:26 +0100 Subject: [PATCH 33/45] Added a getter for splits_per_leaf_. --- .../cost_effective_gradient_boosting.hpp | 7 +++++-- src/treelearner/monotone_constraints.cpp | 18 ++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp index 2f71aa49fcac..dfca4da46477 100644 --- a/src/treelearner/cost_effective_gradient_boosting.hpp +++ b/src/treelearner/cost_effective_gradient_boosting.hpp @@ -20,8 +20,6 @@ namespace LightGBM { class CostEfficientGradientBoosting { public: - std::vector splits_per_leaf_; - explicit CostEfficientGradientBoosting(const SerialTreeLearner* tree_learner):tree_learner_(tree_learner) { } static bool IsEnable(const Config* config) { @@ -85,6 +83,10 @@ class CostEfficientGradientBoosting { } } + SplitInfo const & GetSplitInfo(int i) const { + return splits_per_leaf_[i]; + } + private: double CalculateOndemandCosts(int feature_index, int real_fidx, int leaf_index) const { if (tree_learner_->config_->cegb_penalty_feature_lazy.empty()) { @@ -108,6 +110,7 @@ class CostEfficientGradientBoosting { } const SerialTreeLearner* tree_learner_; + std::vector splits_per_leaf_; std::vector is_feature_used_in_split_; std::vector feature_used_in_data_; }; diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp index b950de4d6eb5..7442063747c2 100644 --- a/src/treelearner/monotone_constraints.cpp +++ b/src/treelearner/monotone_constraints.cpp @@ -301,10 +301,11 @@ void LeafConstraints::UpdateBestSplitsFromHistograms( split.left_count + split.right_count, feature_index, histogram_array_, bests, leaf, depth, tid, real_fidx, learner_state, true); } else { - if (learner_state.cegb_->splits_per_leaf_[leaf * learner_state.train_data_->num_features() + - feature_index] > bests[tid]) { - bests[tid] = learner_state.cegb_->splits_per_leaf_ - [leaf * learner_state.train_data_->num_features() + feature_index]; + if (learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + + feature_index) > bests[tid]) { + bests[tid] = learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + feature_index); should_split_be_worse[tid] = learner_state.constraints_per_leaf_[leaf] .AreActualConstraintsWorse(feature_index); @@ -345,10 +346,11 @@ void LeafConstraints::UpdateBestSplitsFromHistograms( CHECK(!learner_state.constraints_per_leaf_[leaf] .ToBeUpdated(feature_index)); #endif - if (learner_state.cegb_->splits_per_leaf_[leaf * learner_state.train_data_->num_features() + - feature_index] > bests[tid]) { - bests[tid] = learner_state.cegb_->splits_per_leaf_ - [leaf * learner_state.train_data_->num_features() + feature_index]; + if (learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + + feature_index) > bests[tid]) { + bests[tid] = learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + feature_index); } } From dbc3f072bfe8c577b7107b592dc480f8f2e0c0c4 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 23 Oct 2019 16:51:35 +0100 Subject: [PATCH 34/45] Removed could_be_splittable_ useful only in the Slow method. --- src/treelearner/feature_histogram.hpp | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index a4610c3d0a08..2f3b12761e44 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -99,7 +99,6 @@ class FeatureHistogram { data_size_t num_data, SplitInfo *output, SplittingConstraints &constraints) { is_splittable_ = false; - could_be_splittable_ = false; double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; @@ -479,23 +478,13 @@ class FeatureHistogram { /*! * \brief True if this histogram can be splitted */ - bool is_splittable() { - // if the monotone precise mode is enabled, then, even if a leaf is not splittable right now, - // it may become splittable later, because it can be unconstrained by splits happening somewhere else in the tree - if (meta_->config->monotone_precise_mode && - meta_->bin_type == BinType::NumericalBin) { - return could_be_splittable_; - } else { - return is_splittable_; - } - } + bool is_splittable() { return is_splittable_; } /*! * \brief Set splittable to this histogram */ void set_is_splittable(bool val) { is_splittable_ = val; - could_be_splittable_ = val; } static double ThresholdL1(double s, double l1) { @@ -613,8 +602,6 @@ class FeatureHistogram { double sum_left_gradient = sum_gradient - sum_right_gradient; // current split gain - could_be_splittable_ = true; - // when the monotone precise mode in enabled, as t changes, the constraints applied on // each child may change, because the constraints may depend on thresholds constraints.UpdateIndices(dir, bias, t); @@ -691,8 +678,6 @@ class FeatureHistogram { // if sum hessian too small if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break; - could_be_splittable_ = true; - double sum_right_gradient = sum_gradient - sum_left_gradient; constraints.UpdateIndices(1, bias, t); @@ -755,7 +740,6 @@ class FeatureHistogram { HistogramBinEntry* data_; // std::vector data_; bool is_splittable_ = true; - bool could_be_splittable_ = true; std::function find_best_threshold_fun_; From 9593c3c6586eed84060183d7169341704ff5c781 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 23 Oct 2019 16:52:29 +0100 Subject: [PATCH 35/45] Remove old comment. --- src/treelearner/feature_histogram.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 2f3b12761e44..0c326246e0fc 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -600,7 +600,6 @@ class FeatureHistogram { if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) break; double sum_left_gradient = sum_gradient - sum_right_gradient; - // current split gain // when the monotone precise mode in enabled, as t changes, the constraints applied on // each child may change, because the constraints may depend on thresholds From 6040cbd7234f3c86f813b1cbb7ee9ae89fd265b6 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 24 Oct 2019 07:38:29 +0100 Subject: [PATCH 36/45] Pass constraint class directly to GetSolitGains. --- src/treelearner/feature_histogram.hpp | 29 +++++++-------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 0c326246e0fc..521e9ddd4f52 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -181,11 +181,7 @@ class FeatureHistogram { double current_gain = GetSplitGains( sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, - constraints.CurrentMinConstraintRight(), - constraints.CurrentMaxConstraintRight(), - constraints.CurrentMinConstraintLeft(), - constraints.CurrentMaxConstraintLeft(), 0); + meta_->config->max_delta_step, constraints, 0); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -259,11 +255,7 @@ class FeatureHistogram { double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, - constraints.CurrentMinConstraintRight(), - constraints.CurrentMaxConstraintRight(), - constraints.CurrentMinConstraintLeft(), - constraints.CurrentMaxConstraintLeft(), 0); + meta_->config->max_delta_step, constraints, 0); if (current_gain <= min_gain_shift) continue; is_splittable_ = true; @@ -522,10 +514,9 @@ class FeatureHistogram { static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, double sum_right_gradients, double sum_right_hessians, double l1, double l2, double max_delta_step, - double min_constraint_right, double max_constraint_right, - double min_constraint_left, double max_constraint_left, int8_t monotone_constraint) { - double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint_left, max_constraint_left); - double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint_right, max_constraint_right); + const SplittingConstraints& constraints, int8_t monotone_constraint) { + double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, constraints.CurrentMinConstraintLeft(), constraints.CurrentMaxConstraintLeft()); + double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, constraints.CurrentMinConstraintRight(), constraints.CurrentMaxConstraintRight()); if (((monotone_constraint > 0) && (left_output > right_output)) || ((monotone_constraint < 0) && (left_output < right_output))) { return 0; @@ -612,10 +603,7 @@ class FeatureHistogram { sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - constraints.CurrentMinConstraintRight(), - constraints.CurrentMaxConstraintRight(), - constraints.CurrentMinConstraintLeft(), - constraints.CurrentMaxConstraintLeft(), meta_->monotone_type); + constraints, meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -684,10 +672,7 @@ class FeatureHistogram { sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - constraints.CurrentMinConstraintRight(), - constraints.CurrentMaxConstraintRight(), - constraints.CurrentMinConstraintLeft(), - constraints.CurrentMaxConstraintLeft(), meta_->monotone_type); + constraints, meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; From 3f10afe51413ac536d033c7c342d2ba5775f4e8d Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 24 Oct 2019 09:15:18 +0100 Subject: [PATCH 37/45] Switched SplittingConstraints from a reference to a pointer. --- .../data_parallel_tree_learner.cpp | 2 +- src/treelearner/feature_histogram.hpp | 52 +++++++++---------- src/treelearner/serial_tree_learner.cpp | 2 +- .../voting_parallel_tree_learner.cpp | 6 +-- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 28778f9426cd..87073099e9ce 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -192,7 +192,7 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms( SplitInfo smaller_split; // find best threshold for smaller child // FIXME Fill the vectors with the actual constraints and thresholds - SplittingConstraints constraints; + SplittingConstraints *constraints; std::vector thresholds; this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( this->smaller_leaf_splits_->sum_gradients(), diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 521e9ddd4f52..0c13b2e407be 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -87,7 +87,7 @@ class FeatureHistogram { void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, SplitInfo *output, - SplittingConstraints &constraints) { + SplittingConstraints *constraints) { output->default_left = true; output->gain = kMinScore; find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, @@ -97,7 +97,7 @@ class FeatureHistogram { void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, SplitInfo *output, - SplittingConstraints &constraints) { + SplittingConstraints *constraints) { is_splittable_ = false; double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); @@ -105,7 +105,7 @@ class FeatureHistogram { // at this point, the following arrays contain the constraints applied on every part of the leaf // since we are splitting the leaf in 2, we can compute the cumulative / minimum maximum in both directions - constraints.ComputeCumulativeExtremums(); + constraints->ComputeCumulativeExtremums(); if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { @@ -138,7 +138,7 @@ class FeatureHistogram { void FindBestThresholdCategorical( double sum_gradient, double sum_hessian, data_size_t num_data, - SplitInfo *output, SplittingConstraints& constraints) { + SplitInfo *output, SplittingConstraints *constraints) { output->default_left = false; double best_gain = kMinScore; data_size_t best_left_count = 0; @@ -156,7 +156,7 @@ class FeatureHistogram { int best_threshold = -1; int best_dir = 1; - constraints.InitializeIndices(1); + constraints->InitializeIndices(1); if (use_onehot) { for (int t = 0; t < used_bin; ++t) { @@ -275,8 +275,8 @@ class FeatureHistogram { output->left_output = CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints.CurrentMinConstraintLeft(), - constraints.CurrentMaxConstraintLeft()); + constraints->CurrentMinConstraintLeft(), + constraints->CurrentMaxConstraintLeft()); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; @@ -284,8 +284,8 @@ class FeatureHistogram { sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints.CurrentMinConstraintRight(), - constraints.CurrentMaxConstraintRight()); + constraints->CurrentMinConstraintRight(), + constraints->CurrentMaxConstraintRight()); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -514,9 +514,9 @@ class FeatureHistogram { static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, double sum_right_gradients, double sum_right_hessians, double l1, double l2, double max_delta_step, - const SplittingConstraints& constraints, int8_t monotone_constraint) { - double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, constraints.CurrentMinConstraintLeft(), constraints.CurrentMaxConstraintLeft()); - double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, constraints.CurrentMinConstraintRight(), constraints.CurrentMaxConstraintRight()); + const SplittingConstraints *constraints, int8_t monotone_constraint) { + double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, constraints->CurrentMinConstraintLeft(), constraints->CurrentMaxConstraintLeft()); + double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, constraints->CurrentMinConstraintRight(), constraints->CurrentMaxConstraintRight()); if (((monotone_constraint > 0) && (left_output > right_output)) || ((monotone_constraint < 0) && (left_output < right_output))) { return 0; @@ -545,7 +545,7 @@ class FeatureHistogram { data_size_t num_data, double min_gain_shift, SplitInfo *output, int dir, bool skip_default_bin, bool use_na_as_missing, - SplittingConstraints &constraints) { + SplittingConstraints *constraints) { const int8_t bias = meta_->bias; double best_sum_left_gradient = NAN; @@ -569,7 +569,7 @@ class FeatureHistogram { int t = meta_->num_bin - 1 - bias - use_na_as_missing; const int t_end = 1 - bias; - constraints.InitializeIndices(dir); + constraints->InitializeIndices(dir); // from right to left, and we don't need data in bin0 for (; t >= t_end; --t) { @@ -594,7 +594,7 @@ class FeatureHistogram { // when the monotone precise mode in enabled, as t changes, the constraints applied on // each child may change, because the constraints may depend on thresholds - constraints.UpdateIndices(dir, bias, t); + constraints->UpdateIndices(dir, bias, t); // when the algorithm goes through the thresholds we use the same index for cumulative arrays // in both directions but each leaf is constrained according to the corresponding array @@ -618,10 +618,10 @@ class FeatureHistogram { best_threshold = static_cast(t - 1 + bias); best_gain = current_gain; - best_min_constraint_right = constraints.CurrentMinConstraintRight(); - best_max_constraint_right = constraints.CurrentMaxConstraintRight(); - best_min_constraint_left = constraints.CurrentMinConstraintLeft(); - best_max_constraint_left = constraints.CurrentMaxConstraintLeft(); + best_min_constraint_right = constraints->CurrentMinConstraintRight(); + best_max_constraint_right = constraints->CurrentMaxConstraintRight(); + best_min_constraint_left = constraints->CurrentMinConstraintLeft(); + best_max_constraint_left = constraints->CurrentMaxConstraintLeft(); } } } else { @@ -644,7 +644,7 @@ class FeatureHistogram { t = -1; } - constraints.InitializeIndices(dir); + constraints->InitializeIndices(dir); for (; t <= t_end; ++t) { // need to skip default bin @@ -667,7 +667,7 @@ class FeatureHistogram { double sum_right_gradient = sum_gradient - sum_left_gradient; - constraints.UpdateIndices(1, bias, t); + constraints->UpdateIndices(1, bias, t); double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, @@ -686,10 +686,10 @@ class FeatureHistogram { best_threshold = static_cast(t + bias); best_gain = current_gain; - best_min_constraint_right = constraints.CurrentMinConstraintRight(); - best_max_constraint_right = constraints.CurrentMaxConstraintRight(); - best_min_constraint_left = constraints.CurrentMinConstraintLeft(); - best_max_constraint_left = constraints.CurrentMaxConstraintLeft(); + best_min_constraint_right = constraints->CurrentMinConstraintRight(); + best_max_constraint_right = constraints->CurrentMaxConstraintRight(); + best_min_constraint_left = constraints->CurrentMinConstraintLeft(); + best_max_constraint_left = constraints->CurrentMaxConstraintLeft(); } } } @@ -726,7 +726,7 @@ class FeatureHistogram { bool is_splittable_ = true; std::function find_best_threshold_fun_; + SplittingConstraints *)> find_best_threshold_fun_; }; class HistogramPool { public: diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 7a3d07bd89c0..91e47c09dd08 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -959,7 +959,7 @@ void SerialTreeLearner::ComputeBestSplitForFeature( SplitInfo new_split; histogram_array_[feature_index] .FindBestThreshold(sum_gradient, sum_hessian, num_data, &new_split, - learner_state.current_constraints[tid]); + &learner_state.current_constraints[tid]); if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) { learner_state.current_constraints.InitializeConstraints(tid); diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 8b99e63272b0..265f2546e726 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -304,7 +304,7 @@ void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) this->smaller_leaf_histogram_array_[feature_index].RawData()); // FIXME Fill the vectors with the actual constraints and thresholds - SplittingConstraints constraints; + SplittingConstraints *constraints; this->smaller_leaf_histogram_array_[feature_index] .FindBestThreshold(this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), @@ -416,7 +416,7 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms( // find best threshold // FIXME Fill the vectors with the actual constraints and thresholds - SplittingConstraints constraints; + SplittingConstraints *constraints; smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold( smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(), @@ -440,7 +440,7 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms( // find best threshold // FIXME Fill the vectors with the actual constraints and thresholds - SplittingConstraints constraints; + SplittingConstraints *constraints; larger_leaf_histogram_array_global_[feature_index].FindBestThreshold( larger_leaf_splits_global_->sum_gradients(), From ed61bab7fc9971599efa126c0d10085fb618101e Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 24 Oct 2019 09:44:37 +0100 Subject: [PATCH 38/45] Grouped the best_constraints from feature_histogram in a class. --- src/treelearner/feature_histogram.hpp | 21 ++++++--------------- src/treelearner/monotone_constraints.hpp | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 0c13b2e407be..6adf68fe0904 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -553,10 +553,7 @@ class FeatureHistogram { // when the monotone precise mode is enabled, then the left and the right children may not // have the same min and max constraints because constraints can depend on the thresholds - double best_min_constraint_left = NAN; - double best_max_constraint_left = NAN; - double best_min_constraint_right = NAN; - double best_max_constraint_right = NAN; + BestConstraints best_constraints = BestConstraints(); double best_gain = kMinScore; data_size_t best_left_count = 0; @@ -618,10 +615,7 @@ class FeatureHistogram { best_threshold = static_cast(t - 1 + bias); best_gain = current_gain; - best_min_constraint_right = constraints->CurrentMinConstraintRight(); - best_max_constraint_right = constraints->CurrentMaxConstraintRight(); - best_min_constraint_left = constraints->CurrentMinConstraintLeft(); - best_max_constraint_left = constraints->CurrentMaxConstraintLeft(); + best_constraints.Update(constraints); } } } else { @@ -686,10 +680,7 @@ class FeatureHistogram { best_threshold = static_cast(t + bias); best_gain = current_gain; - best_min_constraint_right = constraints->CurrentMinConstraintRight(); - best_max_constraint_right = constraints->CurrentMaxConstraintRight(); - best_min_constraint_left = constraints->CurrentMinConstraintLeft(); - best_max_constraint_left = constraints->CurrentMaxConstraintLeft(); + best_constraints.Update(constraints); } } } @@ -700,8 +691,8 @@ class FeatureHistogram { output->left_output = CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, best_min_constraint_left, - best_max_constraint_left); + meta_->config->max_delta_step, best_constraints.best_min_constraint_left, + best_constraints.best_max_constraint_left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; @@ -709,7 +700,7 @@ class FeatureHistogram { sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - best_min_constraint_right, best_max_constraint_right); + best_constraints.best_min_constraint_right, best_constraints.best_max_constraint_right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 12e87124b736..8a169a7af7dc 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -587,5 +587,26 @@ struct CurrentConstraints { } }; +struct BestConstraints { + double best_min_constraint_right; + double best_max_constraint_right; + double best_min_constraint_left; + double best_max_constraint_left; + + void Init() { + best_min_constraint_left = NAN; + best_max_constraint_left = NAN; + best_min_constraint_right = NAN; + best_max_constraint_right = NAN; + } + + void Update(SplittingConstraints *constraints) { + best_min_constraint_right = constraints->CurrentMinConstraintRight(); + best_max_constraint_right = constraints->CurrentMaxConstraintRight(); + best_min_constraint_left = constraints->CurrentMinConstraintLeft(); + best_max_constraint_left = constraints->CurrentMaxConstraintLeft(); + } +}; + } // namespace LightGBM #endif // LightGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ From f6eb56ae82e7f3a4b9ca64e74cb6e0be9b3b5d45 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 24 Oct 2019 16:55:02 +0100 Subject: [PATCH 39/45] Changed constraints to a nullptr when there are no monotone constraints. --- src/treelearner/feature_histogram.hpp | 58 ++++++++++++++++++++----- src/treelearner/serial_tree_learner.cpp | 12 +++-- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 6adf68fe0904..c529b499a8eb 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -105,7 +105,9 @@ class FeatureHistogram { // at this point, the following arrays contain the constraints applied on every part of the leaf // since we are splitting the leaf in 2, we can compute the cumulative / minimum maximum in both directions - constraints->ComputeCumulativeExtremums(); + if (constraints != nullptr) { + constraints->ComputeCumulativeExtremums(); + } if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { @@ -156,7 +158,9 @@ class FeatureHistogram { int best_threshold = -1; int best_dir = 1; - constraints->InitializeIndices(1); + if (constraints != nullptr) { + constraints->InitializeIndices(1); + } if (use_onehot) { for (int t = 0; t < used_bin; ++t) { @@ -272,11 +276,22 @@ class FeatureHistogram { } if (is_splittable_) { + double current_min_constraint_left = -std::numeric_limits::max(); + double current_max_constraint_left = std::numeric_limits::max(); + double current_min_constraint_right = -std::numeric_limits::max(); + double current_max_constraint_right = std::numeric_limits::max(); + if (constraints != nullptr) { + current_min_constraint_left = constraints->CurrentMinConstraintLeft(); + current_max_constraint_left = constraints->CurrentMaxConstraintLeft(); + current_min_constraint_right = constraints->CurrentMinConstraintRight(); + current_max_constraint_right = constraints->CurrentMaxConstraintRight(); + } + output->left_output = CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints->CurrentMinConstraintLeft(), - constraints->CurrentMaxConstraintLeft()); + current_min_constraint_left, + current_max_constraint_left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; @@ -284,8 +299,8 @@ class FeatureHistogram { sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints->CurrentMinConstraintRight(), - constraints->CurrentMaxConstraintRight()); + current_min_constraint_right, + current_max_constraint_right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -515,8 +530,18 @@ class FeatureHistogram { double sum_right_gradients, double sum_right_hessians, double l1, double l2, double max_delta_step, const SplittingConstraints *constraints, int8_t monotone_constraint) { - double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, constraints->CurrentMinConstraintLeft(), constraints->CurrentMaxConstraintLeft()); - double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, constraints->CurrentMinConstraintRight(), constraints->CurrentMaxConstraintRight()); + double current_min_constraint_left = -std::numeric_limits::max(); + double current_max_constraint_left = std::numeric_limits::max(); + double current_min_constraint_right = -std::numeric_limits::max(); + double current_max_constraint_right = std::numeric_limits::max(); + if (constraints != nullptr) { + current_min_constraint_left = constraints->CurrentMinConstraintLeft(); + current_max_constraint_left = constraints->CurrentMaxConstraintLeft(); + current_min_constraint_right = constraints->CurrentMinConstraintRight(); + current_max_constraint_right = constraints->CurrentMaxConstraintRight(); + } + double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, current_min_constraint_left, current_max_constraint_left); + double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, current_min_constraint_right, current_max_constraint_right); if (((monotone_constraint > 0) && (left_output > right_output)) || ((monotone_constraint < 0) && (left_output < right_output))) { return 0; @@ -566,7 +591,10 @@ class FeatureHistogram { int t = meta_->num_bin - 1 - bias - use_na_as_missing; const int t_end = 1 - bias; - constraints->InitializeIndices(dir); + + if (constraints != nullptr) { + constraints->InitializeIndices(dir); + } // from right to left, and we don't need data in bin0 for (; t >= t_end; --t) { @@ -591,7 +619,9 @@ class FeatureHistogram { // when the monotone precise mode in enabled, as t changes, the constraints applied on // each child may change, because the constraints may depend on thresholds - constraints->UpdateIndices(dir, bias, t); + if (constraints != nullptr) { + constraints->UpdateIndices(dir, bias, t); + } // when the algorithm goes through the thresholds we use the same index for cumulative arrays // in both directions but each leaf is constrained according to the corresponding array @@ -638,7 +668,9 @@ class FeatureHistogram { t = -1; } - constraints->InitializeIndices(dir); + if (constraints != nullptr) { + constraints->InitializeIndices(dir); + } for (; t <= t_end; ++t) { // need to skip default bin @@ -661,7 +693,9 @@ class FeatureHistogram { double sum_right_gradient = sum_gradient - sum_left_gradient; - constraints->UpdateIndices(1, bias, t); + if (constraints != nullptr) { + constraints->UpdateIndices(1, bias, t); + } double current_gain = GetSplitGains( sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 91e47c09dd08..f9bd2087d3d1 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -957,9 +957,15 @@ void SerialTreeLearner::ComputeBestSplitForFeature( learner_state.tree->LeafOutput(leaf_index), tid, kEpsilon) #endif SplitInfo new_split; - histogram_array_[feature_index] - .FindBestThreshold(sum_gradient, sum_hessian, num_data, &new_split, - &learner_state.current_constraints[tid]); + + SplittingConstraints *constraints; + if (learner_state.config_->monotone_constraints.empty()) { + constraints = nullptr; + } else { + constraints = &learner_state.current_constraints[tid]; + } + histogram_array_[feature_index].FindBestThreshold( + sum_gradient, sum_hessian, num_data, &new_split, constraints); if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) { learner_state.current_constraints.InitializeConstraints(tid); From eb2c41268037274347219d54c0ae3d1285ea7242 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 31 Oct 2019 15:00:38 +0000 Subject: [PATCH 40/45] Changed data_partition from unique_ptr to regular pointer. --- src/treelearner/monotone_constraints.hpp | 4 ++-- src/treelearner/serial_tree_learner.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 8a169a7af7dc..70c217c8db3d 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -16,7 +16,7 @@ struct LeafConstraints; struct LearnerState { const Config *config_; - std::unique_ptr &data_partition_; + const DataPartition *data_partition_; const Dataset *train_data_; std::vector &constraints_per_leaf_; const Tree *tree; @@ -24,7 +24,7 @@ struct LearnerState { std::unique_ptr &cegb_; LearnerState(const Config *config_, - std::unique_ptr &data_partition_, + const DataPartition *data_partition_, const Dataset *train_data_, std::vector &constraints_per_leaf_, const Tree *tree, CurrentConstraints ¤t_constraints, diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index f9bd2087d3d1..563ec04ecc1e 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -543,7 +543,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( #ifdef TIMETAG auto start_time = std::chrono::steady_clock::now(); #endif - LearnerState learner_state(config_, data_partition_, train_data_, + LearnerState learner_state(config_, data_partition_.get(), train_data_, constraints_per_leaf_, tree, current_constraints, cegb_); std::vector smaller_best(num_threads_); @@ -885,7 +885,7 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri // values don't clash with existing constraints in the subtree, // and if they do, the existing splits need to be updated if (tree->leaf_is_in_monotone_subtree(*right_leaf) && !config_->monotone_constraints.empty()) { - LearnerState learner_state(config_, data_partition_, train_data_, + LearnerState learner_state(config_, data_partition_.get(), train_data_, constraints_per_leaf_, tree, current_constraints, cegb_); LeafConstraints::GoUpToFindLeavesToUpdate( From 8d6ef502cc8f5ecb348d3eab85844e9ae793fe57 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 1 Nov 2019 08:19:48 +0000 Subject: [PATCH 41/45] Splitted Splitting constraints in left SplittingConstraint and RightSplittingConstraint. --- src/treelearner/feature_histogram.hpp | 16 +-- src/treelearner/monotone_constraints.hpp | 170 ++++++++++++----------- 2 files changed, 97 insertions(+), 89 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index c529b499a8eb..cc7055e29d53 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -281,10 +281,10 @@ class FeatureHistogram { double current_min_constraint_right = -std::numeric_limits::max(); double current_max_constraint_right = std::numeric_limits::max(); if (constraints != nullptr) { - current_min_constraint_left = constraints->CurrentMinConstraintLeft(); - current_max_constraint_left = constraints->CurrentMaxConstraintLeft(); - current_min_constraint_right = constraints->CurrentMinConstraintRight(); - current_max_constraint_right = constraints->CurrentMaxConstraintRight(); + current_min_constraint_left = constraints->left.GetCurrentMinConstraint(); + current_max_constraint_left = constraints->left.GetCurrentMaxConstraint(); + current_min_constraint_right = constraints->right.GetCurrentMinConstraint(); + current_max_constraint_right = constraints->right.GetCurrentMaxConstraint(); } output->left_output = CalculateSplittedLeafOutput( @@ -535,10 +535,10 @@ class FeatureHistogram { double current_min_constraint_right = -std::numeric_limits::max(); double current_max_constraint_right = std::numeric_limits::max(); if (constraints != nullptr) { - current_min_constraint_left = constraints->CurrentMinConstraintLeft(); - current_max_constraint_left = constraints->CurrentMaxConstraintLeft(); - current_min_constraint_right = constraints->CurrentMinConstraintRight(); - current_max_constraint_right = constraints->CurrentMaxConstraintRight(); + current_min_constraint_left = constraints->left.GetCurrentMinConstraint(); + current_max_constraint_left = constraints->left.GetCurrentMaxConstraint(); + current_min_constraint_right = constraints->right.GetCurrentMinConstraint(); + current_max_constraint_right = constraints->right.GetCurrentMaxConstraint(); } double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, current_min_constraint_left, current_max_constraint_left); double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, current_min_constraint_right, current_max_constraint_right); diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 70c217c8db3d..aad6c8fc295b 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -327,7 +327,61 @@ struct LeafConstraints { } }; +struct SplittingConstraint { + std::vector cumulative_min_constraint; + std::vector cumulative_max_constraint; + + unsigned int index_min_constraint; + unsigned int index_max_constraint; + + SplittingConstraint() {} + + SplittingConstraint(std::vector cumulative_min_constraint, + std::vector cumulative_max_constraint) { + this->cumulative_min_constraint = cumulative_min_constraint; + this->cumulative_max_constraint = cumulative_max_constraint; + } + + double GetCurrentMinConstraint() const { + return cumulative_min_constraint[index_min_constraint]; + } + + double GetCurrentMaxConstraint() const { + return cumulative_max_constraint[index_max_constraint]; + } + + void Reserve(int space_to_reserve) { + cumulative_max_constraint.reserve(space_to_reserve); + cumulative_min_constraint.reserve(space_to_reserve); + } + + void InitializeConstraints() { + cumulative_max_constraint.resize(1); + cumulative_min_constraint.resize(1); + cumulative_min_constraint[0] = -std::numeric_limits::max(); + cumulative_max_constraint[0] = std::numeric_limits::max(); + } + + void InitializeIndices(int dir, int min_size, int max_size) { + if (dir == -1) { + index_min_constraint = min_size; + index_max_constraint = max_size; + } else { + index_min_constraint = 0; + index_max_constraint = 0; + } + } + + void Set(const LeafConstraints &leaf_constraints) { + cumulative_min_constraint[0] = leaf_constraints.min_constraints[0][0]; + cumulative_max_constraint[0] = leaf_constraints.max_constraints[0][0]; + } +}; + struct SplittingConstraints { + SplittingConstraint right; + SplittingConstraint left; + std::vector cumulative_min_constraint_right_to_left; std::vector cumulative_max_constraint_right_to_left; std::vector cumulative_min_constraint_left_to_right; @@ -351,14 +405,8 @@ struct SplittingConstraints { std::vector &cumulative_max_constraint_left_to_right, std::vector &thresholds_min_constraints, std::vector &thresholds_max_constraints) { - this->cumulative_min_constraint_right_to_left = - cumulative_min_constraint_right_to_left; - this->cumulative_min_constraint_left_to_right = - cumulative_min_constraint_left_to_right; - this->cumulative_max_constraint_right_to_left = - cumulative_max_constraint_right_to_left; - this->cumulative_max_constraint_left_to_right = - cumulative_max_constraint_left_to_right; + right = SplittingConstraint(cumulative_min_constraint_right_to_left, cumulative_max_constraint_right_to_left); + left = SplittingConstraint(cumulative_min_constraint_left_to_right, cumulative_max_constraint_left_to_right); this->thresholds_min_constraints = thresholds_min_constraints; this->thresholds_max_constraints = thresholds_max_constraints; @@ -390,25 +438,18 @@ struct SplittingConstraints { const double &(*min)(const double &, const double &) = std::min; const double &(*max)(const double &, const double &) = std::max; - CumulativeExtremum(max, true, cumulative_min_constraint_left_to_right); - CumulativeExtremum(max, false, cumulative_min_constraint_right_to_left); - CumulativeExtremum(min, true, cumulative_max_constraint_left_to_right); - CumulativeExtremum(min, false, cumulative_max_constraint_right_to_left); + CumulativeExtremum(max, true, left.cumulative_min_constraint); + CumulativeExtremum(max, false, right.cumulative_min_constraint); + CumulativeExtremum(min, true, left.cumulative_max_constraint); + CumulativeExtremum(min, false, right.cumulative_max_constraint); } void InitializeIndices(int dir) { + right.InitializeIndices(dir, thresholds_min_constraints.size() - 1, thresholds_max_constraints.size() - 1); + left.InitializeIndices(dir, thresholds_min_constraints.size() - 1, thresholds_max_constraints.size() - 1); if (dir == -1) { - index_min_constraint_left_to_right = thresholds_min_constraints.size() - 1; - index_min_constraint_right_to_left = thresholds_min_constraints.size() - 1; - index_max_constraint_left_to_right = thresholds_max_constraints.size() - 1; - index_max_constraint_right_to_left = thresholds_max_constraints.size() - 1; update_is_necessary = !(thresholds_max_constraints.size() == 1 && thresholds_min_constraints.size() == 1); - } else { - index_min_constraint_left_to_right = 0; - index_min_constraint_right_to_left = 0; - index_max_constraint_left_to_right = 0; - index_max_constraint_right_to_left = 0; } } @@ -419,77 +460,55 @@ struct SplittingConstraints { static_cast( thresholds_min_constraints[index_min_constraint_left_to_right]) > t + bias - 1) { - index_min_constraint_left_to_right -= 1; + left.index_min_constraint -= 1; } while ( static_cast( thresholds_min_constraints[index_min_constraint_right_to_left]) > t + bias) { - index_min_constraint_right_to_left -= 1; + right.index_min_constraint -= 1; } while ( static_cast( thresholds_max_constraints[index_max_constraint_left_to_right]) > t + bias - 1) { - index_max_constraint_left_to_right -= 1; + left.index_max_constraint -= 1; } while ( static_cast( thresholds_max_constraints[index_max_constraint_right_to_left]) > t + bias) { - index_max_constraint_right_to_left -= 1; + right.index_max_constraint -= 1; } } #ifdef DEBUG - CHECK(index_min_constraint_left_to_right < + CHECK(left.index_min_constraint < thresholds_min_constraint.size()); - CHECK(index_min_constraint_right_to_left < + CHECK(right.index_min_constraint < thresholds_min_constraint.size()); - CHECK(index_max_constraint_left_to_right < + CHECK(left.index_max_constraint < thresholds_max_constraint.size()); - CHECK(index_max_constraint_right_to_left < + CHECK(right.index_max_constraint < thresholds_max_constraint.size()); #endif } else { // current split gain #ifdef DEBUG - CHECK(index_min_constraint_left_to_right < + CHECK(left.index_min_constraint < thresholds_min_constraint.size()); - CHECK(index_min_constraint_right_to_left < + CHECK(right.index_min_constraint < thresholds_min_constraint.size()); - CHECK(index_max_constraint_left_to_right < + CHECK(left.index_max_constraint < thresholds_max_constraint.size()); - CHECK(index_max_constraint_right_to_left < + CHECK(right.index_max_constraint < thresholds_max_constraint.size()); #endif } } - double CurrentMinConstraintRight() const { - return cumulative_min_constraint_right_to_left - [index_min_constraint_right_to_left]; - } - - double CurrentMaxConstraintRight() const { - return cumulative_max_constraint_right_to_left - [index_max_constraint_right_to_left]; - } - - double CurrentMinConstraintLeft() const { - return cumulative_min_constraint_left_to_right - [index_min_constraint_left_to_right]; - } - - double CurrentMaxConstraintLeft() const { - return cumulative_max_constraint_left_to_right - [index_max_constraint_left_to_right]; - } - void Reserve(int space_to_reserve) { - cumulative_max_constraint_right_to_left.reserve(space_to_reserve); - cumulative_max_constraint_left_to_right.reserve(space_to_reserve); - cumulative_min_constraint_right_to_left.reserve(space_to_reserve); - cumulative_min_constraint_left_to_right.reserve(space_to_reserve); + right.Reserve(space_to_reserve); + left.Reserve(space_to_reserve); thresholds_max_constraints.reserve(space_to_reserve); thresholds_min_constraints.reserve(space_to_reserve); } @@ -498,40 +517,29 @@ struct SplittingConstraints { thresholds_min_constraints.resize(1); thresholds_max_constraints.resize(1); - cumulative_min_constraint_right_to_left.resize(1); - cumulative_min_constraint_left_to_right.resize(1); - cumulative_max_constraint_right_to_left.resize(1); - cumulative_max_constraint_left_to_right.resize(1); - - cumulative_min_constraint_right_to_left[0] = -std::numeric_limits::max(); - cumulative_min_constraint_left_to_right[0] = -std::numeric_limits::max(); - cumulative_max_constraint_right_to_left[0] = std::numeric_limits::max(); - cumulative_max_constraint_left_to_right[0] = std::numeric_limits::max(); - thresholds_min_constraints[0] = 0; thresholds_max_constraints[0] = 0; + + right.InitializeConstraints(); + left.InitializeConstraints(); } void Set(const LeafConstraints &leaf_constraints) { - cumulative_min_constraint_right_to_left[0] = leaf_constraints.min_constraints[0][0]; - cumulative_max_constraint_right_to_left[0] = leaf_constraints.max_constraints[0][0]; - - cumulative_min_constraint_left_to_right[0] = leaf_constraints.min_constraints[0][0]; - cumulative_max_constraint_left_to_right[0] = leaf_constraints.max_constraints[0][0]; - + right.Set(leaf_constraints); + left.Set(leaf_constraints); thresholds_min_constraints[0] = leaf_constraints.min_thresholds[0][0]; thresholds_max_constraints[0] = leaf_constraints.max_thresholds[0][0]; } void CheckCoherenceWithLeafOutput(double leaf_output, double EPS) { - CHECK(cumulative_min_constraint_left_to_right == cumulative_min_constraint_right_to_left); - CHECK(cumulative_max_constraint_left_to_right == cumulative_max_constraint_right_to_left); - for (const auto &x : cumulative_max_constraint_left_to_right) { + CHECK(left.cumulative_min_constraint == right.cumulative_min_constraint); + CHECK(left.cumulative_max_constraint == right.cumulative_max_constraint); + for (const auto &x : left.cumulative_max_constraint) { CHECK(leaf_output <= EPS + x); CHECK(x > -std::numeric_limits::max()); } - for (const auto &x : cumulative_min_constraint_right_to_left) { + for (const auto &x : right.cumulative_min_constraint) { CHECK(leaf_output + EPS >= x); CHECK(x < std::numeric_limits::max()); } @@ -601,10 +609,10 @@ struct BestConstraints { } void Update(SplittingConstraints *constraints) { - best_min_constraint_right = constraints->CurrentMinConstraintRight(); - best_max_constraint_right = constraints->CurrentMaxConstraintRight(); - best_min_constraint_left = constraints->CurrentMinConstraintLeft(); - best_max_constraint_left = constraints->CurrentMaxConstraintLeft(); + best_min_constraint_right = constraints->right.GetCurrentMinConstraint(); + best_max_constraint_right = constraints->right.GetCurrentMaxConstraint(); + best_min_constraint_left = constraints->left.GetCurrentMinConstraint(); + best_max_constraint_left = constraints->left.GetCurrentMaxConstraint(); } }; From ceae2bf4b9b83d5dc1ea90c06ab5e0670a696243 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 1 Nov 2019 10:20:47 +0000 Subject: [PATCH 42/45] Constraints classes are now passed to CalculateSplittedLeafOutput instead of doubles. --- src/treelearner/feature_histogram.hpp | 58 ++++++++++++------------ src/treelearner/monotone_constraints.hpp | 49 ++++++++++++++------ 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index cc7055e29d53..9c3361a1a46d 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -276,22 +276,21 @@ class FeatureHistogram { } if (is_splittable_) { - double current_min_constraint_left = -std::numeric_limits::max(); - double current_max_constraint_left = std::numeric_limits::max(); - double current_min_constraint_right = -std::numeric_limits::max(); - double current_max_constraint_right = std::numeric_limits::max(); + SplittingConstraint *left; + SplittingConstraint *right; if (constraints != nullptr) { - current_min_constraint_left = constraints->left.GetCurrentMinConstraint(); - current_max_constraint_left = constraints->left.GetCurrentMaxConstraint(); - current_min_constraint_right = constraints->right.GetCurrentMinConstraint(); - current_max_constraint_right = constraints->right.GetCurrentMaxConstraint(); + left = &(constraints->left); + right = &(constraints-> right); + } + else { + left = new SplittingConstraint(); + right = new SplittingConstraint(); } output->left_output = CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - current_min_constraint_left, - current_max_constraint_left); + left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; @@ -299,8 +298,7 @@ class FeatureHistogram { sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - current_min_constraint_right, - current_max_constraint_right); + right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -514,13 +512,14 @@ class FeatureHistogram { * \param sum_hessians * \return leaf output */ + template static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step, - double min_constraint, double max_constraint) { + const T *constraint) { double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step); - if (ret < min_constraint) { - ret = min_constraint; - } else if (ret > max_constraint) { - ret = max_constraint; + if (ret < constraint->GetCurrentMinConstraint()) { + ret = constraint->GetCurrentMinConstraint(); + } else if (ret > constraint->GetCurrentMaxConstraint()) { + ret = constraint->GetCurrentMaxConstraint(); } return ret; } @@ -530,18 +529,18 @@ class FeatureHistogram { double sum_right_gradients, double sum_right_hessians, double l1, double l2, double max_delta_step, const SplittingConstraints *constraints, int8_t monotone_constraint) { - double current_min_constraint_left = -std::numeric_limits::max(); - double current_max_constraint_left = std::numeric_limits::max(); - double current_min_constraint_right = -std::numeric_limits::max(); - double current_max_constraint_right = std::numeric_limits::max(); + const SplittingConstraint *left; + const SplittingConstraint *right; if (constraints != nullptr) { - current_min_constraint_left = constraints->left.GetCurrentMinConstraint(); - current_max_constraint_left = constraints->left.GetCurrentMaxConstraint(); - current_min_constraint_right = constraints->right.GetCurrentMinConstraint(); - current_max_constraint_right = constraints->right.GetCurrentMaxConstraint(); + left = &(constraints->left); + right = &(constraints->right); + } + else { + left = new SplittingConstraint(); + right = new SplittingConstraint(); } - double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, current_min_constraint_left, current_max_constraint_left); - double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, current_min_constraint_right, current_max_constraint_right); + double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, left); + double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, right); if (((monotone_constraint > 0) && (left_output > right_output)) || ((monotone_constraint < 0) && (left_output < right_output))) { return 0; @@ -725,8 +724,7 @@ class FeatureHistogram { output->left_output = CalculateSplittedLeafOutput( best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step, best_constraints.best_min_constraint_left, - best_constraints.best_max_constraint_left); + meta_->config->max_delta_step, &best_constraints.left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; @@ -734,7 +732,7 @@ class FeatureHistogram { sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - best_constraints.best_min_constraint_right, best_constraints.best_max_constraint_right); + &best_constraints.right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index aad6c8fc295b..9f69a0aef4d1 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -334,7 +334,13 @@ struct SplittingConstraint { unsigned int index_min_constraint; unsigned int index_max_constraint; - SplittingConstraint() {} + SplittingConstraint() { + index_min_constraint = 0; + index_max_constraint = 0; + + cumulative_min_constraint = std::vector(1, -std::numeric_limits::max()); + cumulative_max_constraint = std::vector(1, std::numeric_limits::max()); + } SplittingConstraint(std::vector cumulative_min_constraint, std::vector cumulative_max_constraint) { @@ -595,24 +601,39 @@ struct CurrentConstraints { } }; -struct BestConstraints { - double best_min_constraint_right; - double best_max_constraint_right; - double best_min_constraint_left; - double best_max_constraint_left; +struct BestConstraint { + double best_min_constraint; + double best_max_constraint; void Init() { - best_min_constraint_left = NAN; - best_max_constraint_left = NAN; - best_min_constraint_right = NAN; - best_max_constraint_right = NAN; + best_min_constraint = NAN; + best_max_constraint = NAN; + } + + void Update(const SplittingConstraint &constraint) { + best_min_constraint = constraint.GetCurrentMinConstraint(); + best_max_constraint = constraint.GetCurrentMaxConstraint(); + } + + // Named after SplittingConstraint to make a template to group the 2 + double GetCurrentMinConstraint() const { + return best_min_constraint; } + double GetCurrentMaxConstraint() const { + return best_max_constraint; + } +}; + +struct BestConstraints { + BestConstraint right; + BestConstraint left; + + void Init() {} + void Update(SplittingConstraints *constraints) { - best_min_constraint_right = constraints->right.GetCurrentMinConstraint(); - best_max_constraint_right = constraints->right.GetCurrentMaxConstraint(); - best_min_constraint_left = constraints->left.GetCurrentMinConstraint(); - best_max_constraint_left = constraints->left.GetCurrentMaxConstraint(); + right.Update(constraints->right); + left.Update(constraints->left); } }; From b63279142446eec11f55ee24d7d2ec8a079f7f8a Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Fri, 1 Nov 2019 10:57:20 +0000 Subject: [PATCH 43/45] Fix bug. --- src/treelearner/feature_histogram.hpp | 8 ++++++-- src/treelearner/monotone_constraints.hpp | 8 +++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 9c3361a1a46d..ea67e169e597 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -644,7 +644,9 @@ class FeatureHistogram { best_threshold = static_cast(t - 1 + bias); best_gain = current_gain; - best_constraints.Update(constraints); + if (constraints != nullptr) { + best_constraints.Update(constraints); + } } } } else { @@ -713,7 +715,9 @@ class FeatureHistogram { best_threshold = static_cast(t + bias); best_gain = current_gain; - best_constraints.Update(constraints); + if (constraints != nullptr) { + best_constraints.Update(constraints); + } } } } diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 9f69a0aef4d1..a5fa8f2a4cc3 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -605,9 +605,9 @@ struct BestConstraint { double best_min_constraint; double best_max_constraint; - void Init() { - best_min_constraint = NAN; - best_max_constraint = NAN; + BestConstraint() { + best_min_constraint = -std::numeric_limits::max(); + best_max_constraint = std::numeric_limits::max(); } void Update(const SplittingConstraint &constraint) { @@ -629,8 +629,6 @@ struct BestConstraints { BestConstraint right; BestConstraint left; - void Init() {} - void Update(SplittingConstraints *constraints) { right.Update(constraints->right); left.Update(constraints->left); From d291b99634c2547ea28204c463a2a8b92e985a39 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Wed, 18 Dec 2019 13:26:01 +0000 Subject: [PATCH 44/45] Use nullptr's when there are no constraints. --- src/treelearner/feature_histogram.hpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index ea67e169e597..35cc51fc16a7 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -283,8 +283,8 @@ class FeatureHistogram { right = &(constraints-> right); } else { - left = new SplittingConstraint(); - right = new SplittingConstraint(); + left = nullptr; + right = nullptr; } output->left_output = CalculateSplittedLeafOutput( @@ -516,10 +516,12 @@ class FeatureHistogram { static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step, const T *constraint) { double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step); - if (ret < constraint->GetCurrentMinConstraint()) { - ret = constraint->GetCurrentMinConstraint(); - } else if (ret > constraint->GetCurrentMaxConstraint()) { - ret = constraint->GetCurrentMaxConstraint(); + if (constraint != nullptr) { + if (ret < constraint->GetCurrentMinConstraint()) { + ret = constraint->GetCurrentMinConstraint(); + } else if (ret > constraint->GetCurrentMaxConstraint()) { + ret = constraint->GetCurrentMaxConstraint(); + } } return ret; } @@ -536,8 +538,8 @@ class FeatureHistogram { right = &(constraints->right); } else { - left = new SplittingConstraint(); - right = new SplittingConstraint(); + left = nullptr; + right = nullptr; } double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, left); double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, right); From c0f1cc0517fc146cbdff72a586504bb211593109 Mon Sep 17 00:00:00 2001 From: Charles Auguste Date: Thu, 19 Dec 2019 14:15:38 +0000 Subject: [PATCH 45/45] Clarified an if statement. --- src/treelearner/serial_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 563ec04ecc1e..1f358ac8aa3e 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -884,7 +884,7 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri // if there is a monotone split above, we need to make sure the new // values don't clash with existing constraints in the subtree, // and if they do, the existing splits need to be updated - if (tree->leaf_is_in_monotone_subtree(*right_leaf) && !config_->monotone_constraints.empty()) { + if (!config_->monotone_constraints.empty() && tree->leaf_is_in_monotone_subtree(*right_leaf)) { LearnerState learner_state(config_, data_partition_.get(), train_data_, constraints_per_leaf_, tree, current_constraints, cegb_);