diff --git a/docs/Parameters.rst b/docs/Parameters.rst index a996b0132852..19e646126f7c 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -312,6 +312,18 @@ Learning Control Parameters - dropout rate: a fraction of previous trees to drop during the dropout +- ``monotone_penalty`` :raw-html:`🔗︎`, default = ``0.``, type = double, aliases: ``monotone_splits_penalty``, constraints: ``0.0 <= monotone_penalty (< max_depth, if max_depth > 0)`` + + - used only if ``monotone_constraints`` is set + + - monotone penalty: a penalization of 0 equals to no penalization. A penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter + +- ``monotone_precise_method`` :raw-html:`🔗︎`, default = ``false``, type = bool, aliases: ``monotone_constraints_precise_mode`` + + - used only if ``monotone_constraints`` is set + + - monotone precise method`: if set to false then the program will run as fast as without constraints, but the results may be over-constrained. If set to true, then the program will be slower, but results will be better. Note that if there are categorical features, in the dataset, they will be splitted using the fast method regardless of this parameter. Also, the parameter can only be set to true if the missing handle is disabled + - ``max_drop`` :raw-html:`🔗︎`, default = ``50``, type = int - used only in ``dart`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 974735532c29..0230ae245d0f 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -325,6 +325,18 @@ struct Config { // desc = dropout rate: a fraction of previous trees to drop during the dropout double drop_rate = 0.1; + // alias = monotone_splits_penalty + // check = >=0.0 + // check = 0 + // desc = used only if ``monotone_constraints`` is set + // desc = monotone penalty: a penalization of 0 equals to no penalization. A penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter + double monotone_penalty = 0.; + + // alias = monotone_constraints_precise_mode + // desc = used only if ``monotone_constraints`` is set + // desc = monotone precise mode: if set to false then the program will run as fast as without constraints, but the results may be over-constrained. If set to true, then the program will be slower, but results will be better. Note that if there are categorical features, in the dataset, they will be splitted using the fast method regardless of this parameter. Also, the parameter can only be set to true if the missing handle is disabled + bool monotone_precise_mode = false; + // desc = used only in ``dart`` // desc = max number of dropped trees during one boosting iteration // desc = ``<=0`` means no limit diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index f672f62b347d..6bc7130fcdb3 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -60,7 +60,7 @@ class Tree { int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, int left_cnt, int right_cnt, double left_weight, double right_weight, - float gain, MissingType missing_type, bool default_left); + float gain, MissingType missing_type, bool default_left, bool feature_is_monotone); /*! * \brief Performing a split on tree leaves, with categorical feature @@ -80,9 +80,14 @@ class Tree { * \param gain Split gain * \return The index of new leaf. */ - int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin, - const uint32_t* threshold, int num_threshold, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type); + + int SplitCategorical(int leaf, int feature, int real_feature, + const uint32_t *threshold_bin, int num_threshold_bin, + const uint32_t *threshold, int num_threshold, + double left_value, double right_value, int left_cnt, + int right_cnt, double left_weight, double right_weight, + float gain, MissingType missing_type, + bool feature_is_monotone); /*! \brief Get the output of one leaf */ inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; } @@ -124,6 +129,24 @@ class Tree { inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; + // Get node parent + inline int node_parent(int node_idx) const; + // Get leaf parent + inline int leaf_parent(int node_idx) const; + + // Get children + inline int left_child(int node_idx) const; + inline int right_child(int node_idx) const; + + // Get if the feature is in a monotone subtree + inline bool leaf_is_in_monotone_subtree(int leaf_idx) const; + + inline double internal_value(int node_idx) const; + + inline uint32_t threshold_in_bin(int node_idx) const; + + // Get the feature corresponding to the split + inline int split_feature_inner(int node_idx) const; inline void PredictContrib(const double* feature_values, int num_features, double* output); @@ -302,8 +325,10 @@ class Tree { } } - inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain); + inline void Split(int leaf, int feature, int real_feature, double left_value, + double right_value, int left_cnt, int right_cnt, double left_weight, + double right_weight,float gain, bool feature_is_monotone); + /*! * \brief Find leaf index of which record belongs by features * \param feature_values Feature value of this record @@ -402,12 +427,22 @@ class Tree { std::vector leaf_depth_; double shrinkage_; int max_depth_; + // add parent node information + std::vector node_parent_; + // Keeps track of the monotone splits above the leaf + std::vector leaf_is_in_monotone_subtree_; }; inline void Tree::Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, - double left_weight, double right_weight, float gain) { + double left_weight, double right_weight, float gain, bool feature_is_monotone) { int new_node_idx = num_leaves_ - 1; + + // Update if there is a monotone split above the leaf + if (feature_is_monotone || leaf_is_in_monotone_subtree_[leaf]) { + leaf_is_in_monotone_subtree_[leaf] = true; + leaf_is_in_monotone_subtree_[num_leaves_] = true; + } // update parent info int parent = leaf_parent_[leaf]; if (parent >= 0) { @@ -421,6 +456,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature, // add new node split_feature_inner_[new_node_idx] = feature; split_feature_[new_node_idx] = real_feature; + node_parent_[new_node_idx] = parent; split_gain_[new_node_idx] = gain; // add two new leaves @@ -529,6 +565,41 @@ inline int Tree::GetLeafByMap(const std::unordered_map& feature_val return ~node; } +inline int Tree::node_parent(int node_idx) const{ + return node_parent_[node_idx]; +} + +inline int Tree::left_child(int node_idx) const{ + return left_child_[node_idx]; +} + +inline int Tree::right_child(int node_idx) const{ + return right_child_[node_idx]; +} + +inline int Tree::split_feature_inner(int node_idx) const{ + return split_feature_inner_[node_idx]; +} + +inline int Tree::leaf_parent(int node_idx) const{ + return leaf_parent_[node_idx]; +} + +inline uint32_t Tree::threshold_in_bin(int node_idx) const{ + #ifdef DEBUG + CHECK(node_idx >= 0); + #endif + return threshold_in_bin_[node_idx]; +} + +inline bool Tree::leaf_is_in_monotone_subtree(int leaf_idx) const { + return leaf_is_in_monotone_subtree_[leaf_idx]; +} + +inline double Tree::internal_value(int node_idx) const { + return internal_value_[node_idx]; +} + } // namespace LightGBM diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 4e44b2a2527a..e080bcb1d214 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -540,6 +540,9 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + #ifdef DEBUG + CHECK(models_[iter]->split_feature(split_idx) >= 0); + #endif feature_importances[models_[iter]->split_feature(split_idx)] += 1.0; } } @@ -548,6 +551,9 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + #ifdef DEBUG + CHECK(models_[iter]->split_feature(split_idx) >= 0); + #endif feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx); } } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 809b8a7843aa..0534c8f6dfa0 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -6,6 +6,7 @@ * This file is auto generated by LightGBM\helpers\parameter_generator.py from LightGBM\include\LightGBM\config.h file. */ #include +#include namespace LightGBM { std::unordered_map Config::alias_table({ {"config_file", "config"}, @@ -80,6 +81,8 @@ std::unordered_map Config::alias_table({ {"lambda", "lambda_l2"}, {"min_split_gain", "min_gain_to_split"}, {"rate_drop", "drop_rate"}, + {"monotone_splits_penalty", "monotone_penalty"}, + {"monotone_constraints_precise_mode", "monotone_precise_mode"}, {"topk", "top_k"}, {"mc", "monotone_constraints"}, {"monotone_constraint", "monotone_constraints"}, @@ -199,6 +202,8 @@ std::unordered_set Config::parameter_set({ "lambda_l2", "min_gain_to_split", "drop_rate", + "monotone_penalty", + "monotone_precise_mode", "max_drop", "skip_drop", "xgboost_dart_mode", @@ -399,8 +404,21 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); + Log::Warning("The constraining method was just changed, which could significantly affect results of the algorithm"); } + GetDouble(params, "monotone_penalty", &monotone_penalty); + bool constraints_exist = false; + for (auto it = monotone_constraints.begin(); it != monotone_constraints.end(); + it++) { + if (*it != 0) { + constraints_exist = true; + } + } + CHECK(monotone_penalty == 0 || constraints_exist); + CHECK(max_depth <= 0 || monotone_penalty < max_depth); + CHECK(monotone_penalty >= 0.0); + if (GetString(params, "feature_contri", &tmp_str)) { feature_contri = Common::StringToArray(tmp_str, ','); } @@ -476,6 +494,10 @@ void Config::GetMembersFromString(const std::unordered_mapconfig_->cegb_penalty_feature_lazy.empty()) { diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index ed677ecf88d5..87073099e9ce 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -146,7 +146,7 @@ void DataParallelTreeLearner::BeforeTrain() { } template -void DataParallelTreeLearner::FindBestSplits() { +void DataParallelTreeLearner::FindBestSplits(const Tree* tree) { TREELEARNER_T::ConstructHistograms(this->is_feature_used_, true); // construct local histograms #pragma omp parallel for schedule(static) @@ -160,11 +160,12 @@ void DataParallelTreeLearner::FindBestSplits() { // Reduce scatter for histogram Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramBinEntry::SumReducer); - this->FindBestSplitsFromHistograms(this->is_feature_used_, true); + this->FindBestSplitsFromHistograms(this->is_feature_used_, true, tree); } template -void DataParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector&, bool) { +void DataParallelTreeLearner::FindBestSplitsFromHistograms( + const std::vector &, bool, const Tree *tree) { std::vector smaller_bests_per_thread(this->num_threads_, SplitInfo()); std::vector larger_bests_per_thread(this->num_threads_, SplitInfo()); std::vector smaller_node_used_features(this->num_features_, 1); @@ -190,13 +191,14 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const this->smaller_leaf_histogram_array_[feature_index].RawData()); SplitInfo smaller_split; // find best threshold for smaller child + // FIXME Fill the vectors with the actual constraints and thresholds + SplittingConstraints *constraints; + std::vector thresholds; this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( - this->smaller_leaf_splits_->sum_gradients(), - this->smaller_leaf_splits_->sum_hessians(), - GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()), - this->smaller_leaf_splits_->min_constraint(), - this->smaller_leaf_splits_->max_constraint(), - &smaller_split); + this->smaller_leaf_splits_->sum_gradients(), + this->smaller_leaf_splits_->sum_hessians(), + GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()), + &smaller_split, constraints); smaller_split.feature = real_feature_index; if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) { smaller_bests_per_thread[tid] = smaller_split; @@ -210,13 +212,12 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const this->smaller_leaf_histogram_array_[feature_index]); SplitInfo larger_split; // find best threshold for larger child + // FIXME Fill the vectors with the actual constraints and thresholds this->larger_leaf_histogram_array_[feature_index].FindBestThreshold( - this->larger_leaf_splits_->sum_gradients(), - this->larger_leaf_splits_->sum_hessians(), - GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()), - this->larger_leaf_splits_->min_constraint(), - this->larger_leaf_splits_->max_constraint(), - &larger_split); + this->larger_leaf_splits_->sum_gradients(), + this->larger_leaf_splits_->sum_hessians(), + GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()), + &larger_split, constraints); larger_split.feature = real_feature_index; if (larger_split > larger_bests_per_thread[tid] && larger_node_used_features[feature_index]) { larger_bests_per_thread[tid] = larger_split; diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 20437c0be6b0..35cc51fc16a7 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -16,6 +16,7 @@ #include #include "split_info.hpp" +#include "monotone_constraints.hpp" namespace LightGBM { @@ -57,11 +58,15 @@ class FeatureHistogram { meta_ = meta; data_ = data; if (meta_->bin_type == BinType::NumericalBin) { - find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1 - , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); + find_best_threshold_fun_ = std::bind( + &FeatureHistogram::FindBestThresholdNumerical, this, + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, + std::placeholders::_4, std::placeholders::_5); } else { - find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1 - , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); + find_best_threshold_fun_ = std::bind( + &FeatureHistogram::FindBestThresholdCategorical, this, + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, + std::placeholders::_4, std::placeholders::_5); } } @@ -80,30 +85,50 @@ class FeatureHistogram { } } - void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - SplitInfo* output) { + void FindBestThreshold(double sum_gradient, double sum_hessian, + data_size_t num_data, SplitInfo *output, + SplittingConstraints *constraints) { output->default_left = true; output->gain = kMinScore; - find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output); + find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, + output, constraints); output->gain *= meta_->penalty; } - void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - SplitInfo* output) { + void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, + data_size_t num_data, SplitInfo *output, + SplittingConstraints *constraints) { is_splittable_ = false; double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; + + // at this point, the following arrays contain the constraints applied on every part of the leaf + // since we are splitting the leaf in 2, we can compute the cumulative / minimum maximum in both directions + if (constraints != nullptr) { + constraints->ComputeCumulativeExtremums(); + } + if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, true, false); - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, true, false); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, + true, false, constraints); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, + true, false, constraints); } else { - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, true); - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, 1, false, true); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, + false, true, constraints); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, 1, + false, true, constraints); } } else { - FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_constraint, max_constraint, min_gain_shift, output, -1, false, false); + FindBestThresholdSequence( + sum_gradient, sum_hessian, num_data, min_gain_shift, output, -1, + false, false, constraints); // fix the direction error when only have 2 bins if (meta_->missing_type == MissingType::NaN) { output->default_left = false; @@ -111,13 +136,11 @@ class FeatureHistogram { } output->gain -= min_gain_shift; output->monotone_type = meta_->monotone_type; - output->min_constraint = min_constraint; - output->max_constraint = max_constraint; } - void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data, - double min_constraint, double max_constraint, - SplitInfo* output) { + void FindBestThresholdCategorical( + double sum_gradient, double sum_hessian, data_size_t num_data, + SplitInfo *output, SplittingConstraints *constraints) { output->default_left = false; double best_gain = kMinScore; data_size_t best_left_count = 0; @@ -135,6 +158,10 @@ class FeatureHistogram { int best_threshold = -1; int best_dir = 1; + if (constraints != nullptr) { + constraints->InitializeIndices(1); + } + if (use_onehot) { for (int t = 0; t < used_bin; ++t) { // if data not enough, or sum hessian too small @@ -149,10 +176,16 @@ class FeatureHistogram { if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue; double sum_other_gradient = sum_gradient - data_[t].sum_gradients; + +#ifdef DEBUG + CHECK(t >= 0); +#endif // current split gain - double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint, 0); + // the threshold is included in the left leaf + double current_gain = GetSplitGains( + sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, + data_[t].sum_hessians + kEpsilon, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints, 0); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -222,9 +255,12 @@ class FeatureHistogram { cnt_cur_group = 0; double sum_right_gradient = sum_gradient - sum_left_gradient; - double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint, 0); + // the threshold is included in the left leaf + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints, 0); + if (current_gain <= min_gain_shift) continue; is_splittable_ = true; if (current_gain > best_gain) { @@ -240,16 +276,29 @@ class FeatureHistogram { } if (is_splittable_) { - output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + SplittingConstraint *left; + SplittingConstraint *right; + if (constraints != nullptr) { + left = &(constraints->left); + right = &(constraints-> right); + } + else { + left = nullptr; + right = nullptr; + } + + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - best_sum_left_gradient, + sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, + right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -273,8 +322,6 @@ class FeatureHistogram { } } output->monotone_type = 0; - output->min_constraint = min_constraint; - output->max_constraint = max_constraint; } } @@ -441,7 +488,9 @@ class FeatureHistogram { /*! * \brief Set splittable to this histogram */ - void set_is_splittable(bool val) { is_splittable_ = val; } + void set_is_splittable(bool val) { + is_splittable_ = val; + } static double ThresholdL1(double s, double l1) { const double reg_s = std::max(0.0, std::fabs(s) - l1); @@ -457,38 +506,51 @@ class FeatureHistogram { } } - private: - static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, - double sum_right_gradients, double sum_right_hessians, - double l1, double l2, double max_delta_step, - double min_constraint, double max_constraint, int8_t monotone_constraint) { - double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); - double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); - if (((monotone_constraint > 0) && (left_output > right_output)) || - ((monotone_constraint < 0) && (left_output < right_output))) { - return 0; - } - return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output) - + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output); - } - /*! * \brief Calculate the output of a leaf based on regularized sum_gradients and sum_hessians * \param sum_gradients * \param sum_hessians * \return leaf output */ + template static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step, - double min_constraint, double max_constraint) { + const T *constraint) { double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step); - if (ret < min_constraint) { - ret = min_constraint; - } else if (ret > max_constraint) { - ret = max_constraint; + if (constraint != nullptr) { + if (ret < constraint->GetCurrentMinConstraint()) { + ret = constraint->GetCurrentMinConstraint(); + } else if (ret > constraint->GetCurrentMaxConstraint()) { + ret = constraint->GetCurrentMaxConstraint(); + } } return ret; } + private: + static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, + double sum_right_gradients, double sum_right_hessians, + double l1, double l2, double max_delta_step, + const SplittingConstraints *constraints, int8_t monotone_constraint) { + const SplittingConstraint *left; + const SplittingConstraint *right; + if (constraints != nullptr) { + left = &(constraints->left); + right = &(constraints->right); + } + else { + left = nullptr; + right = nullptr; + } + double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, left); + double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, right); + if (((monotone_constraint > 0) && (left_output > right_output)) || + ((monotone_constraint < 0) && (left_output < right_output))) { + return 0; + } + return GetLeafSplitGainGivenOutput(sum_left_gradients, sum_left_hessians, l1, l2, left_output) + + GetLeafSplitGainGivenOutput(sum_right_gradients, sum_right_hessians, l1, l2, right_output); + } + /*! * \brief Calculate the split gain based on regularized sum_gradients and sum_hessians * \param sum_gradients @@ -505,12 +567,20 @@ class FeatureHistogram { return -(2.0 * sg_l1 * output + (sum_hessians + l2) * output * output); } - void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) { + void FindBestThresholdSequence(double sum_gradient, double sum_hessian, + data_size_t num_data, double min_gain_shift, + SplitInfo *output, int dir, + bool skip_default_bin, bool use_na_as_missing, + SplittingConstraints *constraints) { const int8_t bias = meta_->bias; double best_sum_left_gradient = NAN; double best_sum_left_hessian = NAN; + + // when the monotone precise mode is enabled, then the left and the right children may not + // have the same min and max constraints because constraints can depend on the thresholds + BestConstraints best_constraints = BestConstraints(); + double best_gain = kMinScore; data_size_t best_left_count = 0; uint32_t best_threshold = static_cast(meta_->num_bin); @@ -523,6 +593,10 @@ class FeatureHistogram { int t = meta_->num_bin - 1 - bias - use_na_as_missing; const int t_end = 1 - bias; + if (constraints != nullptr) { + constraints->InitializeIndices(dir); + } + // from right to left, and we don't need data in bin0 for (; t >= t_end; --t) { // need to skip default bin @@ -543,10 +617,21 @@ class FeatureHistogram { if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) break; double sum_left_gradient = sum_gradient - sum_right_gradient; - // current split gain - double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint, meta_->monotone_type); + + // when the monotone precise mode in enabled, as t changes, the constraints applied on + // each child may change, because the constraints may depend on thresholds + if (constraints != nullptr) { + constraints->UpdateIndices(dir, bias, t); + } + + // when the algorithm goes through the thresholds we use the same index for cumulative arrays + // in both directions but each leaf is constrained according to the corresponding array + // the threshold is included in the left leaf + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + constraints, meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -560,6 +645,10 @@ class FeatureHistogram { // left is <= threshold, right is > threshold. so this is t-1 best_threshold = static_cast(t - 1 + bias); best_gain = current_gain; + + if (constraints != nullptr) { + best_constraints.Update(constraints); + } } } } else { @@ -582,6 +671,10 @@ class FeatureHistogram { t = -1; } + if (constraints != nullptr) { + constraints->InitializeIndices(dir); + } + for (; t <= t_end; ++t) { // need to skip default bin if (skip_default_bin && (t + bias) == static_cast(meta_->default_bin)) { continue; } @@ -602,10 +695,15 @@ class FeatureHistogram { if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) break; double sum_right_gradient = sum_gradient - sum_left_gradient; - // current split gain - double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint, meta_->monotone_type); + + if (constraints != nullptr) { + constraints->UpdateIndices(1, bias, t); + } + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + constraints, meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -618,6 +716,10 @@ class FeatureHistogram { best_sum_left_hessian = sum_left_hessian; best_threshold = static_cast(t + bias); best_gain = current_gain; + + if (constraints != nullptr) { + best_constraints.Update(constraints); + } } } } @@ -625,21 +727,24 @@ class FeatureHistogram { if (is_splittable_ && best_gain > output->gain) { // update split information output->threshold = best_threshold; - output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, &best_constraints.left); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - best_sum_left_gradient, + sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + &best_constraints.right); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; output->gain = best_gain; output->default_left = dir == -1; + } } @@ -649,7 +754,8 @@ class FeatureHistogram { // std::vector data_; bool is_splittable_ = true; - std::function find_best_threshold_fun_; + std::function find_best_threshold_fun_; }; class HistogramPool { public: diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index 745ca44be68b..d5e6c013b73d 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -52,8 +52,11 @@ void FeatureParallelTreeLearner::BeforeTrain() { } template -void FeatureParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { - TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract); +void FeatureParallelTreeLearner::FindBestSplitsFromHistograms( + const std::vector &is_feature_used, bool use_subtract, + const Tree *tree) { + TREELEARNER_T::FindBestSplitsFromHistograms(is_feature_used, use_subtract, + tree); SplitInfo smaller_best_split, larger_best_split; // get best split at smaller leaf smaller_best_split = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()]; diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index f279fdc7331e..12fbf746ec87 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -1085,6 +1085,7 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right double smaller_max = smaller_leaf_splits_->max_constraint(); double larger_min = larger_leaf_splits_->min_constraint(); double larger_max = larger_leaf_splits_->max_constraint(); + // FIXME This part of the code has not been updated smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); smaller_leaf_splits_->SetValueConstraint(smaller_min, smaller_max); diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index e46d0b846fcb..b63560340c53 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -38,27 +38,22 @@ class LeafSplits { * \param sum_gradients * \param sum_hessians */ - void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) { + void Init(int leaf, const DataPartition *data_partition, double sum_gradients, + double sum_hessians, int depth) { + depth_ = depth; leaf_index_ = leaf; data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); sum_gradients_ = sum_gradients; sum_hessians_ = sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } - void SetValueConstraint(double min, double max) { - min_val_ = min; - max_val_ = max; - } - - /*! * \brief Init splits on current leaf, it will traverse all data to sum up the results * \param gradients * \param hessians */ void Init(const score_t* gradients, const score_t* hessians) { + depth_ = 0; num_data_in_leaf_ = num_data_; leaf_index_ = 0; data_indices_ = nullptr; @@ -71,8 +66,6 @@ class LeafSplits { } sum_gradients_ = tmp_sum_gradients; sum_hessians_ = tmp_sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } /*! @@ -82,7 +75,9 @@ class LeafSplits { * \param gradients * \param hessians */ - void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t* hessians) { + void Init(int leaf, const DataPartition *data_partition, + const score_t *gradients, const score_t *hessians, int depth) { + depth_ = depth; leaf_index_ = leaf; data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); double tmp_sum_gradients = 0.0f; @@ -95,8 +90,6 @@ class LeafSplits { } sum_gradients_ = tmp_sum_gradients; sum_hessians_ = tmp_sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } @@ -106,22 +99,20 @@ class LeafSplits { * \param sum_hessians */ void Init(double sum_gradients, double sum_hessians) { + depth_ = 0; leaf_index_ = 0; sum_gradients_ = sum_gradients; sum_hessians_ = sum_hessians; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } /*! * \brief Init splits on current leaf */ void Init() { + depth_ = 0; leaf_index_ = -1; data_indices_ = nullptr; num_data_in_leaf_ = 0; - min_val_ = -std::numeric_limits::max(); - max_val_ = std::numeric_limits::max(); } @@ -137,9 +128,7 @@ class LeafSplits { /*! \brief Get sum of hessians of current leaf */ double sum_hessians() const { return sum_hessians_; } - double max_constraint() const { return max_val_; } - - double min_constraint() const { return min_val_; } + int depth() const { return depth_; } /*! \brief Get indices of data of current leaf */ const data_size_t* data_indices() const { return data_indices_; } @@ -158,8 +147,7 @@ class LeafSplits { double sum_hessians_; /*! \brief indices of data of current leaf */ const data_size_t* data_indices_; - double min_val_; - double max_val_; + int depth_; }; } // namespace LightGBM diff --git a/src/treelearner/monotone_constraints.cpp b/src/treelearner/monotone_constraints.cpp new file mode 100644 index 000000000000..7442063747c2 --- /dev/null +++ b/src/treelearner/monotone_constraints.cpp @@ -0,0 +1,368 @@ +#include "monotone_constraints.hpp" +#include "serial_tree_learner.h" +#include "feature_histogram.hpp" +#include "cost_effective_gradient_boosting.hpp" + +namespace LightGBM { + +void LeafConstraints::SetChildrenConstraintsFastMethod( + std::vector &constraints_per_leaf, int *right_leaf, + int *left_leaf, int8_t monotone_type, double right_output, + double left_output, bool is_numerical_split) { + constraints_per_leaf[*right_leaf] = constraints_per_leaf[*left_leaf]; + if (is_numerical_split) { + // depending on the monotone type we set constraints on the future splits + // these constraints may be updated later in the algorithm + if (monotone_type < 0) { + constraints_per_leaf[*left_leaf].SetMinConstraint(right_output); + constraints_per_leaf[*right_leaf].SetMaxConstraint(left_output); + } else if (monotone_type > 0) { + constraints_per_leaf[*left_leaf].SetMaxConstraint(right_output); + constraints_per_leaf[*right_leaf].SetMinConstraint(left_output); + } + } +} + +// this function goes through the tree to find how the split that +// has just been performed is going to affect the constraints of other leaves +void LeafConstraints::GoUpToFindLeavesToUpdate( + int node_idx, std::vector &features, std::vector &thresholds, + std::vector &is_in_right_split, int split_feature, + const SplitInfo &split_info, double previous_leaf_output, + uint32_t split_threshold, std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state) { + int parent_idx = learner_state.tree->node_parent(node_idx); + if (parent_idx != -1) { + int inner_feature = learner_state.tree->split_feature_inner(parent_idx); + int8_t monotone_type = + learner_state.train_data_->FeatureMonotone(inner_feature); + bool is_right_split = + learner_state.tree->right_child(parent_idx) == node_idx; + bool split_contains_new_information = true; + bool is_split_numerical = + learner_state.train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + // only branches containing leaves that are contiguous to the original leaf + // need to be updated + for (unsigned int i = 0; i < features.size(); ++i) { + if ((features[i] == inner_feature && is_split_numerical) && + (is_in_right_split[i] == is_right_split)) { + split_contains_new_information = false; + break; + } + } + + if (split_contains_new_information) { + if (monotone_type != 0) { + int left_child_idx = learner_state.tree->left_child(parent_idx); + int right_child_idx = learner_state.tree->right_child(parent_idx); + bool left_child_is_curr_idx = (left_child_idx == node_idx); + int node_idx_to_pass = + (left_child_is_curr_idx) ? right_child_idx : left_child_idx; + bool take_min = (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + + GoDownToFindLeavesToUpdate( + node_idx_to_pass, features, thresholds, is_in_right_split, take_min, + split_feature, split_info, previous_leaf_output, true, true, + split_threshold, best_split_per_leaf_, is_feature_used_, + num_threads_, num_features_, histogram_pool_, learner_state); + } + + is_in_right_split.push_back(learner_state.tree->right_child(parent_idx) == + node_idx); + thresholds.push_back(learner_state.tree->threshold_in_bin(parent_idx)); + features.push_back(learner_state.tree->split_feature_inner(parent_idx)); + } + + if (parent_idx != 0) { + LeafConstraints::GoUpToFindLeavesToUpdate( + parent_idx, features, thresholds, is_in_right_split, split_feature, + split_info, previous_leaf_output, split_threshold, + best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, + histogram_pool_, learner_state); + } + } +} + +// this function goes through the tree to find how the split that was just made +// is +// going to affect other leaves +void LeafConstraints::GoDownToFindLeavesToUpdate( + int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, int maximum, int split_feature, + const SplitInfo &split_info, double previous_leaf_output, + bool use_left_leaf, bool use_right_leaf, uint32_t split_threshold, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state) { + if (node_idx < 0) { + int leaf_idx = ~node_idx; + + // if leaf is at max depth then there is no need to update it + int max_depth = learner_state.config_->max_depth; + if (learner_state.tree->leaf_depth(leaf_idx) >= max_depth && + max_depth > 0) { + return; + } + + // splits that are not to be used shall not be updated + if (best_split_per_leaf_[leaf_idx].gain == kMinScore) { + return; + } + + std::pair min_max_constraints; + bool something_changed; + if (use_right_leaf && use_left_leaf) { + min_max_constraints = + std::minmax(split_info.right_output, split_info.left_output); + } else if (use_right_leaf && !use_left_leaf) { + min_max_constraints = std::pair(split_info.right_output, + split_info.right_output); + } else { + min_max_constraints = std::pair(split_info.left_output, + split_info.left_output); + } + +#ifdef DEBUG + if (maximum) { + CHECK(min_max_constraints.first >= + learner_state.tree->LeafOutput(leaf_idx)); + } else { + CHECK(min_max_constraints.second <= + learner_state.tree->LeafOutput(leaf_idx)); + } +#endif + + if (!learner_state.config_->monotone_precise_mode) { + if (!maximum) { + something_changed = + learner_state.constraints_per_leaf_[leaf_idx] + .SetMinConstraintAndReturnChange(min_max_constraints.second); + } else { + something_changed = + learner_state.constraints_per_leaf_[leaf_idx] + .SetMaxConstraintAndReturnChange(min_max_constraints.first); + } + if (!something_changed) { + return; + } + } else { + if (!maximum) { + // both functions need to be called in this order + // because they modify the struct + something_changed = + learner_state.constraints_per_leaf_[leaf_idx] + .CrossesMinConstraint(min_max_constraints.second); + something_changed = learner_state.constraints_per_leaf_[leaf_idx] + .IsInMinConstraints(previous_leaf_output) || + something_changed; + } else { + // both functions need to be called in this order + // because they modify the struct + something_changed = + learner_state.constraints_per_leaf_[leaf_idx] + .CrossesMaxConstraint(min_max_constraints.first); + something_changed = learner_state.constraints_per_leaf_[leaf_idx] + .IsInMaxConstraints(previous_leaf_output) || + something_changed; + } + // if constraints have changed, then best splits need to be updated + // otherwise, we can just continue and go to the next split + if (!something_changed) { + return; + } + } + UpdateBestSplitsFromHistograms( + best_split_per_leaf_[leaf_idx], leaf_idx, + learner_state.tree->leaf_depth(leaf_idx), is_feature_used_, + num_threads_, num_features_, histogram_pool_, learner_state); + } else { + // check if the children are contiguous with the original leaf + std::pair keep_going_left_right = ShouldKeepGoingLeftRight( + learner_state.tree, node_idx, features, thresholds, is_in_right_split, + learner_state.train_data_); + int inner_feature = learner_state.tree->split_feature_inner(node_idx); + uint32_t threshold = learner_state.tree->threshold_in_bin(node_idx); + bool is_split_numerical = + learner_state.train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + bool use_left_leaf_for_update = true; + bool use_right_leaf_for_update = true; + if (is_split_numerical && inner_feature == split_feature) { + if (threshold >= split_threshold) { + use_left_leaf_for_update = false; + } + if (threshold <= split_threshold) { + use_right_leaf_for_update = false; + } + } + + if (keep_going_left_right.first) { + GoDownToFindLeavesToUpdate( + learner_state.tree->left_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + previous_leaf_output, use_left_leaf, + use_right_leaf_for_update && use_right_leaf, split_threshold, + best_split_per_leaf_, is_feature_used_, num_threads_, num_features_, + histogram_pool_, learner_state); + } + if (keep_going_left_right.second) { + GoDownToFindLeavesToUpdate( + learner_state.tree->right_child(node_idx), features, thresholds, + is_in_right_split, maximum, split_feature, split_info, + previous_leaf_output, use_left_leaf_for_update && use_left_leaf, + use_right_leaf, split_threshold, best_split_per_leaf_, + is_feature_used_, num_threads_, num_features_, histogram_pool_, + learner_state); + } + } +} + +// this function checks if the original leaf and the children of the node that +// is +// currently being visited are contiguous, and if so, the children should be +// visited too +std::pair LeafConstraints::ShouldKeepGoingLeftRight( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, const Dataset *train_data_) { + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = train_data_->FeatureBinMapper(inner_feature) + ->bin_type() == BinType::NumericalBin; + + bool keep_going_right = true; + bool keep_going_left = true; + // we check if the left and right node are contiguous with the original leaf + // if so we should keep going down these nodes to update constraints + for (unsigned int i = 0; i < features.size(); ++i) { + if (features[i] == inner_feature) { + if (is_split_numerical) { + if (threshold >= thresholds[i] && !is_in_right_split[i]) { + keep_going_right = false; + } + if (threshold <= thresholds[i] && is_in_right_split[i]) { + keep_going_left = false; + } + } + } + } + return std::pair(keep_going_left, keep_going_right); +} + +// this function updates the best split for each leaf +// it is called only when monotone constraints exist +void LeafConstraints::UpdateBestSplitsFromHistograms( + SplitInfo &split, int leaf, int depth, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state) { + std::vector bests(num_threads_); + std::vector should_split_be_worse(num_threads_, false); + + // the feature histogram is retrieved + FeatureHistogram *histogram_array_; + histogram_pool_.Get(leaf, &histogram_array_); + + OMP_INIT_EX(); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + // the feature that are supposed to be used are computed + if (!is_feature_used_[feature_index]) + continue; + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } + + // loop through the features to find the best one just like in the + // FindBestSplitsFromHistograms function + const int tid = omp_get_thread_num(); + int real_fidx = learner_state.train_data_->RealFeatureIndex(feature_index); + + // if the monotone precise mode is disabled or if the constraints have to be + // updated, + // but are not exclusively worse, then we update the constraints and the + // best split + if (!learner_state.config_->monotone_precise_mode || + (learner_state.constraints_per_leaf_[leaf].ToBeUpdated(feature_index) && + !learner_state.constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index))) { + + SerialTreeLearner::ComputeBestSplitForFeature( + split.left_sum_gradient + split.right_sum_gradient, + split.left_sum_hessian + split.right_sum_hessian, + split.left_count + split.right_count, feature_index, histogram_array_, + bests, leaf, depth, tid, real_fidx, learner_state, true); + } else { + if (learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + + feature_index) > bests[tid]) { + bests[tid] = learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + feature_index); + should_split_be_worse[tid] = + learner_state.constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index); + } + } + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + + auto best_idx = ArrayArgs::ArgMax(bests); + // if the best split that has been found previously actually doesn't have the + // true constraints + // but worse ones that were not computed before to optimize the computation + // time, + // then we update every split and every constraints that should be updated + if (should_split_be_worse[best_idx]) { + std::fill(bests.begin(), bests.end(), SplitInfo()); +#pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) + for (int feature_index = 0; feature_index < num_features_; + ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_used_[feature_index]) + continue; + if (!histogram_array_[feature_index].is_splittable()) { + continue; + } + + const int tid = omp_get_thread_num(); + int real_fidx = + learner_state.train_data_->RealFeatureIndex(feature_index); + + if (learner_state.constraints_per_leaf_[leaf] + .AreActualConstraintsWorse(feature_index)) { + ; + } else { +#ifdef DEBUG + CHECK(!learner_state.constraints_per_leaf_[leaf] + .ToBeUpdated(feature_index)); +#endif + if (learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + + feature_index) > bests[tid]) { + bests[tid] = learner_state.cegb_->GetSplitInfo( + leaf * learner_state.train_data_->num_features() + feature_index); + } + } + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + best_idx = ArrayArgs::ArgMax(bests); + } + + // note: the gains may differ for the same set of constraints due to the + // non-deterministic OMP reduction. + split = bests[best_idx]; +} + +} // namespace LightGBM diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp new file mode 100644 index 000000000000..a5fa8f2a4cc3 --- /dev/null +++ b/src/treelearner/monotone_constraints.hpp @@ -0,0 +1,639 @@ +#ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ +#define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ + +#include +#include +#include "split_info.hpp" +#include +#include "data_partition.hpp" + +namespace LightGBM { + +struct CostEfficientGradientBoosting; +struct CurrentConstraints; +class HistogramPool; +struct LeafConstraints; + +struct LearnerState { + const Config *config_; + const DataPartition *data_partition_; + const Dataset *train_data_; + std::vector &constraints_per_leaf_; + const Tree *tree; + CurrentConstraints ¤t_constraints; + std::unique_ptr &cegb_; + + LearnerState(const Config *config_, + const DataPartition *data_partition_, + const Dataset *train_data_, + std::vector &constraints_per_leaf_, + const Tree *tree, CurrentConstraints ¤t_constraints, + std::unique_ptr &cegb_) + : config_(config_), data_partition_(data_partition_), + train_data_(train_data_), constraints_per_leaf_(constraints_per_leaf_), + tree(tree), current_constraints(current_constraints), cegb_(cegb_) {}; +}; + +// the purpose of this structure is to store the constraints for one leaf +// when the monotone precise mode is disabled, then it will just store +// one min and one max constraint +// but if the monotone precise mode is enabled, then it may store a +// large number of constraints for different thresholds and features +struct LeafConstraints { + std::vector > min_constraints; + std::vector > max_constraints; + // the constraint number i is valid on the slice + // [thresholds[i]:threshold[i+1]) + // if threshold[i+1] does not exist, then it is valid for thresholds following + // threshold[i] + std::vector > min_thresholds; + std::vector > max_thresholds; + // These 2 vectors keep track of which constraints over which features + // have to be upated + std::vector min_to_be_updated; + std::vector max_to_be_updated; + // This vector keeps track of the constraints that we didn't update for some + // features, because they could only be worse, and another better split was + // available, so we didn't need to compute them yet, but we may need to in the + // future + std::vector are_actual_constraints_worse; + + static void SetChildrenConstraintsFastMethod( + std::vector &constraints_per_leaf, int *right_leaf, + int *left_leaf, int8_t monotone_type, double right_output, + double left_output, bool is_numerical_split); + + static void GoUpToFindLeavesToUpdate( + int node_idx, std::vector &features, + std::vector &thresholds, std::vector &is_in_right_split, + int split_feature, const SplitInfo &split_info, + double previous_leaf_output, uint32_t split_threshold, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state); + + static void GoUpToFindLeavesToUpdate( + int node_idx, int split_feature, const SplitInfo &split_info, + double previous_leaf_output, uint32_t split_threshold, + std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state) { + int depth = learner_state.tree->leaf_depth( + ~learner_state.tree->left_child(node_idx)) - + 1; + + std::vector features; + std::vector thresholds; + std::vector is_in_right_split; + + features.reserve(depth); + thresholds.reserve(depth); + is_in_right_split.reserve(depth); + + GoUpToFindLeavesToUpdate(node_idx, features, thresholds, is_in_right_split, + split_feature, split_info, previous_leaf_output, + split_threshold, best_split_per_leaf_, + is_feature_used_, num_threads_, num_features_, + histogram_pool_, learner_state); + } + + static void GoDownToFindLeavesToUpdate( + int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, int maximum, + int split_feature, const SplitInfo &split_info, + double previous_leaf_output, bool use_left_leaf, bool use_right_leaf, + uint32_t split_threshold, std::vector &best_split_per_leaf_, + const std::vector &is_feature_used_, int num_threads_, + int num_features_, HistogramPool &histogram_pool_, + LearnerState &learner_state); + + static std::pair ShouldKeepGoingLeftRight( + const Tree *tree, int node_idx, const std::vector &features, + const std::vector &thresholds, + const std::vector &is_in_right_split, const Dataset *train_data_); + + static void + UpdateBestSplitsFromHistograms(SplitInfo &split, int leaf, int depth, + const std::vector &is_feature_used_, + int num_threads_, int num_features_, + HistogramPool &histogram_pool_, + LearnerState &learner_state); + + bool IsInConstraints(double element, + const std::vector > &constraints, + std::vector &to_be_updated) { + bool ret = false; + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if (element == constraints[i][j]) { + ret = true; + to_be_updated[i] = true; + are_actual_constraints_worse[i] = false; + } + } + } + return ret; + } + + bool IsInMinConstraints(double min) { + return IsInConstraints(min, min_constraints, min_to_be_updated); + } + + bool IsInMaxConstraints(double max) { + return IsInConstraints(max, max_constraints, max_to_be_updated); + } + + void SetConstraint(double element, + std::vector > &constraints, + bool is_operator_greater) const { + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if ((is_operator_greater && element > constraints[i][j]) || + (!is_operator_greater && element < constraints[i][j])) { + constraints[i][j] = element; + } + } + } + } + + // this function is the same as the previous one, but it also returns + // if it actually modified something or not + bool + SetConstraintAndReturnChange(double element, + std::vector > &constraints, + bool is_operator_greater) const { + bool something_changed = false; + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if ((is_operator_greater && element > constraints[i][j]) || + (!is_operator_greater && element < constraints[i][j])) { + constraints[i][j] = element; + something_changed = true; + } + } + } + return something_changed; + } + + // this function checks if the element passed as a parameter would actually update + // the constraints if they were to be set it as an additional constraint + bool CrossesConstraint(double element, + std::vector > &constraints, + bool is_operator_greater, + std::vector &to_be_updated) { + bool ret = false; + for (unsigned int i = 0; i < constraints.size(); i++) { + for (unsigned int j = 0; j < constraints[i].size(); j++) { + if ((is_operator_greater && element > constraints[i][j]) || + (!is_operator_greater && element < constraints[i][j])) { + ret = true; + to_be_updated[i] = true; + are_actual_constraints_worse[i] = true; + } + } + } + return ret; + } + + bool SetMinConstraintAndReturnChange(double min) { + return SetConstraintAndReturnChange(min, min_constraints, true); + } + + bool SetMaxConstraintAndReturnChange(double max) { + return SetConstraintAndReturnChange(max, max_constraints, false); + } + + void SetMinConstraint(double min) { + SetConstraint(min, min_constraints, true); + } + + void SetMaxConstraint(double max) { + SetConstraint(max, max_constraints, false); + } + + bool CrossesMinConstraint(double min) { + return CrossesConstraint(min, min_constraints, true, min_to_be_updated); + } + + bool CrossesMaxConstraint(double max) { + return CrossesConstraint(max, max_constraints, false, max_to_be_updated); + } + + void ResetUpdates(unsigned int i) { +#ifdef DEBUG + CHECK(i < are_actual_constraints_worse.size()); +#endif + are_actual_constraints_worse[i] = false; + min_to_be_updated[i] = false; + max_to_be_updated[i] = false; + } + + // when the monotone precise mode is disabled, then we can just store + // 1 min and 1 max constraints per leaf, so we call this constructor + LeafConstraints() { + min_constraints.push_back( + std::vector(1, -std::numeric_limits::max())); + max_constraints.push_back( + std::vector(1, std::numeric_limits::max())); + min_thresholds.push_back(std::vector(1, 0)); + max_thresholds.push_back(std::vector(1, 0)); + } + + // when the monotone precise mode is enabled, then for each feature, + // we need to sort an array of constraints + LeafConstraints(unsigned int num_features) { + min_constraints.resize(num_features); + max_constraints.resize(num_features); + + min_thresholds.resize(num_features); + max_thresholds.resize(num_features); + + min_to_be_updated.resize(num_features, false); + max_to_be_updated.resize(num_features, false); + are_actual_constraints_worse.resize(num_features, false); + + for (unsigned int i = 0; i < num_features; i++) { + // The number 32 has no real meaning here, but during our experiments, + // we found that the number of constraints per feature was well below 32, so by + // allocating this space, we may save some time because we won't have to allocate it later + min_constraints[i].reserve(32); + max_constraints[i].reserve(32); + + min_thresholds[i].reserve(32); + max_thresholds[i].reserve(32); + + min_constraints[i].push_back(-std::numeric_limits::max()); + max_constraints[i].push_back(std::numeric_limits::max()); + + min_thresholds[i].push_back(0); + max_thresholds[i].push_back(0); + } + } + + bool AreActualConstraintsWorse(unsigned int feature_idx) const { + return are_actual_constraints_worse[feature_idx]; + } + + bool ToBeUpdated(unsigned int feature_idx) const { + return min_to_be_updated[feature_idx] || max_to_be_updated[feature_idx]; + } + + bool MinToBeUpdated(unsigned int feature_idx) const { + return min_to_be_updated[feature_idx]; + } + + bool MaxToBeUpdated(unsigned int feature_idx) const { + return max_to_be_updated[feature_idx]; + } + + LeafConstraints(const LeafConstraints &constraints) + : min_constraints(constraints.min_constraints), + max_constraints(constraints.max_constraints), + min_thresholds(constraints.min_thresholds), + max_thresholds(constraints.max_thresholds), + min_to_be_updated(constraints.min_to_be_updated), + max_to_be_updated(constraints.max_to_be_updated), + are_actual_constraints_worse(constraints.are_actual_constraints_worse) { + } + + // When we reset the constraints, then we just need to write that the constraints + // are +/- inf, starting from the threshold 0 + void Reset() { + for (unsigned int i = 0; i < min_constraints.size(); i++) { + min_constraints[i].resize(1); + max_constraints[i].resize(1); + min_thresholds[i].resize(1); + max_thresholds[i].resize(1); + + min_constraints[i][0] = -std::numeric_limits::max(); + max_constraints[i][0] = std::numeric_limits::max(); + min_thresholds[i][0] = 0; + max_thresholds[i][0] = 0; + } + } + + static double ComputeMonotoneSplitGainPenalty(int depth, double penalization, + double epsilon = 1e-10) { + if (penalization >= depth + 1.) { + return epsilon; + } + if (penalization <= 1.) { + return 1. - penalization / pow(2., depth) + epsilon; + } + return 1. - pow(2, penalization - 1. - depth) + epsilon; + } +}; + +struct SplittingConstraint { + std::vector cumulative_min_constraint; + std::vector cumulative_max_constraint; + + unsigned int index_min_constraint; + unsigned int index_max_constraint; + + SplittingConstraint() { + index_min_constraint = 0; + index_max_constraint = 0; + + cumulative_min_constraint = std::vector(1, -std::numeric_limits::max()); + cumulative_max_constraint = std::vector(1, std::numeric_limits::max()); + } + + SplittingConstraint(std::vector cumulative_min_constraint, + std::vector cumulative_max_constraint) { + this->cumulative_min_constraint = cumulative_min_constraint; + this->cumulative_max_constraint = cumulative_max_constraint; + } + + double GetCurrentMinConstraint() const { + return cumulative_min_constraint[index_min_constraint]; + } + + double GetCurrentMaxConstraint() const { + return cumulative_max_constraint[index_max_constraint]; + } + + void Reserve(int space_to_reserve) { + cumulative_max_constraint.reserve(space_to_reserve); + cumulative_min_constraint.reserve(space_to_reserve); + } + + void InitializeConstraints() { + cumulative_max_constraint.resize(1); + cumulative_min_constraint.resize(1); + cumulative_min_constraint[0] = -std::numeric_limits::max(); + cumulative_max_constraint[0] = std::numeric_limits::max(); + } + + void InitializeIndices(int dir, int min_size, int max_size) { + if (dir == -1) { + index_min_constraint = min_size; + index_max_constraint = max_size; + } else { + index_min_constraint = 0; + index_max_constraint = 0; + } + } + + void Set(const LeafConstraints &leaf_constraints) { + cumulative_min_constraint[0] = leaf_constraints.min_constraints[0][0]; + cumulative_max_constraint[0] = leaf_constraints.max_constraints[0][0]; + } +}; + +struct SplittingConstraints { + SplittingConstraint right; + SplittingConstraint left; + + std::vector cumulative_min_constraint_right_to_left; + std::vector cumulative_max_constraint_right_to_left; + std::vector cumulative_min_constraint_left_to_right; + std::vector cumulative_max_constraint_left_to_right; + + std::vector thresholds_min_constraints; + std::vector thresholds_max_constraints; + + unsigned int index_min_constraint_left_to_right; + unsigned int index_min_constraint_right_to_left; + unsigned int index_max_constraint_left_to_right; + unsigned int index_max_constraint_right_to_left; + bool update_is_necessary; + + SplittingConstraints() {}; + + SplittingConstraints( + std::vector &cumulative_min_constraint_right_to_left, + std::vector &cumulative_min_constraint_left_to_right, + std::vector &cumulative_max_constraint_right_to_left, + std::vector &cumulative_max_constraint_left_to_right, + std::vector &thresholds_min_constraints, + std::vector &thresholds_max_constraints) { + right = SplittingConstraint(cumulative_min_constraint_right_to_left, cumulative_max_constraint_right_to_left); + left = SplittingConstraint(cumulative_min_constraint_left_to_right, cumulative_max_constraint_left_to_right); + + this->thresholds_min_constraints = thresholds_min_constraints; + this->thresholds_max_constraints = thresholds_max_constraints; + } + + static void CumulativeExtremum( + const double &(*extremum_function)(const double &, const double &), + bool is_direction_from_left_to_right, + std::vector &cumulative_extremum) { + if (cumulative_extremum.size() == 1) { + return; + } +#ifdef DEBUG + CHECK(cumulative_extremum.size() != 0); +#endif + + std::size_t n_exts = cumulative_extremum.size(); + int step = is_direction_from_left_to_right ? 1 : -1; + std::size_t start = is_direction_from_left_to_right ? 0 : n_exts - 1; + std::size_t end = is_direction_from_left_to_right ? n_exts - 1 : 0; + + for (auto i = start; i != end; i = i + step) { + cumulative_extremum[i + step] = extremum_function( + cumulative_extremum[i + step], cumulative_extremum[i]); + } + } + + void ComputeCumulativeExtremums() { + const double &(*min)(const double &, const double &) = std::min; + const double &(*max)(const double &, const double &) = std::max; + + CumulativeExtremum(max, true, left.cumulative_min_constraint); + CumulativeExtremum(max, false, right.cumulative_min_constraint); + CumulativeExtremum(min, true, left.cumulative_max_constraint); + CumulativeExtremum(min, false, right.cumulative_max_constraint); + } + + void InitializeIndices(int dir) { + right.InitializeIndices(dir, thresholds_min_constraints.size() - 1, thresholds_max_constraints.size() - 1); + left.InitializeIndices(dir, thresholds_min_constraints.size() - 1, thresholds_max_constraints.size() - 1); + if (dir == -1) { + update_is_necessary = !(thresholds_max_constraints.size() == 1 && + thresholds_min_constraints.size() == 1); + } + } + + void UpdateIndices(int dir, const int8_t bias, int t) { + if (dir == -1) { + if (update_is_necessary) { + while ( + static_cast( + thresholds_min_constraints[index_min_constraint_left_to_right]) > + t + bias - 1) { + left.index_min_constraint -= 1; + } + while ( + static_cast( + thresholds_min_constraints[index_min_constraint_right_to_left]) > + t + bias) { + right.index_min_constraint -= 1; + } + while ( + static_cast( + thresholds_max_constraints[index_max_constraint_left_to_right]) > + t + bias - 1) { + left.index_max_constraint -= 1; + } + while ( + static_cast( + thresholds_max_constraints[index_max_constraint_right_to_left]) > + t + bias) { + right.index_max_constraint -= 1; + } + } +#ifdef DEBUG + CHECK(left.index_min_constraint < + thresholds_min_constraint.size()); + CHECK(right.index_min_constraint < + thresholds_min_constraint.size()); + CHECK(left.index_max_constraint < + thresholds_max_constraint.size()); + CHECK(right.index_max_constraint < + thresholds_max_constraint.size()); +#endif + } else { +// current split gain +#ifdef DEBUG + CHECK(left.index_min_constraint < + thresholds_min_constraint.size()); + CHECK(right.index_min_constraint < + thresholds_min_constraint.size()); + CHECK(left.index_max_constraint < + thresholds_max_constraint.size()); + CHECK(right.index_max_constraint < + thresholds_max_constraint.size()); +#endif + } + } + + void Reserve(int space_to_reserve) { + right.Reserve(space_to_reserve); + left.Reserve(space_to_reserve); + thresholds_max_constraints.reserve(space_to_reserve); + thresholds_min_constraints.reserve(space_to_reserve); + } + + void InitializeConstraints() { + thresholds_min_constraints.resize(1); + thresholds_max_constraints.resize(1); + + thresholds_min_constraints[0] = 0; + thresholds_max_constraints[0] = 0; + + right.InitializeConstraints(); + left.InitializeConstraints(); + } + + void Set(const LeafConstraints &leaf_constraints) { + right.Set(leaf_constraints); + left.Set(leaf_constraints); + thresholds_min_constraints[0] = leaf_constraints.min_thresholds[0][0]; + thresholds_max_constraints[0] = leaf_constraints.max_thresholds[0][0]; + } + + void CheckCoherenceWithLeafOutput(double leaf_output, + double EPS) { + CHECK(left.cumulative_min_constraint == right.cumulative_min_constraint); + CHECK(left.cumulative_max_constraint == right.cumulative_max_constraint); + for (const auto &x : left.cumulative_max_constraint) { + CHECK(leaf_output <= EPS + x); + CHECK(x > -std::numeric_limits::max()); + } + for (const auto &x : right.cumulative_min_constraint) { + CHECK(leaf_output + EPS >= x); + CHECK(x < std::numeric_limits::max()); + } + } +}; + +struct CurrentConstraints { + std::vector splitting_constraints_vector; + + const int space_to_reserve_non_monotone_precise_mode; + const int space_to_reserve_monotone_precise_mode; + + // the number 32 has no real meaning here, but during our experiments, + // we found that the number of constraints per feature was well below 32, so + // by allocating this space, we may save some time because we won't have to + // allocate it later + CurrentConstraints() + : space_to_reserve_non_monotone_precise_mode(1), + space_to_reserve_monotone_precise_mode(32) {}; + + void Init(int num_threads_, const Config *config_) { + splitting_constraints_vector.resize(num_threads_); + + int space_to_reserve = space_to_reserve_monotone_precise_mode; + if (!config_->monotone_precise_mode) { + space_to_reserve = space_to_reserve_non_monotone_precise_mode; + } + + for (int i = 0; i < num_threads_; ++i) { + splitting_constraints_vector[i].Reserve(space_to_reserve); + InitializeConstraints(i); + } + } + + SplittingConstraints& operator[](unsigned int i) { + return splitting_constraints_vector[i]; + } + + // initializing constraints is just writing that the constraints should +/- + // inf from threshold 0 + void InitializeConstraints(unsigned int tid) { + splitting_constraints_vector[tid].InitializeConstraints(); + } + + void Set(const LeafConstraints &leaf_constraints, unsigned int tid) { + splitting_constraints_vector[tid].Set(leaf_constraints); + } + + void CheckCoherenceWithLeafOutput(double leaf_output, unsigned int tid, + double EPS) { + splitting_constraints_vector[tid] + .CheckCoherenceWithLeafOutput(leaf_output, EPS); + } +}; + +struct BestConstraint { + double best_min_constraint; + double best_max_constraint; + + BestConstraint() { + best_min_constraint = -std::numeric_limits::max(); + best_max_constraint = std::numeric_limits::max(); + } + + void Update(const SplittingConstraint &constraint) { + best_min_constraint = constraint.GetCurrentMinConstraint(); + best_max_constraint = constraint.GetCurrentMaxConstraint(); + } + + // Named after SplittingConstraint to make a template to group the 2 + double GetCurrentMinConstraint() const { + return best_min_constraint; + } + + double GetCurrentMaxConstraint() const { + return best_max_constraint; + } +}; + +struct BestConstraints { + BestConstraint right; + BestConstraint left; + + void Update(SplittingConstraints *constraints) { + right.Update(constraints->right); + left.Update(constraints->left); + } +}; + +} // namespace LightGBM +#endif // LightGBM_TREELEARNER_MONOTONE_CONSTRAINTS_H_ diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index c6754b517397..92a67cdbe5ea 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -31,7 +31,9 @@ class FeatureParallelTreeLearner: public TREELEARNER_T { protected: void BeforeTrain() override; - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) override; + void FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, + const Tree *tree) override; private: /*! \brief rank of local machine */ @@ -59,8 +61,10 @@ class DataParallelTreeLearner: public TREELEARNER_T { protected: void BeforeTrain() override; - void FindBestSplits() override; - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) override; + void FindBestSplits(const Tree *tree) override; + void FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, + const Tree *tree) override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override { @@ -114,8 +118,10 @@ class VotingParallelTreeLearner: public TREELEARNER_T { protected: void BeforeTrain() override; bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; - void FindBestSplits() override; - void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) override; + void FindBestSplits(const Tree *tree) override; + void FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, + const Tree *tree) override; void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override { diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e3531039ec9d..1f358ac8aa3e 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -25,6 +25,7 @@ std::chrono::duration hist_time; std::chrono::duration find_split_time; std::chrono::duration split_time; std::chrono::duration ordered_bin_time; +std::chrono::duration refit_leaves_time; #endif // TIMETAG SerialTreeLearner::SerialTreeLearner(const Config* config) @@ -45,6 +46,7 @@ SerialTreeLearner::~SerialTreeLearner() { Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3); Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3); Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3); + Log::Info("SerialTreeLearner::refit_leaves costs %f", refit_leaves_time * 1e-3); #endif } @@ -71,6 +73,16 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves); // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); + + // when the monotone precise mode is enabled, we need to store + // more constraints; hence the constructors are different + if (config_->monotone_precise_mode) { + ; + } else { + constraints_per_leaf_.resize(config_->num_leaves, + LeafConstraints()); + } + // get ordered bin train_data_->CreateOrderedBins(&ordered_bins_); @@ -108,6 +120,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_->Init(); } + + current_constraints.Init(num_threads_, config_); } void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { @@ -208,7 +222,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians init_split_time += std::chrono::steady_clock::now() - start_time; #endif // find best threshold for every feature - FindBestSplits(); + FindBestSplits(tree.get()); } else if (aborted_last_force_split) { aborted_last_force_split = false; } @@ -236,6 +250,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians return tree.release(); } + Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const { auto tree = std::unique_ptr(new Tree(*old_tree)); CHECK(data_partition_->num_leaves() >= tree->num_leaves()); @@ -337,6 +352,7 @@ void SerialTreeLearner::BeforeTrain() { // reset the splits for leaves for (int i = 0; i < config_->num_leaves; ++i) { best_split_per_leaf_[i].Reset(); + constraints_per_leaf_[i].Reset(); } // Sumup for root @@ -346,7 +362,7 @@ void SerialTreeLearner::BeforeTrain() { } else { // use bagging, only use part of data - smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); + smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_, 0.); } larger_leaf_splits_->Init(); @@ -476,7 +492,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int return true; } -void SerialTreeLearner::FindBestSplits() { +void SerialTreeLearner::FindBestSplits(const Tree* tree) { std::vector is_feature_used(num_features_, 0); #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { @@ -490,7 +506,7 @@ void SerialTreeLearner::FindBestSplits() { } bool use_subtract = parent_leaf_histogram_array_ != nullptr; ConstructHistograms(is_feature_used, use_subtract); - FindBestSplitsFromHistograms(is_feature_used, use_subtract); + FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); } void SerialTreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { @@ -521,10 +537,15 @@ void SerialTreeLearner::ConstructHistograms(const std::vector& is_featur #endif } -void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { +void SerialTreeLearner::FindBestSplitsFromHistograms( + const std::vector &is_feature_used, bool use_subtract, + const Tree *tree) { #ifdef TIMETAG auto start_time = std::chrono::steady_clock::now(); #endif + LearnerState learner_state(config_, data_partition_.get(), train_data_, + constraints_per_leaf_, tree, current_constraints, + cegb_); std::vector smaller_best(num_threads_); std::vector larger_best(num_threads_); std::vector smaller_node_used_features(num_features_, 1); @@ -540,26 +561,21 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& OMP_LOOP_EX_BEGIN(); if (!is_feature_used[feature_index]) { continue; } const int tid = omp_get_thread_num(); - SplitInfo smaller_split; + train_data_->FixHistogram(feature_index, smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(), smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_histogram_array_[feature_index].RawData()); int real_fidx = train_data_->RealFeatureIndex(feature_index); - smaller_leaf_histogram_array_[feature_index].FindBestThreshold( - smaller_leaf_splits_->sum_gradients(), - smaller_leaf_splits_->sum_hessians(), - smaller_leaf_splits_->num_data_in_leaf(), - smaller_leaf_splits_->min_constraint(), - smaller_leaf_splits_->max_constraint(), - &smaller_split); - smaller_split.feature = real_fidx; - if (cegb_ != nullptr) { - smaller_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->num_data_in_leaf(), smaller_split); - } - if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) { - smaller_best[tid] = smaller_split; - } + + ComputeBestSplitForFeature(smaller_leaf_splits_->sum_gradients(), + smaller_leaf_splits_->sum_hessians(), + smaller_leaf_splits_->num_data_in_leaf(), + feature_index, smaller_leaf_histogram_array_, + smaller_best, smaller_leaf_splits_->LeafIndex(), + smaller_leaf_splits_->depth(), tid, real_fidx, + learner_state); + // only has root leaf if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -570,22 +586,15 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& larger_leaf_splits_->num_data_in_leaf(), larger_leaf_histogram_array_[feature_index].RawData()); } - SplitInfo larger_split; - // find best threshold for larger child - larger_leaf_histogram_array_[feature_index].FindBestThreshold( - larger_leaf_splits_->sum_gradients(), - larger_leaf_splits_->sum_hessians(), - larger_leaf_splits_->num_data_in_leaf(), - larger_leaf_splits_->min_constraint(), - larger_leaf_splits_->max_constraint(), - &larger_split); - larger_split.feature = real_fidx; - if (cegb_ != nullptr) { - larger_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->num_data_in_leaf(), larger_split); - } - if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) { - larger_best[tid] = larger_split; - } + + ComputeBestSplitForFeature(larger_leaf_splits_->sum_gradients(), + larger_leaf_splits_->sum_hessians(), + larger_leaf_splits_->num_data_in_leaf(), + feature_index, larger_leaf_histogram_array_, + larger_best, larger_leaf_splits_->LeafIndex(), + larger_leaf_splits_->depth(), tid, real_fidx, + learner_state); + OMP_LOOP_EX_END(); } OMP_THROW_EX(); @@ -620,7 +629,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json // before processing next node from queue, store info for current left/right leaf // store "best split" for left and right, even if they might be overwritten by forced split if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) { - FindBestSplits(); + FindBestSplits(tree); } // then, compute own splits SplitInfo left_split; @@ -679,6 +688,11 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json SplitInfo current_split_info = forceSplitMap[current_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex( current_split_info.feature); + // we want to know if the feature has to be monotone + bool feature_is_monotone = false; + if (!config_->monotone_constraints.empty()) { + feature_is_monotone = config_->monotone_constraints[inner_feature_index] != 0; + } auto threshold_double = train_data_->RealThreshold( inner_feature_index, current_split_info.threshold); @@ -698,7 +712,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json static_cast(current_split_info.right_sum_hessian), static_cast(current_split_info.gain), train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), - current_split_info.default_left); + current_split_info.default_left, + feature_is_monotone); data_partition_->Split(current_leaf, train_data_, inner_feature_index, ¤t_split_info.threshold, 1, current_split_info.default_left, *right_leaf); @@ -726,26 +741,33 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json static_cast(current_split_info.left_sum_hessian), static_cast(current_split_info.right_sum_hessian), static_cast(current_split_info.gain), - train_data_->FeatureBinMapper(inner_feature_index)->missing_type()); + train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), + feature_is_monotone); data_partition_->Split(current_leaf, train_data_, inner_feature_index, cat_bitset_inner.data(), static_cast(cat_bitset_inner.size()), current_split_info.default_left, *right_leaf); } + int depth = tree->leaf_depth(*left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(*right_leaf)); + #endif if (current_split_info.left_count < current_split_info.right_count) { left_smaller = true; smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), current_split_info.left_sum_gradient, - current_split_info.left_sum_hessian); + current_split_info.left_sum_hessian, depth); larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), current_split_info.right_sum_gradient, - current_split_info.right_sum_hessian); + current_split_info.right_sum_hessian, depth); } else { left_smaller = false; smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), - current_split_info.right_sum_gradient, current_split_info.right_sum_hessian); + current_split_info.right_sum_gradient, + current_split_info.right_sum_hessian, depth); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), - current_split_info.left_sum_gradient, current_split_info.left_sum_hessian); + current_split_info.left_sum_gradient, + current_split_info.left_sum_hessian, depth); } left = Json(); @@ -770,6 +792,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; + double previous_leaf_output = tree->LeafOutput(best_leaf); const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); if (cegb_ != nullptr) { cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_); @@ -793,7 +816,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri static_cast(best_split_info.right_sum_hessian), static_cast(best_split_info.gain), train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), - best_split_info.default_left); + best_split_info.default_left, + best_split_info.monotone_type != 0); data_partition_->Split(best_leaf, train_data_, inner_feature_index, &best_split_info.threshold, 1, best_split_info.default_left, *right_leaf); } else { @@ -817,7 +841,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri static_cast(best_split_info.left_sum_hessian), static_cast(best_split_info.right_sum_hessian), static_cast(best_split_info.gain), - train_data_->FeatureBinMapper(inner_feature_index)->missing_type()); + train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), + best_split_info.monotone_type != 0); data_partition_->Split(best_leaf, train_data_, inner_feature_index, cat_bitset_inner.data(), static_cast(cat_bitset_inner.size()), best_split_info.default_left, *right_leaf); } @@ -825,33 +850,52 @@ void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* ri #ifdef DEBUG CHECK(best_split_info.left_count == data_partition_->leaf_count(best_leaf)); #endif - auto p_left = smaller_leaf_splits_.get(); - auto p_right = larger_leaf_splits_.get(); // init the leaves that used on next iteration + int depth = tree->leaf_depth(*left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(*right_leaf)); + #endif if (best_split_info.left_count < best_split_info.right_count) { - smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); - larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); + smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); + larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); } else { - smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); - larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); - p_right = smaller_leaf_splits_.get(); - p_left = larger_leaf_splits_.get(); - } - p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - if (is_numerical_split) { - double mid = (best_split_info.left_output + best_split_info.right_output) / 2.0f; - if (best_split_info.monotone_type < 0) { - p_left->SetValueConstraint(mid, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, mid); - } else if (best_split_info.monotone_type > 0) { - p_left->SetValueConstraint(best_split_info.min_constraint, mid); - p_right->SetValueConstraint(mid, best_split_info.max_constraint); - } + smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); + larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); + } + + // when the monotone precise mode is disabled it is very easy to compute the constraints of + // the children of a leaf, but when it is enabled, one needs to go through the tree to do so, + // and it is done directly before computing best splits + if (!config_->monotone_precise_mode) { + LeafConstraints::SetChildrenConstraintsFastMethod( + constraints_per_leaf_, right_leaf, left_leaf, + best_split_info.monotone_type, best_split_info.right_output, + best_split_info.left_output, is_numerical_split); + } + + // if there is a monotone split above, we need to make sure the new + // values don't clash with existing constraints in the subtree, + // and if they do, the existing splits need to be updated + if (!config_->monotone_constraints.empty() && tree->leaf_is_in_monotone_subtree(*right_leaf)) { + LearnerState learner_state(config_, data_partition_.get(), train_data_, + constraints_per_leaf_, tree, current_constraints, + cegb_); + LeafConstraints::GoUpToFindLeavesToUpdate( + tree->leaf_parent(*right_leaf), inner_feature_index, + best_split_info, previous_leaf_output, best_split_info.threshold, + best_split_per_leaf_, is_feature_used_, + num_threads_, num_features_, histogram_pool_, learner_state); } } - void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const { if (obj != nullptr && obj->IsRenewTreeOutput()) { @@ -892,4 +936,56 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj } } +// this function computes the best split for a given leaf and a given feature +void SerialTreeLearner::ComputeBestSplitForFeature( + double sum_gradient, double sum_hessian, data_size_t num_data, + int feature_index, FeatureHistogram *histogram_array_, + std::vector &bests, int leaf_index, int depth, const int tid, + int real_fidx, LearnerState &learner_state, bool update) { + + // if this is not a subtree stemming from a monotone split, then no constraint + // apply + if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) { + if (!learner_state.config_->monotone_precise_mode) { + learner_state.current_constraints.Set(learner_state.constraints_per_leaf_[leaf_index], + tid); + } + } + +#ifdef DEBUG + learner_state.current_constraints.CheckCoherenceWithLeafOutput( + learner_state.tree->LeafOutput(leaf_index), tid, kEpsilon) +#endif + SplitInfo new_split; + + SplittingConstraints *constraints; + if (learner_state.config_->monotone_constraints.empty()) { + constraints = nullptr; + } else { + constraints = &learner_state.current_constraints[tid]; + } + histogram_array_[feature_index].FindBestThreshold( + sum_gradient, sum_hessian, num_data, &new_split, constraints); + + if (learner_state.tree->leaf_is_in_monotone_subtree(leaf_index)) { + learner_state.current_constraints.InitializeConstraints(tid); + } + + new_split.feature = real_fidx; + if (learner_state.cegb_ != nullptr) { + new_split.gain -= learner_state.cegb_->DetlaGain(feature_index, real_fidx, leaf_index, num_data, new_split); + } + + + if (new_split.monotone_type != 0) { + double penalty = LeafConstraints::ComputeMonotoneSplitGainPenalty( + depth, learner_state.config_->monotone_penalty); + new_split.gain *= penalty; + } + + if (new_split > bests[tid]) { + bests[tid] = new_split; + } +} + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 31743933a780..1026f0172060 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -22,6 +22,7 @@ #include "feature_histogram.hpp" #include "leaf_splits.hpp" #include "split_info.hpp" +#include "monotone_constraints.hpp" #ifdef USE_GPU // Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled. @@ -79,6 +80,12 @@ class SerialTreeLearner: public TreeLearner { void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + static void ComputeBestSplitForFeature( + double sum_gradient, double sum_hessian, data_size_t num_data, + int feature_index, FeatureHistogram *histogram_array_, + std::vector &bests, int leaf_index, int depth, const int tid, + int real_fidx, LearnerState &learner_state, bool update = false); + protected: virtual std::vector GetUsedFeatures(bool is_tree_level); /*! @@ -91,11 +98,13 @@ class SerialTreeLearner: public TreeLearner { */ virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf); - virtual void FindBestSplits(); + virtual void FindBestSplits(const Tree* tree); virtual void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract); - virtual void FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract); + virtual void + FindBestSplitsFromHistograms(const std::vector &is_feature_used, + bool use_subtract, const Tree *tree); /*! * \brief Partition tree and data according best split. @@ -145,8 +154,8 @@ class SerialTreeLearner: public TreeLearner { /*! \brief store best split points for all leaves */ std::vector best_split_per_leaf_; - /*! \brief store best split per feature for all leaves */ - std::vector splits_per_leaf_; + + std::vector constraints_per_leaf_; /*! \brief stores best thresholds for all feature for smaller leaf */ std::unique_ptr smaller_leaf_splits_; @@ -180,6 +189,8 @@ class SerialTreeLearner: public TreeLearner { std::vector ordered_bin_indices_; bool is_constant_hessian_; std::unique_ptr cegb_; + + CurrentConstraints current_constraints; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp index 3afa72a0f4a3..86653522dd04 100644 --- a/src/treelearner/split_info.hpp +++ b/src/treelearner/split_info.hpp @@ -48,8 +48,6 @@ struct SplitInfo { /*! \brief True if default split is left */ bool default_left = true; int8_t monotone_type = 0; - double min_constraint = -std::numeric_limits::max(); - double max_constraint = std::numeric_limits::max(); inline static int Size(int max_cat_threshold) { return 2 * sizeof(int) + sizeof(uint32_t) + sizeof(bool) + sizeof(double) * 9 + sizeof(data_size_t) * 2 + max_cat_threshold * sizeof(uint32_t) + sizeof(int8_t); } @@ -81,10 +79,6 @@ struct SplitInfo { buffer += sizeof(default_left); std::memcpy(buffer, &monotone_type, sizeof(monotone_type)); buffer += sizeof(monotone_type); - std::memcpy(buffer, &min_constraint, sizeof(min_constraint)); - buffer += sizeof(min_constraint); - std::memcpy(buffer, &max_constraint, sizeof(max_constraint)); - buffer += sizeof(max_constraint); std::memcpy(buffer, &num_cat_threshold, sizeof(num_cat_threshold)); buffer += sizeof(num_cat_threshold); std::memcpy(buffer, cat_threshold.data(), sizeof(uint32_t) * num_cat_threshold); @@ -117,10 +111,6 @@ struct SplitInfo { buffer += sizeof(default_left); std::memcpy(&monotone_type, buffer, sizeof(monotone_type)); buffer += sizeof(monotone_type); - std::memcpy(&min_constraint, buffer, sizeof(min_constraint)); - buffer += sizeof(min_constraint); - std::memcpy(&max_constraint, buffer, sizeof(max_constraint)); - buffer += sizeof(max_constraint); std::memcpy(&num_cat_threshold, buffer, sizeof(num_cat_threshold)); buffer += sizeof(num_cat_threshold); cat_threshold.resize(num_cat_threshold); diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index cb18e3779ba6..265f2546e726 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -149,16 +149,26 @@ bool VotingParallelTreeLearner::BeforeFindBestSplit(const Tree* t if (TREELEARNER_T::BeforeFindBestSplit(tree, left_leaf, right_leaf)) { data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf); data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf); + int depth = tree->leaf_depth(left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(right_leaf)); + #endif if (right_leaf < 0) { return true; } else if (num_data_in_left_child < num_data_in_right_child) { // get local sumup - this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); - this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); + this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, + depth); + this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, depth); } else { // get local sumup - this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); - this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_); + this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, + depth); + this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(), + this->gradients_, this->hessians_, depth); } return true; } else { @@ -259,7 +269,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vec } template -void VotingParallelTreeLearner::FindBestSplits() { +void VotingParallelTreeLearner::FindBestSplits(const Tree* tree) { // use local data to find local best splits std::vector is_feature_used(this->num_features_, 0); #pragma omp parallel for schedule(static) @@ -293,13 +303,14 @@ void VotingParallelTreeLearner::FindBestSplits() { this->smaller_leaf_splits_->num_data_in_leaf(), this->smaller_leaf_histogram_array_[feature_index].RawData()); - this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( - this->smaller_leaf_splits_->sum_gradients(), - this->smaller_leaf_splits_->sum_hessians(), - this->smaller_leaf_splits_->num_data_in_leaf(), - this->smaller_leaf_splits_->min_constraint(), - this->smaller_leaf_splits_->max_constraint(), - &smaller_bestsplit_per_features[feature_index]); + // FIXME Fill the vectors with the actual constraints and thresholds + SplittingConstraints *constraints; + this->smaller_leaf_histogram_array_[feature_index] + .FindBestThreshold(this->smaller_leaf_splits_->sum_gradients(), + this->smaller_leaf_splits_->sum_hessians(), + this->smaller_leaf_splits_->num_data_in_leaf(), + &smaller_bestsplit_per_features[feature_index], + constraints); smaller_bestsplit_per_features[feature_index].feature = real_feature_index; // only has root leaf if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) { continue; } @@ -312,13 +323,13 @@ void VotingParallelTreeLearner::FindBestSplits() { this->larger_leaf_histogram_array_[feature_index].RawData()); } // find best threshold for larger child - this->larger_leaf_histogram_array_[feature_index].FindBestThreshold( - this->larger_leaf_splits_->sum_gradients(), - this->larger_leaf_splits_->sum_hessians(), - this->larger_leaf_splits_->num_data_in_leaf(), - this->larger_leaf_splits_->min_constraint(), - this->larger_leaf_splits_->max_constraint(), - &larger_bestsplit_per_features[feature_index]); + // FIXME Fill the vectors with the actual constraints and thresholds + this->larger_leaf_histogram_array_[feature_index] + .FindBestThreshold(this->larger_leaf_splits_->sum_gradients(), + this->larger_leaf_splits_->sum_hessians(), + this->larger_leaf_splits_->num_data_in_leaf(), + &larger_bestsplit_per_features[feature_index], + constraints); larger_bestsplit_per_features[feature_index].feature = real_feature_index; OMP_LOOP_EX_END(); } @@ -370,11 +381,12 @@ void VotingParallelTreeLearner::FindBestSplits() { Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramBinEntry::SumReducer); - this->FindBestSplitsFromHistograms(is_feature_used, false); + this->FindBestSplitsFromHistograms(is_feature_used, false, tree); } template -void VotingParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector&, bool) { +void VotingParallelTreeLearner::FindBestSplitsFromHistograms( + const std::vector &, bool, const Tree *tree) { std::vector smaller_bests_per_thread(this->num_threads_); std::vector larger_best_per_thread(this->num_threads_); std::vector smaller_node_used_features(this->num_features_, 1); @@ -403,13 +415,13 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms(cons smaller_leaf_histogram_array_global_[feature_index].RawData()); // find best threshold + // FIXME Fill the vectors with the actual constraints and thresholds + SplittingConstraints *constraints; smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold( - smaller_leaf_splits_global_->sum_gradients(), - smaller_leaf_splits_global_->sum_hessians(), - GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), - smaller_leaf_splits_global_->min_constraint(), - smaller_leaf_splits_global_->max_constraint(), - &smaller_split); + smaller_leaf_splits_global_->sum_gradients(), + smaller_leaf_splits_global_->sum_hessians(), + GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), + &smaller_split, constraints); smaller_split.feature = real_feature_index; if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) { smaller_bests_per_thread[tid] = smaller_split; @@ -427,13 +439,14 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms(cons larger_leaf_histogram_array_global_[feature_index].RawData()); // find best threshold + // FIXME Fill the vectors with the actual constraints and thresholds + SplittingConstraints *constraints; + larger_leaf_histogram_array_global_[feature_index].FindBestThreshold( - larger_leaf_splits_global_->sum_gradients(), - larger_leaf_splits_global_->sum_hessians(), - GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), - larger_leaf_splits_global_->min_constraint(), - larger_leaf_splits_global_->max_constraint(), - &larger_split); + larger_leaf_splits_global_->sum_gradients(), + larger_leaf_splits_global_->sum_hessians(), + GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), + &larger_split, constraints); larger_split.feature = real_feature_index; if (larger_split > larger_best_per_thread[tid] && larger_node_used_features[feature_index]) { larger_best_per_thread[tid] = larger_split; @@ -477,39 +490,25 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, // set the global number of data for leaves global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count; global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count; - auto p_left = smaller_leaf_splits_global_.get(); - auto p_right = larger_leaf_splits_global_.get(); // init the global sumup info + int depth = tree->leaf_depth(*left_leaf); + #ifdef DEBUG + CHECK(depth == tree->leaf_depth(*right_leaf)); + #endif if (best_split_info.left_count < best_split_info.right_count) { smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); } else { smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), - best_split_info.right_sum_gradient, - best_split_info.right_sum_hessian); + best_split_info.right_sum_gradient, + best_split_info.right_sum_hessian, depth); larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), - best_split_info.left_sum_gradient, - best_split_info.left_sum_hessian); - p_left = larger_leaf_splits_global_.get(); - p_right = smaller_leaf_splits_global_.get(); - } - const int inner_feature_index = this->train_data_->InnerFeatureIndex(best_split_info.feature); - bool is_numerical_split = this->train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; - p_left->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, best_split_info.max_constraint); - if (is_numerical_split) { - double mid = (best_split_info.left_output + best_split_info.right_output) / 2.0f; - if (best_split_info.monotone_type < 0) { - p_left->SetValueConstraint(mid, best_split_info.max_constraint); - p_right->SetValueConstraint(best_split_info.min_constraint, mid); - } else if (best_split_info.monotone_type > 0) { - p_left->SetValueConstraint(best_split_info.min_constraint, mid); - p_right->SetValueConstraint(mid, best_split_info.max_constraint); - } + best_split_info.left_sum_gradient, + best_split_info.left_sum_hessian, depth); } } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 51c99494e68e..e9e47145bc6a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -856,45 +856,200 @@ def test_init_with_subset(self): self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin") self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin") - def test_monotone_constraint(self): + def test_monotone_constraints_categorical_feature(self): def is_increasing(y): return (np.diff(y) >= 0.0).all() def is_decreasing(y): return (np.diff(y) <= 0.0).all() - def is_correctly_constrained(learner): - n = 200 + def is_correctly_constrained(learner, number_categories): + n = 1000 + iterations = 10 variable_x = np.linspace(0, 1, n).reshape((n, 1)) fixed_xs_values = np.linspace(0, 1, n) - for i in range(n): + for i in range(iterations): fixed_x = fixed_xs_values[i] * np.ones((n, 1)) - monotonically_increasing_x = np.column_stack((variable_x, fixed_x)) + monotonically_increasing_x = np.column_stack((variable_x, fixed_x, + (fixed_x * number_categories).astype(int))) monotonically_increasing_y = learner.predict(monotonically_increasing_x) - monotonically_decreasing_x = np.column_stack((fixed_x, variable_x)) + monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, + (fixed_x * number_categories).astype(int))) monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) - if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y)): + if not (is_increasing(monotonically_increasing_y) and + is_decreasing(monotonically_decreasing_y)): return False return True + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + number_categories = 2 ** (np.random.randint(1, 12)) + number_of_dpoints = 3000 + x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) + x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x3_categorical = (np.random.random(size=number_of_dpoints) * number_categories).astype(int) + x = np.column_stack( + (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_categorical)) + zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) + scales = 10. * (np.random.random(6) + 0.5) + y = (scales[0] * x1_positively_correlated_with_y + + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) + - scales[2] * x2_negatively_correlated_with_y + - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y) + - scales[4] * x3_categorical + - np.cos(scales[5] * np.pi * x3_categorical) + + zs) + trainset = lgb.Dataset(x, label=y) + params = { + 'min_data': 20, + 'num_leaves': 10, + "num_threads": 1, + 'monotone_constraints': '1,-1,0', + "categorical_feature": [2], + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model, number_categories)) + + # test if categorical features and monotone features can both be in a dataset without causing issues + def generate_trainset_for_monotone_constraints_tests(self): number_of_dpoints = 3000 x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) - x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y)) + x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x = np.column_stack( + (x1_positively_correlated_with_y, x2_negatively_correlated_with_y, x3_negatively_correlated_with_y)) zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) - y = (5 * x1_positively_correlated_with_y - + np.sin(10 * np.pi * x1_positively_correlated_with_y) - - 5 * x2_negatively_correlated_with_y - - np.cos(10 * np.pi * x2_negatively_correlated_with_y) + scales = 10. * (np.random.random(6) + 0.5) + y = (scales[0] * x1_positively_correlated_with_y + + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) + - scales[2] * x2_negatively_correlated_with_y + - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y) + - scales[4] * x3_negatively_correlated_with_y + - np.cos(scales[5] * np.pi * x3_negatively_correlated_with_y) + zs) trainset = lgb.Dataset(x, label=y) - params = { - 'min_data': 20, - 'num_leaves': 20, - 'monotone_constraints': '1,-1' - } - constrained_model = lgb.train(params, trainset) - self.assertTrue(is_correctly_constrained(constrained_model)) + return trainset + + def test_monotone_constraints(self): + def is_increasing(y): + return (np.diff(y) >= 0.0).all() + + def is_decreasing(y): + return (np.diff(y) <= 0.0).all() + + def is_non_monotone(y): + return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any() + + def is_correctly_constrained(learner): + iterations = 10 + n = 1000 + variable_x = np.linspace(0, 1, n).reshape((n, 1)) + fixed_xs_values = np.linspace(0, 1, n) + for i in range(iterations): + fixed_x = fixed_xs_values[i] * np.ones((n, 1)) + monotonically_increasing_x = np.column_stack((variable_x, fixed_x, fixed_x)) + monotonically_increasing_y = learner.predict(monotonically_increasing_x) + monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, fixed_x)) + monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) + non_monotone_x = np.column_stack((fixed_x, fixed_x, variable_x)) + non_monotone_y = learner.predict(non_monotone_x) + if not (is_increasing(monotonically_increasing_y) and + is_decreasing(monotonically_decreasing_y) and + is_non_monotone(non_monotone_y)): + return False + return True + + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + trainset = self.generate_trainset_for_monotone_constraints_tests() + params = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': '1,-1,0', + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model)) + + # test if the monotone penalty is working + def test_monotone_penalty(self): + def are_first_splits_non_monotone(tree, n, monotone_constraints): + if n <= 0: + return True + if "leaf_value" in tree: + return True + if monotone_constraints[tree["split_feature"]] != 0: + return False + return (are_first_splits_non_monotone(tree["left_child"], n - 1, monotone_constraints) and + are_first_splits_non_monotone(tree["right_child"], n - 1, monotone_constraints)) + + def are_there_monotone_splits(tree, monotone_constraints): + if "leaf_value" in tree: + return False + if monotone_constraints[tree["split_feature"]] != 0: + return True + return (are_there_monotone_splits(tree["left_child"], monotone_constraints) or + are_there_monotone_splits(tree["right_child"], monotone_constraints)) + + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + penalization_parameter = np.random.random() * 3 + trainset = self.generate_trainset_for_monotone_constraints_tests() + monotone_constraints = [1, -1, 0] + params = { + 'min_data': 20, + 'num_leaves': 100, + 'monotone_constraints': monotone_constraints, + 'monotone_penalty': penalization_parameter, + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params, trainset, 10) + dumped_model = constrained_model.dump_model()["tree_info"] + for tree in dumped_model: + self.assert_(are_first_splits_non_monotone(tree["tree_structure"], int(penalization_parameter), + monotone_constraints)) + self.assert_(are_there_monotone_splits(tree["tree_structure"], monotone_constraints)) + + # test if a penalty as high as the depth indeed prohibits all monotone splits + def test_monotone_penalty_max(self): + number_of_trials = 10 + for _ in range(number_of_trials): + for monotone_precise_mode in [False, True]: + max_depth = 5 + penalization_parameter = max_depth - 1e-10 + trainset_constrained_model = self.generate_trainset_for_monotone_constraints_tests() + x = trainset_constrained_model.data + y = trainset_constrained_model.label + x3_negatively_correlated_with_y = x[:, 2] + monotone_constraints = [1, -1, 0] + params_constrained_model = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': monotone_constraints, + 'monotone_penalty': penalization_parameter, + "max_depth": max_depth, + "monotone_precise_mode": monotone_precise_mode, + "use_missing": False + } + constrained_model = lgb.train(params_constrained_model, trainset_constrained_model, 10) + + trainset_unconstrained_model = lgb.Dataset(x3_negatively_correlated_with_y.reshape(-1, 1), label=y) + params_unconstrained_model = { + 'min_data': 20, + 'num_leaves': 20, + "max_depth": max_depth + } + unconstrained_model = lgb.train(params_unconstrained_model, trainset_unconstrained_model, 10) + + self.assert_((constrained_model.predict(x) == + unconstrained_model.predict(x3_negatively_correlated_with_y.reshape(-1, 1))).all()) def test_max_bin_by_feature(self): col1 = np.arange(0, 100)[:, np.newaxis]