diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 471bfc948cc3..ba9ef054bfab 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -48,6 +48,7 @@ OBJECTS = \ treelearner/data_parallel_tree_learner.o \ treelearner/feature_parallel_tree_learner.o \ treelearner/gpu_tree_learner.o \ + treelearner/gradient_discretizer.o \ treelearner/linear_tree_learner.o \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 8d39317b4a3a..fe4d31eb7746 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -49,6 +49,7 @@ OBJECTS = \ treelearner/data_parallel_tree_learner.o \ treelearner/feature_parallel_tree_learner.o \ treelearner/gpu_tree_learner.o \ + treelearner/gradient_discretizer.o \ treelearner/linear_tree_learner.o \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ diff --git a/docs/Parameters.rst b/docs/Parameters.rst index abbd8cb14e14..aee1cc4e7f84 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -658,6 +658,38 @@ Learning Control Parameters - **Note**: can be used only in CLI version +- ``use_quantized_grad`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to use gradient quantization when training + + - enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` + + - with quantized training, most arithmetics in the training process will be integer operations + + - gradient quantization can accelerate training, with little accuracy drop in most cases + + - **Note**: can be used only with ``device_type = cpu`` + +- ``num_grad_quant_bins`` :raw-html:`🔗︎`, default = ``4``, type = int + + - number of bins to quantization gradients and hessians + + - with more bins, the quantized training will be closer to full precision training + + - **Note**: can be used only with ``device_type = cpu`` + +- ``quant_train_renew_leaf`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to renew the leaf values with original gradients when quantized training + + - renewing is very helpful for good quantized training accuracy for ranking objectives + + - **Note**: can be used only with ``device_type = cpu`` + +- ``stochastic_rounding`` :raw-html:`🔗︎`, default = ``true``, type = bool + + - whether to use stochastic rounding in gradient quantization + IO Parameters ------------- diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index a6199bbbcbd2..ffb8f2844843 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -30,11 +30,14 @@ enum MissingType { }; typedef double hist_t; +typedef int32_t int_hist_t; typedef uint64_t hist_cnt_t; // check at compile time static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct"); const size_t kHistEntrySize = 2 * sizeof(hist_t); +const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t); +const size_t kInt16HistEntrySize = 2 * sizeof(int16_t); const int kHistOffset = 2; const double kSparseThreshold = 0.7; @@ -56,6 +59,28 @@ inline static void HistogramSumReducer(const char* src, char* dst, int type_size } } +inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) { + const int64_t* src_ptr = reinterpret_cast(src); + int64_t* dst_ptr = reinterpret_cast(dst); + const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (comm_size_t i = 0; i < steps; ++i) { + dst_ptr[i] += src_ptr[i]; + } +} + +inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) { + const int32_t* src_ptr = reinterpret_cast(src); + int32_t* dst_ptr = reinterpret_cast(dst); + const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (comm_size_t i = 0; i < steps; ++i) { + dst_ptr[i] += src_ptr[i]; + } +} + /*! \brief This class used to convert feature values into bin, * and store some meta information for bin*/ class BinMapper { @@ -332,6 +357,33 @@ class Bin { const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const = 0; + virtual void ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + /*! * \brief Construct histogram of this feature, * Note: We use ordered_gradients and ordered_hessians to improve cache hit chance @@ -351,6 +403,24 @@ class Bin { virtual void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, hist_t* out) const = 0; + virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t most_freq_bin, MissingType missing_type, bool default_left, @@ -464,6 +534,57 @@ class MultiValBin { const score_t* ordered_hessians, hist_t* out) const = 0; + virtual void ConstructHistogramInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + virtual void FinishLoad() = 0; virtual bool IsSparse() = 0; diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index cbb2735baeb2..89318a7af246 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -592,6 +592,30 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; + // [no-save] + // desc = whether to use gradient quantization when training + // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` + // desc = with quantized training, most arithmetics in the training process will be integer operations + // desc = gradient quantization can accelerate training, with little accuracy drop in most cases + // desc = **Note**: can be used only with ``device_type = cpu`` + bool use_quantized_grad = false; + + // [no-save] + // desc = number of bins to quantization gradients and hessians + // desc = with more bins, the quantized training will be closer to full precision training + // desc = **Note**: can be used only with ``device_type = cpu`` + int num_grad_quant_bins = 4; + + // [no-save] + // desc = whether to renew the leaf values with original gradients when quantized training + // desc = renewing is very helpful for good quantized training accuracy for ranking objectives + // desc = **Note**: can be used only with ``device_type = cpu`` + bool quant_train_renew_leaf = false; + + // [no-save] + // desc = whether to use stochastic rounding in gradient quantization + bool stochastic_rounding = true; + #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 79c4ed196b09..825c5c6ebcf8 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -598,10 +598,11 @@ class Dataset { MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const; + template TrainingShareStates* GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, - bool force_col_wise, bool force_row_wise) const; + bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const; LIGHTGBM_EXPORT void FinishLoad(); @@ -636,7 +637,7 @@ class Dataset { void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; - template + template void ConstructHistogramsInner(const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -646,7 +647,7 @@ class Dataset { TrainingShareStates* share_state, hist_t* hist_data) const; - template + template void ConstructHistogramsMultiVal(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -654,6 +655,7 @@ class Dataset { TrainingShareStates* share_state, hist_t* hist_data) const; + template inline void ConstructHistograms( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, @@ -666,21 +668,21 @@ class Dataset { bool use_indices = data_indices != nullptr && (num_data < num_data_); if (share_state->is_constant_hessian) { if (use_indices) { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } else { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } } else { if (use_indices) { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } else { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } @@ -689,6 +691,9 @@ class Dataset { void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; + template + void FixHistogramInt(int feature_idx, int64_t sum_gradient_and_hessian, hist_t* data) const; + inline data_size_t Split(int feature, const uint32_t* threshold, int num_threshold, bool default_left, const data_size_t* data_indices, diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 8c50734695b2..f102668edf70 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -19,7 +19,7 @@ namespace LightGBM { class MultiValBinWrapper { public: MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, - const std::vector& feature_groups_contained); + const std::vector& feature_groups_contained, const int num_grad_quant_bins); bool IsSparse() { if (multi_val_bin_ != nullptr) { @@ -34,15 +34,17 @@ class MultiValBinWrapper { const data_size_t* bagging_use_indices, data_size_t bagging_indices_cnt); + template void HistMove(const std::vector>& hist_buf); + template void HistMerge(std::vector>* hist_buf); void ResizeHistBuf(std::vector>* hist_buf, MultiValBin* sub_multi_val_bin, hist_t* origin_hist_data); - template + template void ConstructHistograms(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -59,55 +61,145 @@ class MultiValBinWrapper { Threading::BlockInfo(num_threads_, num_data, min_block_size_, &n_data_block_, &data_block_size_); ResizeHistBuf(hist_buf, cur_multi_val_bin, origin_hist_data); + const int inner_hist_bits = (data_block_size_ * num_grad_quant_bins_ < 256 && HIST_BITS == 16) ? 8 : HIST_BITS; OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int block_id = 0; block_id < n_data_block_; ++block_id) { OMP_LOOP_EX_BEGIN(); data_size_t start = block_id * data_block_size_; data_size_t end = std::min(start + data_block_size_, num_data); - ConstructHistogramsForBlock( - cur_multi_val_bin, start, end, data_indices, gradients, hessians, - block_id, hist_buf); + if (inner_hist_bits == 8) { + ConstructHistogramsForBlock( + cur_multi_val_bin, start, end, data_indices, gradients, hessians, + block_id, hist_buf); + } else { + ConstructHistogramsForBlock( + cur_multi_val_bin, start, end, data_indices, gradients, hessians, + block_id, hist_buf); + } OMP_LOOP_EX_END(); } OMP_THROW_EX(); global_timer.Stop("Dataset::sparse_bin_histogram"); global_timer.Start("Dataset::sparse_bin_histogram_merge"); - HistMerge(hist_buf); + if (inner_hist_bits == 8) { + HistMerge(hist_buf); + } else { + HistMerge(hist_buf); + } global_timer.Stop("Dataset::sparse_bin_histogram_merge"); global_timer.Start("Dataset::sparse_bin_histogram_move"); - HistMove(*hist_buf); + if (inner_hist_bits == 8) { + HistMove(*hist_buf); + } else { + HistMove(*hist_buf); + } global_timer.Stop("Dataset::sparse_bin_histogram_move"); } } - template + template void ConstructHistogramsForBlock(const MultiValBin* sub_multi_val_bin, data_size_t start, data_size_t end, const data_size_t* data_indices, const score_t* gradients, const score_t* hessians, int block_id, std::vector>* hist_buf) { - hist_t* data_ptr = origin_hist_data_; - if (block_id == 0) { - if (is_use_subcol_) { - data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 8) { + int8_t* hist_buf_ptr = reinterpret_cast(hist_buf->data()); + int8_t* data_ptr = hist_buf_ptr + + static_cast(num_bin_aligned_) * block_id * 2; + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt8HistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrderedInt8(data_indices, start, end, + gradients, hessians, + reinterpret_cast(data_ptr)); + } else { + sub_multi_val_bin->ConstructHistogramInt8(data_indices, start, end, gradients, + hessians, + reinterpret_cast(data_ptr)); + } + } else { + sub_multi_val_bin->ConstructHistogramInt8(start, end, gradients, hessians, + reinterpret_cast(data_ptr)); + } + } else if (HIST_BITS == 16) { + int16_t* data_ptr = reinterpret_cast(origin_hist_data_); + int16_t* hist_buf_ptr = reinterpret_cast(hist_buf->data()); + if (block_id == 0) { + if (is_use_subcol_) { + data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } + } else { + data_ptr = hist_buf_ptr + + static_cast(num_bin_aligned_) * (block_id - 1) * 2; + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt16HistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrderedInt16(data_indices, start, end, + gradients, hessians, + reinterpret_cast(data_ptr)); + } else { + sub_multi_val_bin->ConstructHistogramInt16(data_indices, start, end, gradients, + hessians, + reinterpret_cast(data_ptr)); + } + } else { + sub_multi_val_bin->ConstructHistogramInt16(start, end, gradients, hessians, + reinterpret_cast(data_ptr)); + } + } else { + int32_t* data_ptr = reinterpret_cast(origin_hist_data_); + int32_t* hist_buf_ptr = reinterpret_cast(hist_buf->data()); + if (block_id == 0) { + if (is_use_subcol_) { + data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } + } else { + data_ptr = hist_buf_ptr + + static_cast(num_bin_aligned_) * (block_id - 1) * 2; + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt32HistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrderedInt32(data_indices, start, end, + gradients, hessians, + reinterpret_cast(data_ptr)); + } else { + sub_multi_val_bin->ConstructHistogramInt32(data_indices, start, end, gradients, + hessians, + reinterpret_cast(data_ptr)); + } + } else { + sub_multi_val_bin->ConstructHistogramInt32(start, end, gradients, hessians, + reinterpret_cast(data_ptr)); + } } } else { - data_ptr = hist_buf->data() + - static_cast(num_bin_aligned_) * (block_id - 1) * 2; - } - std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kHistBufferEntrySize); - if (USE_INDICES) { - if (ORDERED) { - sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end, - gradients, hessians, data_ptr); + hist_t* data_ptr = origin_hist_data_; + if (block_id == 0) { + if (is_use_subcol_) { + data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } } else { - sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, - hessians, data_ptr); + data_ptr = hist_buf->data() + + static_cast(num_bin_aligned_) * (block_id - 1) * 2; + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kHistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end, + gradients, hessians, data_ptr); + } else { + sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, + hessians, data_ptr); + } + } else { + sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians, + data_ptr); } - } else { - sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians, - data_ptr); } } @@ -162,10 +254,14 @@ class MultiValBinWrapper { int data_block_size_; int min_block_size_; int num_data_; + int num_grad_quant_bins_; hist_t* origin_hist_data_; const size_t kHistBufferEntrySize = 2 * sizeof(hist_t); + const size_t kInt32HistBufferEntrySize = 2 * sizeof(int32_t); + const size_t kInt16HistBufferEntrySize = 2 * sizeof(int16_t); + const size_t kInt8HistBufferEntrySize = 2 * sizeof(int8_t); }; struct TrainingShareStates { @@ -193,7 +289,7 @@ struct TrainingShareStates { void SetMultiValBin(MultiValBin* bin, data_size_t num_data, const std::vector>& feature_groups, - bool dense_only, bool sparse_only); + bool dense_only, bool sparse_only, const int num_grad_quant_bins); void CalcBinOffsets(const std::vector>& feature_groups, std::vector* offsets, bool is_col_wise); @@ -210,14 +306,14 @@ struct TrainingShareStates { } } - template + template void ConstructHistograms(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, const score_t* hessians, hist_t* hist_data) { if (multi_val_bin_wrapper_ != nullptr) { - multi_val_bin_wrapper_->ConstructHistograms( + multi_val_bin_wrapper_->ConstructHistograms( data_indices, num_data, gradients, hessians, &hist_buf_, hist_data); } } diff --git a/src/io/config.cpp b/src/io/config.cpp index 86b64a52d105..e8578046960a 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -378,6 +378,10 @@ void Config::CheckParamConflict() { if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } + if (use_quantized_grad) { + Log::Warning("Quantized training is not supported by GPU tree learner. Switch to full precision training."); + use_quantized_grad = false; + } } else if (device_type == std::string("cuda")) { // force row-wise for cuda version force_col_wise = false; @@ -385,6 +389,10 @@ void Config::CheckParamConflict() { if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } + if (use_quantized_grad) { + Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training."); + use_quantized_grad = false; + } } // linear tree learner must be serial type and run on CPU device if (linear_tree) { diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b1dbcc378a27..0906ba4b6439 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -251,6 +251,10 @@ const std::unordered_set& Config::parameter_set() { "output_model", "saved_feature_importance_type", "snapshot_freq", + "use_quantized_grad", + "num_grad_quant_bins", + "quant_train_renew_leaf", + "stochastic_rounding", "linear_tree", "max_bin", "max_bin_by_feature", @@ -493,6 +497,14 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"output_model", {"model_output", "model_out"}}, {"saved_feature_importance_type", {}}, {"snapshot_freq", {"save_period"}}, + {"use_quantized_grad", {}}, + {"num_grad_quant_bins", {}}, + {"quant_train_renew_leaf", {}}, + {"stochastic_rounding", {}}, {"linear_tree", {"linear_trees"}}, {"max_bin", {"max_bins"}}, {"max_bin_by_feature", {}}, @@ -966,6 +982,10 @@ const std::unordered_map& Config::ParameterTypes() { {"output_model", "string"}, {"saved_feature_importance_type", "int"}, {"snapshot_freq", "int"}, + {"use_quantized_grad", "bool"}, + {"num_grad_quant_bins", "int"}, + {"quant_train_renew_leaf", "bool"}, + {"stochastic_rounding", "bool"}, {"linear_tree", "bool"}, {"max_bin", "int"}, {"max_bin_by_feature", "vector"}, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index a8f449d3f55b..5b23f01ec3a0 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -608,10 +608,12 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of return ret.release(); } +template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, - bool force_col_wise, bool force_row_wise) const { + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const { Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer); if (force_col_wise && force_row_wise) { @@ -631,7 +633,7 @@ TrainingShareStates* Dataset::GetShareStates( share_state->CalcBinOffsets( feature_groups_, &offsets, true); share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets), - num_data_, feature_groups_, false, true); + num_data_, feature_groups_, false, true, num_grad_quant_bins); share_state->is_col_wise = true; share_state->is_constant_hessian = is_constant_hessian; return share_state; @@ -641,7 +643,7 @@ TrainingShareStates* Dataset::GetShareStates( share_state->CalcBinOffsets( feature_groups_, &offsets, false); share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_, - feature_groups_, false, false); + feature_groups_, false, false, num_grad_quant_bins); share_state->is_col_wise = false; share_state->is_constant_hessian = is_constant_hessian; return share_state; @@ -658,14 +660,14 @@ TrainingShareStates* Dataset::GetShareStates( std::vector col_wise_offsets; col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true); col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_, - feature_groups_, false, true); + feature_groups_, false, true, num_grad_quant_bins); col_wise_init_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); std::vector row_wise_offsets; row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false); row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_, - feature_groups_, false, false); + feature_groups_, false, false, num_grad_quant_bins); row_wise_init_time = std::chrono::steady_clock::now() - start_time; uint64_t max_total_bin = std::max(row_wise_state->num_hist_total_bin(), @@ -685,12 +687,12 @@ TrainingShareStates* Dataset::GetShareStates( InitTrain(is_feature_used, row_wise_state.get()); std::chrono::duration col_wise_time, row_wise_time; start_time = std::chrono::steady_clock::now(); - ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, + ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, col_wise_state.get(), hist_data.data()); col_wise_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); - ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, + ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, row_wise_state.get(), hist_data.data()); row_wise_time = std::chrono::steady_clock::now() - start_time; @@ -721,6 +723,24 @@ TrainingShareStates* Dataset::GetShareStates( } } +template TrainingShareStates* Dataset::GetShareStates( + score_t* gradients, score_t* hessians, + const std::vector& is_feature_used, bool is_constant_hessian, + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const; + +template TrainingShareStates* Dataset::GetShareStates( + score_t* gradients, score_t* hessians, + const std::vector& is_feature_used, bool is_constant_hessian, + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const; + +template TrainingShareStates* Dataset::GetShareStates( + score_t* gradients, score_t* hessians, + const std::vector& is_feature_used, bool is_constant_hessian, + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const; + void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { feature_groups_.clear(); num_features_ = dataset->num_features_; @@ -1203,7 +1223,7 @@ void Dataset::InitTrain(const std::vector& is_feature_used, is_feature_used); } -template +template void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -1212,18 +1232,18 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, hist_t* hist_data) const { Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer); - share_state->ConstructHistograms( + share_state->ConstructHistograms( data_indices, num_data, gradients, hessians, hist_data); } -template +template void Dataset::ConstructHistogramsInner( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, TrainingShareStates* share_state, hist_t* hist_data) const { if (!share_state->is_col_wise) { - return ConstructHistogramsMultiVal( + return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } std::vector used_dense_group; @@ -1275,30 +1295,80 @@ void Dataset::ConstructHistogramsInner( for (int gi = 0; gi < num_used_dense_group; ++gi) { OMP_LOOP_EX_BEGIN(); int group = used_dense_group[gi]; - auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr), 0, - num_bin * kHistEntrySize); - if (USE_HESSIAN) { - if (USE_INDICES) { - feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, - data_ptr); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 16) { + auto data_ptr = reinterpret_cast(reinterpret_cast(hist_data) + group_bin_boundaries_[group]); + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kInt16HistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } + } else { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + data_indices, 0, num_data, ptr_ordered_grad, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + 0, num_data, ptr_ordered_grad, data_ptr); + } + } } else { - feature_groups_[group]->bin_data_->ConstructHistogram( - 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + auto data_ptr = hist_data + group_bin_boundaries_[group]; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kInt32HistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } + } else { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + data_indices, 0, num_data, ptr_ordered_grad, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + 0, num_data, ptr_ordered_grad, data_ptr); + } + } } } else { - if (USE_INDICES) { - feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, 0, num_data, ptr_ordered_grad, data_ptr); + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kHistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogram( + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } } else { - feature_groups_[group]->bin_data_->ConstructHistogram( - 0, num_data, ptr_ordered_grad, data_ptr); - } - auto cnt_dst = reinterpret_cast(data_ptr + 1); - for (int i = 0; i < num_bin * 2; i += 2) { - data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0]; + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogram( + data_indices, 0, num_data, ptr_ordered_grad, data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, data_ptr); + } + auto cnt_dst = reinterpret_cast(data_ptr + 1); + for (int i = 0; i < num_bin * 2; i += 2) { + data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0]; + } } } OMP_LOOP_EX_END(); @@ -1307,43 +1377,78 @@ void Dataset::ConstructHistogramsInner( } global_timer.Stop("Dataset::dense_bin_histogram"); if (multi_val_groud_id >= 0) { - if (num_used_dense_group > 0) { - ConstructHistogramsMultiVal( - data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, - share_state, - hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 32) { + int32_t* hist_data_ptr = reinterpret_cast(hist_data); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } + } else if (HIST_BITS == 16) { + int16_t* hist_data_ptr = reinterpret_cast(hist_data); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } + } } else { - ConstructHistogramsMultiVal( - data_indices, num_data, gradients, hessians, share_state, - hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } } } } // explicitly initialize template methods, for cross module call -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +#define CONSTRUCT_HISTOGRAMS_INNER_PARMA \ + const std::vector& is_feature_used, const data_size_t* data_indices, \ + data_size_t num_data, const score_t* gradients, const score_t* hessians, \ + score_t* ordered_gradients, score_t* ordered_hessians, \ + TrainingShareStates* share_state, hist_t* hist_data -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +// explicitly initialize template methods, for cross module call +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const { @@ -1365,6 +1470,49 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, } } +template +void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const { + const int group = feature2group_[feature_idx]; + const int sub_feature = feature2subfeature_[feature_idx]; + const BinMapper* bin_mapper = + feature_groups_[group]->bin_mappers_[sub_feature].get(); + const int most_freq_bin = bin_mapper->GetMostFreqBin(); + PACKED_HIST_BIN_T* data_ptr = reinterpret_cast(data); + PACKED_HIST_ACC_T int_sum_gradient_and_hessian_local = HIST_BITS_ACC == 16 ? + ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) | + static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) : + int_sum_gradient_and_hessian; + if (most_freq_bin > 0) { + const int num_bin = bin_mapper->num_bin(); + if (HIST_BITS_BIN == HIST_BITS_ACC) { + for (int i = 0; i < num_bin; ++i) { + if (i != most_freq_bin) { + int_sum_gradient_and_hessian_local -= data_ptr[i]; + } + } + data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local; + } else { + CHECK_EQ(HIST_BITS_ACC, 32); + CHECK_EQ(HIST_BITS_BIN, 16); + for (int i = 0; i < num_bin; ++i) { + if (i != most_freq_bin) { + const PACKED_HIST_BIN_T packed_hist = data_ptr[i]; + const PACKED_HIST_ACC_T packed_hist_acc = (static_cast(static_cast(packed_hist >> 16)) << 32) | + static_cast(packed_hist & 0x0000ffff); + int_sum_gradient_and_hessian_local -= packed_hist_acc; + } + } + PACKED_HIST_BIN_T int_sum_gradient_and_hessian_local_bin = + (static_cast(int_sum_gradient_and_hessian_local >> 32) << 16) | static_cast(int_sum_gradient_and_hessian_local & 0x0000ffff); + data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local_bin; + } + } +} + +template void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const; + +template void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const; + template void PushVector(std::vector* dest, const std::vector& src) { dest->reserve(dest->size() + src.size()); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 3d0f8db8e549..e612052e47d2 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -171,6 +171,146 @@ class DenseBin : public Bin { } + template + void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + data_size_t i = start; + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); + const VAL_T* data_ptr_base = data_.data(); + if (USE_PREFETCH) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (IS_4BIT) { + PREFETCH_T0(data_ptr_base + (pf_idx >> 1)); + } else { + PREFETCH_T0(data_ptr_base + pf_idx); + } + const auto ti = static_cast(data(idx)); + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto ti = static_cast(data(idx)); + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + template data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin, diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index b4fbfbe673aa..780272bdc4e1 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -124,6 +124,123 @@ class MultiValDenseBin : public MultiValBin { gradients, hessians, out); } + template + void ConstructHistogramIntInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients_and_hessians, hist_t* out) const { + data_size_t i = start; + const VAL_T* data_ptr_base = data_.data(); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(gradients_and_hessians); + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + + if (USE_PREFETCH) { + const data_size_t pf_offset = 32 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + + for (; i < pf_end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (!ORDERED) { + PREFETCH_T0(gradients_and_hessians_ptr + pf_idx); + } + PREFETCH_T0(data_ptr_base + RowPtr(pf_idx)); + const auto j_start = RowPtr(idx); + const VAL_T* data_ptr = data_ptr_base + j_start; + const int16_t gradient_16 = gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (int j = 0; j < num_feature_; ++j) { + const uint32_t bin = static_cast(data_ptr[j]); + const auto ti = (bin + offsets_[j]); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto j_start = RowPtr(idx); + const VAL_T* data_ptr = data_ptr_base + j_start; + const int16_t gradient_16 = gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (int j = 0; j < num_feature_; ++j) { + const uint32_t bin = static_cast(data_ptr[j]); + const auto ti = (bin + offsets_[j]); + out_ptr[ti] += gradient_packed; + } + } + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double, const std::vector& offsets) const override { return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index eaa30ef0a0cc..32a5a51b4f89 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -180,6 +180,124 @@ class MultiValSparseBin : public MultiValBin { gradients, hessians, out); } + template + void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients_and_hessians, hist_t* out) const { + data_size_t i = start; + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(gradients_and_hessians); + const VAL_T* data_ptr = data_.data(); + const INDEX_T* row_ptr_base = row_ptr_.data(); + if (USE_PREFETCH) { + const data_size_t pf_offset = 32 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + + for (; i < pf_end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (!ORDERED) { + PREFETCH_T0(gradients_and_hessians_ptr + pf_idx); + } + PREFETCH_T0(row_ptr_base + pf_idx); + PREFETCH_T0(data_ptr + row_ptr_[pf_idx]); + const auto j_start = RowPtr(idx); + const auto j_end = RowPtr(idx + 1); + const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (auto j = j_start; j < j_end; ++j) { + const auto ti = static_cast(data_ptr[j]); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto j_start = RowPtr(idx); + const auto j_end = RowPtr(idx + 1); + const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (auto j = j_start; j < j_end; ++j) { + const auto ti = static_cast(data_ptr[j]); + out_ptr[ti] += gradient_packed; + } + } + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + MultiValBin* CreateLike(data_size_t num_data, int num_bin, int, double estimate_element_per_row, const std::vector& /*offsets*/) const override { diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index e01c0afcf5bc..f7137d29ffd9 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -203,6 +203,184 @@ class SparseBin : public Bin { } #undef ACC_GH + template + void ConstructIntHistogramInner(data_size_t start, data_size_t end, + const score_t* ordered_gradients_and_hessians, + hist_t* out) const { + data_size_t i_delta, cur_pos; + InitIndex(start, &i_delta, &cur_pos); + if (USE_HESSIAN) { + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + while (cur_pos < start && i_delta < num_vals_) { + cur_pos += deltas_[++i_delta]; + } + while (cur_pos < end && i_delta < num_vals_) { + const VAL_T bin = vals_[i_delta]; + const int16_t gradient_16 = gradients_and_hessians_ptr[cur_pos]; + const PACKED_HIST_T gradient_64 = (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[bin] += gradient_64; + cur_pos += deltas_[++i_delta]; + } + } else { + GRAD_HIST_T* grad = reinterpret_cast(out); + HESS_HIST_T* cnt = reinterpret_cast(out) + 1; + const int8_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + while (cur_pos < start && i_delta < num_vals_) { + cur_pos += deltas_[++i_delta]; + } + while (cur_pos < end && i_delta < num_vals_) { + const uint32_t ti = static_cast(vals_[i_delta]) << 1; + grad[ti] += gradients_and_hessians_ptr[cur_pos]; + ++cnt[ti]; + cur_pos += deltas_[++i_delta]; + } + } + } + + template + void ConstructIntHistogramInner(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients_and_hessians, + hist_t* out) const { + data_size_t i_delta, cur_pos; + InitIndex(data_indices[start], &i_delta, &cur_pos); + data_size_t i = start; + if (USE_HESSIAN) { + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + for (;;) { + if (cur_pos < data_indices[i]) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } else if (cur_pos > data_indices[i]) { + if (++i >= end) { + break; + } + } else { + const VAL_T bin = vals_[i_delta]; + const int16_t gradient_16 = gradients_and_hessians_ptr[i]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[bin] += gradient_packed; + if (++i >= end) { + break; + } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } + } + } else { + GRAD_HIST_T* grad = reinterpret_cast(out); + HESS_HIST_T* cnt = reinterpret_cast(out) + 1; + const int8_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + for (;;) { + if (cur_pos < data_indices[i]) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } else if (cur_pos > data_indices[i]) { + if (++i >= end) { + break; + } + } else { + const uint32_t ti = static_cast(vals_[i_delta]) << 1; + grad[ti] += gradients_and_hessians_ptr[i << 1]; + ++cnt[ti]; + if (++i >= end) { + break; + } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } + } + } + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + inline void NextNonzeroFast(data_size_t* i_delta, data_size_t* cur_pos) const { *cur_pos += deltas_[++(*i_delta)]; diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index f6462697a93d..71b2e097ef1b 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -9,7 +9,7 @@ namespace LightGBM { MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, - const std::vector& feature_groups_contained): + const std::vector& feature_groups_contained, const int num_grad_quant_bins): feature_groups_contained_(feature_groups_contained) { num_threads_ = OMP_NUM_THREADS(); num_data_ = num_data; @@ -19,6 +19,7 @@ MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, } num_bin_ = bin->num_bin(); num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize; + num_grad_quant_bins_ = num_grad_quant_bins; } void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, @@ -45,43 +46,161 @@ void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, } } +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf) { - if (!is_use_subcol_) { + if (!is_use_subcol_ && INNER_HIST_BITS != 8) { return; } - const hist_t* src = hist_buf.data() + hist_buf.size() - - 2 * static_cast(num_bin_aligned_); - #pragma omp parallel for schedule(static) - for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { - std::copy_n(src + hist_move_src_[i], hist_move_size_[i], - origin_hist_data_ + hist_move_dest_[i]); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 32) { + const int64_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2 - + static_cast(num_bin_aligned_); + #pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { + std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, + reinterpret_cast(origin_hist_data_) + hist_move_dest_[i] / 2); + } + } else if (HIST_BITS == 16) { + const int32_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2 - + static_cast(num_bin_aligned_); + if (is_use_subcol_) { + #pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { + std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, + reinterpret_cast(origin_hist_data_) + hist_move_dest_[i] / 2); + } + } else { + int32_t* orig_ptr = reinterpret_cast(origin_hist_data_); + #pragma omp parallel for schedule(static) + for (int i = 0; i < num_bin_; ++i) { + orig_ptr[i] = src[i]; + } + } + } + } else { + const hist_t* src = hist_buf.data() + hist_buf.size() - + 2 * static_cast(num_bin_aligned_); + #pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { + std::copy_n(src + hist_move_src_[i], hist_move_size_[i], + origin_hist_data_ + hist_move_dest_[i]); + } } } +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf) { int n_bin_block = 1; int bin_block_size = num_bin_; Threading::BlockInfo(num_threads_, num_bin_, 512, &n_bin_block, &bin_block_size); - hist_t* dst = origin_hist_data_; - if (is_use_subcol_) { - dst = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); - } - #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) - for (int t = 0; t < n_bin_block; ++t) { - const int start = t * bin_block_size; - const int end = std::min(start + bin_block_size, num_bin_); - for (int tid = 1; tid < n_data_block_; ++tid) { - auto src_ptr = hist_buf->data() + static_cast(num_bin_aligned_) * 2 * (tid - 1); - for (int i = start * 2; i < end * 2; ++i) { - dst[i] += src_ptr[i]; + if (USE_QUANT_GRAD) { + if (HIST_BITS == 32) { + int64_t* dst = reinterpret_cast(origin_hist_data_); + if (is_use_subcol_) { + dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + } + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 1; tid < n_data_block_; ++tid) { + auto src_ptr = reinterpret_cast(hist_buf->data()) + static_cast(num_bin_aligned_) * (tid - 1); + for (int i = start; i < end; ++i) { + dst[i] += src_ptr[i]; + } + } + } + } else if (HIST_BITS == 16 && INNER_HIST_BITS == 16) { + int32_t* dst = reinterpret_cast(origin_hist_data_); + if (is_use_subcol_) { + dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + } + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 1; tid < n_data_block_; ++tid) { + auto src_ptr = reinterpret_cast(hist_buf->data()) + static_cast(num_bin_aligned_) * (tid - 1); + for (int i = start; i < end; ++i) { + dst[i] += src_ptr[i]; + } + } + } + } else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) { + int32_t* dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + std::memset(reinterpret_cast(dst), 0, num_bin_ * kInt16HistBufferEntrySize); + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 0; tid < n_data_block_; ++tid) { + auto src_ptr = reinterpret_cast(hist_buf->data()) + static_cast(num_bin_aligned_) * tid; + for (int i = start; i < end; ++i) { + const int16_t packed_hist = src_ptr[i]; + const int32_t packed_hist_int32 = (static_cast(static_cast(packed_hist >> 8)) << 16) | static_cast(packed_hist & 0x00ff); + dst[i] += packed_hist_int32; + } + } + } + } + } else { + hist_t* dst = origin_hist_data_; + if (is_use_subcol_) { + dst = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 1; tid < n_data_block_; ++tid) { + auto src_ptr = hist_buf->data() + static_cast(num_bin_aligned_) * 2 * (tid - 1); + for (int i = start * 2; i < end * 2; ++i) { + dst[i] += src_ptr[i]; + } } } } } +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + void MultiValBinWrapper::ResizeHistBuf(std::vector>* hist_buf, MultiValBin* sub_multi_val_bin, @@ -389,7 +508,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector>& feature_groups, - bool dense_only, bool sparse_only) { + bool dense_only, bool sparse_only, const int num_grad_quant_bins) { num_threads = OMP_NUM_THREADS(); if (bin == nullptr) { return; @@ -408,7 +527,7 @@ void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, num_total_bin_ += bin->num_bin(); num_elements_per_row_ += bin->num_element_per_row(); multi_val_bin_wrapper_.reset(new MultiValBinWrapper( - bin, num_data, feature_groups_contained)); + bin, num_data, feature_groups_contained, num_grad_quant_bins)); } } // namespace LightGBM diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 677b7dc6eb82..2509db5e722a 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -30,7 +30,9 @@ void DataParallelTreeLearner::Init(const Dataset* train_data, boo auto max_cat_threshold = this->config_->max_cat_threshold; // need to be able to hold smaller and larger best splits in SyncUpGlobalBestSplit size_t split_info_size = static_cast(SplitInfo::Size(max_cat_threshold) * 2); - size_t histogram_size = static_cast(this->share_state_->num_hist_total_bin() * kHistEntrySize); + size_t histogram_size = this->config_->use_quantized_grad ? + static_cast(this->share_state_->num_hist_total_bin() * kInt32HistEntrySize) : + static_cast(this->share_state_->num_hist_total_bin() * kHistEntrySize); // allocate buffer for communication size_t buffer_size = std::max(histogram_size, split_info_size); @@ -43,8 +45,19 @@ void DataParallelTreeLearner::Init(const Dataset* train_data, boo block_start_.resize(num_machines_); block_len_.resize(num_machines_); + if (this->config_->use_quantized_grad) { + block_start_int16_.resize(num_machines_); + block_len_int16_.resize(num_machines_); + } + buffer_write_start_pos_.resize(this->num_features_); buffer_read_start_pos_.resize(this->num_features_); + + if (this->config_->use_quantized_grad) { + buffer_write_start_pos_int16_.resize(this->num_features_); + buffer_read_start_pos_int16_.resize(this->num_features_); + } + global_data_count_in_leaf_.resize(this->config_->num_leaves); } @@ -55,100 +68,155 @@ void DataParallelTreeLearner::ResetConfig(const Config* config) { } template -void DataParallelTreeLearner::BeforeTrain() { - TREELEARNER_T::BeforeTrain(); - // generate feature partition for current tree - std::vector> feature_distribution(num_machines_, std::vector()); - std::vector num_bins_distributed(num_machines_, 0); - for (int i = 0; i < this->train_data_->num_total_features(); ++i) { - int inner_feature_index = this->train_data_->InnerFeatureIndex(i); - if (inner_feature_index == -1) { continue; } - if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) { - int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); - feature_distribution[cur_min_machine].push_back(inner_feature_index); - auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index); - if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetMostFreqBin() == 0) { - num_bin -= 1; - } - num_bins_distributed[cur_min_machine] += num_bin; - } - is_feature_aggregated_[inner_feature_index] = false; - } - // get local used feature - for (auto fid : feature_distribution[rank_]) { - is_feature_aggregated_[fid] = true; - } - +void DataParallelTreeLearner::PrepareBufferPos( + const std::vector>& feature_distribution, + std::vector* block_start, + std::vector* block_len, + std::vector* buffer_write_start_pos, + std::vector* buffer_read_start_pos, + comm_size_t* reduce_scatter_size, + size_t hist_entry_size) { // get block start and block len for reduce scatter - reduce_scatter_size_ = 0; + *reduce_scatter_size = 0; for (int i = 0; i < num_machines_; ++i) { - block_len_[i] = 0; + (*block_len)[i] = 0; for (auto fid : feature_distribution[i]) { auto num_bin = this->train_data_->FeatureNumBin(fid); if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - block_len_[i] += num_bin * kHistEntrySize; + (*block_len)[i] += num_bin * hist_entry_size; } - reduce_scatter_size_ += block_len_[i]; + *reduce_scatter_size += (*block_len)[i]; } - block_start_[0] = 0; + (*block_start)[0] = 0; for (int i = 1; i < num_machines_; ++i) { - block_start_[i] = block_start_[i - 1] + block_len_[i - 1]; + (*block_start)[i] = (*block_start)[i - 1] + (*block_len)[i - 1]; } - // get buffer_write_start_pos_ + // get buffer_write_start_pos int bin_size = 0; for (int i = 0; i < num_machines_; ++i) { for (auto fid : feature_distribution[i]) { - buffer_write_start_pos_[fid] = bin_size; + (*buffer_write_start_pos)[fid] = bin_size; auto num_bin = this->train_data_->FeatureNumBin(fid); if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - bin_size += num_bin * kHistEntrySize; + bin_size += num_bin * hist_entry_size; } } - // get buffer_read_start_pos_ + // get buffer_read_start_pos bin_size = 0; for (auto fid : feature_distribution[rank_]) { - buffer_read_start_pos_[fid] = bin_size; + (*buffer_read_start_pos)[fid] = bin_size; auto num_bin = this->train_data_->FeatureNumBin(fid); if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - bin_size += num_bin * kHistEntrySize; + bin_size += num_bin * hist_entry_size; } +} - // sync global data sumup info - std::tuple data(this->smaller_leaf_splits_->num_data_in_leaf(), - this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians()); - int size = sizeof(data); - std::memcpy(input_buffer_.data(), &data, size); - // global sumup reduce - Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) { - comm_size_t used_size = 0; - const std::tuple *p1; - std::tuple *p2; - while (used_size < len) { - p1 = reinterpret_cast *>(src); - p2 = reinterpret_cast *>(dst); - std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1); - std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1); - std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1); - src += type_size; - dst += type_size; - used_size += type_size; +template +void DataParallelTreeLearner::BeforeTrain() { + TREELEARNER_T::BeforeTrain(); + // generate feature partition for current tree + std::vector> feature_distribution(num_machines_, std::vector()); + std::vector num_bins_distributed(num_machines_, 0); + for (int i = 0; i < this->train_data_->num_total_features(); ++i) { + int inner_feature_index = this->train_data_->InnerFeatureIndex(i); + if (inner_feature_index == -1) { continue; } + if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) { + int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); + feature_distribution[cur_min_machine].push_back(inner_feature_index); + auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index); + if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetMostFreqBin() == 0) { + num_bin -= 1; + } + num_bins_distributed[cur_min_machine] += num_bin; } - }); - // copy back - std::memcpy(reinterpret_cast(&data), output_buffer_.data(), size); - // set global sumup info - this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data)); - // init global data count in leaf - global_data_count_in_leaf_[0] = std::get<0>(data); + is_feature_aggregated_[inner_feature_index] = false; + } + // get local used feature + for (auto fid : feature_distribution[rank_]) { + is_feature_aggregated_[fid] = true; + } + + // get block start and block len for reduce scatter + if (this->config_->use_quantized_grad) { + PrepareBufferPos(feature_distribution, &block_start_, &block_len_, &buffer_write_start_pos_, + &buffer_read_start_pos_, &reduce_scatter_size_, kInt32HistEntrySize); + PrepareBufferPos(feature_distribution, &block_start_int16_, &block_len_int16_, &buffer_write_start_pos_int16_, + &buffer_read_start_pos_int16_, &reduce_scatter_size_int16_, kInt16HistEntrySize); + } else { + PrepareBufferPos(feature_distribution, &block_start_, &block_len_, &buffer_write_start_pos_, + &buffer_read_start_pos_, &reduce_scatter_size_, kHistEntrySize); + } + + if (this->config_->use_quantized_grad) { + // sync global data sumup info + std::tuple data(this->smaller_leaf_splits_->num_data_in_leaf(), + this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), + this->smaller_leaf_splits_->int_sum_gradients_and_hessians()); + int size = sizeof(data); + std::memcpy(input_buffer_.data(), &data, size); + // global sumup reduce + Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) { + comm_size_t used_size = 0; + const std::tuple *p1; + std::tuple *p2; + while (used_size < len) { + p1 = reinterpret_cast *>(src); + p2 = reinterpret_cast *>(dst); + std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1); + std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1); + std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1); + std::get<3>(*p2) = std::get<3>(*p2) + std::get<3>(*p1); + src += type_size; + dst += type_size; + used_size += type_size; + } + }); + // copy back + std::memcpy(reinterpret_cast(&data), output_buffer_.data(), size); + // set global sumup info + this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data), std::get<3>(data)); + // init global data count in leaf + global_data_count_in_leaf_[0] = std::get<0>(data); + // reset hist num bits according to global num data + this->gradient_discretizer_->template SetNumBitsInHistogramBin(0, -1, GetGlobalDataCountInLeaf(0), 0); + } else { + // sync global data sumup info + std::tuple data(this->smaller_leaf_splits_->num_data_in_leaf(), + this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians()); + int size = sizeof(data); + std::memcpy(input_buffer_.data(), &data, size); + // global sumup reduce + Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) { + comm_size_t used_size = 0; + const std::tuple *p1; + std::tuple *p2; + while (used_size < len) { + p1 = reinterpret_cast *>(src); + p2 = reinterpret_cast *>(dst); + std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1); + std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1); + std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1); + src += type_size; + dst += type_size; + used_size += type_size; + } + }); + // copy back + std::memcpy(reinterpret_cast(&data), output_buffer_.data(), size); + // set global sumup info + this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data)); + // init global data count in leaf + global_data_count_in_leaf_[0] = std::get<0>(data); + } } template @@ -167,23 +235,66 @@ void DataParallelTreeLearner::FindBestSplits(const Tree* tree) { const BinMapper* feature_bin_mapper = this->train_data_->FeatureBinMapper(feature_index); const int offset = static_cast(feature_bin_mapper->GetMostFreqBin() == 0); const int num_bin = feature_bin_mapper->num_bin(); - hist_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawData(); - std::memset(reinterpret_cast(hist_ptr), 0, (num_bin - offset) * kHistEntrySize); + if (this->config_->use_quantized_grad) { + int32_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawDataInt32(); + std::memset(reinterpret_cast(hist_ptr), 0, (num_bin - offset) * kInt32HistEntrySize); + int16_t* hist_ptr_int16 = this->smaller_leaf_histogram_array_[feature_index].RawDataInt16(); + std::memset(reinterpret_cast(hist_ptr_int16), 0, (num_bin - offset) * kInt16HistEntrySize); + } else { + hist_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawData(); + std::memset(reinterpret_cast(hist_ptr), 0, (num_bin - offset) * kHistEntrySize); + } } } // construct local histograms + global_timer.Start("DataParallelTreeLearner::ReduceHistogram"); + global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy"); #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false) continue; // copy to buffer - std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], + if (this->config_->use_quantized_grad) { + const uint8_t local_smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (smaller_leaf_num_bits <= 16) { + std::memcpy(input_buffer_.data() + buffer_write_start_pos_int16_[feature_index], + this->smaller_leaf_histogram_array_[feature_index].RawDataInt16(), + this->smaller_leaf_histogram_array_[feature_index].SizeOfInt16Histgram()); + } else { + if (local_smaller_leaf_num_bits == 32) { + std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], + this->smaller_leaf_histogram_array_[feature_index].RawDataInt32(), + this->smaller_leaf_histogram_array_[feature_index].SizeOfInt32Histgram()); + } else { + this->smaller_leaf_histogram_array_[feature_index].CopyFromInt16ToInt32( + input_buffer_.data() + buffer_write_start_pos_[feature_index]); + } + } + } else { + std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], this->smaller_leaf_histogram_array_[feature_index].RawData(), this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram()); + } } + global_timer.Stop("DataParallelTreeLearner::ReduceHistogram::Copy"); // Reduce scatter for histogram - Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), - block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramSumReducer); + global_timer.Start("DataParallelTreeLearner::ReduceHistogram::ReduceScatter"); + if (!this->config_->use_quantized_grad) { + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), + block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramSumReducer); + } else { + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (smaller_leaf_num_bits <= 16) { + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_int16_, sizeof(int16_t), block_start_int16_.data(), + block_len_int16_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &Int16HistogramSumReducer); + } else { + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(int_hist_t), block_start_.data(), + block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &Int32HistogramSumReducer); + } + } + global_timer.Stop("DataParallelTreeLearner::ReduceHistogram::ReduceScatter"); + global_timer.Stop("DataParallelTreeLearner::ReduceHistogram"); this->FindBestSplitsFromHistograms( this->col_sampler_.is_feature_used_bytree(), true, tree); } @@ -198,6 +309,26 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const this->col_sampler_.GetByNode(tree, this->larger_leaf_splits_->leaf_index()); double smaller_leaf_parent_output = this->GetParentOutput(tree, this->smaller_leaf_splits_.get()); double larger_leaf_parent_output = this->GetParentOutput(tree, this->larger_leaf_splits_.get()); + + if (this->config_->use_quantized_grad && this->larger_leaf_splits_ != nullptr && this->larger_leaf_splits_->leaf_index() >= 0) { + const int parent_index = std::min(this->smaller_leaf_splits_->leaf_index(), this->larger_leaf_splits_->leaf_index()); + const uint8_t parent_num_bits = this->gradient_discretizer_->template GetHistBitsInNode(parent_index); + const uint8_t larger_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->larger_leaf_splits_->leaf_index()); + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) { + CHECK_LE(smaller_leaf_num_bits, 16); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_aggregated_[feature_index]) continue; + this->larger_leaf_histogram_array_[feature_index].CopyToBuffer(this->gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } + } + OMP_INIT_EX(); #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { @@ -206,12 +337,39 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const const int tid = omp_get_thread_num(); const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index); // restore global histograms from buffer - this->smaller_leaf_histogram_array_[feature_index].FromMemory( - output_buffer_.data() + buffer_read_start_pos_[feature_index]); + if (this->config_->use_quantized_grad) { + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (smaller_leaf_num_bits <= 16) { + this->smaller_leaf_histogram_array_[feature_index].FromMemoryInt16( + output_buffer_.data() + buffer_read_start_pos_int16_[feature_index]); + } else { + this->smaller_leaf_histogram_array_[feature_index].FromMemoryInt32( + output_buffer_.data() + buffer_read_start_pos_[feature_index]); + } + } else { + this->smaller_leaf_histogram_array_[feature_index].FromMemory( + output_buffer_.data() + buffer_read_start_pos_[feature_index]); + } - this->train_data_->FixHistogram(feature_index, - this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), - this->smaller_leaf_histogram_array_[feature_index].RawData()); + if (this->config_->use_quantized_grad) { + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + const int64_t int_sum_gradient_and_hessian = this->smaller_leaf_splits_->int_sum_gradients_and_hessians(); + if (smaller_leaf_num_bits <= 16) { + this->train_data_->template FixHistogramInt( + feature_index, + int_sum_gradient_and_hessian, + reinterpret_cast(this->smaller_leaf_histogram_array_[feature_index].RawDataInt16())); + } else { + this->train_data_->template FixHistogramInt( + feature_index, + int_sum_gradient_and_hessian, + reinterpret_cast(this->smaller_leaf_histogram_array_[feature_index].RawDataInt32())); + } + } else { + this->train_data_->FixHistogram(feature_index, + this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), + this->smaller_leaf_histogram_array_[feature_index].RawData()); + } this->ComputeBestSplitForFeature( this->smaller_leaf_histogram_array_, feature_index, real_feature_index, @@ -225,8 +383,31 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->leaf_index() < 0) continue; // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms - this->larger_leaf_histogram_array_[feature_index].Subtract( - this->smaller_leaf_histogram_array_[feature_index]); + if (this->config_->use_quantized_grad) { + const int parent_index = std::min(this->smaller_leaf_splits_->leaf_index(), this->larger_leaf_splits_->leaf_index()); + const uint8_t parent_num_bits = this->gradient_discretizer_->template GetHistBitsInNode(parent_index); + const uint8_t larger_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->larger_leaf_splits_->leaf_index()); + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (parent_num_bits <= 16) { + CHECK_LE(smaller_leaf_num_bits, 16); + CHECK_LE(larger_leaf_num_bits, 16); + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } else if (larger_leaf_num_bits <= 16) { + CHECK_LE(smaller_leaf_num_bits, 16); + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index], this->gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + } else if (smaller_leaf_num_bits <= 16) { + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } else { + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } + } else { + this->larger_leaf_histogram_array_[feature_index].Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } this->ComputeBestSplitForFeature( this->larger_leaf_histogram_array_, feature_index, real_feature_index, @@ -273,6 +454,10 @@ void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, in // need update global number of data in leaf global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count; global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count; + // reset hist num bits according to global num data + if (this->config_->use_quantized_grad) { + this->gradient_discretizer_->template SetNumBitsInHistogramBin(*left_leaf, *right_leaf, GetGlobalDataCountInLeaf(*left_leaf), GetGlobalDataCountInLeaf(*right_leaf)); + } } // instantiate template classes, otherwise linker cannot find the code diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 7804292d15d0..d917ed7917ec 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -51,6 +51,18 @@ class FeatureHistogram { /*! \brief Disable copy */ FeatureHistogram(const FeatureHistogram&) = delete; + /*! + * \brief Init the feature histogram + * \param feature the feature data for this histogram + * \param min_num_data_one_leaf minimal number of data in one leaf + */ + void Init(hist_t* data, int16_t* data_int16, const FeatureMetainfo* meta) { + meta_ = meta; + data_ = data; + data_int16_ = data_int16; + ResetFunc(); + } + /*! * \brief Init the feature histogram * \param feature the feature data for this histogram @@ -59,6 +71,7 @@ class FeatureHistogram { void Init(hist_t* data, const FeatureMetainfo* meta) { meta_ = meta; data_ = data; + data_int16_ = nullptr; ResetFunc(); } @@ -72,13 +85,80 @@ class FeatureHistogram { hist_t* RawData() { return data_; } + int32_t* RawDataInt32() { return reinterpret_cast(data_); } + + int16_t* RawDataInt16() { return data_int16_; } + /*! * \brief Subtract current histograms with other * \param other The histogram that want to subtract */ - void Subtract(const FeatureHistogram& other) { - for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) { - data_[i] -= other.data_[i]; + template + void Subtract(const FeatureHistogram& other, const int32_t* buffer = nullptr) { + if (USE_DIST_GRAD) { + const THIS_HIST_T* this_int_data = THIS_HIST_BITS == 16 ? + reinterpret_cast(data_int16_) : + (RESULT_HIST_BITS == 16 ? + reinterpret_cast(buffer) : + reinterpret_cast(data_)); + const OTHER_HIST_T* other_int_data = OTHER_HIST_BITS == 16 ? + reinterpret_cast(other.data_int16_) : + reinterpret_cast(other.data_); + RESULT_HIST_T* result_int_data = RESULT_HIST_BITS == 16 ? + reinterpret_cast(data_int16_) : + reinterpret_cast(data_); + if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 32) { + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + const int32_t other_grad_hess = static_cast(other_int_data[i]); + const int64_t this_grad_hess = this_int_data[i]; + const int64_t other_grad_hess_int64 = + (static_cast(static_cast(other_grad_hess >> 16)) << 32) | + (static_cast(other_grad_hess & 0x0000ffff)); + const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64; + result_int_data[i] = result_grad_hess; + } + } else if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 16) { + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + const int32_t other_grad_hess = static_cast(other_int_data[i]); + const int64_t this_grad_hess = this_int_data[i]; + const int64_t other_grad_hess_int64 = + (static_cast(static_cast(other_grad_hess >> 16)) << 32) | + (static_cast(other_grad_hess & 0x0000ffff)); + const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64; + const int32_t result_grad_hess_int32 = + (static_cast(result_grad_hess >> 32) << 16) | + static_cast(result_grad_hess & 0x00000000ffffffff); + result_int_data[i] = result_grad_hess_int32; + } + } else { + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + result_int_data[i] = this_int_data[i] - other_int_data[i]; + } + } + } else { + for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) { + data_[i] -= other.data_[i]; + } + } + } + + void CopyToBuffer(int32_t* buffer) { + const int64_t* data_ptr = reinterpret_cast(data_); + int64_t* buffer_ptr = reinterpret_cast(buffer); + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + buffer_ptr[i] = data_ptr[i]; + } + } + + void CopyFromInt16ToInt32(char* buffer) { + const int32_t* int16_data = reinterpret_cast(RawDataInt16()); + int64_t* int32_data = reinterpret_cast(buffer); + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + const int32_t int16_val = int16_data[i]; + int32_data[i] = (static_cast(static_cast(int16_val >> 16)) << 32) | + static_cast(int16_val & 0x0000ffff); } } @@ -94,8 +174,23 @@ class FeatureHistogram { output->gain *= meta_->penalty; } + void FindBestThresholdInt(int64_t sum_gradient_and_hessian, + double grad_scale, double hess_scale, + const uint8_t num_bits_bin, + const uint8_t num_bits_acc, + data_size_t num_data, + const FeatureConstraint* constraints, + double parent_output, + SplitInfo* output) { + output->default_left = true; + output->gain = kMinScore; + int_find_best_threshold_fun_(sum_gradient_and_hessian, grad_scale, hess_scale, num_bits_bin, num_bits_acc, num_data, + constraints, parent_output, output); + output->gain *= meta_->penalty; + } + template - double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data, + double BeforeNumerical(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data, SplitInfo* output, int* rand_threshold) { is_splittable_ = false; output->monotone_type = meta_->monotone_type; @@ -112,6 +207,27 @@ class FeatureHistogram { return gain_shift + meta_->config->min_gain_to_split; } + template + double BeforeNumericalInt(int64_t sum_gradient_and_hessian, double grad_scale, double hess_scale, double parent_output, data_size_t num_data, + SplitInfo* output, int* rand_threshold) { + is_splittable_ = false; + output->monotone_type = meta_->monotone_type; + const int32_t int_sum_gradient = static_cast(sum_gradient_and_hessian >> 32); + const uint32_t int_sum_hessian = static_cast(sum_gradient_and_hessian & 0x00000000ffffffff); + const double sum_gradient = static_cast(int_sum_gradient) * grad_scale; + const double sum_hessian = static_cast(int_sum_hessian) * hess_scale; + double gain_shift = GetLeafGain( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); + *rand_threshold = 0; + if (USE_RAND) { + if (meta_->num_bin - 2 > 0) { + *rand_threshold = meta_->rand.NextInt(0, meta_->num_bin - 2); + } + } + return gain_shift + meta_->config->min_gain_to_split; + } + void FuncForNumrical() { if (meta_->config->extra_trees) { if (meta_->config->monotone_constraints.empty()) { @@ -155,6 +271,119 @@ class FeatureHistogram { template void FuncForNumricalL3() { + if (meta_->config->use_quantized_grad) { +#define TEMPLATE_PREFIX_INT USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING +#define LAMBDA_ARGUMENTS_INT \ + int64_t sum_gradient_and_hessian, double grad_scale, double hess_scale, const uint8_t hist_bits_bin, const uint8_t hist_bits_acc, data_size_t num_data, \ + const FeatureConstraint* constraints, double parent_output, SplitInfo *output +#define BEFORE_ARGUMENTS_INT sum_gradient_and_hessian, grad_scale, hess_scale, parent_output, num_data, output, &rand_threshold +#define FUNC_ARGUMENTS_INT \ + sum_gradient_and_hessian, grad_scale, hess_scale, num_data, constraints, min_gain_shift, \ + output, rand_threshold, parent_output + + if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { + if (meta_->missing_type == MissingType::Zero) { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + }; + } else { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + }; + } + } else { + if (meta_->missing_type != MissingType::NaN) { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + }; + } else { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + output->default_left = false; + }; + } + } +#undef TEMPLATE_PREFIX_INT +#undef LAMBDA_ARGUMENTS_INT +#undef BEFORE_ARGUMENTS_INT +#undef FUNC_ARGURMENTS_INT + } else { #define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING #define LAMBDA_ARGUMENTS \ double sum_gradient, double sum_hessian, data_size_t num_data, \ @@ -164,56 +393,57 @@ class FeatureHistogram { sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \ output, rand_threshold, parent_output - if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { - if (meta_->missing_type == MissingType::Zero) { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - }; - } else { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - }; - } - } else { - if (meta_->missing_type != MissingType::NaN) { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - }; + if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { + if (meta_->missing_type == MissingType::Zero) { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + }; + } else { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + }; + } } else { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - output->default_left = false; - }; + if (meta_->missing_type != MissingType::NaN) { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + }; + } else { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + output->default_left = false; + }; + } } - } #undef TEMPLATE_PREFIX #undef LAMBDA_ARGUMENTS #undef BEFORE_ARGUMENTS #undef FUNC_ARGURMENTS + } } void FuncForCategorical() { @@ -716,6 +946,14 @@ class FeatureHistogram { return (meta_->num_bin - meta_->offset) * kHistEntrySize; } + int SizeOfInt32Histgram() const { + return (meta_->num_bin - meta_->offset) * kInt32HistEntrySize; + } + + int SizeOfInt16Histgram() const { + return (meta_->num_bin - meta_->offset) * kInt16HistEntrySize; + } + /*! * \brief Restore histogram from memory */ @@ -724,6 +962,16 @@ class FeatureHistogram { (meta_->num_bin - meta_->offset) * kHistEntrySize); } + void FromMemoryInt32(char* memory_data) { + std::memcpy(data_, memory_data, + (meta_->num_bin - meta_->offset) * kInt32HistEntrySize); + } + + void FromMemoryInt16(char* memory_data) { + std::memcpy(data_int16_, memory_data, + (meta_->num_bin - meta_->offset) * kInt16HistEntrySize); + } + /*! * \brief True if this histogram can be splitted */ @@ -1082,14 +1330,312 @@ class FeatureHistogram { } } + template + void FindBestThresholdSequentiallyInt(int64_t int_sum_gradient_and_hessian, + const double grad_scale, const double hess_scale, + data_size_t num_data, + const FeatureConstraint* constraints, + double min_gain_shift, SplitInfo* output, + int rand_threshold, double parent_output) { + const int8_t offset = meta_->offset; + PACKED_HIST_ACC_T best_sum_left_gradient_and_hessian = 0; + PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian = + HIST_BITS_ACC == 16 ? + ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) | static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) : + int_sum_gradient_and_hessian; + double best_gain = kMinScore; + uint32_t best_threshold = static_cast(meta_->num_bin); + const double cnt_factor = static_cast(num_data) / + static_cast(static_cast(int_sum_gradient_and_hessian & 0x00000000ffffffff)); + + BasicConstraint best_right_constraints; + BasicConstraint best_left_constraints; + bool constraint_update_necessary = + USE_MC && constraints->ConstraintDifferentDependingOnThreshold(); + + if (USE_MC) { + constraints->InitCumulativeConstraints(REVERSE); + } + + const PACKED_HIST_BIN_T* data_ptr = nullptr; + if (HIST_BITS_BIN == 16) { + data_ptr = reinterpret_cast(data_int16_); + } else { + data_ptr = reinterpret_cast(data_); + } + if (REVERSE) { + PACKED_HIST_ACC_T sum_right_gradient_and_hessian = 0; + + int t = meta_->num_bin - 1 - offset - NA_AS_MISSING; + const int t_end = 1 - offset; + + // from right to left, and we don't need data in bin0 + for (; t >= t_end; --t) { + // need to skip default bin + if (SKIP_DEFAULT_BIN) { + if ((t + offset) == static_cast(meta_->default_bin)) { + continue; + } + } + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t]; + if (HIST_BITS_ACC != HIST_BITS_BIN) { + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x00000000ffffffff))); + sum_right_gradient_and_hessian += grad_and_hess_acc; + } else { + sum_right_gradient_and_hessian += grad_and_hess; + } + const uint32_t int_sum_right_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_right_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_right_gradient_and_hessian & 0x00000000ffffffff); + data_size_t right_count = Common::RoundInt(int_sum_right_hessian * cnt_factor); + double sum_right_hessian = int_sum_right_hessian * hess_scale; + // if data not enough, or sum hessian too small + if (right_count < meta_->config->min_data_in_leaf || + sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t left_count = num_data - right_count; + // if data not enough + if (left_count < meta_->config->min_data_in_leaf) { + break; + } + + const PACKED_HIST_ACC_T sum_left_gradient_and_hessian = local_int_sum_gradient_and_hessian - sum_right_gradient_and_hessian; + const uint32_t int_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_left_gradient_and_hessian & 0x00000000ffffffff); + double sum_left_hessian = int_sum_left_hessian * hess_scale; + // if sum hessian too small + if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { + break; + } + + double sum_right_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_right_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_right_gradient_and_hessian >> 32)) * grad_scale; + double sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_left_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_left_gradient_and_hessian >> 32)) * grad_scale; + if (USE_RAND) { + if (t - 1 + offset != rand_threshold) { + continue; + } + } + + if (USE_MC && constraint_update_necessary) { + constraints->Update(t + offset); + } + + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient, + sum_right_hessian + kEpsilon, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + constraints, meta_->monotone_type, meta_->config->path_smooth, + left_count, right_count, parent_output); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // mark as able to be split + is_splittable_ = true; + // better split point + if (current_gain > best_gain) { + if (USE_MC) { + best_right_constraints = constraints->RightToBasicConstraint(); + best_left_constraints = constraints->LeftToBasicConstraint(); + if (best_right_constraints.min > best_right_constraints.max || + best_left_constraints.min > best_left_constraints.max) { + continue; + } + } + best_sum_left_gradient_and_hessian = sum_left_gradient_and_hessian; + // left is <= threshold, right is > threshold. so this is t-1 + best_threshold = static_cast(t - 1 + offset); + best_gain = current_gain; + } + } + } else { + PACKED_HIST_ACC_T sum_left_gradient_and_hessian = 0; + + int t = 0; + const int t_end = meta_->num_bin - 2 - offset; + + if (NA_AS_MISSING) { + if (offset == 1) { + sum_left_gradient_and_hessian = local_int_sum_gradient_and_hessian; + for (int i = 0; i < meta_->num_bin - offset; ++i) { + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[i]; + if (HIST_BITS_ACC != HIST_BITS_BIN) { + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x00000000ffffffff))); + sum_left_gradient_and_hessian -= grad_and_hess_acc; + } else { + sum_left_gradient_and_hessian -= grad_and_hess; + } + } + t = -1; + } + } + + for (; t <= t_end; ++t) { + if (SKIP_DEFAULT_BIN) { + if ((t + offset) == static_cast(meta_->default_bin)) { + continue; + } + } + if (t >= 0) { + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t]; + if (HIST_BITS_ACC != HIST_BITS_BIN) { + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x00000000ffffffff))); + sum_left_gradient_and_hessian += grad_and_hess_acc; + } else { + sum_left_gradient_and_hessian += grad_and_hess; + } + } + // if data not enough, or sum hessian too small + const uint32_t int_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_left_gradient_and_hessian & 0x00000000ffffffff); + const data_size_t left_count = Common::RoundInt(static_cast(int_sum_left_hessian) * cnt_factor); + const double sum_left_hessian = static_cast(int_sum_left_hessian) * hess_scale; + if (left_count < meta_->config->min_data_in_leaf || + sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t right_count = num_data - left_count; + // if data not enough + if (right_count < meta_->config->min_data_in_leaf) { + break; + } + + const PACKED_HIST_ACC_T sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - sum_left_gradient_and_hessian; + const uint32_t int_sum_right_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_right_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_right_gradient_and_hessian & 0x00000000ffffffff); + const double sum_right_hessian = static_cast(int_sum_right_hessian) * hess_scale; + // if sum Hessian too small + if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { + break; + } + + double sum_right_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_right_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_right_gradient_and_hessian >> 32)) * grad_scale; + double sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_left_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_left_gradient_and_hessian >> 32)) * grad_scale; + if (USE_RAND) { + if (t + offset != rand_threshold) { + continue; + } + } + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient, + sum_right_hessian + kEpsilon, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + constraints, meta_->monotone_type, meta_->config->path_smooth, left_count, + right_count, parent_output); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // mark as able to be split + is_splittable_ = true; + // better split point + if (current_gain > best_gain) { + if (USE_MC) { + best_right_constraints = constraints->RightToBasicConstraint(); + best_left_constraints = constraints->LeftToBasicConstraint(); + if (best_right_constraints.min > best_right_constraints.max || + best_left_constraints.min > best_left_constraints.max) { + continue; + } + } + best_sum_left_gradient_and_hessian = sum_left_gradient_and_hessian; + best_threshold = static_cast(t + offset); + best_gain = current_gain; + } + } + } + + if (is_splittable_ && best_gain > output->gain + min_gain_shift) { + const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) : + static_cast(best_sum_left_gradient_and_hessian >> 32); + const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(best_sum_left_gradient_and_hessian & 0x00000000ffffffff); + const double best_sum_left_gradient = static_cast(int_best_sum_left_gradient) * grad_scale; + const double best_sum_left_hessian = static_cast(int_best_sum_left_hessian) * hess_scale; + const int64_t best_sum_left_gradient_and_hessian_int64 = HIST_BITS_ACC == 16 ? + ((static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) << 32) | + static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff)) : + best_sum_left_gradient_and_hessian; + const int64_t best_sum_right_gradient_and_hessian = int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian_int64; + const int32_t int_best_sum_right_gradient = static_cast(best_sum_right_gradient_and_hessian >> 32); + const uint32_t int_best_sum_right_hessian = static_cast(best_sum_right_gradient_and_hessian & 0x00000000ffffffff); + const double best_sum_right_gradient = static_cast(int_best_sum_right_gradient) * grad_scale; + const double best_sum_right_hessian = static_cast(int_best_sum_right_hessian) * hess_scale; + const data_size_t best_left_count = Common::RoundInt(static_cast(int_best_sum_left_hessian) * cnt_factor); + const data_size_t best_right_count = Common::RoundInt(static_cast(int_best_sum_right_hessian) * cnt_factor); + // update split information + output->threshold = best_threshold; + output->left_output = + CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, best_left_constraints, meta_->config->path_smooth, + best_left_count, parent_output); + output->left_count = best_left_count; + output->left_sum_gradient = best_sum_left_gradient; + output->left_sum_hessian = best_sum_left_hessian; + output->left_sum_gradient_and_hessian = best_sum_left_gradient_and_hessian_int64; + output->right_output = + CalculateSplittedLeafOutput( + best_sum_right_gradient, + best_sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + best_right_constraints, meta_->config->path_smooth, best_right_count, + parent_output); + output->right_count = best_right_count; + output->right_sum_gradient = best_sum_right_gradient; + output->right_sum_hessian = best_sum_right_hessian; + output->right_sum_gradient_and_hessian = best_sum_right_gradient_and_hessian; + output->gain = best_gain - min_gain_shift; + output->default_left = REVERSE; + } + } + const FeatureMetainfo* meta_; /*! \brief sum of gradient of each bin */ hist_t* data_; + int16_t* data_int16_; bool is_splittable_ = true; std::function find_best_threshold_fun_; + + std::function + int_find_best_threshold_fun_; }; class HistogramPool { @@ -1200,18 +1746,35 @@ class HistogramPool { pool_.resize(cache_size); data_.resize(cache_size); } - OMP_INIT_EX(); -#pragma omp parallel for schedule(static) - for (int i = old_cache_size; i < cache_size; ++i) { - OMP_LOOP_EX_BEGIN(); - pool_[i].reset(new FeatureHistogram[train_data->num_features()]); - data_[i].resize(num_total_bin * 2); - for (int j = 0; j < train_data->num_features(); ++j) { - pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); + + if (config->use_quantized_grad) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int i = old_cache_size; i < cache_size; ++i) { + OMP_LOOP_EX_BEGIN(); + pool_[i].reset(new FeatureHistogram[train_data->num_features()]); + data_[i].resize(num_total_bin); + for (int j = 0; j < train_data->num_features(); ++j) { + int16_t* data_ptr = reinterpret_cast(data_[i].data()); + pool_[i][j].Init(data_[i].data() + offsets[j], data_ptr + 2 * offsets[j], &feature_metas_[j]); + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } else { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int i = old_cache_size; i < cache_size; ++i) { + OMP_LOOP_EX_BEGIN(); + pool_[i].reset(new FeatureHistogram[train_data->num_features()]); + data_[i].resize(num_total_bin * 2); + for (int j = 0; j < train_data->num_features(); ++j) { + pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); + } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); + OMP_THROW_EX(); } - OMP_THROW_EX(); } void ResetConfig(const Dataset* train_data, const Config* config) { diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index f92da0fe9f76..294be28b6f86 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -991,7 +991,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector& is_feature_u nullptr, nullptr, nullptr, nullptr); // then construct sparse features on CPU - train_data_->ConstructHistograms(is_sparse_feature_used, + train_data_->ConstructHistograms(is_sparse_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), @@ -1056,7 +1056,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector& is_feature_u gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data()); // then construct sparse features on CPU - train_data_->ConstructHistograms(is_sparse_feature_used, + train_data_->ConstructHistograms(is_sparse_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), diff --git a/src/treelearner/gradient_discretizer.cpp b/src/treelearner/gradient_discretizer.cpp new file mode 100644 index 000000000000..4c00f73ab12c --- /dev/null +++ b/src/treelearner/gradient_discretizer.cpp @@ -0,0 +1,262 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "gradient_discretizer.hpp" +#include + +#include +#include +#include + +namespace LightGBM { + +void GradientDiscretizer::Init( + const data_size_t num_data, const int num_leaves, + const int num_features, const Dataset* train_data) { + discretized_gradients_and_hessians_vector_.resize(num_data * 2); + gradient_random_values_.resize(num_data); + hessian_random_values_.resize(num_data); + random_values_use_start_eng_ = std::mt19937(random_seed_); + random_values_use_start_dist_ = std::uniform_int_distribution(0, num_data); + + const int num_threads = OMP_NUM_THREADS(); + int num_blocks = 0; + data_size_t block_size = 0; + Threading::BlockInfo(num_data, 512, &num_blocks, &block_size); + #pragma omp parallel for schedule(static, 1) num_threads(num_threads) + for (int thread_id = 0; thread_id < num_blocks; ++thread_id) { + const data_size_t start = thread_id * block_size; + const data_size_t end = std::min(start + block_size, num_data); + std::mt19937 gradient_random_values_eng(random_seed_ + thread_id); + std::uniform_real_distribution gradient_random_values_dist(0.0f, 1.0f); + std::mt19937 hessian_random_values_eng(random_seed_ + thread_id + num_threads); + std::uniform_real_distribution hessian_random_values_dist(0.0f, 1.0f); + for (data_size_t i = start; i < end; ++i) { + gradient_random_values_[i] = gradient_random_values_dist(gradient_random_values_eng); + hessian_random_values_[i] = hessian_random_values_dist(hessian_random_values_eng); + } + } + + max_gradient_abs_ = 0.0f; + max_hessian_abs_ = 0.0f; + + gradient_scale_ = 0.0f; + hessian_scale_ = 0.0f; + inverse_gradient_scale_ = 0.0f; + inverse_hessian_scale_ = 0.0f; + + num_leaves_ = num_leaves; + leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + node_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + global_leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + global_node_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + + leaf_grad_hess_stats_.resize(num_leaves_ * 2, 0.0); + change_hist_bits_buffer_.resize(num_features); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int feature_index = 0; feature_index < num_features; ++feature_index) { + const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); + change_hist_bits_buffer_[feature_index].resize((bin_mapper->num_bin() - static_cast(bin_mapper->GetMostFreqBin() == 0)) * 2); + } + + ordered_int_gradients_and_hessians_.resize(2 * num_data); +} + +void GradientDiscretizer::DiscretizeGradients( + const data_size_t num_data, + const score_t* input_gradients, + const score_t* input_hessians) { + double max_gradient = std::fabs(input_gradients[0]); + double max_hessian = std::fabs(input_hessians[0]); + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_max_gradient(num_threads, max_gradient); + std::vector thread_max_hessian(num_threads, max_hessian); + Threading::For(0, num_data, 1024, + [input_gradients, input_hessians, &thread_max_gradient, &thread_max_hessian] + (int, data_size_t start, data_size_t end) { + int thread_id = omp_get_thread_num(); + for (data_size_t i = start; i < end; ++i) { + double fabs_grad = std::fabs(input_gradients[i]); + double fabs_hess = std::fabs(input_hessians[i]); + if (fabs_grad > thread_max_gradient[thread_id]) { + thread_max_gradient[thread_id] = fabs_grad; + } + if (fabs_hess > thread_max_hessian[thread_id]) { + thread_max_hessian[thread_id] = fabs_hess; + } + }}); + max_gradient = thread_max_gradient[0]; + max_hessian = thread_max_hessian[0]; + for (int thread_id = 1; thread_id < num_threads; ++thread_id) { + if (max_gradient < thread_max_gradient[thread_id]) { + max_gradient = thread_max_gradient[thread_id]; + } + if (max_hessian < thread_max_hessian[thread_id]) { + max_hessian = thread_max_hessian[thread_id]; + } + } + if (Network::num_machines() > 1) { + max_gradient = Network::GlobalSyncUpByMax(max_gradient); + max_hessian = Network::GlobalSyncUpByMax(max_hessian); + } + max_gradient_abs_ = max_gradient; + max_hessian_abs_ = max_hessian; + gradient_scale_ = max_gradient_abs_ / static_cast(num_grad_quant_bins_ / 2); + if (is_constant_hessian_) { + hessian_scale_ = max_hessian_abs_; + } else { + hessian_scale_ = max_hessian_abs_ / static_cast(num_grad_quant_bins_); + } + inverse_gradient_scale_ = 1.0f / gradient_scale_; + inverse_hessian_scale_ = 1.0f / hessian_scale_; + + const int random_values_use_start = random_values_use_start_dist_(random_values_use_start_eng_); + int8_t* discretized_int8 = discretized_gradients_and_hessians_vector_.data(); + if (stochastic_rounding_) { + if (is_constant_hessian_) { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + const data_size_t random_value_pos = (i + random_values_use_start) % num_data; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) : + static_cast(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]); + discretized_int8[2 * i] = static_cast(1); + } + } else { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + const data_size_t random_value_pos = (i + random_values_use_start) % num_data; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) : + static_cast(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]); + discretized_int8[2 * i] = static_cast(input_hessians[i] * inverse_hessian_scale_ + hessian_random_values_[random_value_pos]); + } + } + } else { + if (is_constant_hessian_) { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + 0.5) : + static_cast(gradient * inverse_gradient_scale_ - 0.5); + discretized_int8[2 * i] = static_cast(1); + } + } else { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + 0.5) : + static_cast(gradient * inverse_gradient_scale_ - 0.5); + discretized_int8[2 * i] = static_cast(input_hessians[i] * inverse_hessian_scale_ + 0.5); + } + } + } +} + +template +void GradientDiscretizer::SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf) { + std::vector& leaf_num_bits_in_histogram_bin = IS_GLOBAL ? + global_leaf_num_bits_in_histogram_bin_ : leaf_num_bits_in_histogram_bin_; + std::vector& node_num_bits_in_histogram_bin = IS_GLOBAL ? + global_node_num_bits_in_histogram_bin_ : node_num_bits_in_histogram_bin_; + if (right_leaf_index == -1) { + const uint64_t max_stat_per_bin = static_cast(num_data_in_left_leaf) * static_cast(num_grad_quant_bins_); + if (max_stat_per_bin < 256) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 8; + } else if (max_stat_per_bin < 65536) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 16; + } else { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 32; + } + } else { + const uint64_t max_stat_left_per_bin = static_cast(num_data_in_left_leaf) * static_cast(num_grad_quant_bins_); + const uint64_t max_stat_right_per_bin = static_cast(num_data_in_right_leaf) * static_cast(num_grad_quant_bins_); + node_num_bits_in_histogram_bin[left_leaf_index] = leaf_num_bits_in_histogram_bin[left_leaf_index]; + if (max_stat_left_per_bin < 256) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 8; + } else if (max_stat_left_per_bin < 65536) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 16; + } else { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 32; + } + if (max_stat_right_per_bin < 256) { + leaf_num_bits_in_histogram_bin[right_leaf_index] = 8; + } else if (max_stat_right_per_bin < 65536) { + leaf_num_bits_in_histogram_bin[right_leaf_index] = 16; + } else { + leaf_num_bits_in_histogram_bin[right_leaf_index] = 32; + } + } +} + +template void GradientDiscretizer::SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf); + +template void GradientDiscretizer::SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf); + +void GradientDiscretizer::RenewIntGradTreeOutput( + Tree* tree, const Config* config, const DataPartition* data_partition, + const score_t* gradients, const score_t* hessians, + const std::function& leaf_index_to_global_num_data) { + global_timer.Start("GradientDiscretizer::RenewIntGradTreeOutput"); + if (config->tree_learner == std::string("data")) { + for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) { + data_size_t leaf_cnt = 0; + const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt); + double sum_gradient = 0.0f, sum_hessian = 0.0f; + #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian) + for (data_size_t i = 0; i < leaf_cnt; ++i) { + const data_size_t index = data_indices[i]; + const score_t grad = gradients[index]; + const score_t hess = hessians[index]; + sum_gradient += grad; + sum_hessian += hess; + } + leaf_grad_hess_stats_[2 * leaf_id] = sum_gradient; + leaf_grad_hess_stats_[2 * leaf_id + 1] = sum_hessian; + } + std::vector global_leaf_grad_hess_stats = Network::GlobalSum(&leaf_grad_hess_stats_); + for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) { + const double sum_gradient = global_leaf_grad_hess_stats[2 * leaf_id]; + const double sum_hessian = global_leaf_grad_hess_stats[2 * leaf_id + 1]; + const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput( + sum_gradient, sum_hessian, + config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth, + leaf_index_to_global_num_data(leaf_id), 0.0f); + tree->SetLeafOutput(leaf_id, leaf_output); + } + } else { + for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) { + data_size_t leaf_cnt = 0; + const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt); + double sum_gradient = 0.0f, sum_hessian = 0.0f; + #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian) + for (data_size_t i = 0; i < leaf_cnt; ++i) { + const data_size_t index = data_indices[i]; + const score_t grad = gradients[index]; + const score_t hess = hessians[index]; + sum_gradient += grad; + sum_hessian += hess; + } + const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput(sum_gradient, sum_hessian, + config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth, + leaf_cnt, 0.0f); + tree->SetLeafOutput(leaf_id, leaf_output); + } + } + global_timer.Stop("GradientDiscretizer::RenewIntGradTreeOutput"); +} + +} // namespace LightGBM diff --git a/src/treelearner/gradient_discretizer.hpp b/src/treelearner/gradient_discretizer.hpp new file mode 100644 index 000000000000..352788f7d093 --- /dev/null +++ b/src/treelearner/gradient_discretizer.hpp @@ -0,0 +1,128 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_ +#define LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_ + +#include +#include +#include +#include + +#include +#include + +#include "data_partition.hpp" +#include "feature_histogram.hpp" + +namespace LightGBM { + +class GradientDiscretizer { + public: + GradientDiscretizer(int num_grad_quant_bins, int num_trees, int random_seed, bool is_constant_hessian, const bool stochastic_rounding) { + num_grad_quant_bins_ = num_grad_quant_bins; + iter_ = 0; + num_trees_ = num_trees; + random_seed_ = random_seed; + is_constant_hessian_ = is_constant_hessian; + stochastic_rounding_ = stochastic_rounding; + } + + ~GradientDiscretizer() {} + + virtual void DiscretizeGradients( + const data_size_t num_data, + const score_t* input_gradients, + const score_t* input_hessians); + + virtual const int8_t* discretized_gradients_and_hessians() const { + return discretized_gradients_and_hessians_vector_.data(); + } + + virtual double grad_scale() const { + return gradient_scale_; + } + + virtual double hess_scale() const { + return hessian_scale_; + } + + virtual void Init( + const data_size_t num_data, const int num_leaves, + const int num_features, const Dataset* train_data); + + template + void SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf); + + template + int8_t GetHistBitsInLeaf(const int leaf_index) { + if (IS_GLOBAL) { + return global_leaf_num_bits_in_histogram_bin_[leaf_index]; + } else { + return leaf_num_bits_in_histogram_bin_[leaf_index]; + } + } + + template + int8_t GetHistBitsInNode(const int node_index) { + if (IS_GLOBAL) { + return global_node_num_bits_in_histogram_bin_[node_index]; + } else { + return node_num_bits_in_histogram_bin_[node_index]; + } + } + + int8_t* ordered_int_gradients_and_hessians() { + return ordered_int_gradients_and_hessians_.data(); + } + + void RenewIntGradTreeOutput( + Tree* tree, const Config* config, const DataPartition* data_partition, + const score_t* gradients, const score_t* hessians, + const std::function& leaf_index_to_global_num_data); + + int32_t* GetChangeHistBitsBuffer(const int feature_index) { + return change_hist_bits_buffer_[feature_index].data(); + } + + protected: + int num_grad_quant_bins_; + int iter_; + int num_trees_; + int random_seed_; + bool stochastic_rounding_; + + std::vector gradient_random_values_; + std::vector hessian_random_values_; + std::mt19937 random_values_use_start_eng_; + std::uniform_int_distribution random_values_use_start_dist_; + std::vector discretized_gradients_and_hessians_vector_; + std::vector ordered_int_gradients_and_hessians_; + + double max_gradient_abs_; + double max_hessian_abs_; + + double gradient_scale_; + double hessian_scale_; + double inverse_gradient_scale_; + double inverse_hessian_scale_; + + bool is_constant_hessian_; + int num_leaves_; + + std::vector leaf_num_bits_in_histogram_bin_; + std::vector node_num_bits_in_histogram_bin_; + std::vector global_leaf_num_bits_in_histogram_bin_; + std::vector global_node_num_bits_in_histogram_bin_; + + std::vector leaf_grad_hess_stats_; + std::vector> change_hist_bits_buffer_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_ diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 46d8ce417857..163bfc4df9ca 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -85,6 +85,38 @@ class LeafSplits { sum_hessians_ = tmp_sum_hessians; } + + /*! + * \brief Init splits on the current leaf, it will traverse all data to sum up the results + * \param int_gradients_and_hessians Discretized gradients and hessians + * \param grad_scale Scaling factor to recover original gradients from discretized gradients + * \param hess_scale Scaling factor to recover original hessians from discretized hessians + */ + void Init(const int8_t* int_gradients_and_hessians, + const double grad_scale, const double hess_scale) { + num_data_in_leaf_ = num_data_; + leaf_index_ = 0; + data_indices_ = nullptr; + double tmp_sum_gradients = 0.0f; + double tmp_sum_hessians = 0.0f; + const int16_t* packed_int_gradients_and_hessians = reinterpret_cast(int_gradients_and_hessians); + int64_t tmp_sum_gradients_and_hessians = 0; +#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_) + for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { + tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale; + tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale; + const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i]; + const int64_t packed_long_int_grad_and_hess = + (static_cast(static_cast(packed_int_grad_and_hess >> 8)) << 32) | + (static_cast(packed_int_grad_and_hess & 0x00ff)); + tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess; + } + sum_gradients_ = tmp_sum_gradients; + sum_hessians_ = tmp_sum_hessians; + int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians; + } + + /*! * \brief Init splits on current leaf of partial data. * \param leaf Index of current leaf @@ -109,6 +141,40 @@ class LeafSplits { } + /*! + * \brief Init splits on current leaf of partial data. + * \param leaf Index of current leaf + * \param data_partition current data partition + * \param int_gradients_and_hessians Discretized gradients and hessians + * \param grad_scale Scaling factor to recover original gradients from discretized gradients + * \param hess_scale Scaling factor to recover original hessians from discretized hessians + */ + void Init(int leaf, const DataPartition* data_partition, + const int8_t* int_gradients_and_hessians, + const score_t grad_scale, const score_t hess_scale) { + leaf_index_ = leaf; + data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); + double tmp_sum_gradients = 0.0f; + double tmp_sum_hessians = 0.0f; + const int16_t* packed_int_gradients_and_hessians = reinterpret_cast(int_gradients_and_hessians); + int64_t tmp_sum_gradients_and_hessians = 0; +#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_) + for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { + const data_size_t idx = data_indices_[i]; + tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale; + tmp_sum_hessians += int_gradients_and_hessians[2 * idx] * hess_scale; + const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i]; + const int64_t packed_long_int_grad_and_hess = + (static_cast(static_cast(packed_int_grad_and_hess >> 8)) << 32) | + (static_cast(packed_int_grad_and_hess & 0x00ff)); + tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess; + } + sum_gradients_ = tmp_sum_gradients; + sum_hessians_ = tmp_sum_hessians; + int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians; + } + + /*! * \brief Init splits on current leaf, only update sum_gradients and sum_hessians * \param sum_gradients @@ -120,6 +186,19 @@ class LeafSplits { sum_hessians_ = sum_hessians; } + /*! + * \brief Init splits on current leaf, only update sum_gradients and sum_hessians + * \param sum_gradients + * \param sum_hessians + * \param int_sum_gradients_and_hessians + */ + void Init(double sum_gradients, double sum_hessians, int64_t int_sum_gradients_and_hessians) { + leaf_index_ = 0; + sum_gradients_ = sum_gradients; + sum_hessians_ = sum_hessians; + int_sum_gradients_and_hessians_ = int_sum_gradients_and_hessians; + } + /*! * \brief Init splits on current leaf */ @@ -142,6 +221,9 @@ class LeafSplits { /*! \brief Get sum of Hessians of current leaf */ double sum_hessians() const { return sum_hessians_; } + /*! \brief Get sum of discretized gradients and Hessians of current leaf */ + int64_t int_sum_gradients_and_hessians() const { return int_sum_gradients_and_hessians_; } + /*! \brief Get indices of data of current leaf */ const data_size_t* data_indices() const { return data_indices_; } @@ -162,6 +244,8 @@ class LeafSplits { double sum_gradients_; /*! \brief sum of Hessians of current leaf */ double sum_hessians_; + /*! \brief sum of discretized gradients and Hessians of current leaf */ + int64_t int_sum_gradients_and_hessians_; /*! \brief indices of data of current leaf */ const data_size_t* data_indices_; /*! \brief weight of current leaf */ diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 29f4e1688b99..b942dceab28b 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -71,15 +71,24 @@ class DataParallelTreeLearner: public TREELEARNER_T { } } + void PrepareBufferPos( + const std::vector>& feature_distribution, + std::vector* block_start, + std::vector* block_len, + std::vector* buffer_write_start_pos, + std::vector* buffer_read_start_pos, + comm_size_t* reduce_scatter_size, + size_t hist_entry_size); + private: /*! \brief Rank of local machine */ int rank_; /*! \brief Number of machines of this parallel task */ int num_machines_; /*! \brief Buffer for network send */ - std::vector input_buffer_; + std::vector> input_buffer_; /*! \brief Buffer for network receive */ - std::vector output_buffer_; + std::vector> output_buffer_; /*! \brief different machines will aggregate histograms for different features, use this to mark local aggregate features*/ std::vector is_feature_aggregated_; @@ -87,12 +96,22 @@ class DataParallelTreeLearner: public TREELEARNER_T { std::vector block_start_; /*! \brief Block size for reduce scatter */ std::vector block_len_; + /*! \brief Block start index for reduce scatter with int16 histograms */ + std::vector block_start_int16_; + /*! \brief Block size for reduce scatter with int16 histograms */ + std::vector block_len_int16_; /*! \brief Write positions for feature histograms */ std::vector buffer_write_start_pos_; /*! \brief Read positions for local feature histograms */ std::vector buffer_read_start_pos_; + /*! \brief Write positions for feature histograms with int16 histograms*/ + std::vector buffer_write_start_pos_int16_; + /*! \brief Read positions for local feature histograms with int16 histograms */ + std::vector buffer_read_start_pos_int16_; /*! \brief Size for reduce scatter */ comm_size_t reduce_scatter_size_; + /*! \brief Size for reduce scatter with int16 histogram*/ + comm_size_t reduce_scatter_size_int16_; /*! \brief Store global number of data in leaves */ std::vector global_data_count_in_leaf_; }; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5ca8a3f047f6..c322c1a796c2 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -21,6 +21,7 @@ namespace LightGBM { SerialTreeLearner::SerialTreeLearner(const Config* config) : config_(config), col_sampler_(config) { + gradient_discretizer_ = nullptr; } SerialTreeLearner::~SerialTreeLearner() { @@ -60,6 +61,11 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ordered_gradients_.resize(num_data_); ordered_hessians_.resize(num_data_); + if (config_->use_quantized_grad) { + gradient_discretizer_.reset(new GradientDiscretizer(config_->num_grad_quant_bins, config_->num_iterations, config_->seed, is_constant_hessian, config_->stochastic_rounding)); + gradient_discretizer_->Init(num_data_, config_->num_leaves, num_features_, train_data_); + } + GetShareStates(train_data_, is_constant_hessian, true); histogram_pool_.DynamicChangeSize(train_data_, share_state_->num_hist_total_bin(), @@ -76,17 +82,31 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time) { if (is_first_time) { - share_state_.reset(dataset->GetShareStates( - ordered_gradients_.data(), ordered_hessians_.data(), + if (config_->use_quantized_grad) { + share_state_.reset(dataset->GetShareStates( + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise)); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + } else { + share_state_.reset(dataset->GetShareStates( + ordered_gradients_.data(), ordered_hessians_.data(), + col_sampler_.is_feature_used_bytree(), is_constant_hessian, + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + } } else { CHECK_NOTNULL(share_state_); // cannot change is_hist_col_wise during training - share_state_.reset(dataset->GetShareStates( - ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), - is_constant_hessian, share_state_->is_col_wise, - !share_state_->is_col_wise)); + if (config_->use_quantized_grad) { + share_state_.reset(dataset->GetShareStates( + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, + col_sampler_.is_feature_used_bytree(), is_constant_hessian, + share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins)); + } else { + share_state_.reset(dataset->GetShareStates( + ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), + is_constant_hessian, share_state_->is_col_wise, + !share_state_->is_col_wise, config_->num_grad_quant_bins)); + } } CHECK_NOTNULL(share_state_); } @@ -169,6 +189,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } share_state_->num_threads = num_threads; + if (config_->use_quantized_grad) { + gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_); + } + // some initial works before training BeforeTrain(); @@ -205,6 +229,11 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } + if (config_->use_quantized_grad && config_->quant_train_renew_leaf) { + gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_, + [this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); }); + } + Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth); return tree.release(); } @@ -270,11 +299,25 @@ void SerialTreeLearner::BeforeTrain() { // Sumup for root if (data_partition_->leaf_count(0) == num_data_) { // use all data - smaller_leaf_splits_->Init(gradients_, hessians_); - + if (!config_->use_quantized_grad) { + smaller_leaf_splits_->Init(gradients_, hessians_); + } else { + smaller_leaf_splits_->Init( + gradient_discretizer_->discretized_gradients_and_hessians(), + gradient_discretizer_->grad_scale(), + gradient_discretizer_->hess_scale()); + } } else { // use bagging, only use part of data - smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); + if (!config_->use_quantized_grad) { + smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); + } else { + smaller_leaf_splits_->Init( + 0, data_partition_.get(), + gradient_discretizer_->discretized_gradients_and_hessians(), + gradient_discretizer_->grad_scale(), + gradient_discretizer_->hess_scale()); + } } larger_leaf_splits_->Init(); @@ -282,6 +325,10 @@ void SerialTreeLearner::BeforeTrain() { if (cegb_ != nullptr) { cegb_->BeforeTrain(); } + + if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { + gradient_discretizer_->SetNumBitsInHistogramBin(0, -1, data_partition_->leaf_count(0), 0); + } } bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { @@ -353,22 +400,67 @@ void SerialTreeLearner::ConstructHistograms( Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer); // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = - smaller_leaf_histogram_array_[0].RawData() - kHistOffset; - train_data_->ConstructHistograms( - is_feature_used, smaller_leaf_splits_->data_indices(), - smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, - ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), - ptr_smaller_leaf_hist_data); - if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { - // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = - larger_leaf_histogram_array_[0].RawData() - kHistOffset; - train_data_->ConstructHistograms( - is_feature_used, larger_leaf_splits_->data_indices(), - larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, + if (config_->use_quantized_grad) { + const uint8_t smaller_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); + hist_t* ptr_smaller_leaf_hist_data = + smaller_leaf_num_bits <= 16 ? + reinterpret_cast(smaller_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) : + reinterpret_cast(smaller_leaf_histogram_array_[0].RawDataInt32() - kHistOffset); + #define SMALLER_LEAF_ARGS \ + is_feature_used, smaller_leaf_splits_->data_indices(), \ + smaller_leaf_splits_->num_data_in_leaf(), \ + reinterpret_cast(gradient_discretizer_->discretized_gradients_and_hessians()), \ + nullptr, \ + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), \ + nullptr, \ + share_state_.get(), \ + reinterpret_cast(ptr_smaller_leaf_hist_data) + if (smaller_leaf_num_bits <= 16) { + train_data_->ConstructHistograms(SMALLER_LEAF_ARGS); + } else { + train_data_->ConstructHistograms(SMALLER_LEAF_ARGS); + } + #undef SMALLER_LEAF_ARGS + if (larger_leaf_histogram_array_ && !use_subtract) { + const uint8_t larger_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + hist_t* ptr_larger_leaf_hist_data = + larger_leaf_num_bits <= 16 ? + reinterpret_cast(larger_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) : + reinterpret_cast(larger_leaf_histogram_array_[0].RawDataInt32() - kHistOffset); + #define LARGER_LEAF_ARGS \ + is_feature_used, larger_leaf_splits_->data_indices(), \ + larger_leaf_splits_->num_data_in_leaf(), \ + reinterpret_cast(gradient_discretizer_->discretized_gradients_and_hessians()), \ + nullptr, \ + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), \ + nullptr, \ + share_state_.get(), \ + reinterpret_cast(ptr_larger_leaf_hist_data) + if (larger_leaf_num_bits <= 16) { + train_data_->ConstructHistograms(LARGER_LEAF_ARGS); + } else { + train_data_->ConstructHistograms(LARGER_LEAF_ARGS); + } + #undef LARGER_LEAF_ARGS + } + } else { + hist_t* ptr_smaller_leaf_hist_data = + smaller_leaf_histogram_array_[0].RawData() - kHistOffset; + train_data_->ConstructHistograms( + is_feature_used, smaller_leaf_splits_->data_indices(), + smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), - ptr_larger_leaf_hist_data); + ptr_smaller_leaf_hist_data); + if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { + // construct larger leaf + hist_t* ptr_larger_leaf_hist_data = + larger_leaf_histogram_array_[0].RawData() - kHistOffset; + train_data_->ConstructHistograms( + is_feature_used, larger_leaf_splits_->data_indices(), + larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, + ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), + ptr_larger_leaf_hist_data); + } } } @@ -388,6 +480,26 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( if (larger_leaf_splits_->leaf_index() >= 0) { larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index()); } + + if (use_subtract && config_->use_quantized_grad) { + const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); + const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode(parent_index); + const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + if (parent_hist_bits > 16 && larger_hist_bits <= 16) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_used[feature_index]) { + continue; + } + larger_leaf_histogram_array_[feature_index].CopyToBuffer(gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } + } + OMP_INIT_EX(); // find splits #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) @@ -397,10 +509,24 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( continue; } const int tid = omp_get_thread_num(); - train_data_->FixHistogram( - feature_index, smaller_leaf_splits_->sum_gradients(), - smaller_leaf_splits_->sum_hessians(), - smaller_leaf_histogram_array_[feature_index].RawData()); + if (config_->use_quantized_grad) { + const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); + const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians(); + if (hist_bits_bin <= 16) { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(smaller_leaf_histogram_array_[feature_index].RawDataInt16())); + } else { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(smaller_leaf_histogram_array_[feature_index].RawDataInt32())); + } + } else { + train_data_->FixHistogram( + feature_index, smaller_leaf_splits_->sum_gradients(), + smaller_leaf_splits_->sum_hessians(), + smaller_leaf_histogram_array_[feature_index].RawData()); + } int real_fidx = train_data_->RealFeatureIndex(feature_index); ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index, @@ -417,13 +543,50 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } if (use_subtract) { - larger_leaf_histogram_array_[feature_index].Subtract( - smaller_leaf_histogram_array_[feature_index]); + if (config_->use_quantized_grad) { + const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); + const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode(parent_index); + const uint8_t smaller_hist_bits = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); + const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + if (parent_hist_bits <= 16) { + CHECK_LE(smaller_hist_bits, 16); + CHECK_LE(larger_hist_bits, 16); + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } else if (larger_hist_bits <= 16) { + CHECK_LE(smaller_hist_bits, 16); + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index], gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + } else if (smaller_hist_bits <= 16) { + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } else { + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } + } else { + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } } else { - train_data_->FixHistogram( - feature_index, larger_leaf_splits_->sum_gradients(), - larger_leaf_splits_->sum_hessians(), - larger_leaf_histogram_array_[feature_index].RawData()); + if (config_->use_quantized_grad) { + const int64_t int_sum_gradient_and_hessian = larger_leaf_splits_->int_sum_gradients_and_hessians(); + const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + if (hist_bits_bin <= 16) { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(larger_leaf_histogram_array_[feature_index].RawDataInt16())); + } else { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(larger_leaf_histogram_array_[feature_index].RawDataInt32())); + } + } else { + train_data_->FixHistogram( + feature_index, larger_leaf_splits_->sum_gradients(), + larger_leaf_splits_->sum_hessians(), + larger_leaf_histogram_array_[feature_index].RawData()); + } } ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index, @@ -699,6 +862,11 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.left_sum_hessian, best_split_info.left_output); } + if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { + gradient_discretizer_->SetNumBitsInHistogramBin(*left_leaf, *right_leaf, + data_partition_->leaf_count(*left_leaf), + data_partition_->leaf_count(*right_leaf)); + } auto leaves_need_update = constraints_->Update( is_numerical_split, *left_leaf, *right_leaf, best_split_info.monotone_type, best_split_info.right_output, @@ -762,9 +930,21 @@ void SerialTreeLearner::ComputeBestSplitForFeature( train_data_->FeatureNumBin(feature_index)); } SplitInfo new_split; - histogram_array_[feature_index].FindBestThreshold( - leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, - constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split); + if (config_->use_quantized_grad) { + const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(leaf_splits->leaf_index()); + histogram_array_[feature_index].FindBestThresholdInt( + leaf_splits->int_sum_gradients_and_hessians(), + gradient_discretizer_->grad_scale(), + gradient_discretizer_->hess_scale(), + hist_bits_bin, + hist_bits_bin, + num_data, + constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split); + } else { + histogram_array_[feature_index].FindBestThreshold( + leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, + constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split); + } new_split.feature = real_fidx; if (cegb_ != nullptr) { new_split.gain -= diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 14b78eb6a577..1f8e3add0d8c 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -24,6 +24,7 @@ #include "col_sampler.hpp" #include "data_partition.hpp" #include "feature_histogram.hpp" +#include "gradient_discretizer.hpp" #include "leaf_splits.hpp" #include "monotone_constraints.hpp" #include "split_info.hpp" @@ -170,6 +171,8 @@ class SerialTreeLearner: public TreeLearner { std::set FindAllForceFeatures(Json force_split_leaf_setting); + void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index); + /*! * \brief Get the number of data in a leaf * \param leaf_idx The index of leaf @@ -230,6 +233,7 @@ class SerialTreeLearner: public TreeLearner { const Json* forced_split_json_; std::unique_ptr share_state_; std::unique_ptr cegb_; + std::unique_ptr gradient_discretizer_; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp index 644bd329b3a6..234105eb9a34 100644 --- a/src/treelearner/split_info.hpp +++ b/src/treelearner/split_info.hpp @@ -40,10 +40,14 @@ struct SplitInfo { double left_sum_gradient = 0; /*! \brief Left sum hessian after split */ double left_sum_hessian = 0; + /*! \brief Left sum discretized gradient and hessian after split */ + int64_t left_sum_gradient_and_hessian = 0; /*! \brief Right sum gradient after split */ double right_sum_gradient = 0; /*! \brief Right sum hessian after split */ double right_sum_hessian = 0; + /*! \brief Right sum discretized gradient and hessian after split */ + int64_t right_sum_gradient_and_hessian = 0; std::vector cat_threshold; /*! \brief True if default split is left */ bool default_left = true; @@ -71,10 +75,14 @@ struct SplitInfo { buffer += sizeof(left_sum_gradient); std::memcpy(buffer, &left_sum_hessian, sizeof(left_sum_hessian)); buffer += sizeof(left_sum_hessian); + std::memcpy(buffer, &left_sum_gradient_and_hessian, sizeof(left_sum_gradient_and_hessian)); + buffer += sizeof(left_sum_gradient_and_hessian); std::memcpy(buffer, &right_sum_gradient, sizeof(right_sum_gradient)); buffer += sizeof(right_sum_gradient); std::memcpy(buffer, &right_sum_hessian, sizeof(right_sum_hessian)); buffer += sizeof(right_sum_hessian); + std::memcpy(buffer, &right_sum_gradient_and_hessian, sizeof(right_sum_gradient_and_hessian)); + buffer += sizeof(right_sum_gradient_and_hessian); std::memcpy(buffer, &default_left, sizeof(default_left)); buffer += sizeof(default_left); std::memcpy(buffer, &monotone_type, sizeof(monotone_type)); @@ -103,10 +111,14 @@ struct SplitInfo { buffer += sizeof(left_sum_gradient); std::memcpy(&left_sum_hessian, buffer, sizeof(left_sum_hessian)); buffer += sizeof(left_sum_hessian); + std::memcpy(&left_sum_gradient_and_hessian, buffer, sizeof(left_sum_gradient_and_hessian)); + buffer += sizeof(left_sum_gradient_and_hessian); std::memcpy(&right_sum_gradient, buffer, sizeof(right_sum_gradient)); buffer += sizeof(right_sum_gradient); std::memcpy(&right_sum_hessian, buffer, sizeof(right_sum_hessian)); buffer += sizeof(right_sum_hessian); + std::memcpy(&right_sum_gradient_and_hessian, buffer, sizeof(right_sum_gradient_and_hessian)); + buffer += sizeof(right_sum_gradient_and_hessian); std::memcpy(&default_left, buffer, sizeof(default_left)); buffer += sizeof(default_left); std::memcpy(&monotone_type, buffer, sizeof(monotone_type)); diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 594f88f527ac..bac09fbe45bb 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1854,3 +1854,44 @@ def test_predict_with_raw_score(task, output, cluster): if task.endswith('classification'): pred_proba_raw = model.predict_proba(dX, raw_score=True).compute() assert_eq(raw_predictions, pred_proba_raw) + + +def test_distributed_quantized_training(cluster): + with Client(cluster) as client: + X, y, w, _, dX, dy, dw, _ = _create_data( + objective='regression', + output='array' + ) + + np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f") + + params = { + "boosting_type": 'gbdt', + "n_estimators": 50, + "num_leaves": 31, + 'use_quantized_grad': True, + 'num_grad_quant_bins': 30, + 'quant_train_renew_leaf': True, + 'verbose': -1, + 'force_row_wise': True, + } + + quant_dask_classifier = lgb.DaskLGBMRegressor( + client=client, + time_out=5, + **params + ) + quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw) + quant_p1 = quant_dask_classifier.predict(dX) + quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2)) + + params["use_quantized_grad"] = False + dask_classifier = lgb.DaskLGBMRegressor( + client=client, + time_out=5, + **params + ) + dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) + p1 = dask_classifier.predict(dX) + rmse = np.sqrt(np.mean((p1.compute() - y) ** 2)) + assert quant_rmse < rmse + 7.0 diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d152c2c359d3..a3f724bed00e 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -4055,3 +4055,19 @@ def test_train_raises_informative_error_for_params_of_wrong_type(): dtrain = lgb.Dataset(X, label=y) with pytest.raises(lgb.basic.LightGBMError, match="Parameter early_stopping_round should be of type int, got \"too-many\""): lgb.train(params, dtrain) + + +def test_quantized_training(): + X, y = make_synthetic_regression() + ds = lgb.Dataset(X, label=y) + bst_params = {'num_leaves': 15, 'verbose': -1, 'seed': 0} + bst = lgb.train(bst_params, ds, num_boost_round=10) + rmse = np.sqrt(np.mean((bst.predict(X) - y) ** 2)) + bst_params.update({ + 'use_quantized_grad': True, + 'num_grad_quant_bins': 30, + 'quant_train_renew_leaf': True, + }) + quant_bst = lgb.train(bst_params, ds, num_boost_round=10) + quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2)) + assert quant_rmse < rmse + 6.0 diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 342616d27daa..269bf2ca5955 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -306,6 +306,7 @@ + @@ -341,6 +342,7 @@ + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index ed591fc4d87a..27b445893c0f 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -51,6 +51,9 @@ src\treelearner + + src\treelearner + src\application @@ -338,5 +341,8 @@ src\treelearner + + src\treelearner + \ No newline at end of file