diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 471bfc948cc3..ba9ef054bfab 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -48,6 +48,7 @@ OBJECTS = \
treelearner/data_parallel_tree_learner.o \
treelearner/feature_parallel_tree_learner.o \
treelearner/gpu_tree_learner.o \
+ treelearner/gradient_discretizer.o \
treelearner/linear_tree_learner.o \
treelearner/serial_tree_learner.o \
treelearner/tree_learner.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 8d39317b4a3a..fe4d31eb7746 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -49,6 +49,7 @@ OBJECTS = \
treelearner/data_parallel_tree_learner.o \
treelearner/feature_parallel_tree_learner.o \
treelearner/gpu_tree_learner.o \
+ treelearner/gradient_discretizer.o \
treelearner/linear_tree_learner.o \
treelearner/serial_tree_learner.o \
treelearner/tree_learner.o \
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index abbd8cb14e14..aee1cc4e7f84 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -658,6 +658,38 @@ Learning Control Parameters
- **Note**: can be used only in CLI version
+- ``use_quantized_grad`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to use gradient quantization when training
+
+ - enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
+
+ - with quantized training, most arithmetics in the training process will be integer operations
+
+ - gradient quantization can accelerate training, with little accuracy drop in most cases
+
+ - **Note**: can be used only with ``device_type = cpu``
+
+- ``num_grad_quant_bins`` :raw-html:`🔗︎`, default = ``4``, type = int
+
+ - number of bins to quantization gradients and hessians
+
+ - with more bins, the quantized training will be closer to full precision training
+
+ - **Note**: can be used only with ``device_type = cpu``
+
+- ``quant_train_renew_leaf`` :raw-html:`🔗︎`, default = ``false``, type = bool
+
+ - whether to renew the leaf values with original gradients when quantized training
+
+ - renewing is very helpful for good quantized training accuracy for ranking objectives
+
+ - **Note**: can be used only with ``device_type = cpu``
+
+- ``stochastic_rounding`` :raw-html:`🔗︎`, default = ``true``, type = bool
+
+ - whether to use stochastic rounding in gradient quantization
+
IO Parameters
-------------
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index a6199bbbcbd2..ffb8f2844843 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -30,11 +30,14 @@ enum MissingType {
};
typedef double hist_t;
+typedef int32_t int_hist_t;
typedef uint64_t hist_cnt_t;
// check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
const size_t kHistEntrySize = 2 * sizeof(hist_t);
+const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t);
+const size_t kInt16HistEntrySize = 2 * sizeof(int16_t);
const int kHistOffset = 2;
const double kSparseThreshold = 0.7;
@@ -56,6 +59,28 @@ inline static void HistogramSumReducer(const char* src, char* dst, int type_size
}
}
+inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+ const int64_t* src_ptr = reinterpret_cast(src);
+ int64_t* dst_ptr = reinterpret_cast(dst);
+ const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
+ const int num_threads = OMP_NUM_THREADS();
+ #pragma omp parallel for schedule(static) num_threads(num_threads)
+ for (comm_size_t i = 0; i < steps; ++i) {
+ dst_ptr[i] += src_ptr[i];
+ }
+}
+
+inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+ const int32_t* src_ptr = reinterpret_cast(src);
+ int32_t* dst_ptr = reinterpret_cast(dst);
+ const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
+ const int num_threads = OMP_NUM_THREADS();
+ #pragma omp parallel for schedule(static) num_threads(num_threads)
+ for (comm_size_t i = 0; i < steps; ++i) {
+ dst_ptr[i] += src_ptr[i];
+ }
+}
+
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class BinMapper {
@@ -332,6 +357,33 @@ class Bin {
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
+ virtual void ConstructHistogramInt8(
+ const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt16(
+ const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt32(
+ const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
@@ -351,6 +403,24 @@ class Bin {
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
+ virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients, hist_t* out) const = 0;
+
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t most_freq_bin,
MissingType missing_type, bool default_left,
@@ -464,6 +534,57 @@ class MultiValBin {
const score_t* ordered_hessians,
hist_t* out) const = 0;
+ virtual void ConstructHistogramInt32(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt16(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt8(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* hessians,
+ hist_t* out) const = 0;
+
+ virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ const score_t* ordered_hessians,
+ hist_t* out) const = 0;
+
virtual void FinishLoad() = 0;
virtual bool IsSparse() = 0;
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index cbb2735baeb2..89318a7af246 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -592,6 +592,30 @@ struct Config {
// desc = **Note**: can be used only in CLI version
int snapshot_freq = -1;
+ // [no-save]
+ // desc = whether to use gradient quantization when training
+ // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
+ // desc = with quantized training, most arithmetics in the training process will be integer operations
+ // desc = gradient quantization can accelerate training, with little accuracy drop in most cases
+ // desc = **Note**: can be used only with ``device_type = cpu``
+ bool use_quantized_grad = false;
+
+ // [no-save]
+ // desc = number of bins to quantization gradients and hessians
+ // desc = with more bins, the quantized training will be closer to full precision training
+ // desc = **Note**: can be used only with ``device_type = cpu``
+ int num_grad_quant_bins = 4;
+
+ // [no-save]
+ // desc = whether to renew the leaf values with original gradients when quantized training
+ // desc = renewing is very helpful for good quantized training accuracy for ranking objectives
+ // desc = **Note**: can be used only with ``device_type = cpu``
+ bool quant_train_renew_leaf = false;
+
+ // [no-save]
+ // desc = whether to use stochastic rounding in gradient quantization
+ bool stochastic_rounding = true;
+
#ifndef __NVCC__
#pragma endregion
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 79c4ed196b09..825c5c6ebcf8 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -598,10 +598,11 @@ class Dataset {
MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const;
+ template
TrainingShareStates* GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
- bool force_col_wise, bool force_row_wise) const;
+ bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const;
LIGHTGBM_EXPORT void FinishLoad();
@@ -636,7 +637,7 @@ class Dataset {
void InitTrain(const std::vector& is_feature_used,
TrainingShareStates* share_state) const;
- template
+ template
void ConstructHistogramsInner(const std::vector& is_feature_used,
const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients,
@@ -646,7 +647,7 @@ class Dataset {
TrainingShareStates* share_state,
hist_t* hist_data) const;
- template
+ template
void ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
@@ -654,6 +655,7 @@ class Dataset {
TrainingShareStates* share_state,
hist_t* hist_data) const;
+ template
inline void ConstructHistograms(
const std::vector& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
@@ -666,21 +668,21 @@ class Dataset {
bool use_indices = data_indices != nullptr && (num_data < num_data_);
if (share_state->is_constant_hessian) {
if (use_indices) {
- ConstructHistogramsInner(
+ ConstructHistogramsInner(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
} else {
- ConstructHistogramsInner(
+ ConstructHistogramsInner(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
}
} else {
if (use_indices) {
- ConstructHistogramsInner(
+ ConstructHistogramsInner(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
} else {
- ConstructHistogramsInner(
+ ConstructHistogramsInner(
is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data);
}
@@ -689,6 +691,9 @@ class Dataset {
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
+ template
+ void FixHistogramInt(int feature_idx, int64_t sum_gradient_and_hessian, hist_t* data) const;
+
inline data_size_t Split(int feature, const uint32_t* threshold,
int num_threshold, bool default_left,
const data_size_t* data_indices,
diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h
index 8c50734695b2..f102668edf70 100644
--- a/include/LightGBM/train_share_states.h
+++ b/include/LightGBM/train_share_states.h
@@ -19,7 +19,7 @@ namespace LightGBM {
class MultiValBinWrapper {
public:
MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
- const std::vector& feature_groups_contained);
+ const std::vector& feature_groups_contained, const int num_grad_quant_bins);
bool IsSparse() {
if (multi_val_bin_ != nullptr) {
@@ -34,15 +34,17 @@ class MultiValBinWrapper {
const data_size_t* bagging_use_indices,
data_size_t bagging_indices_cnt);
+ template
void HistMove(const std::vector>& hist_buf);
+ template
void HistMerge(std::vector>* hist_buf);
void ResizeHistBuf(std::vector>* hist_buf,
MultiValBin* sub_multi_val_bin,
hist_t* origin_hist_data);
- template
+ template
void ConstructHistograms(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
@@ -59,55 +61,145 @@ class MultiValBinWrapper {
Threading::BlockInfo(num_threads_, num_data, min_block_size_,
&n_data_block_, &data_block_size_);
ResizeHistBuf(hist_buf, cur_multi_val_bin, origin_hist_data);
+ const int inner_hist_bits = (data_block_size_ * num_grad_quant_bins_ < 256 && HIST_BITS == 16) ? 8 : HIST_BITS;
OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_)
for (int block_id = 0; block_id < n_data_block_; ++block_id) {
OMP_LOOP_EX_BEGIN();
data_size_t start = block_id * data_block_size_;
data_size_t end = std::min(start + data_block_size_, num_data);
- ConstructHistogramsForBlock(
- cur_multi_val_bin, start, end, data_indices, gradients, hessians,
- block_id, hist_buf);
+ if (inner_hist_bits == 8) {
+ ConstructHistogramsForBlock(
+ cur_multi_val_bin, start, end, data_indices, gradients, hessians,
+ block_id, hist_buf);
+ } else {
+ ConstructHistogramsForBlock(
+ cur_multi_val_bin, start, end, data_indices, gradients, hessians,
+ block_id, hist_buf);
+ }
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
global_timer.Stop("Dataset::sparse_bin_histogram");
global_timer.Start("Dataset::sparse_bin_histogram_merge");
- HistMerge(hist_buf);
+ if (inner_hist_bits == 8) {
+ HistMerge(hist_buf);
+ } else {
+ HistMerge(hist_buf);
+ }
global_timer.Stop("Dataset::sparse_bin_histogram_merge");
global_timer.Start("Dataset::sparse_bin_histogram_move");
- HistMove(*hist_buf);
+ if (inner_hist_bits == 8) {
+ HistMove(*hist_buf);
+ } else {
+ HistMove(*hist_buf);
+ }
global_timer.Stop("Dataset::sparse_bin_histogram_move");
}
}
- template
+ template
void ConstructHistogramsForBlock(const MultiValBin* sub_multi_val_bin,
data_size_t start, data_size_t end, const data_size_t* data_indices,
const score_t* gradients, const score_t* hessians, int block_id,
std::vector>* hist_buf) {
- hist_t* data_ptr = origin_hist_data_;
- if (block_id == 0) {
- if (is_use_subcol_) {
- data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_);
+ if (USE_QUANT_GRAD) {
+ if (HIST_BITS == 8) {
+ int8_t* hist_buf_ptr = reinterpret_cast(hist_buf->data());
+ int8_t* data_ptr = hist_buf_ptr +
+ static_cast(num_bin_aligned_) * block_id * 2;
+ std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt8HistBufferEntrySize);
+ if (USE_INDICES) {
+ if (ORDERED) {
+ sub_multi_val_bin->ConstructHistogramOrderedInt8(data_indices, start, end,
+ gradients, hessians,
+ reinterpret_cast(data_ptr));
+ } else {
+ sub_multi_val_bin->ConstructHistogramInt8(data_indices, start, end, gradients,
+ hessians,
+ reinterpret_cast(data_ptr));
+ }
+ } else {
+ sub_multi_val_bin->ConstructHistogramInt8(start, end, gradients, hessians,
+ reinterpret_cast(data_ptr));
+ }
+ } else if (HIST_BITS == 16) {
+ int16_t* data_ptr = reinterpret_cast(origin_hist_data_);
+ int16_t* hist_buf_ptr = reinterpret_cast(hist_buf->data());
+ if (block_id == 0) {
+ if (is_use_subcol_) {
+ data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast(num_bin_aligned_);
+ }
+ } else {
+ data_ptr = hist_buf_ptr +
+ static_cast(num_bin_aligned_) * (block_id - 1) * 2;
+ }
+ std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt16HistBufferEntrySize);
+ if (USE_INDICES) {
+ if (ORDERED) {
+ sub_multi_val_bin->ConstructHistogramOrderedInt16(data_indices, start, end,
+ gradients, hessians,
+ reinterpret_cast(data_ptr));
+ } else {
+ sub_multi_val_bin->ConstructHistogramInt16(data_indices, start, end, gradients,
+ hessians,
+ reinterpret_cast(data_ptr));
+ }
+ } else {
+ sub_multi_val_bin->ConstructHistogramInt16(start, end, gradients, hessians,
+ reinterpret_cast(data_ptr));
+ }
+ } else {
+ int32_t* data_ptr = reinterpret_cast(origin_hist_data_);
+ int32_t* hist_buf_ptr = reinterpret_cast(hist_buf->data());
+ if (block_id == 0) {
+ if (is_use_subcol_) {
+ data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast(num_bin_aligned_);
+ }
+ } else {
+ data_ptr = hist_buf_ptr +
+ static_cast(num_bin_aligned_) * (block_id - 1) * 2;
+ }
+ std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt32HistBufferEntrySize);
+ if (USE_INDICES) {
+ if (ORDERED) {
+ sub_multi_val_bin->ConstructHistogramOrderedInt32(data_indices, start, end,
+ gradients, hessians,
+ reinterpret_cast(data_ptr));
+ } else {
+ sub_multi_val_bin->ConstructHistogramInt32(data_indices, start, end, gradients,
+ hessians,
+ reinterpret_cast(data_ptr));
+ }
+ } else {
+ sub_multi_val_bin->ConstructHistogramInt32(start, end, gradients, hessians,
+ reinterpret_cast(data_ptr));
+ }
}
} else {
- data_ptr = hist_buf->data() +
- static_cast(num_bin_aligned_) * (block_id - 1) * 2;
- }
- std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kHistBufferEntrySize);
- if (USE_INDICES) {
- if (ORDERED) {
- sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
- gradients, hessians, data_ptr);
+ hist_t* data_ptr = origin_hist_data_;
+ if (block_id == 0) {
+ if (is_use_subcol_) {
+ data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_);
+ }
} else {
- sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
- hessians, data_ptr);
+ data_ptr = hist_buf->data() +
+ static_cast(num_bin_aligned_) * (block_id - 1) * 2;
+ }
+ std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kHistBufferEntrySize);
+ if (USE_INDICES) {
+ if (ORDERED) {
+ sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
+ gradients, hessians, data_ptr);
+ } else {
+ sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
+ hessians, data_ptr);
+ }
+ } else {
+ sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
+ data_ptr);
}
- } else {
- sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
- data_ptr);
}
}
@@ -162,10 +254,14 @@ class MultiValBinWrapper {
int data_block_size_;
int min_block_size_;
int num_data_;
+ int num_grad_quant_bins_;
hist_t* origin_hist_data_;
const size_t kHistBufferEntrySize = 2 * sizeof(hist_t);
+ const size_t kInt32HistBufferEntrySize = 2 * sizeof(int32_t);
+ const size_t kInt16HistBufferEntrySize = 2 * sizeof(int16_t);
+ const size_t kInt8HistBufferEntrySize = 2 * sizeof(int8_t);
};
struct TrainingShareStates {
@@ -193,7 +289,7 @@ struct TrainingShareStates {
void SetMultiValBin(MultiValBin* bin, data_size_t num_data,
const std::vector>& feature_groups,
- bool dense_only, bool sparse_only);
+ bool dense_only, bool sparse_only, const int num_grad_quant_bins);
void CalcBinOffsets(const std::vector>& feature_groups,
std::vector* offsets, bool is_col_wise);
@@ -210,14 +306,14 @@ struct TrainingShareStates {
}
}
- template
+ template
void ConstructHistograms(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
const score_t* hessians,
hist_t* hist_data) {
if (multi_val_bin_wrapper_ != nullptr) {
- multi_val_bin_wrapper_->ConstructHistograms(
+ multi_val_bin_wrapper_->ConstructHistograms(
data_indices, num_data, gradients, hessians, &hist_buf_, hist_data);
}
}
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 86b64a52d105..e8578046960a 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -378,6 +378,10 @@ void Config::CheckParamConflict() {
if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
}
+ if (use_quantized_grad) {
+ Log::Warning("Quantized training is not supported by GPU tree learner. Switch to full precision training.");
+ use_quantized_grad = false;
+ }
} else if (device_type == std::string("cuda")) {
// force row-wise for cuda version
force_col_wise = false;
@@ -385,6 +389,10 @@ void Config::CheckParamConflict() {
if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
}
+ if (use_quantized_grad) {
+ Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
+ use_quantized_grad = false;
+ }
}
// linear tree learner must be serial type and run on CPU device
if (linear_tree) {
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index b1dbcc378a27..0906ba4b6439 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -251,6 +251,10 @@ const std::unordered_set& Config::parameter_set() {
"output_model",
"saved_feature_importance_type",
"snapshot_freq",
+ "use_quantized_grad",
+ "num_grad_quant_bins",
+ "quant_train_renew_leaf",
+ "stochastic_rounding",
"linear_tree",
"max_bin",
"max_bin_by_feature",
@@ -493,6 +497,14 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet
{"output_model", {"model_output", "model_out"}},
{"saved_feature_importance_type", {}},
{"snapshot_freq", {"save_period"}},
+ {"use_quantized_grad", {}},
+ {"num_grad_quant_bins", {}},
+ {"quant_train_renew_leaf", {}},
+ {"stochastic_rounding", {}},
{"linear_tree", {"linear_trees"}},
{"max_bin", {"max_bins"}},
{"max_bin_by_feature", {}},
@@ -966,6 +982,10 @@ const std::unordered_map& Config::ParameterTypes() {
{"output_model", "string"},
{"saved_feature_importance_type", "int"},
{"snapshot_freq", "int"},
+ {"use_quantized_grad", "bool"},
+ {"num_grad_quant_bins", "int"},
+ {"quant_train_renew_leaf", "bool"},
+ {"stochastic_rounding", "bool"},
{"linear_tree", "bool"},
{"max_bin", "int"},
{"max_bin_by_feature", "vector"},
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index a8f449d3f55b..5b23f01ec3a0 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -608,10 +608,12 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of
return ret.release();
}
+template
TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians,
const std::vector& is_feature_used, bool is_constant_hessian,
- bool force_col_wise, bool force_row_wise) const {
+ bool force_col_wise, bool force_row_wise,
+ const int num_grad_quant_bins) const {
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
global_timer);
if (force_col_wise && force_row_wise) {
@@ -631,7 +633,7 @@ TrainingShareStates* Dataset::GetShareStates(
share_state->CalcBinOffsets(
feature_groups_, &offsets, true);
share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
- num_data_, feature_groups_, false, true);
+ num_data_, feature_groups_, false, true, num_grad_quant_bins);
share_state->is_col_wise = true;
share_state->is_constant_hessian = is_constant_hessian;
return share_state;
@@ -641,7 +643,7 @@ TrainingShareStates* Dataset::GetShareStates(
share_state->CalcBinOffsets(
feature_groups_, &offsets, false);
share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
- feature_groups_, false, false);
+ feature_groups_, false, false, num_grad_quant_bins);
share_state->is_col_wise = false;
share_state->is_constant_hessian = is_constant_hessian;
return share_state;
@@ -658,14 +660,14 @@ TrainingShareStates* Dataset::GetShareStates(
std::vector col_wise_offsets;
col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
- feature_groups_, false, true);
+ feature_groups_, false, true, num_grad_quant_bins);
col_wise_init_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
std::vector row_wise_offsets;
row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
- feature_groups_, false, false);
+ feature_groups_, false, false, num_grad_quant_bins);
row_wise_init_time = std::chrono::steady_clock::now() - start_time;
uint64_t max_total_bin = std::max(row_wise_state->num_hist_total_bin(),
@@ -685,12 +687,12 @@ TrainingShareStates* Dataset::GetShareStates(
InitTrain(is_feature_used, row_wise_state.get());
std::chrono::duration col_wise_time, row_wise_time;
start_time = std::chrono::steady_clock::now();
- ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
+ ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, col_wise_state.get(),
hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
- ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
+ ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, row_wise_state.get(),
hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time;
@@ -721,6 +723,24 @@ TrainingShareStates* Dataset::GetShareStates(
}
}
+template TrainingShareStates* Dataset::GetShareStates(
+ score_t* gradients, score_t* hessians,
+ const std::vector& is_feature_used, bool is_constant_hessian,
+ bool force_col_wise, bool force_row_wise,
+ const int num_grad_quant_bins) const;
+
+template TrainingShareStates* Dataset::GetShareStates(
+ score_t* gradients, score_t* hessians,
+ const std::vector& is_feature_used, bool is_constant_hessian,
+ bool force_col_wise, bool force_row_wise,
+ const int num_grad_quant_bins) const;
+
+template TrainingShareStates* Dataset::GetShareStates(
+ score_t* gradients, score_t* hessians,
+ const std::vector& is_feature_used, bool is_constant_hessian,
+ bool force_col_wise, bool force_row_wise,
+ const int num_grad_quant_bins) const;
+
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
feature_groups_.clear();
num_features_ = dataset->num_features_;
@@ -1203,7 +1223,7 @@ void Dataset::InitTrain(const std::vector& is_feature_used,
is_feature_used);
}
-template
+template
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data,
const score_t* gradients,
@@ -1212,18 +1232,18 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
hist_t* hist_data) const {
Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
global_timer);
- share_state->ConstructHistograms(
+ share_state->ConstructHistograms(
data_indices, num_data, gradients, hessians, hist_data);
}
-template
+template
void Dataset::ConstructHistogramsInner(
const std::vector& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const {
if (!share_state->is_col_wise) {
- return ConstructHistogramsMultiVal(
+ return ConstructHistogramsMultiVal(
data_indices, num_data, gradients, hessians, share_state, hist_data);
}
std::vector used_dense_group;
@@ -1275,30 +1295,80 @@ void Dataset::ConstructHistogramsInner(
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
- auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
- std::memset(reinterpret_cast(data_ptr), 0,
- num_bin * kHistEntrySize);
- if (USE_HESSIAN) {
- if (USE_INDICES) {
- feature_groups_[group]->bin_data_->ConstructHistogram(
- data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
- data_ptr);
+ if (USE_QUANT_GRAD) {
+ if (HIST_BITS == 16) {
+ auto data_ptr = reinterpret_cast(reinterpret_cast(hist_data) + group_bin_boundaries_[group]);
+ std::memset(reinterpret_cast(data_ptr), 0,
+ num_bin * kInt16HistEntrySize);
+ if (USE_HESSIAN) {
+ if (USE_INDICES) {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+ data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+ data_ptr);
+ } else {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+ 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+ }
+ } else {
+ if (USE_INDICES) {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+ data_indices, 0, num_data, ptr_ordered_grad,
+ data_ptr);
+ } else {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+ 0, num_data, ptr_ordered_grad, data_ptr);
+ }
+ }
} else {
- feature_groups_[group]->bin_data_->ConstructHistogram(
- 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+ auto data_ptr = hist_data + group_bin_boundaries_[group];
+ std::memset(reinterpret_cast(data_ptr), 0,
+ num_bin * kInt32HistEntrySize);
+ if (USE_HESSIAN) {
+ if (USE_INDICES) {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+ data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+ data_ptr);
+ } else {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+ 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+ }
+ } else {
+ if (USE_INDICES) {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+ data_indices, 0, num_data, ptr_ordered_grad,
+ data_ptr);
+ } else {
+ feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+ 0, num_data, ptr_ordered_grad, data_ptr);
+ }
+ }
}
} else {
- if (USE_INDICES) {
- feature_groups_[group]->bin_data_->ConstructHistogram(
- data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
+ auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+ std::memset(reinterpret_cast(data_ptr), 0,
+ num_bin * kHistEntrySize);
+ if (USE_HESSIAN) {
+ if (USE_INDICES) {
+ feature_groups_[group]->bin_data_->ConstructHistogram(
+ data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+ data_ptr);
+ } else {
+ feature_groups_[group]->bin_data_->ConstructHistogram(
+ 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+ }
} else {
- feature_groups_[group]->bin_data_->ConstructHistogram(
- 0, num_data, ptr_ordered_grad, data_ptr);
- }
- auto cnt_dst = reinterpret_cast(data_ptr + 1);
- for (int i = 0; i < num_bin * 2; i += 2) {
- data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0];
+ if (USE_INDICES) {
+ feature_groups_[group]->bin_data_->ConstructHistogram(
+ data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
+ } else {
+ feature_groups_[group]->bin_data_->ConstructHistogram(
+ 0, num_data, ptr_ordered_grad, data_ptr);
+ }
+ auto cnt_dst = reinterpret_cast(data_ptr + 1);
+ for (int i = 0; i < num_bin * 2; i += 2) {
+ data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0];
+ }
}
}
OMP_LOOP_EX_END();
@@ -1307,43 +1377,78 @@ void Dataset::ConstructHistogramsInner(
}
global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) {
- if (num_used_dense_group > 0) {
- ConstructHistogramsMultiVal(
- data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
- share_state,
- hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+ if (USE_QUANT_GRAD) {
+ if (HIST_BITS == 32) {
+ int32_t* hist_data_ptr = reinterpret_cast(hist_data);
+ if (num_used_dense_group > 0) {
+ ConstructHistogramsMultiVal(
+ data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
+ share_state,
+ reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+ } else {
+ ConstructHistogramsMultiVal(
+ data_indices, num_data, gradients, hessians, share_state,
+ reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+ }
+ } else if (HIST_BITS == 16) {
+ int16_t* hist_data_ptr = reinterpret_cast(hist_data);
+ if (num_used_dense_group > 0) {
+ ConstructHistogramsMultiVal(
+ data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
+ share_state,
+ reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+ } else {
+ ConstructHistogramsMultiVal(
+ data_indices, num_data, gradients, hessians, share_state,
+ reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+ }
+ }
} else {
- ConstructHistogramsMultiVal(
- data_indices, num_data, gradients, hessians, share_state,
- hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+ if (num_used_dense_group > 0) {
+ ConstructHistogramsMultiVal(
+ data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
+ share_state,
+ hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+ } else {
+ ConstructHistogramsMultiVal(
+ data_indices, num_data, gradients, hessians, share_state,
+ hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+ }
}
}
}
// explicitly initialize template methods, for cross module call
-template void Dataset::ConstructHistogramsInner(
- const std::vector& is_feature_used, const data_size_t* data_indices,
- data_size_t num_data, const score_t* gradients, const score_t* hessians,
- score_t* ordered_gradients, score_t* ordered_hessians,
- TrainingShareStates* share_state, hist_t* hist_data) const;
+#define CONSTRUCT_HISTOGRAMS_INNER_PARMA \
+ const std::vector& is_feature_used, const data_size_t* data_indices, \
+ data_size_t num_data, const score_t* gradients, const score_t* hessians, \
+ score_t* ordered_gradients, score_t* ordered_hessians, \
+ TrainingShareStates* share_state, hist_t* hist_data
-template void Dataset::ConstructHistogramsInner(
- const std::vector& is_feature_used, const data_size_t* data_indices,
- data_size_t num_data, const score_t* gradients, const score_t* hessians,
- score_t* ordered_gradients, score_t* ordered_hessians,
- TrainingShareStates* share_state, hist_t* hist_data) const;
+// explicitly initialize template methods, for cross module call
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
-template void Dataset::ConstructHistogramsInner(
- const std::vector& is_feature_used, const data_size_t* data_indices,
- data_size_t num_data, const score_t* gradients, const score_t* hessians,
- score_t* ordered_gradients, score_t* ordered_hessians,
- TrainingShareStates* share_state, hist_t* hist_data) const;
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
-template void Dataset::ConstructHistogramsInner(
- const std::vector& is_feature_used, const data_size_t* data_indices,
- data_size_t num_data, const score_t* gradients, const score_t* hessians,
- score_t* ordered_gradients, score_t* ordered_hessians,
- TrainingShareStates* share_state, hist_t* hist_data) const;
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
void Dataset::FixHistogram(int feature_idx, double sum_gradient,
double sum_hessian, hist_t* data) const {
@@ -1365,6 +1470,49 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient,
}
}
+template
+void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const {
+ const int group = feature2group_[feature_idx];
+ const int sub_feature = feature2subfeature_[feature_idx];
+ const BinMapper* bin_mapper =
+ feature_groups_[group]->bin_mappers_[sub_feature].get();
+ const int most_freq_bin = bin_mapper->GetMostFreqBin();
+ PACKED_HIST_BIN_T* data_ptr = reinterpret_cast(data);
+ PACKED_HIST_ACC_T int_sum_gradient_and_hessian_local = HIST_BITS_ACC == 16 ?
+ ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) |
+ static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) :
+ int_sum_gradient_and_hessian;
+ if (most_freq_bin > 0) {
+ const int num_bin = bin_mapper->num_bin();
+ if (HIST_BITS_BIN == HIST_BITS_ACC) {
+ for (int i = 0; i < num_bin; ++i) {
+ if (i != most_freq_bin) {
+ int_sum_gradient_and_hessian_local -= data_ptr[i];
+ }
+ }
+ data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local;
+ } else {
+ CHECK_EQ(HIST_BITS_ACC, 32);
+ CHECK_EQ(HIST_BITS_BIN, 16);
+ for (int i = 0; i < num_bin; ++i) {
+ if (i != most_freq_bin) {
+ const PACKED_HIST_BIN_T packed_hist = data_ptr[i];
+ const PACKED_HIST_ACC_T packed_hist_acc = (static_cast(static_cast(packed_hist >> 16)) << 32) |
+ static_cast(packed_hist & 0x0000ffff);
+ int_sum_gradient_and_hessian_local -= packed_hist_acc;
+ }
+ }
+ PACKED_HIST_BIN_T int_sum_gradient_and_hessian_local_bin =
+ (static_cast(int_sum_gradient_and_hessian_local >> 32) << 16) | static_cast(int_sum_gradient_and_hessian_local & 0x0000ffff);
+ data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local_bin;
+ }
+ }
+}
+
+template void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const;
+
+template void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const;
+
template
void PushVector(std::vector* dest, const std::vector& src) {
dest->reserve(dest->size() + src.size());
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 3d0f8db8e549..e612052e47d2 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -171,6 +171,146 @@ class DenseBin : public Bin {
}
+ template
+ void ConstructHistogramIntInner(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ hist_t* out) const {
+ data_size_t i = start;
+ PACKED_HIST_T* out_ptr = reinterpret_cast(out);
+ const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients);
+ const VAL_T* data_ptr_base = data_.data();
+ if (USE_PREFETCH) {
+ const data_size_t pf_offset = 64 / sizeof(VAL_T);
+ const data_size_t pf_end = end - pf_offset;
+ for (; i < pf_end; ++i) {
+ const auto idx = USE_INDICES ? data_indices[i] : i;
+ const auto pf_idx =
+ USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
+ if (IS_4BIT) {
+ PREFETCH_T0(data_ptr_base + (pf_idx >> 1));
+ } else {
+ PREFETCH_T0(data_ptr_base + pf_idx);
+ }
+ const auto ti = static_cast(data(idx));
+ const int16_t gradient_16 = gradients_ptr[i];
+ if (USE_HESSIAN) {
+ const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+ (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+ out_ptr[ti] += gradient_packed;
+ } else {
+ const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+ (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1);
+ out_ptr[ti] += gradient_packed;
+ }
+ }
+ }
+ for (; i < end; ++i) {
+ const auto idx = USE_INDICES ? data_indices[i] : i;
+ const auto ti = static_cast(data(idx));
+ const int16_t gradient_16 = gradients_ptr[i];
+ if (USE_HESSIAN) {
+ const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+ (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+ out_ptr[ti] += gradient_packed;
+ } else {
+ const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+ (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1);
+ out_ptr[ti] += gradient_packed;
+ }
+ }
+ }
+
+ void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients,
+ const score_t* /*ordered_hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ data_indices, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ const score_t* /*ordered_hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ data_indices, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients,
+ const score_t* /*ordered_hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ data_indices, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ const score_t* /*ordered_hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ data_indices, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients,
+ const score_t* /*ordered_hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ data_indices, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ const score_t* /*ordered_hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ data_indices, start, end, ordered_gradients, out);
+ }
+
+ void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, ordered_gradients, out);
+ }
+
template
data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin,
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index b4fbfbe673aa..780272bdc4e1 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -124,6 +124,123 @@ class MultiValDenseBin : public MultiValBin {
gradients, hessians, out);
}
+ template
+ void ConstructHistogramIntInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+ const score_t* gradients_and_hessians, hist_t* out) const {
+ data_size_t i = start;
+ const VAL_T* data_ptr_base = data_.data();
+ const int16_t* gradients_and_hessians_ptr = reinterpret_cast(gradients_and_hessians);
+ PACKED_HIST_T* out_ptr = reinterpret_cast(out);
+
+ if (USE_PREFETCH) {
+ const data_size_t pf_offset = 32 / sizeof(VAL_T);
+ const data_size_t pf_end = end - pf_offset;
+
+ for (; i < pf_end; ++i) {
+ const auto idx = USE_INDICES ? data_indices[i] : i;
+ const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
+ if (!ORDERED) {
+ PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
+ }
+ PREFETCH_T0(data_ptr_base + RowPtr(pf_idx));
+ const auto j_start = RowPtr(idx);
+ const VAL_T* data_ptr = data_ptr_base + j_start;
+ const int16_t gradient_16 = gradients_and_hessians_ptr[idx];
+ const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+ ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) |
+ static_cast(gradient_16 & 0xff));
+ for (int j = 0; j < num_feature_; ++j) {
+ const uint32_t bin = static_cast(data_ptr[j]);
+ const auto ti = (bin + offsets_[j]);
+ out_ptr[ti] += gradient_packed;
+ }
+ }
+ }
+ for (; i < end; ++i) {
+ const auto idx = USE_INDICES ? data_indices[i] : i;
+ const auto j_start = RowPtr(idx);
+ const VAL_T* data_ptr = data_ptr_base + j_start;
+ const int16_t gradient_16 = gradients_and_hessians_ptr[idx];
+ const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+ ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) |
+ static_cast(gradient_16 & 0xff));
+ for (int j = 0; j < num_feature_; ++j) {
+ const uint32_t bin = static_cast(data_ptr[j]);
+ const auto ti = (bin + offsets_[j]);
+ out_ptr[ti] += gradient_packed;
+ }
+ }
+ }
+
+ void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* gradients,
+ const score_t* /*hessians*/, hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* gradients, const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, gradients, out);
+ }
+
+ void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* gradients,
+ const score_t* /*hessians*/, hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* gradients, const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, gradients, out);
+ }
+
+ void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* gradients,
+ const score_t* /*hessians*/, hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* gradients, const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, gradients, out);
+ }
+
+ void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double,
const std::vector& offsets) const override {
return new MultiValDenseBin(num_data, num_bin, num_feature, offsets);
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index eaa30ef0a0cc..32a5a51b4f89 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -180,6 +180,124 @@ class MultiValSparseBin : public MultiValBin {
gradients, hessians, out);
}
+ template
+ void ConstructHistogramIntInner(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients_and_hessians, hist_t* out) const {
+ data_size_t i = start;
+ PACKED_HIST_T* out_ptr = reinterpret_cast(out);
+ const int16_t* gradients_and_hessians_ptr = reinterpret_cast(gradients_and_hessians);
+ const VAL_T* data_ptr = data_.data();
+ const INDEX_T* row_ptr_base = row_ptr_.data();
+ if (USE_PREFETCH) {
+ const data_size_t pf_offset = 32 / sizeof(VAL_T);
+ const data_size_t pf_end = end - pf_offset;
+
+ for (; i < pf_end; ++i) {
+ const auto idx = USE_INDICES ? data_indices[i] : i;
+ const auto pf_idx =
+ USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
+ if (!ORDERED) {
+ PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
+ }
+ PREFETCH_T0(row_ptr_base + pf_idx);
+ PREFETCH_T0(data_ptr + row_ptr_[pf_idx]);
+ const auto j_start = RowPtr(idx);
+ const auto j_end = RowPtr(idx + 1);
+ const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
+ const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+ ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) |
+ static_cast(gradient_16 & 0xff));
+ for (auto j = j_start; j < j_end; ++j) {
+ const auto ti = static_cast(data_ptr[j]);
+ out_ptr[ti] += gradient_packed;
+ }
+ }
+ }
+ for (; i < end; ++i) {
+ const auto idx = USE_INDICES ? data_indices[i] : i;
+ const auto j_start = RowPtr(idx);
+ const auto j_end = RowPtr(idx + 1);
+ const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
+ const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+ ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) |
+ static_cast(gradient_16 & 0xff));
+ for (auto j = j_start; j < j_end; ++j) {
+ const auto ti = static_cast(data_ptr[j]);
+ out_ptr[ti] += gradient_packed;
+ }
+ }
+ }
+
+ void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* gradients,
+ const score_t* /*hessians*/, hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt32(data_size_t start, data_size_t end,
+ const score_t* gradients, const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, gradients, out);
+ }
+
+ void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* gradients,
+ const score_t* /*hessians*/, hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt16(data_size_t start, data_size_t end,
+ const score_t* gradients, const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, gradients, out);
+ }
+
+ void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* gradients,
+ const score_t* /*hessians*/, hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
+ void ConstructHistogramInt8(data_size_t start, data_size_t end,
+ const score_t* gradients, const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(
+ nullptr, start, end, gradients, out);
+ }
+
+ void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
+ data_size_t start, data_size_t end,
+ const score_t* gradients,
+ const score_t* /*hessians*/,
+ hist_t* out) const override {
+ ConstructHistogramIntInner(data_indices, start, end,
+ gradients, out);
+ }
+
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
double estimate_element_per_row,
const std::vector& /*offsets*/) const override {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index e01c0afcf5bc..f7137d29ffd9 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -203,6 +203,184 @@ class SparseBin : public Bin {
}
#undef ACC_GH
+ template
+ void ConstructIntHistogramInner(data_size_t start, data_size_t end,
+ const score_t* ordered_gradients_and_hessians,
+ hist_t* out) const {
+ data_size_t i_delta, cur_pos;
+ InitIndex(start, &i_delta, &cur_pos);
+ if (USE_HESSIAN) {
+ PACKED_HIST_T* out_ptr = reinterpret_cast(out);
+ const int16_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians);
+ while (cur_pos < start && i_delta < num_vals_) {
+ cur_pos += deltas_[++i_delta];
+ }
+ while (cur_pos < end && i_delta < num_vals_) {
+ const VAL_T bin = vals_[i_delta];
+ const int16_t gradient_16 = gradients_and_hessians_ptr[cur_pos];
+ const PACKED_HIST_T gradient_64 = (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+ out_ptr[bin] += gradient_64;
+ cur_pos += deltas_[++i_delta];
+ }
+ } else {
+ GRAD_HIST_T* grad = reinterpret_cast(out);
+ HESS_HIST_T* cnt = reinterpret_cast(out) + 1;
+ const int8_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians);
+ while (cur_pos < start && i_delta < num_vals_) {
+ cur_pos += deltas_[++i_delta];
+ }
+ while (cur_pos < end && i_delta < num_vals_) {
+ const uint32_t ti = static_cast(vals_[i_delta]) << 1;
+ grad[ti] += gradients_and_hessians_ptr[cur_pos];
+ ++cnt[ti];
+ cur_pos += deltas_[++i_delta];
+ }
+ }
+ }
+
+ template
+ void ConstructIntHistogramInner(const data_size_t* data_indices, data_size_t start,
+ data_size_t end, const score_t* ordered_gradients_and_hessians,
+ hist_t* out) const {
+ data_size_t i_delta, cur_pos;
+ InitIndex(data_indices[start], &i_delta, &cur_pos);
+ data_size_t i = start;
+ if (USE_HESSIAN) {
+ PACKED_HIST_T* out_ptr = reinterpret_cast(out);
+ const int16_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians);
+ for (;;) {
+ if (cur_pos < data_indices[i]) {
+ cur_pos += deltas_[++i_delta];
+ if (i_delta >= num_vals_) {
+ break;
+ }
+ } else if (cur_pos > data_indices[i]) {
+ if (++i >= end) {
+ break;
+ }
+ } else {
+ const VAL_T bin = vals_[i_delta];
+ const int16_t gradient_16 = gradients_and_hessians_ptr[i];
+ const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+ (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+ out_ptr[bin] += gradient_packed;
+ if (++i >= end) {
+ break;
+ }
+ cur_pos += deltas_[++i_delta];
+ if (i_delta >= num_vals_) {
+ break;
+ }
+ }
+ }
+ } else {
+ GRAD_HIST_T* grad = reinterpret_cast(out);
+ HESS_HIST_T* cnt = reinterpret_cast(out) + 1;
+ const int8_t* gradients_and_hessians_ptr = reinterpret_cast