Skip to content

Commit

Permalink
Implement sketching with Hessian on GPU. (#9399)
Browse files Browse the repository at this point in the history
- Prepare for implementing approx on GPU.
- Unify the code path between weighted and uniform sketching on DMatrix.
  • Loading branch information
trivialfis authored Jul 24, 2023
1 parent 851cba9 commit a196443
Show file tree
Hide file tree
Showing 14 changed files with 436 additions and 220 deletions.
12 changes: 6 additions & 6 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,10 @@ class MetaInfo {
return data_split_mode == DataSplitMode::kRow;
}

/*! \brief Whether the data is split column-wise. */
bool IsColumnSplit() const {
return data_split_mode == DataSplitMode::kCol;
}
/** @brief Whether the data is split column-wise. */
bool IsColumnSplit() const { return data_split_mode == DataSplitMode::kCol; }
/** @brief Whether this is a learning to rank data. */
bool IsRanking() const { return !group_ptr_.empty(); }

/*!
* \brief A convenient method to check if we are doing vertical federated learning, which requires
Expand Down Expand Up @@ -249,7 +249,7 @@ struct BatchParam {
/**
* \brief Hessian, used for sketching with future approx implementation.
*/
common::Span<float> hess;
common::Span<float const> hess;
/**
* \brief Whether should we force DMatrix to regenerate the batch. Only used for
* GHistIndex.
Expand Down Expand Up @@ -279,7 +279,7 @@ struct BatchParam {
* Get batch with sketch weighted by hessian. The batch will be regenerated if the
* span is changed, so caller should keep the span for each iteration.
*/
BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
BatchParam(bst_bin_t max_bin, common::Span<float const> hessian, bool regenerate)
: max_bin{max_bin}, hess{hessian}, regen{regenerate} {}

[[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
Expand Down
8 changes: 5 additions & 3 deletions include/xgboost/host_device_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,12 @@
#ifndef XGBOOST_HOST_DEVICE_VECTOR_H_
#define XGBOOST_HOST_DEVICE_VECTOR_H_

#include <xgboost/context.h> // for DeviceOrd
#include <xgboost/span.h> // for Span

#include <initializer_list>
#include <vector>
#include <type_traits>

#include "span.h"
#include <vector>

namespace xgboost {

Expand Down Expand Up @@ -133,6 +134,7 @@ class HostDeviceVector {
GPUAccess DeviceAccess() const;

void SetDevice(int device) const;
void SetDevice(DeviceOrd device) const;

void Resize(size_t new_size, T v = T());

Expand Down
6 changes: 3 additions & 3 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "quantile.h"
#include "xgboost/base.h"
#include "xgboost/context.h" // Context
#include "xgboost/data.h" // SparsePage, SortedCSCPage
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for SparsePage, SortedCSCPage

#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
Expand All @@ -30,7 +30,7 @@ HistogramCuts::HistogramCuts() {
}

HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
Span<float> const hessian) {
Span<float const> hessian) {
HistogramCuts out;
auto const &info = m->Info();
auto n_threads = ctx->Threads();
Expand Down
263 changes: 135 additions & 128 deletions src/common/hist_util.cu

Large diffs are not rendered by default.

44 changes: 33 additions & 11 deletions src/common/hist_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@

#include <cstddef> // for size_t

#include "../data/device_adapter.cuh"
#include "../data/adapter.h" // for IsValidFunctor
#include "device_helpers.cuh"
#include "hist_util.h"
#include "quantile.cuh"
#include "timer.h"
#include "xgboost/span.h" // for IterSpan

namespace xgboost {
namespace common {
namespace xgboost::common {
namespace cuda {
/**
* copy and paste of the host version, we can't make it a __host__ __device__ function as
Expand Down Expand Up @@ -246,10 +245,35 @@ void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_r
dh::caching_device_vector<size_t>* p_column_sizes_scan);
} // namespace detail

// Compute sketch on DMatrix.
// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements = 0);
/**
* @brief Compute sketch on DMatrix with GPU and Hessian as weight.
*
* @param ctx Runtime context
* @param p_fmat Training feature matrix
* @param max_bin Maximum number of bins for each feature
* @param hessian Hessian vector.
* @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
*
* @return Quantile cuts
*/
HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
Span<float const> hessian,
std::size_t sketch_batch_num_elements = 0);

/**
* @brief Compute sketch on DMatrix with GPU.
*
* @param ctx Runtime context
* @param p_fmat Training feature matrix
* @param max_bin Maximum number of bins for each feature
* @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
*
* @return Quantile cuts
*/
inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
std::size_t sketch_batch_num_elements = 0) {
return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
}

template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
Expand Down Expand Up @@ -417,7 +441,5 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
}
}
}
} // namespace common
} // namespace xgboost

} // namespace xgboost::common
#endif // COMMON_HIST_UTIL_CUH_
2 changes: 1 addition & 1 deletion src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ class HistogramCuts {
* but consumes more memory.
*/
HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
bool use_sorted = false, Span<float> const hessian = {});
bool use_sorted = false, Span<float const> hessian = {});

enum BinTypeSize : uint8_t {
kUint8BinsTypeSize = 1,
Expand Down
3 changes: 3 additions & 0 deletions src/common/host_device_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
template <typename T>
void HostDeviceVector<T>::SetDevice(int) const {}

template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}

// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<double>;
Expand Down
5 changes: 5 additions & 0 deletions src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,11 @@ void HostDeviceVector<T>::SetDevice(int device) const {
impl_->SetDevice(device);
}

template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
impl_->SetDevice(device.ordinal);
}

template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
impl_->Resize(new_size, v);
Expand Down
2 changes: 1 addition & 1 deletion src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
monitor_.Start("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
monitor_.Stop("Quantiles");

monitor_.Start("InitCompressedData");
Expand Down
2 changes: 1 addition & 1 deletion src/data/gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM

GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch,
common::Span<float> hess)
common::Span<float const> hess)
: max_numeric_bins_per_feat{max_bins_per_feat} {
CHECK(p_fmat->SingleColBlock());
// We use sorted sketching for approx tree method since it's more efficient in
Expand Down
2 changes: 1 addition & 1 deletion src/data/gradient_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ class GHistIndexMatrix {
* \brief Constrcutor for SimpleDMatrix.
*/
GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
double sparse_thresh, bool sorted_sketch, common::Span<float const> hess = {});
/**
* \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
* for push batch.
Expand Down
4 changes: 2 additions & 2 deletions src/data/sparse_page_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
cache_info_.erase(id);
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
std::unique_ptr<common::HistogramCuts> cuts;
cuts = std::make_unique<common::HistogramCuts>(
common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
cuts =
std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin, 0));
this->InitializeSparsePage(ctx); // reset after use.

row_stride = GetRowStride(this);
Expand Down
Loading

0 comments on commit a196443

Please sign in to comment.