Implement sketching with Hessian on GPU. (#9399)

- Prepare for implementing approx on GPU. - Unify the code path between weighted and uniform sketching on DMatrix.
dmlc · Jul 24, 2023 · a196443 · a196443
1 parent 851cba9
commit a196443
Show file tree

Hide file tree

Showing 14 changed files with 436 additions and 220 deletions.
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -185,10 +185,10 @@ class MetaInfo {
     return data_split_mode == DataSplitMode::kRow;
   }
 
-  /*! \brief Whether the data is split column-wise. */
-  bool IsColumnSplit() const {
-    return data_split_mode == DataSplitMode::kCol;
-  }
+  /** @brief Whether the data is split column-wise. */
+  bool IsColumnSplit() const { return data_split_mode == DataSplitMode::kCol; }
+  /** @brief Whether this is a learning to rank data. */
+  bool IsRanking() const { return !group_ptr_.empty(); }
 
   /*!
    * \brief A convenient method to check if we are doing vertical federated learning, which requires
@@ -249,7 +249,7 @@ struct BatchParam {
   /**
    * \brief Hessian, used for sketching with future approx implementation.
    */
-  common::Span<float> hess;
+  common::Span<float const> hess;
   /**
    * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
    *        GHistIndex.
@@ -279,7 +279,7 @@ struct BatchParam {
    *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
    *   span is changed, so caller should keep the span for each iteration.
    */
-  BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
+  BatchParam(bst_bin_t max_bin, common::Span<float const> hessian, bool regenerate)
       : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
 
   [[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {

diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
@@ -49,11 +49,12 @@
 #ifndef XGBOOST_HOST_DEVICE_VECTOR_H_
 #define XGBOOST_HOST_DEVICE_VECTOR_H_
 
+#include <xgboost/context.h>  // for DeviceOrd
+#include <xgboost/span.h>     // for Span
+
 #include <initializer_list>
-#include <vector>
 #include <type_traits>
-
-#include "span.h"
+#include <vector>
 
 namespace xgboost {
 
@@ -133,6 +134,7 @@ class HostDeviceVector {
   GPUAccess DeviceAccess() const;
 
   void SetDevice(int device) const;
+  void SetDevice(DeviceOrd device) const;
 
   void Resize(size_t new_size, T v = T());
 

diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
@@ -12,8 +12,8 @@
 #include "../data/gradient_index.h"  // for GHistIndexMatrix
 #include "quantile.h"
 #include "xgboost/base.h"
-#include "xgboost/context.h"  // Context
-#include "xgboost/data.h"     // SparsePage, SortedCSCPage
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for SparsePage, SortedCSCPage
 
 #if defined(XGBOOST_MM_PREFETCH_PRESENT)
   #include <xmmintrin.h>
@@ -30,7 +30,7 @@ HistogramCuts::HistogramCuts() {
 }
 
 HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
-                              Span<float> const hessian) {
+                              Span<float const> hessian) {
   HistogramCuts out;
   auto const &info = m->Info();
   auto n_threads = ctx->Threads();

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
@@ -11,14 +11,13 @@
 
 #include <cstddef>  // for size_t
 
-#include "../data/device_adapter.cuh"
+#include "../data/adapter.h"  // for IsValidFunctor
 #include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "timer.h"
+#include "xgboost/span.h"  // for IterSpan
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace cuda {
 /**
  * copy and paste of the host version, we can't make it a __host__ __device__ function as
@@ -246,10 +245,35 @@ void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_r
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
 }  // namespace detail
 
-// Compute sketch on DMatrix.
-// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
-HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
-                           size_t sketch_batch_num_elements = 0);
+/**
+ * @brief Compute sketch on DMatrix with GPU and Hessian as weight.
+ *
+ * @param ctx     Runtime context
+ * @param p_fmat  Training feature matrix
+ * @param max_bin Maximum number of bins for each feature
+ * @param hessian Hessian vector.
+ * @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
+ *
+ * @return Quantile cuts
+ */
+HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                      Span<float const> hessian,
+                                      std::size_t sketch_batch_num_elements = 0);
+
+/**
+ * @brief Compute sketch on DMatrix with GPU.
+ *
+ * @param ctx     Runtime context
+ * @param p_fmat  Training feature matrix
+ * @param max_bin Maximum number of bins for each feature
+ * @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
+ *
+ * @return Quantile cuts
+ */
+inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                  std::size_t sketch_batch_num_elements = 0) {
+  return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
+}
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
@@ -417,7 +441,5 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
     }
   }
 }
-}      // namespace common
-}      // namespace xgboost
-
+}  // namespace xgboost::common
 #endif  // COMMON_HIST_UTIL_CUH_
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
@@ -172,7 +172,7 @@ class HistogramCuts {
  *                   but consumes more memory.
  */
 HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
-                              bool use_sorted = false, Span<float> const hessian = {});
+                              bool use_sorted = false, Span<float const> hessian = {});
 
 enum BinTypeSize : uint8_t {
   kUint8BinsTypeSize = 1,

diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
@@ -168,6 +168,9 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int) const {}
 
+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
+
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;

diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
@@ -394,6 +394,11 @@ void HostDeviceVector<T>::SetDevice(int device) const {
   impl_->SetDevice(device);
 }
 
+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
+  impl_->SetDevice(device.ordinal);
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
   impl_->Resize(new_size, v);

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
@@ -131,7 +131,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   monitor_.Start("Quantiles");
   // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
   row_stride = GetRowStride(dmat);
-  cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
+  cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");

diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
@@ -21,7 +21,7 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
 
 GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                    double sparse_thresh, bool sorted_sketch,
-                                   common::Span<float> hess)
+                                   common::Span<float const> hess)
     : max_numeric_bins_per_feat{max_bins_per_feat} {
   CHECK(p_fmat->SingleColBlock());
   // We use sorted sketching for approx tree method since it's more efficient in

diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
@@ -160,7 +160,7 @@ class GHistIndexMatrix {
    * \brief Constrcutor for SimpleDMatrix.
    */
   GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
-                   double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
+                   double sparse_thresh, bool sorted_sketch, common::Span<float const> hess = {});
   /**
    * \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
    *        for push batch.

diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
@@ -25,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
     std::unique_ptr<common::HistogramCuts> cuts;
-    cuts = std::make_unique<common::HistogramCuts>(
-        common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
+    cuts =
+        std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin, 0));
     this->InitializeSparsePage(ctx);  // reset after use.
 
     row_stride = GetRowStride(this);