dmlc · trivialfis · Nov 16, 2019 · Oct 28, 2019 · Oct 29, 2019 · Oct 29, 2019
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -166,6 +166,15 @@ struct BatchParam {
   int max_bin;
   /*! \brief Number of rows in a GPU batch, used for finding quantiles on GPU. */
   int gpu_batch_nrows;
+  /*! \brief Page size for external memory mode. */
+  size_t gpu_page_size;
+
+  inline bool operator!=(const BatchParam& other) const {
+    return gpu_id != other.gpu_id ||
+        max_bin != other.max_bin ||
+        gpu_batch_nrows != other.gpu_batch_nrows ||
+        gpu_page_size != other.gpu_page_size;
+  }
 };
 
 /*!

diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
@@ -21,6 +21,8 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
   int nthread;
   // primary device, -1 means no gpu.
   int gpu_id;
+  // gpu page size in external memory mode, 0 means using the default.
+  size_t gpu_page_size;
 
   void CheckDeprecated() {
     if (this->n_gpus != 0) {
@@ -49,6 +51,10 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
         .set_default(-1)
         .set_lower_bound(-1)
         .describe("The primary GPU device ordinal.");
+    DMLC_DECLARE_FIELD(gpu_page_size)
+        .set_default(0)
+        .set_lower_bound(0)
+        .describe("GPU page size when running in external memory mode.");
     DMLC_DECLARE_FIELD(n_gpus)
         .set_default(0)
         .set_range(0, 1)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
@@ -69,6 +69,8 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
   monitor_.Init("ellpack_page");
   dh::safe_cuda(cudaSetDevice(param.gpu_id));
 
+  matrix.n_rows = dmat->Info().num_row_;
+
   monitor_.StartCuda("Quantiles");
   // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
   common::HistogramCuts hmat;
@@ -206,52 +208,57 @@ void EllpackPageImpl::CreateHistIndices(int device,
 
 // Return the number of rows contained in this page.
 size_t EllpackPageImpl::Size() const {
-  return n_rows;
+  return matrix.n_rows;
 }
 
 // Clear the current page.
 void EllpackPageImpl::Clear() {
   ba_.Clear();
   gidx_buffer = {};
   idx_buffer.clear();
-  n_rows = 0;
+  sparse_page_.Clear();
+  matrix.base_rowid = 0;
+  matrix.n_rows = 0;
 }
 
 // Push a CSR page to the current page.
 //
-// First compress the CSR page into ELLPACK, then the compressed buffer is copied to host and
-// appended to the existing host vector.
+// The CSR pages are accumulated in memory until they reach a certain size, then written out as
+// compressed ELLPACK.
 void EllpackPageImpl::Push(int device, const SparsePage& batch) {
+  sparse_page_.Push(batch);
+  matrix.n_rows += batch.Size();
+}
+
+// Compress the accumulated SparsePage.
+void EllpackPageImpl::CompressSparsePage(int device) {
   monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(device, batch.Size());
+  InitCompressedData(device, matrix.n_rows);
   monitor_.StopCuda("InitCompressedData");
 
   monitor_.StartCuda("BinningCompression");
-  DeviceHistogramBuilderState hist_builder_row_state(batch.Size());
-  hist_builder_row_state.BeginBatch(batch);
-  CreateHistIndices(device, batch, hist_builder_row_state.GetRowStateOnDevice());
+  DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
+  hist_builder_row_state.BeginBatch(sparse_page_);
+  CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
   hist_builder_row_state.EndBatch();
   monitor_.StopCuda("BinningCompression");
 
   monitor_.StartCuda("CopyDeviceToHost");
-  std::vector<common::CompressedByteT> buffer(gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&buffer, gidx_buffer);
-  int offset = 0;
-  if (!idx_buffer.empty()) {
-    offset = ::xgboost::common::detail::kPadding;
-  }
-  idx_buffer.reserve(idx_buffer.size() + buffer.size() - offset);
-  idx_buffer.insert(idx_buffer.end(), buffer.begin() + offset, buffer.end());
+  idx_buffer.resize(gidx_buffer.size());
+  dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
   ba_.Clear();
   gidx_buffer = {};
   monitor_.StopCuda("CopyDeviceToHost");
-
-  n_rows += batch.Size();
 }
 
 // Return the memory cost for storing the compressed features.
 size_t EllpackPageImpl::MemCostBytes() const {
-  return idx_buffer.size() * sizeof(common::CompressedByteT);
+  size_t num_symbols = matrix.info.n_bins + 1;
+
+  // Required buffer size for storing data matrix in ELLPack format.
+  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
+      matrix.info.row_stride * matrix.n_rows, num_symbols);
+  return compressed_size_bytes;
 }
 
 // Copy the compressed features to GPU.

diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
@@ -78,13 +78,14 @@ struct EllpackInfo {
  * kernels.*/
 struct EllpackMatrix {
   EllpackInfo info;
+  size_t base_rowid{};
+  size_t n_rows{};
   common::CompressedIterator<uint32_t> gidx_iter;
 
-  XGBOOST_DEVICE size_t BinCount() const { return info.gidx_fvalue_map.size(); }
-
   // Get a matrix element, uses binary search for look up Return NaN if missing
   // Given a row index and a feature index, returns the corresponding cut value
   __device__ bst_float GetElement(size_t ridx, size_t fidx) const {
+    ridx -= base_rowid;
     auto row_begin = info.row_stride * ridx;
     auto row_end = row_begin + info.row_stride;
     auto gidx = -1;
@@ -102,6 +103,11 @@ struct EllpackMatrix {
     }
     return info.gidx_fvalue_map[gidx];
   }
+
+  // Check if the row id is withing range of the current batch.
+  __device__ bool IsInRange(size_t row_id) const {
+    return row_id >= base_rowid && row_id < base_rowid + n_rows;
+  }
 };
 
 // Instances of this type are created while creating the histogram bins for the
@@ -185,7 +191,6 @@ class EllpackPageImpl {
   /*! \brief global index of histogram, which is stored in ELLPack format. */
   common::Span<common::CompressedByteT> gidx_buffer;
   std::vector<common::CompressedByteT> idx_buffer;
-  size_t n_rows{};
 
   /*!
    * \brief Default constructor.
@@ -240,7 +245,7 @@ class EllpackPageImpl {
 
   /*! \brief Set the base row id for this page. */
   inline void SetBaseRowId(size_t row_id) {
-    base_rowid_ = row_id;
+    matrix.base_rowid = row_id;
   }
 
   /*! \brief clear the page. */
@@ -263,11 +268,17 @@ class EllpackPageImpl {
    */
   void InitDevice(int device, EllpackInfo info);
 
+  /*! \brief Compress the accumulated SparsePage into ELLPACK format.
+   *
+   * @param device The GPU device to use.
+   */
+  void CompressSparsePage(int device);
+
  private:
   common::Monitor monitor_;
   dh::BulkAllocator ba_;
-  size_t base_rowid_{};
   bool device_initialized_{false};
+  SparsePage sparse_page_{};
 };
 
 }  // namespace xgboost

diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
@@ -17,21 +17,21 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
  public:
   bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
     auto* impl = page->Impl();
-    if (!fi->Read(&impl->n_rows))  return false;
+    if (!fi->Read(&impl->matrix.n_rows))  return false;
     return fi->Read(&impl->idx_buffer);
   }
 
   bool Read(EllpackPage* page,
             dmlc::SeekStream* fi,
             const std::vector<bst_uint>& sorted_index_set) override {
     auto* impl = page->Impl();
-    if (!fi->Read(&impl->n_rows))  return false;
+    if (!fi->Read(&impl->matrix.n_rows))  return false;
     return fi->Read(&page->Impl()->idx_buffer);
   }
 
   void Write(const EllpackPage& page, dmlc::Stream* fo) override {
     auto* impl = page.Impl();
-    fo->Write(impl->n_rows);
+    fo->Write(impl->matrix.n_rows);
     auto buffer = impl->idx_buffer;
     CHECK(!buffer.empty());
     fo->Write(buffer);

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
@@ -40,11 +40,13 @@ class EllpackPageSourceImpl : public DataSource<EllpackPage> {
   const std::string kPageType_{".ellpack.page"};
 
   int device_{-1};
+  size_t page_size_{DMatrix::kPageSize};
   common::Monitor monitor_;
   dh::BulkAllocator ba_;
   /*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
   EllpackInfo ellpack_info_;
   std::unique_ptr<SparsePageSource<EllpackPage>> source_;
+  std::string cache_info_;
 };
 
 EllpackPageSource::EllpackPageSource(DMatrix* dmat,
@@ -72,8 +74,12 @@ const EllpackPage& EllpackPageSource::Value() const {
 // each CSR page, and write the accumulated ELLPACK pages to disk.
 EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
                                              const std::string& cache_info,
-                                             const BatchParam& param) noexcept(false) {
-  device_ = param.gpu_id;
+                                             const BatchParam& param) noexcept(false)
+    : device_(param.gpu_id), cache_info_(cache_info) {
+
+  if (param.gpu_page_size > 0) {
+    page_size_ = param.gpu_page_size;
+  }
 
   monitor_.Init("ellpack_page_source");
   dh::safe_cuda(cudaSetDevice(device_));
@@ -92,10 +98,11 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
   WriteEllpackPages(dmat, cache_info);
   monitor_.StopCuda("WriteEllpackPages");
 
-  source_.reset(new SparsePageSource<EllpackPage>(cache_info, kPageType_));
+  source_.reset(new SparsePageSource<EllpackPage>(cache_info_, kPageType_));
 }
 
 void EllpackPageSourceImpl::BeforeFirst() {
+  source_.reset(new SparsePageSource<EllpackPage>(cache_info_, kPageType_));
   source_->BeforeFirst();
 }
 
@@ -133,20 +140,23 @@ void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string&
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
     impl->Push(device_, batch);
 
-    if (impl->MemCostBytes() >= DMatrix::kPageSize) {
-      bytes_write += impl->MemCostBytes();
+    size_t mem_cost_bytes = impl->MemCostBytes();
+    if (mem_cost_bytes >= page_size_) {
+      bytes_write += mem_cost_bytes;
+      impl->CompressSparsePage(device_);
       writer.PushWrite(std::move(page));
       writer.Alloc(&page);
       impl = page->Impl();
       impl->matrix.info = ellpack_info_;
       impl->Clear();
       double tdiff = dmlc::GetTime() - tstart;
-      LOG(INFO) << "Writing to " << cache_info << " in "
+      LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
                 << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
                 << (bytes_write >> 20UL) << " written";
     }
   }
   if (impl->Size() != 0) {
+    impl->CompressSparsePage(device_);
     writer.PushWrite(std::move(page));
   }
 }

diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
@@ -81,10 +81,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
   CHECK_GE(param.gpu_id, 0);
   CHECK_GE(param.max_bin, 2);
   // Lazily instantiate
-  if (!ellpack_source_ ||
-      batch_param_.gpu_id != param.gpu_id ||
-      batch_param_.max_bin != param.max_bin ||
-      batch_param_.gpu_batch_nrows != param.gpu_batch_nrows) {
+  if (!ellpack_source_ || batch_param_ != param) {
     ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
     batch_param_ = param;
   }

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
@@ -33,6 +33,7 @@ class RowPartitioner {
   using TreePositionT = int32_t;
   using RowIndexT = bst_uint;
   struct Segment;
+  static constexpr TreePositionT kIgnoredTreePosition = -1;
 
  private:
   int device_idx;
@@ -124,6 +125,7 @@ class RowPartitioner {
       idx += segment.begin;
       RowIndexT ridx = d_ridx[idx];
       TreePositionT new_position = op(ridx);  // new node id
+      if (new_position == kIgnoredTreePosition) return;
       KERNEL_CHECK(new_position == left_nidx || new_position == right_nidx);
       AtomicIncrement(d_left_count, new_position == left_nidx);
       d_position[idx] = new_position;
@@ -163,7 +165,9 @@ class RowPartitioner {
     dh::LaunchN(device_idx, position.Size(), [=] __device__(size_t idx) {
       auto position = d_position[idx];
       RowIndexT ridx = d_ridx[idx];
-      d_position[idx] = op(ridx, position);
+      TreePositionT new_position = op(ridx, position);
+      if (new_position == kIgnoredTreePosition) return;
+      d_position[idx] = new_position;
     });
   }