From bccebf9afa122323a1f95dbd6fe6b4ab1b85a9a7 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 23 Sep 2021 11:01:05 -0700
Subject: [PATCH] Using pinned host memory for Random Forest and DBSCAN (#4215)

Benchmarks show that RF performs consistently better with pinned host memory, while DBSCAN sometimes better and sometimes not (within the margin of error), so using pinned host memory by default for both these algorithms.

Ignoring KMeans and LARS for now as both show slightly better perf with pinned host memory but only with increasing number of columns. Since this would need more analysis and deciding if a heuristic is needed for selecting memory, deferring it to 21.12.

Here are the raw numbers:
1. LARS
Normal memory:
```{'lars': {(100000, 10): 0.12429666519165039, (100000, 100): 0.015396833419799805, (100000, 250): 0.015408039093017578, (250000, 10): 0.00986933708190918, (250000, 100): 0.023822546005249023, (250000, 250): 0.03715157508850098, (500000, 10): 0.013423442840576172, (500000, 100): 0.044762372970581055, (500000, 250): 0.07782578468322754}```
Pinned memory:
```{'lars': {(100000, 10): 0.12958097457885742, (100000, 100): 0.01501011848449707, (100000, 250): 0.016597509384155273, (250000, 10): 0.01801013946533203, (250000, 100): 0.022644996643066406, (250000, 250): 0.037090301513671875, (500000, 10): 0.020437955856323242, (500000, 100): 0.044635772705078125, (500000, 250): 0.07696056365966797}```
2. RFR
Normal memory:
```'rfr': {(100000, 10): 1.1951744556427002, (100000, 100): 5.099738359451294, (100000, 250): 11.32804536819458, (250000, 10): 2.0097765922546387, (250000, 100): 9.109776496887207, (250000, 250): 21.058837890625, (500000, 10): 3.3387184143066406, (500000, 100): 15.802990436553955, (500000, 250): 36.80855870246887}```
Pinned memory:
```'rfr': {(100000, 10): 1.1727137565612793, (100000, 100): 4.804195880889893, (100000, 250): 11.621357917785645, (250000, 10): 1.8899295330047607, (250000, 100): 9.16961407661438, (250000, 250): 21.12194561958313, (500000, 10): 3.2937560081481934, (500000, 100): 15.66197681427002, (500000, 250): 36.6080117225647}```
3. KMeans
Normal memory:
```{(100000, 10): 0.11008882522583008, (100000, 100): 0.15475797653198242, (100000, 250): 0.15683507919311523, (250000, 10): 0.18775177001953125, (250000, 100): 0.25696277618408203, (250000, 250): 0.40389132499694824, (500000, 10): 0.4578282833099365, (500000, 100): 0.3917391300201416, (500000, 250): 0.6426849365234375}```
Pinned memory:
```'kmeans': {(100000, 10): 0.11982870101928711, (100000, 100): 0.16992664337158203, (100000, 250): 0.1021108627319336, (250000, 10): 0.16021251678466797, (250000, 100): 0.31025242805480957, (250000, 250): 0.298201322555542, (500000, 10): 0.21084189414978027, (500000, 100): 0.50473952293396, (500000, 250): 0.6191830635070801}```
4. DBSCAN
Normal memory:
```'dbscan': {(100000, 10): 0.4957292079925537, (100000, 100): 0.8680248260498047, (100000, 250): 1.585218906402588, (250000, 10): 4.52524995803833, (250000, 100): 7.175846099853516, (250000, 250): 12.135416269302368, (500000, 10): 26.427770853042603, (500000, 100): 37.57275915145874, (500000, 250): 57.98261737823486}}```
Pinned memory:
```'dbscan': {(100000, 10): 0.49578166007995605, (100000, 100): 0.8678708076477051, (100000, 250): 1.5854766368865967, (250000, 10): 4.526952505111694, (250000, 100): 7.172863006591797, (250000, 250): 12.145166397094727, (500000, 10): 26.422622680664062, (500000, 100): 37.56665277481079, (500000, 250): 58.02563738822937}}```

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Rory Mitchell (https://github.com/RAMitchell)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/4215
---
 .../cuml/common/pinned_host_vector.hpp        | 64 +++++++++++++++++++
 cpp/src/dbscan/adjgraph/naive.cuh             |  9 +--
 .../batched-levelalgo/builder.cuh             |  3 +-
 cpp/src/kmeans/common.cuh                     |  1 +
 cpp/src/solver/lars_impl.cuh                  |  1 +
 5 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 cpp/include/cuml/common/pinned_host_vector.hpp

diff --git a/cpp/include/cuml/common/pinned_host_vector.hpp b/cpp/include/cuml/common/pinned_host_vector.hpp
new file mode 100644
index 0000000000..b29527a893
--- /dev/null
+++ b/cpp/include/cuml/common/pinned_host_vector.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+namespace ML {
+
+template <typename T>
+class pinned_host_vector {
+ public:
+  pinned_host_vector() = default;
+
+  explicit pinned_host_vector(std::size_t n)
+    : size_{n}, data_{static_cast<T*>(pinned_mr.allocate(n * sizeof(T)))}
+  {
+    std::uninitialized_fill(data_, data_ + n, static_cast<T>(0));
+  }
+  ~pinned_host_vector() { pinned_mr.deallocate(data_, size_ * sizeof(T)); }
+
+  pinned_host_vector(pinned_host_vector const&) = delete;
+  pinned_host_vector(pinned_host_vector&&)      = delete;
+  pinned_host_vector& operator=(pinned_host_vector const&) = delete;
+  pinned_host_vector& operator=(pinned_host_vector&&) = delete;
+
+  void resize(std::size_t n)
+  {
+    size_ = n;
+    data_ = static_cast<T*>(pinned_mr.allocate(n * sizeof(T)));
+    std::uninitialized_fill(data_, data_ + n, static_cast<T>(0));
+  }
+
+  T* data() { return data_; }
+
+  T* begin() { return data_; }
+
+  T* end() { return data_ + size_; }
+
+  std::size_t size() { return size_; }
+
+  T operator[](std::size_t idx) const { return *(data_ + idx); }
+  T& operator[](std::size_t idx) { return *(data_ + idx); }
+
+ private:
+  rmm::mr::pinned_memory_resource pinned_mr{};
+  T* data_;
+  std::size_t size_;
+};
+
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh
index 6ef2830c7d..f633a2e4c8 100644
--- a/cpp/src/dbscan/adjgraph/naive.cuh
+++ b/cpp/src/dbscan/adjgraph/naive.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+#include <cuml/common/pinned_host_vector.hpp>
 #include <raft/cuda_utils.cuh>
 #include <vector>
 #include "../common.cuh"
@@ -35,14 +36,14 @@ void launcher(const raft::handle_t& handle,
 {
   Index_ k = 0;
   Index_ N = data.N;
-  std::vector<Index_> host_vd(batch_size + 1);
-  std::vector<char> host_adj(((batch_size * N) / 8) + 1);
-  std::vector<Index_> host_ex_scan(batch_size);
+  ML::pinned_host_vector<Index_> host_vd(batch_size + 1);
+  ML::pinned_host_vector<char> host_adj(((batch_size * N) / 8) + 1);
+  ML::pinned_host_vector<Index_> host_ex_scan(batch_size);
   raft::update_host((bool*)host_adj.data(), data.adj, batch_size * N, stream);
   raft::update_host(host_vd.data(), data.vd, batch_size + 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
   size_t adjgraph_size = size_t(host_vd[batch_size]);
-  std::vector<Index_> host_adj_graph(adjgraph_size);
+  ML::pinned_host_vector<Index_> host_adj_graph(adjgraph_size);
   for (Index_ i = 0; i < batch_size; i++) {
     for (Index_ j = 0; j < N; j++) {
       /// TODO: change layout or remove; cf #3414
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index 19bebcfa93..2306ac8a17 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -27,6 +27,7 @@
 #include <common/grid_sync.cuh>
 #include <common/nvtx.hpp>
 #include <cuml/common/logger.hpp>
+#include <cuml/common/pinned_host_vector.hpp>
 #include <cuml/tree/decisiontree.hpp>
 #include <raft/cuda_utils.cuh>
 #include "input.cuh"
@@ -190,7 +191,7 @@ struct Builder {
   const size_t alignValue = 512;
 
   rmm::device_uvector<char> d_buff;
-  std::vector<char> h_buff;
+  ML::pinned_host_vector<char> h_buff;
 
   Builder(const raft::handle_t& handle,
           IdxT treeid,
diff --git a/cpp/src/kmeans/common.cuh b/cpp/src/kmeans/common.cuh
index 8c8df33c67..4a29941724 100644
--- a/cpp/src/kmeans/common.cuh
+++ b/cpp/src/kmeans/common.cuh
@@ -61,6 +61,7 @@
 #include <fstream>
 #include <numeric>
 #include <random>
+#include <vector>
 
 namespace ML {
 
diff --git a/cpp/src/solver/lars_impl.cuh b/cpp/src/solver/lars_impl.cuh
index 3e8c5d2440..9ab45735fd 100644
--- a/cpp/src/solver/lars_impl.cuh
+++ b/cpp/src/solver/lars_impl.cuh
@@ -19,6 +19,7 @@
 #include <iostream>
 #include <limits>
 #include <numeric>
+#include <vector>
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>