rapidsai · rapids-bot · Dec 13, 2021 · Jul 12, 2021 · Jul 14, 2021 · Jul 14, 2021
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
@@ -19,6 +19,8 @@
 #include <raft/comms/comms.hpp>
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -513,7 +515,8 @@ bool test_commsplit(const handle_t &h, int n_colors) {
   int color = rank % n_colors;
   int key = rank / n_colors;
 
-  handle_t new_handle(1);
+  rmm::cuda_stream_pool stream_pool(1);
+  handle_t new_handle(rmm::cuda_stream_default, stream_pool);
   auto shared_comm =
     std::make_shared<comms_t>(communicator.comm_split(color, key));
   new_handle.set_comms(shared_comm);

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
@@ -39,6 +39,7 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/host/allocator.hpp>
 #include <rmm/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_view.hpp>
 #include "cudart_utils.h"
 
 namespace raft {
@@ -52,56 +53,47 @@ class handle_t {
   static constexpr int kNumDefaultWorkerStreams = 0;
 
  public:
+  // delete copy/move constructors and assignment operators as
+  // copying and moving underlying resources is unsafe
+  handle_t(const handle_t&) = delete;
+  handle_t& operator=(const handle_t&) = delete;
+  handle_t(handle_t&&) = delete;
+  handle_t& operator=(handle_t&&) = delete;
+
   /**
-   * @brief Construct a handle with the specified number of worker streams
+   * @brief Construct a handle with a stream view and stream pool
    *
-   * @param[in] n_streams number worker streams to be created
+   * @param[in] stream the default stream (which has the default value of nullptr if unspecified)
+   * @param[in] stream_pool the stream pool used (which has default pool of size 0 if unspecified)
    */
-  explicit handle_t(int n_streams = kNumDefaultWorkerStreams)
+  handle_t(rmm::cuda_stream_view stream = {},
+           const rmm::cuda_stream_pool& stream_pool = rmm::cuda_stream_pool{0})
     : dev_id_([]() -> int {
         int cur_dev = -1;
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
       }()),
-      streams_(n_streams),
       device_allocator_(std::make_shared<mr::device::default_allocator>()),
-      host_allocator_(std::make_shared<mr::host::default_allocator>()) {
-    create_resources();
-  }
-
-  /**
-   * @brief Construct a light handle copy from another 
-   * user stream, cuda handles, comms and worker pool are not copied
-   * The user_stream of the returned handle is set to the specified stream 
-   * of the other handle worker pool 
-   * @param[in] stream_id stream id in `other` worker streams 
-   * to be set as user stream in the constructed handle
-   * @param[in] n_streams number worker streams to be created
-   */
-  handle_t(const handle_t& other, int stream_id,
-           int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device()), streams_(n_streams) {
-    RAFT_EXPECTS(
-      other.get_num_internal_streams() > 0,
-      "ERROR: the main handle must have at least one worker stream\n");
-    prop_ = other.get_device_properties();
-    device_prop_initialized_ = true;
-    device_allocator_ = other.get_device_allocator();
-    host_allocator_ = other.get_host_allocator();
+      host_allocator_(std::make_shared<mr::host::default_allocator>()),
+      stream_view_(stream),
+      stream_pool_(stream_pool) {
     create_resources();
-    set_stream(other.get_internal_stream(stream_id));
   }
 
   /** Destroys all held-up resources */
   virtual ~handle_t() { destroy_resources(); }
 
   int get_device() const { return dev_id_; }
 
-  void set_stream(cudaStream_t stream) { user_stream_ = stream; }
-  cudaStream_t get_stream() const { return user_stream_; }
-  rmm::cuda_stream_view get_stream_view() const {
-    return rmm::cuda_stream_view(user_stream_);
-  }
+  /**
+   * @brief returns main stream on the handle
+   */
+  const rmm::cuda_stream_view& get_stream() const { return stream_view_; }
+
+  /**
+   * @brief returns stream pool on the handle, could be 0 sized
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const { return stream_pool_; }
 
   void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
     device_allocator_ = allocator;
@@ -121,6 +113,7 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
       CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+      CUBLAS_CHECK(cublasSetStream(cublas_handle_, stream_view_));
       cublas_initialized_ = true;
     }
     return cublas_handle_;
@@ -130,6 +123,7 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
       CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
+      CUSOLVER_CHECK(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
       cusolver_dn_initialized_ = true;
     }
     return cusolver_dn_handle_;
@@ -139,6 +133,7 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
       CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
+      CUSOLVER_CHECK(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
       cusolver_sp_initialized_ = true;
     }
     return cusolver_sp_handle_;
@@ -148,40 +143,44 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
       CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
+      CUSPARSE_CHECK(cusparseSetStream(cusparse_handle_, stream_view_));
       cusparse_initialized_ = true;
     }
     return cusparse_handle_;
   }
 
-  // legacy compatibility for cuML
-  cudaStream_t get_internal_stream(int sid) const {
-    return streams_.get_stream(sid).value();
-  }
-  // new accessor return rmm::cuda_stream_view
-  rmm::cuda_stream_view get_internal_stream_view(int sid) const {
-    return streams_.get_stream(sid);
-  }
+  /**
+   * @brief synchronize main stream on the handle
+   */
+  void sync_stream() const { stream_view_.synchronize(); }
 
-  int get_num_internal_streams() const { return streams_.get_pool_size(); }
-  std::vector<cudaStream_t> get_internal_streams() const {
-    std::vector<cudaStream_t> int_streams_vec;
-    for (int i = 0; i < get_num_internal_streams(); i++) {
-      int_streams_vec.push_back(get_internal_stream(i));
+  /**
+   * @brief synchronize the stream pool on the handle
+   */
+  void sync_stream_pool() const {
+    for (std::size_t i = 0; i < stream_pool_.get_pool_size(); i++) {
+      stream_pool_.get_stream(i).synchronize();
     }
-    return int_streams_vec;
   }
 
-  void wait_on_user_stream() const {
-    CUDA_CHECK(cudaEventRecord(event_, user_stream_));
-    for (int i = 0; i < get_num_internal_streams(); i++) {
-      CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
+  /**
+   * @brief synchronize subset of stream pool
+   * 
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const {
+    for (const auto& stream_index : stream_indices) {
+      stream_pool_.get_stream(stream_index).synchronize();
     }
   }
 
-  void wait_on_internal_streams() const {
-    for (int i = 0; i < get_num_internal_streams(); i++) {
-      CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
-      CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const {
+    CUDA_CHECK(cudaEventRecord(event_, stream_view_));
+    for (std::size_t i = 0; i < stream_pool_.get_pool_size(); i++) {
+      CUDA_CHECK(cudaStreamWaitEvent(stream_pool_.get_stream(i), event_, 0));
     }
   }
 
@@ -227,7 +226,6 @@ class handle_t {
   std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
 
   const int dev_id_;
-  rmm::cuda_stream_pool streams_{0};
   mutable cublasHandle_t cublas_handle_;
   mutable bool cublas_initialized_{false};
   mutable cusolverDnHandle_t cusolver_dn_handle_;
@@ -238,7 +236,8 @@ class handle_t {
   mutable bool cusparse_initialized_{false};
   std::shared_ptr<mr::device::allocator> device_allocator_;
   std::shared_ptr<mr::host::allocator> host_allocator_;
-  cudaStream_t user_stream_{nullptr};
+  rmm::cuda_stream_view stream_view_;
+  const rmm::cuda_stream_pool& stream_pool_;
   cudaEvent_t event_;
   mutable cudaDeviceProp prop_;
   mutable bool device_prop_initialized_{false};
@@ -277,9 +276,12 @@ class handle_t {
 class stream_syncer {
  public:
   explicit stream_syncer(const handle_t& handle) : handle_(handle) {
-    handle_.wait_on_user_stream();
+    handle_.sync_stream();
+  }
+  ~stream_syncer() {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
   }
-  ~stream_syncer() { handle_.wait_on_internal_streams(); }
 
   stream_syncer(const stream_syncer& other) = delete;
   stream_syncer& operator=(const stream_syncer& other) = delete;

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
@@ -200,9 +201,8 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
                           int64_t *res_I, float *res_D, IntType k,
                           std::shared_ptr<deviceAllocator> allocator,
                           cudaStream_t userStream,
-                          cudaStream_t *internalStreams = nullptr,
-                          int n_int_streams = 0, bool rowMajorIndex = true,
-                          bool rowMajorQuery = true,
+                          const rmm::cuda_stream_pool &internalStreams,
+                          bool rowMajorIndex = true, bool rowMajorQuery = true,
                           std::vector<int64_t> *translations = nullptr,
                           raft::distance::DistanceType metric =
                             raft::distance::DistanceType::L2Expanded,
@@ -263,14 +263,16 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   }
 
   // Sync user stream only if using other streams to parallelize query
-  if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
+  auto n_internal_streams = internalStreams.get_pool_size();
+  if (n_internal_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
 
   for (size_t i = 0; i < input.size(); i++) {
     float *out_d_ptr = out_D + (i * k * n);
     int64_t *out_i_ptr = out_I + (i * k * n);
 
-    cudaStream_t stream =
-      raft::select_stream(userStream, internalStreams, n_int_streams, i);
+    cudaStream_t stream = n_internal_streams > 0
+                            ? internalStreams.get_stream().value()
+                            : userStream;
 
     switch (metric) {
       case raft::distance::DistanceType::Haversine:
@@ -318,8 +320,8 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   // Sync internal streams if used. We don't need to
   // sync the user stream because we'll already have
   // fully serial execution.
-  for (int i = 0; i < n_int_streams; i++) {
-    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
+  for (std::size_t i = 0; i < internalStreams.get_pool_size(); i++) {
+    CUDA_CHECK(cudaStreamSynchronize(internalStreams.get_stream(i)));
   }
 
   if (input.size() > 1 || translations != nullptr) {

diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
@@ -69,13 +69,11 @@ inline void brute_force_knn(
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors must be the same size");
 
-  std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
-
   detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
                                k, handle.get_device_allocator(),
-                               handle.get_stream(), int_streams.data(),
-                               handle.get_num_internal_streams(), rowMajorIndex,
-                               rowMajorQuery, translations, metric, metric_arg);
+                               handle.get_stream(), handle.get_stream_pool(),
+                               rowMajorIndex, rowMajorQuery, translations,
+                               metric, metric_arg);
 }
 
 }  // namespace knn

diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
@@ -59,7 +59,6 @@ TEST(Raft, ModularitySolvers) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   index_type neigvs{10};

diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
@@ -29,7 +29,6 @@ TEST(Raft, EigenSolvers) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   index_type* ro{nullptr};
@@ -73,7 +72,6 @@ TEST(Raft, SpectralSolvers) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   index_type neigvs{10};