Convert cudaStream_t to rmm::cuda_stream_view (#325)

This PR converts all usage of cudaStream_t in cuSpatial to rmm::cuda_stream_view, following on from rapidsai/cudf#6646 and rapidsai/cudf#6648 Also reorders stream parameters to occur before MR parameters in all functions.
rapidsai · Nov 24, 2020 · d4ed2d8 · d4ed2d8
1 parent ec8941b
commit d4ed2d8
Show file tree

Hide file tree

Showing 26 changed files with 407 additions and 354 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,10 @@
 
 ## Improvements
 - PR #310 Pin cmake policies to cmake 3.17 version
+- PR #325 Convert `cudaStream_t` to `rmm::cuda_stream_view`
 
 ## Bug Fixes
-- PR #320 Fix a quadtree construction bug - needs zero out device_uvector before scatter
+- PR #320 Fix quadtree construction bug: zero out `device_uvector` before `scatter`
 
 # cuSpatial 0.16.0 (Date TBD)
 

diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -16,47 +16,45 @@
 
 #include "synchronization.hpp"
 
-#include <rmm/device_buffer.hpp>
+#include <cudf/utilities/error.hpp>
 
-#define RMM_CUDA_ASSERT_OK(expr)       \
-  do {                                 \
-    cudaError_t const status = (expr); \
-    assert(cudaSuccess == status);     \
-  } while (0);
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 
 cuda_event_timer::cuda_event_timer(benchmark::State& state,
                                    bool flush_l2_cache,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream)
   : p_state(&state), stream(stream)
 {
   // flush all of L2$
   if (flush_l2_cache) {
     int current_device = 0;
-    RMM_CUDA_TRY(cudaGetDevice(&current_device));
+    CUDA_TRY(cudaGetDevice(&current_device));
 
     int l2_cache_bytes = 0;
-    RMM_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+    CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
 
     if (l2_cache_bytes > 0) {
       const int memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      RMM_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream));
+      CUDA_TRY(
+        cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
     }
   }
 
-  RMM_CUDA_TRY(cudaEventCreate(&start));
-  RMM_CUDA_TRY(cudaEventCreate(&stop));
-  RMM_CUDA_TRY(cudaEventRecord(start, stream));
+  CUDA_TRY(cudaEventCreate(&start));
+  CUDA_TRY(cudaEventCreate(&stop));
+  CUDA_TRY(cudaEventRecord(start, stream.value()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  RMM_CUDA_ASSERT_OK(cudaEventRecord(stop, stream));
-  RMM_CUDA_ASSERT_OK(cudaEventSynchronize(stop));
+  CUDA_TRY(cudaEventRecord(stop, stream.value()));
+  CUDA_TRY(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0f;
-  RMM_CUDA_ASSERT_OK(cudaEventElapsedTime(&milliseconds, start, stop));
+  CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
   p_state->SetIterationTime(milliseconds / (1000.0f));
-  RMM_CUDA_ASSERT_OK(cudaEventDestroy(start));
-  RMM_CUDA_ASSERT_OK(cudaEventDestroy(stop));
+  CUDA_TRY(cudaEventDestroy(start));
+  CUDA_TRY(cudaEventDestroy(stop));
 }
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -17,7 +17,7 @@
 /**
  * @file synchronization.hpp
  * @brief This is the header file for `cuda_event_timer`.
- */
+ **/
 
 /**
  * @brief  This class serves as a wrapper for using `cudaEvent_t` as the user
@@ -33,7 +33,7 @@
 
       for (auto _ : state){
 
-        cudaStream_t stream = 0;
+        rmm::cuda_stream_view stream{}; // default stream, could be another stream
 
         // Create (Construct) an object of this class. You HAVE to pass in the
         // benchmark::State object you are using. It measures the time from its
@@ -44,23 +44,29 @@
         cuda_event_timer raii(state, true, stream); // flush_l2_cache = true
 
         // Now perform the operations that is to be benchmarked
-        sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel
+        sample_kernel<<<1, 256, 0, stream.value()>>>(); // Possibly launching a CUDA kernel
 
       }
     }
 
     // Register the function as a benchmark. You will need to set the `UseManualTime()`
-    // flag in order to use the timer embeded in this class.
+    // flag in order to use the timer embedded in this class.
     BENCHMARK(sample_cuda_benchmark)->UseManualTime();
 
 
- */
+ **/
 
-#pragma once
+#ifndef CUDF_BENCH_SYNCHRONIZATION_H
+#define CUDF_BENCH_SYNCHRONIZATION_H
 
 // Google Benchmark library
 #include <benchmark/benchmark.h>
-#include <cuda_runtime_api.h>
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <driver_types.h>
 
 class cuda_event_timer {
  public:
@@ -73,21 +79,25 @@ class cuda_event_timer {
    * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before
    *                            every iteration.
    * @param[in] stream_ The CUDA stream we are measuring time on.
-   */
-  cuda_event_timer(benchmark::State& state, bool flush_l2_cache, cudaStream_t stream_ = 0);
+   **/
+  cuda_event_timer(benchmark::State& state,
+                   bool flush_l2_cache,
+                   rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
-  // The user will HAVE to provide a benchmark::State object to set
+  // The user must provide a benchmark::State object to set
   // the timer so we disable the default c'tor.
   cuda_event_timer() = delete;
 
-  // The d'tor stops the timer and performs a synchroniazation.
+  // The d'tor stops the timer and performs a synchronization.
   // Time of the benchmark::State object provided to the c'tor
   // will be set to the value given by `cudaEventElapsedTime`.
   ~cuda_event_timer();
 
  private:
   cudaEvent_t start;
   cudaEvent_t stop;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
   benchmark::State* p_state;
 };
+
+#endif
diff --git a/cpp/include/cuspatial/cubic_spline.hpp b/cpp/include/cuspatial/cubic_spline.hpp
@@ -18,6 +18,8 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
 #include <memory>
 
 namespace cuspatial {
@@ -43,16 +45,19 @@ namespace cuspatial {
  * @param[in] offsets the exclusive scan of the spline sizes, prefixed by
  * 0. For example, for 3 splines of 5 vertices each, the offsets input array
  * is {0, 5, 10, 15}.
+ * @param[in] mr The memory resource to use for allocating output
  *
  * @return cudf::table_view of coefficients for spline interpolation. The size
  * of the table is ((M-n), 4) where M is `t.size()` and and n is
  * `ids.size()-1`.
  **/
-std::unique_ptr<cudf::column> cubicspline_interpolate(cudf::column_view const& query_points,
-                                                      cudf::column_view const& spline_ids,
-                                                      cudf::column_view const& offsets,
-                                                      cudf::column_view const& source_points,
-                                                      cudf::table_view const& coefficients);
+std::unique_ptr<cudf::column> cubicspline_interpolate(
+  cudf::column_view const& query_points,
+  cudf::column_view const& spline_ids,
+  cudf::column_view const& offsets,
+  cudf::column_view const& source_points,
+  cudf::table_view const& coefficients,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute cubic interpolations of a set of points based on their
@@ -69,11 +74,14 @@ std::unique_ptr<cudf::column> cubicspline_interpolate(cudf::column_view const& q
  * identify which specific spline a given query_point is interpolated with.
  * @param[in] coefficients table of spline coefficients produced by
  * cubicspline_coefficients.
+ * @param[in] mr The memory resource to use for allocating output
  *
  * @return cudf::column `y` coordinates interpolated from `x` and `coefs`.
  **/
-std::unique_ptr<cudf::table> cubicspline_coefficients(cudf::column_view const& t,
-                                                      cudf::column_view const& y,
-                                                      cudf::column_view const& ids,
-                                                      cudf::column_view const& offsets);
+std::unique_ptr<cudf::table> cubicspline_coefficients(
+  cudf::column_view const& t,
+  cudf::column_view const& y,
+  cudf::column_view const& ids,
+  cudf::column_view const& offsets,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace cuspatial
diff --git a/cpp/src/indexing/construction/detail/phase_1.cuh b/cpp/src/indexing/construction/detail/phase_1.cuh
@@ -24,6 +24,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/functional.h>
@@ -57,11 +58,11 @@ compute_point_keys_and_sorted_indices(cudf::column_view const &x,
                                       T y_max,
                                       T scale,
                                       int8_t max_depth,
-                                      rmm::mr::device_memory_resource *mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr)
 {
   rmm::device_uvector<uint32_t> keys(x.size(), stream);
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     make_zip_iterator(x.begin<T>(), y.begin<T>()),
                     make_zip_iterator(x.begin<T>(), y.begin<T>()) + x.size(),
                     keys.begin(),
@@ -77,12 +78,12 @@ compute_point_keys_and_sorted_indices(cudf::column_view const &x,
 
   auto indices = make_fixed_width_column<uint32_t>(keys.size(), stream, mr);
 
-  thrust::sequence(rmm::exec_policy(stream)->on(stream),
+  thrust::sequence(rmm::exec_policy(stream)->on(stream.value()),
                    indices->mutable_view().begin<uint32_t>(),
                    indices->mutable_view().end<uint32_t>());
 
   // Sort the codes and point indices
-  thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+  thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream.value()),
                              keys.begin(),
                              keys.end(),
                              indices->mutable_view().begin<int32_t>());
@@ -106,9 +107,9 @@ inline cudf::size_type build_tree_level(InputIterator1 keys_begin,
                                         OutputIterator1 keys_out,
                                         OutputIterator2 vals_out,
                                         BinaryOp binary_op,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream)
 {
-  auto result = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+  auto result = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                       keys_begin,
                                       keys_end,
                                       vals_in,
@@ -133,7 +134,7 @@ build_tree_levels(int8_t max_depth,
                   KeysIterator keys_begin,
                   ValsIterator quad_point_count_begin,
                   ValsIterator quad_child_count_begin,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream)
 {
   // begin/end offsets
   cudf::size_type begin{0};
@@ -193,7 +194,7 @@ reverse_tree_levels(rmm::device_uvector<uint32_t> const &quad_keys_in,
                     std::vector<cudf::size_type> const &begin_pos,
                     std::vector<cudf::size_type> const &end_pos,
                     int8_t max_depth,
-                    cudaStream_t stream)
+                    rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<uint32_t> quad_keys(quad_keys_in.size(), stream);
   rmm::device_uvector<uint8_t> quad_levels(quad_keys_in.size(), stream);
@@ -205,19 +206,19 @@ reverse_tree_levels(rmm::device_uvector<uint32_t> const &quad_keys_in,
     cudf::size_type level_end   = end_pos[level];
     cudf::size_type level_begin = begin_pos[level];
     cudf::size_type num_quads   = level_end - level_begin;
-    thrust::fill(rmm::exec_policy(stream)->on(stream),
+    thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
                  quad_levels.begin() + offset,
                  quad_levels.begin() + offset + num_quads,
                  level);
-    thrust::copy(rmm::exec_policy(stream)->on(stream),
+    thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
                  quad_keys_in.begin() + level_begin,
                  quad_keys_in.begin() + level_end,
                  quad_keys.begin() + offset);
-    thrust::copy(rmm::exec_policy(stream)->on(stream),
+    thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
                  quad_point_count_in.begin() + level_begin,
                  quad_point_count_in.begin() + level_end,
                  quad_point_count.begin() + offset);
-    thrust::copy(rmm::exec_policy(stream)->on(stream),
+    thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
                  quad_child_count_in.begin() + level_begin,
                  quad_child_count_in.begin() + level_end,
                  quad_child_count.begin() + offset);
@@ -255,15 +256,15 @@ inline auto make_full_levels(cudf::column_view const &x,
                              T scale,
                              int8_t max_depth,
                              cudf::size_type min_size,
-                             rmm::mr::device_memory_resource *mr,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource *mr)
 {
   // Compute point keys and sort into bottom-level quadrants
   // (i.e. quads at level `max_depth - 1`)
 
   // Compute Morton code (z-order) keys for each point
   auto keys_and_indices = compute_point_keys_and_sorted_indices<T>(
-    x, y, x_min, x_max, y_min, y_max, scale, max_depth, mr, stream);
+    x, y, x_min, x_max, y_min, y_max, scale, max_depth, stream, mr);
 
   auto &point_keys    = keys_and_indices.first;
   auto &point_indices = keys_and_indices.second;
@@ -291,8 +292,10 @@ inline auto make_full_levels(cudf::column_view const &x,
   quad_child_count.resize(num_bottom_quads * (max_depth + 1), stream);
 
   // Zero out the quad_child_count vector because we're reusing the point_keys vector
-  thrust::fill(
-    rmm::exec_policy(stream)->on(stream), quad_child_count.begin(), quad_child_count.end(), 0);
+  thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
+               quad_child_count.begin(),
+               quad_child_count.end(),
+               0);
 
   //
   // Compute "full" quads for the tree at each level. Starting from the quadrant