Replace raw streams with rmm::cuda_stream_view (part 1) (#6646)

Converting libcudf to use `rmm::cuda_stream_view` will require a LOT of changes, so I'm splitting it into multiple PRs to ease reviewing. This is the first PR in the series. This series of PRs will - Replace usage of `cudaStream_t` with `rmm::cuda_stream_view` - Replace usage of `0` or `nullptr` as a stream identifier with `rmm::cuda_stream_default` - Ensure all APIs always order the stream parameter *before* the memory resource parameter. #5119 This first PR converts: - column.hpp (and source) - device_column_view.cuh - copying.hpp (and source) : moves functions that had streams in public APIs to `namespace detail` and adds streamless public versions. - null_mask.hpp (and source) : moves functions that had streams in public APIs to `namespace detail` and adds streamless public versions. - AST (transform) - Usages of the above APIs in other source files - Some benchmarks Contributes to #6645 and #5119 ~Depends on #6732.~
rapidsai · Nov 13, 2020 · dea44d9 · dea44d9
1 parent 3fc8142
commit dea44d9
Show file tree

Hide file tree

Showing 94 changed files with 1,129 additions and 716 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -56,6 +56,7 @@
 - PR #6608 Improve subword tokenizer docs
 - PR #6610 Add ability to set scalar values in `cudf.DataFrame`
 - PR #6612 Update JNI to new RMM cuda_stream_view API
+- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
 - PR #6579 Update scatter APIs to use reference wrapper / const scalar
 - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
 - PR #6622 Update `to_pandas` api docs

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -24,6 +24,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <future>
@@ -296,9 +297,9 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
   return std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_to_id<T>()},
     num_rows,
-    rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)),
+    rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
     rmm::device_buffer(
-      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
+      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
 }
 
 /**
@@ -483,15 +484,16 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32},
       offsets.size(),
-      rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0)));
+      rmm::device_buffer(
+        offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default));
 
     list_column = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
       std::move(current_child_column),
       cudf::UNKNOWN_NULL_COUNT,
       rmm::device_buffer(
-        null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
+        null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
   }
   return list_column;  // return the top-level column
 }

diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift_benchmark.cu
@@ -1,30 +1,40 @@
-#include <benchmark/benchmark.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/transform.h>
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <memory>
 
-template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
-std::unique_ptr<cudf::scalar> make_scalar(
-  cudaStream_t stream                 = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto s = new ScalarType(0, false, stream, mr);
-  return std::unique_ptr<cudf::scalar>(s);
-}
+#include <benchmark/benchmark.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+#include <memory>
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  T value,
-  cudaStream_t stream                 = 0,
+  T value                             = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);

diff --git a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/null_mask.hpp>
 
+#include <benchmark/benchmark.h>
+
 class SetNullmask : public cudf::benchmark {
 };
 
@@ -31,7 +32,7 @@ void BM_setnullmask(benchmark::State& state)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true, 0);
+    cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * size / 8);
@@ -44,4 +45,4 @@ void BM_setnullmask(benchmark::State& state)
     ->Range(1 << 10, 1 << 30)                                                                  \
     ->UseManualTime();
 
-NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
+NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -90,13 +90,13 @@ struct ColumnHandle {
   template <typename ColumnType>
   void operator()(mutable_column_device_view source_column,
                   int work_per_thread,
-                  cudaStream_t stream = 0)
+                  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     cudf::detail::grid_1d grid_config{source_column.size(), block_size};
     int grid_size = grid_config.num_blocks;
     // Launch the kernel.
     host_dispatching_kernel<functor_type, ColumnType>
-      <<<grid_size, block_size, 0, stream>>>(source_column);
+      <<<grid_size, block_size, 0, stream.value()>>>(source_column);
   }
 };
 
@@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
 }
 
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
-void type_dispatcher_benchmark(benchmark::State& state)
+void type_dispatcher_benchmark(::benchmark::State& state)
 {
   const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));
 

diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/docs/DOCUMENTATION.md
@@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func
    */
 ```
 
-Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter.
+Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.
 
 ```c++
 /**
@@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 std::vector<size_type> const& indices,
-                                                cudaStream_t stream = 0);
+                                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 ```
 
 Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.

diff --git a/cpp/docs/TRANSITIONGUIDE.md b/cpp/docs/TRANSITIONGUIDE.md
@@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table.
 We do not yet expose CUDA streams in external libcudf APIs. 
 However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0). 
 
-The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`. 
+The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`. 
 The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter. 
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory.
@@ -144,19 +144,19 @@ void external_function(...);
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., cudaStream_t stream = 0)
+void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
     // defaulted stream parameter
-    void external_function(..., cudaStream_t stream){
+    void external_function(..., rmm::cuda_stream_view stream){
         // implementation uses stream w/ async APIs
         RMM_ALLOC(...,stream);
-        CUDA_TRY(cudaMemcpyAsync(...,stream));
+        CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
         thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...);
-        CUDA_TRY(cudaStreamSynchronize(stream));
+        stream.synchronize();
         RMM_FREE(...,stream);
     }
 } // namespace detail

diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
@@ -25,6 +25,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cstring>
 #include <numeric>
 
@@ -369,7 +371,7 @@ struct ast_plan {
 std::unique_ptr<column> compute_column(
   table_view const table,
   expression const& expr,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 

diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,12 @@
  */
 #pragma once
 
+#include <cudf/column/column_view.hpp>
+
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
-#include "column_view.hpp"
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <memory>
@@ -50,9 +52,6 @@ class column {
   /**
    * @brief Construct a new column by deep copying the contents of `other`.
    *
-   * All device memory allocation and copying is done using the
-   * `device_memory_resource` and `stream` from `other`.
-   *
    * @param other The column to copy
    **/
   column(column const& other);
@@ -69,7 +68,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         cudaStream_t stream,
+         rmm::cuda_stream_view stream,
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -124,7 +123,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  cudaStream_t stream                 = 0,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <cudf/column/column_view.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
@@ -28,6 +26,11 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
 /**
  * @file column_device_view.cuh
  * @brief Column device view class definitons
@@ -386,7 +389,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *`source_view` available in device memory.
    */
   static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
-    column_view source_view, cudaStream_t stream = 0);
+    column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Destroy the `column_device_view` object.
@@ -480,7 +483,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    */
   static std::unique_ptr<mutable_column_device_view,
                          std::function<void(mutable_column_device_view*)>>
-  create(mutable_column_view source_view, cudaStream_t stream = 0);
+  create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Returns pointer to the base device memory allocation casted to