Replace raw streams with rmm::cuda_stream_view (part 3) (#6744)

Converting libcudf to use `rmm::cuda_stream_view` will require a LOT of changes, so I'm splitting it into multiple PRs to ease reviewing. This is the third PR in the series. This series of PRs will - Replace usage of `cudaStream_t` with `rmm::cuda_stream_view` - Replace usage of `0` or `nullptr` as a stream identifier with `rmm::cuda_stream_default` - Ensure all APIs always order the stream parameter before the memory resource parameter. #5119 Contributes to #6645 and #5119 Depends on #6646 and #6648 so this PR will look much bigger until they are merged. This third PR converts: - remaining dictionary functionality - cuio - lists - scalar - strings - groupby - join - contiguous_split - get_element - datetime_ops - extract - merge - partitioning - minmax reduction - scan - byte_cast - clamp - interleave_columns - is_sorted - groupby - rank - tests - concurrent map classes
rapidsai · Nov 24, 2020 · 8cc23bd · 8cc23bd
1 parent fd72e5f
commit 8cc23bd
Show file tree

Hide file tree

Showing 235 changed files with 3,179 additions and 2,719 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -71,6 +71,7 @@
 - PR #6612 Update JNI to new RMM cuda_stream_view API
 - PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
 - PR #6648 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 2)
+- PR #6744 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 3)
 - PR #6579 Update scatter APIs to use reference wrapper / const scalar
 - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
 - PR #6622 Update `to_pandas` api docs

diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
  */
 
 #include "synchronization.hpp"
+
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 cuda_event_timer::cuda_event_timer(benchmark::State& state,
                                    bool flush_l2_cache,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream)
   : p_state(&state), stream(stream)
 {
   // flush all of L2$
@@ -35,18 +37,19 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
     if (l2_cache_bytes > 0) {
       const int memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream));
+      CUDA_TRY(
+        cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
     }
   }
 
   CUDA_TRY(cudaEventCreate(&start));
   CUDA_TRY(cudaEventCreate(&stop));
-  CUDA_TRY(cudaEventRecord(start, stream));
+  CUDA_TRY(cudaEventRecord(start, stream.value()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  CUDA_TRY(cudaEventRecord(stop, stream));
+  CUDA_TRY(cudaEventRecord(stop, stream.value()));
   CUDA_TRY(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0f;

diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@
 
       for (auto _ : state){
 
-        cudaStream_t stream = 0;
+        rmm::cuda_stream_view stream{}; // default stream, could be another stream
 
         // Create (Construct) an object of this class. You HAVE to pass in the
         // benchmark::State object you are using. It measures the time from its
@@ -44,7 +44,7 @@
         cuda_event_timer raii(state, true, stream); // flush_l2_cache = true
 
         // Now perform the operations that is to be benchmarked
-        sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel
+        sample_kernel<<<1, 256, 0, stream.value()>>>(); // Possibly launching a CUDA kernel
 
       }
     }
@@ -61,8 +61,11 @@
 
 // Google Benchmark library
 #include <benchmark/benchmark.h>
+
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <driver_types.h>
 
 class cuda_event_timer {
@@ -77,7 +80,9 @@ class cuda_event_timer {
    *                            every iteration.
    * @param[in] stream_ The CUDA stream we are measuring time on.
    **/
-  cuda_event_timer(benchmark::State& state, bool flush_l2_cache, cudaStream_t stream_ = 0);
+  cuda_event_timer(benchmark::State& state,
+                   bool flush_l2_cache,
+                   rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   // The user must provide a benchmark::State object to set
   // the timer so we disable the default c'tor.
@@ -91,7 +96,7 @@ class cuda_event_timer {
  private:
   cudaEvent_t start;
   cudaEvent_t stop;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
   benchmark::State* p_state;
 };
 

diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
@@ -184,7 +184,7 @@ std::unique_ptr<table> sample(
  */
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
@@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin,
       nullptr);
   }
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 }
 
 /**

diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
@@ -171,10 +171,9 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
                  "scatter dictionary keys must be the same type");
 
     // first combine keys so both dictionaries have the same set
-    auto target_matched = dictionary::detail::add_keys(target, source.keys(), mr, stream.value());
+    auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
     auto const target_view = dictionary_column_view(target_matched->view());
-    auto source_matched    = dictionary::detail::set_keys(
-      source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
+    auto source_matched    = dictionary::detail::set_keys(source, target_view.keys(), stream);
     auto const source_view = dictionary_column_view(source_matched->view());
 
     // now build the new indices by doing a scatter on just the matched indices

diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -19,6 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -44,15 +46,15 @@ namespace detail {
  *
  * @param column The column to dictionary encode.
  * @param indices_type The integer type to use for the indices.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Returns a dictionary column.
  */
 std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type              = data_type{type_id::UINT32},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -65,14 +67,14 @@ std::unique_ptr<column> encode(
  * ```
  *
  * @param dictionary_column Existing dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with type matching the dictionary_column's keys.
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Return minimal integer type for the given number of elements.

diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -18,6 +18,8 @@
 #include <cudf/detail/merge.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -33,15 +35,15 @@ namespace detail {
  * @param lcol First column.
  * @param rcol Second column.
  * @param row_order Indexes for each column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream);
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace dictionary

diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -31,8 +33,8 @@ namespace detail {
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
@@ -43,8 +45,8 @@ std::unique_ptr<column> add_keys(
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
@@ -54,8 +56,8 @@ std::unique_ptr<column> remove_keys(
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
@@ -66,8 +68,8 @@ std::unique_ptr<column> remove_unused_keys(
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -82,8 +84,8 @@ std::unique_ptr<column> set_keys(
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   std::vector<dictionary_column_view> input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from dictionary columns
@@ -106,8 +108,8 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
   std::vector<table_view> tables,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace dictionary

diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 /**
  * @addtogroup column_factories Factories
@@ -54,15 +56,15 @@ namespace cudf {
  *
  * @param keys_column Column of unique, ordered values to use as the new dictionary column's keys.
  * @param indices_column Indices to use for the new dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -106,15 +108,15 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
  *
  * @param keys Column of unique, ordered values to use as the new dictionary column's keys.
  * @param indices Indices values and null-mask to use for the new dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <utility>
 #include <vector>
 
@@ -222,13 +224,13 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
     std::vector<aggregation_request> const& requests,
-    cudaStream_t stream,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 
   // Sort-based groupby
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
     std::vector<aggregation_request> const& requests,
-    cudaStream_t stream,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 };
 /** @} */

diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
@@ -18,11 +18,11 @@
 
 #include "types.hpp"
 
-#include <rmm/mr/device/per_device_resource.hpp>
-
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
-
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-Original file line number
+Diff line change
@@ Expand Up / @@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin, @@
           nullptr);
       }
-      CHECK_CUDA(stream);
+      CHECK_CUDA(stream.value());
     }
     /**
@@ Expand Down @@