From ff53e23103f58ebfe0aebf8f4943a64bd958567d Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 29 Oct 2020 18:36:55 +1100 Subject: [PATCH 01/51] Fix cast warning. --- cpp/src/dictionary/detail/merge.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu index e2d2760642a..6448d711db1 100644 --- a/cpp/src/dictionary/detail/merge.cu +++ b/cpp/src/dictionary/detail/merge.cu @@ -59,7 +59,8 @@ std::unique_ptr merge(dictionary_column_view const& lcol, return make_dictionary_column( std::make_unique(lcol.keys(), stream, mr), std::move(indices_column), - rmm::device_buffer{lcol.has_nulls() || rcol.has_nulls() ? size_t{merged_size} : 0, stream, mr}, + rmm::device_buffer{ + lcol.has_nulls() || rcol.has_nulls() ? static_cast(merged_size) : 0, stream, mr}, lcol.null_count() + rcol.null_count()); } From 8c44adea46e08d1a4987c7a3e0ebc16c142c94e2 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 2 Nov 2020 14:48:19 +1100 Subject: [PATCH 02/51] Initial stream changes --- .../common/generate_benchmark_input.cpp | 10 +- cpp/benchmarks/copying/shift_benchmark.cu | 13 +- .../null_mask/set_null_mask_benchmark.cpp | 4 +- .../type_dispatcher_benchmark.cu | 14 +- cpp/docs/DOCUMENTATION.md | 4 +- cpp/docs/TRANSITIONGUIDE.md | 10 +- cpp/include/cudf/copying.hpp | 81 ++++---- cpp/include/cudf/detail/copy.hpp | 45 +++-- cpp/include/cudf/detail/copy_if.cuh | 2 +- cpp/include/cudf/detail/copy_if_else.cuh | 13 +- cpp/include/cudf/detail/gather.cuh | 8 +- cpp/include/cudf/detail/null_mask.hpp | 67 +++++-- cpp/include/cudf/detail/valid_if.cuh | 9 +- cpp/include/cudf/null_mask.hpp | 17 +- cpp/include/cudf/scalar/scalar.hpp | 62 ++++--- .../cudf/strings/detail/copy_if_else.cuh | 20 +- cpp/include/cudf/strings/detail/merge.cuh | 4 +- .../cudf/strings/detail/modify_strings.cuh | 5 +- cpp/include/cudf/strings/detail/scatter.cuh | 6 +- cpp/src/binaryop/binaryop.cpp | 45 ++--- cpp/src/binaryop/compiled/binary_ops.cu | 3 +- cpp/src/bitmask/null_mask.cu | 174 +++++++++++------- cpp/src/column/column.cu | 29 +-- cpp/src/column/column_factories.cpp | 16 +- cpp/src/copying/concatenate.cu | 5 +- cpp/src/copying/copy.cpp | 15 +- cpp/src/copying/copy.cu | 97 +++++----- cpp/src/copying/copy_range.cu | 3 +- cpp/src/copying/sample.cu | 17 +- cpp/src/copying/scatter.cu | 5 +- cpp/src/copying/shift.cu | 28 ++- cpp/src/copying/slice.cpp | 5 +- cpp/src/datetime/datetime_ops.cu | 16 +- cpp/src/dictionary/add_keys.cu | 11 +- cpp/src/dictionary/decode.cu | 7 +- cpp/src/dictionary/dictionary_factories.cu | 5 +- cpp/src/dictionary/encode.cu | 11 +- cpp/src/dictionary/replace.cu | 4 +- cpp/src/filling/fill.cu | 6 +- cpp/src/groupby/hash/groupby.cu | 3 +- cpp/src/groupby/sort/sort_helper.cu | 3 +- cpp/src/interop/from_arrow.cpp | 33 ++-- cpp/src/io/avro/reader_impl.cu | 3 +- cpp/src/io/csv/durations.cu | 4 +- cpp/src/io/utilities/column_buffer.hpp | 7 +- cpp/src/lists/copying/copying.cu | 4 +- cpp/src/merge/merge.cu | 2 +- cpp/src/quantiles/quantile.cu | 18 +- cpp/src/reductions/scan.cu | 31 ++-- cpp/src/replace/clamp.cu | 2 +- cpp/src/replace/nans.cu | 16 +- cpp/src/replace/nulls.cu | 19 +- cpp/src/replace/replace.cu | 9 +- cpp/src/reshape/byte_cast.cu | 7 +- cpp/src/reshape/interleave_columns.cu | 2 +- cpp/src/scalar/scalar.cpp | 8 +- cpp/src/sort/rank.cu | 11 +- cpp/src/strings/attributes.cu | 6 +- cpp/src/strings/case.cu | 7 +- cpp/src/strings/char_types/char_types.cu | 46 +++-- cpp/src/strings/combine.cu | 6 +- cpp/src/strings/contains.cu | 30 +-- cpp/src/strings/convert/convert_booleans.cu | 19 +- cpp/src/strings/convert/convert_datetime.cu | 35 ++-- cpp/src/strings/convert/convert_durations.cu | 22 ++- cpp/src/strings/convert/convert_floats.cu | 21 ++- cpp/src/strings/convert/convert_hex.cu | 29 +-- cpp/src/strings/convert/convert_integers.cu | 19 +- cpp/src/strings/convert/convert_ipv4.cu | 32 ++-- cpp/src/strings/convert/convert_urls.cu | 9 +- cpp/src/strings/copying/concatenate.cu | 4 +- cpp/src/strings/filter_chars.cu | 7 +- cpp/src/strings/find.cu | 46 +++-- cpp/src/strings/findall.cu | 37 ++-- cpp/src/strings/padding.cu | 9 +- cpp/src/strings/replace/replace.cu | 12 +- cpp/src/strings/split/split.cu | 25 +-- cpp/src/strings/strip.cu | 6 +- cpp/src/strings/substring.cu | 4 +- cpp/src/strings/translate.cu | 5 +- cpp/src/strings/wrap.cu | 4 +- cpp/src/text/normalize.cu | 24 ++- cpp/src/text/replace.cu | 7 +- cpp/src/text/stemmer.cu | 27 +-- cpp/src/transform/encode.cu | 2 +- cpp/src/unary/cast_ops.cu | 15 +- cpp/src/unary/math_ops.cu | 68 ++++--- 87 files changed, 993 insertions(+), 668 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index d516c084f03..e82a58c3a5b 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -296,9 +297,9 @@ std::unique_ptr create_random_column(data_profile const& profile, return std::make_unique( cudf::data_type{cudf::type_to_id()}, num_rows, - rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)), + rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default), rmm::device_buffer( - null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0))); + null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default)); } /** @@ -483,7 +484,8 @@ std::unique_ptr create_random_column(data_profile auto offsets_column = std::make_unique( cudf::data_type{cudf::type_id::INT32}, offsets.size(), - rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0))); + rmm::device_buffer( + offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default)); list_column = cudf::make_lists_column( num_rows, @@ -491,7 +493,7 @@ std::unique_ptr create_random_column(data_profile std::move(current_child_column), cudf::UNKNOWN_NULL_COUNT, rmm::device_buffer( - null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0))); + null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default)); } return list_column; // return the top-level column } diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift_benchmark.cu index 648bb699dbf..4cf3455debb 100644 --- a/cpp/benchmarks/copying/shift_benchmark.cu +++ b/cpp/benchmarks/copying/shift_benchmark.cu @@ -14,17 +14,8 @@ template > std::unique_ptr make_scalar( - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto s = new ScalarType(0, false, stream, mr); - return std::unique_ptr(s); -} - -template > -std::unique_ptr make_scalar( - T value, - cudaStream_t stream = 0, + T value = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto s = new ScalarType(value, true, stream, mr); diff --git a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp index 2f47393731a..e0a35ff0097 100644 --- a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp +++ b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp @@ -31,7 +31,7 @@ void BM_setnullmask(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::set_null_mask(static_cast(mask.data()), begin, end, true, 0); + cudf::set_null_mask(static_cast(mask.data()), begin, end, true); } state.SetBytesProcessed(static_cast(state.iterations()) * size / 8); @@ -44,4 +44,4 @@ void BM_setnullmask(benchmark::State& state) ->Range(1 << 10, 1 << 30) \ ->UseManualTime(); -NBM_BENCHMARK_DEFINE(SetNullMaskKernel); \ No newline at end of file +NBM_BENCHMARK_DEFINE(SetNullMaskKernel); diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu index 7b1068d09dd..222a2c40618 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu @@ -90,13 +90,13 @@ struct ColumnHandle { template void operator()(mutable_column_device_view source_column, int work_per_thread, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { cudf::detail::grid_1d grid_config{source_column.size(), block_size}; int grid_size = grid_config.num_blocks; // Launch the kernel. host_dispatching_kernel - <<>>(source_column); + <<>>(source_column); } }; @@ -144,14 +144,14 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread) // std::vector v_stream(n_cols); for (int c = 0; c < n_cols; c++) { auto d_column = mutable_column_device_view::create(input.column(c)); - cudf::type_dispatcher( - d_column->type(), ColumnHandle{}, *d_column, work_per_thread); + // cudf::type_dispatcher( + // d_column->type(), ColumnHandle{}, *d_column, work_per_thread); } } else if (dispatching_type == DEVICE_DISPATCHING) { auto d_table_view = mutable_table_device_view::create(input); - auto f = device_dispatching_kernel; + // auto f = device_dispatching_kernel; // Launch the kernel - f<<>>(*d_table_view); + // f<<>>(*d_table_view); } else if (dispatching_type == NO_DISPATCHING) { auto f = no_dispatching_kernel; // Launch the kernel @@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread) } template -void type_dispatcher_benchmark(benchmark::State& state) +void type_dispatcher_benchmark(::benchmark::State& state) { const cudf::size_type source_size = static_cast(state.range(1)); diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/docs/DOCUMENTATION.md index 6b0a51dbf1b..b219543e3d6 100644 --- a/cpp/docs/DOCUMENTATION.md +++ b/cpp/docs/DOCUMENTATION.md @@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func */ ``` -Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter. +Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter. ```c++ /** @@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl */ std::vector segmented_count_set_bits(bitmask_type const* bitmask, std::vector const& indices, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); ``` Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it. diff --git a/cpp/docs/TRANSITIONGUIDE.md b/cpp/docs/TRANSITIONGUIDE.md index f7de4863952..8786c4c039b 100644 --- a/cpp/docs/TRANSITIONGUIDE.md +++ b/cpp/docs/TRANSITIONGUIDE.md @@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table. We do not yet expose CUDA streams in external libcudf APIs. However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0). -The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`. +The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter. In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory. @@ -144,19 +144,19 @@ void external_function(...); // cpp/include/cudf/detail/header.hpp namespace detail{ -void external_function(..., cudaStream_t stream = 0) +void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default) } // namespace detail // cudf/src/implementation.cpp namespace detail{ // defaulted stream parameter - void external_function(..., cudaStream_t stream){ + void external_function(..., rmm::cuda_stream_view stream){ // implementation uses stream w/ async APIs RMM_ALLOC(...,stream); - CUDA_TRY(cudaMemcpyAsync(...,stream)); + CUDA_TRY(cudaMemcpyAsync(...,stream.value())); kernel<<<..., stream>>>(...); thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); RMM_FREE(...,stream); } } // namespace detail diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index a20d9d653ce..b1483fea133 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -286,6 +286,46 @@ std::unique_ptr copy_range( size_type target_begin, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Creates a new column by shifting all values by an offset. + * + * @ingroup copy_shift + * + * Elements will be determined by `output[idx] = input[idx - offset]`. + * Some elements in the output may be indeterminable from the input. For those + * elements, the value will be determined by `fill_values`. + * + * @code{.pseudo} + * Examples + * ------------------------------------------------- + * input = [0, 1, 2, 3, 4] + * offset = 3 + * fill_values = @ + * return = [@, @, @, 0, 1] + * ------------------------------------------------- + * input = [5, 4, 3, 2, 1] + * offset = -2 + * fill_values = 7 + * return = [3, 2, 1, 7, 7] + * @endcode + * + * @note if the input is nullable, the output will be nullable. + * @note if the fill value is null, the output will be nullable. + * + * @param input Column to be shifted. + * @param offset The offset by which to shift the input. + * @param fill_value Fill value for indeterminable outputs. + * @param mr Device memory resource used to allocate the returned result's device memory + * + * @throw cudf::logic_error if @p input dtype is not fixed-with. + * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype. + */ +std::unique_ptr shift( + column_view const& input, + size_type offset, + scalar const& fill_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices. * @@ -479,7 +519,6 @@ struct contiguous_split_result { * @param input View of a table to split * @param splits A vector of indices where the view will be split * @param[in] mr Device memory resource used to allocate the returned result's device memory - * @param[in] stream CUDA stream used for device memory operations and kernel launches. * @return The set of requested views of `input` indicated by the `splits` and the viewed memory * buffer. */ @@ -513,46 +552,6 @@ std::unique_ptr copy_if_else( column_view const& boolean_mask, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Creates a new column by shifting all values by an offset. - * - * @ingroup copy_shift - * - * Elements will be determined by `output[idx] = input[idx - offset]`. - * Some elements in the output may be indeterminable from the input. For those - * elements, the value will be determined by `fill_values`. - * - * @code{.pseudo} - * Examples - * ------------------------------------------------- - * input = [0, 1, 2, 3, 4] - * offset = 3 - * fill_values = @ - * return = [@, @, @, 0, 1] - * ------------------------------------------------- - * input = [5, 4, 3, 2, 1] - * offset = -2 - * fill_values = 7 - * return = [3, 2, 1, 7, 7] - * @endcode - * - * @note if the input is nullable, the output will be nullable. - * @note if the fill value is null, the output will be nullable. - * - * @param input Column to be shifted. - * @param offset The offset by which to shift the input. - * @param fill_value Fill value for indeterminable outputs. - * - * @throw cudf::logic_error if @p input dtype is not fixed-with. - * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype. - */ -std::unique_ptr shift( - column_view const& input, - size_type offset, - scalar const& fill_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); - /** * @brief Returns a new column, where each element is selected from either @p lhs or * @p rhs based on the value of the corresponding element in @p boolean_mask diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index 22399043bb2..0312f1ebe75 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -21,6 +21,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -71,7 +73,20 @@ ColumnView slice(ColumnView const& input, cudf::size_type begin, cudf::size_type */ std::vector slice(column_view const& input, std::vector const& indices, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::shift(column_view const&,size_type,scalar const&, + * rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr shift( + column_view const& input, + size_type offset, + scalar const& fill_value, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::contiguous_split @@ -81,8 +96,8 @@ std::vector slice(column_view const& input, std::vector contiguous_split( cudf::table_view const& input, std::vector const& splits, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy, @@ -94,8 +109,8 @@ std::unique_ptr allocate_like( column_view const& input, size_type size, mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::copy_if_else( column_view const&, column_view const&, @@ -107,8 +122,8 @@ std::unique_ptr copy_if_else( column_view const& lhs, column_view const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::copy_if_else( scalar const&, column_view const&, @@ -120,8 +135,8 @@ std::unique_ptr copy_if_else( scalar const& lhs, column_view const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::copy_if_else( column_view const&, scalar const&, @@ -133,8 +148,8 @@ std::unique_ptr copy_if_else( column_view const& lhs, scalar const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::copy_if_else( scalar const&, scalar const&, @@ -146,8 +161,8 @@ std::unique_ptr copy_if_else( scalar const& lhs, scalar const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::sample @@ -159,8 +174,8 @@ std::unique_ptr sample( size_type const n, sample_with_replacement replacement = sample_with_replacement::FALSE, int64_t const seed = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index ce581d71ac7..9399df22450 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -213,7 +213,7 @@ struct scatter_gather_functor { cudaStream_t stream = 0) { auto output_column = cudf::detail::allocate_like( - input, output_size, cudf::mask_allocation_policy::RETAIN, mr, stream); + input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr); auto output = output_column->mutable_view(); bool has_valid = input.nullable(); diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 0bad8a1a86f..d5be077d27b 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -27,9 +27,10 @@ #include #include -#include #include +#include + namespace cudf { namespace detail { namespace { // anonymous @@ -162,8 +163,8 @@ std::unique_ptr copy_if_else( LeftIter lhs_end, RightIter rhs, FilterFn filter, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { using Element = typename thrust::tuple_element<0, typename thrust::iterator_traits::value_type>::type; @@ -177,7 +178,7 @@ std::unique_ptr copy_if_else( make_fixed_width_column(data_type(type_to_id()), size, nullable ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, - stream, + stream.value(), mr); auto out_v = mutable_column_device_view::create(*out); @@ -188,14 +189,14 @@ std::unique_ptr copy_if_else( // call the kernel copy_if_else_kernel - <<>>( + <<>>( lhs_begin, rhs, filter, *out_v, valid_count.data()); out->set_null_count(size - valid_count.value()); } else { // call the kernel copy_if_else_kernel - <<>>(lhs_begin, rhs, filter, *out_v, nullptr); + <<>>(lhs_begin, rhs, filter, *out_v, nullptr); } return out; diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index dd6266f258b..f20af839916 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -24,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -175,7 +175,7 @@ struct column_gatherer_impl { auto const num_rows = cudf::distance(gather_map_begin, gather_map_end); auto const policy = cudf::mask_allocation_policy::NEVER; auto destination_column = - cudf::detail::allocate_like(source_column, num_rows, policy, mr, stream); + cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr); using Type = device_storage_type_t; @@ -403,7 +403,7 @@ struct column_gatherer_impl { // Perform gather on just the indices column_view indices = dictionary.get_indices_annotated(); auto new_indices = cudf::detail::allocate_like( - indices, output_count, cudf::mask_allocation_policy::NEVER, mr, stream); + indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr); gather_helper( cudf::detail::indexalator_factory::make_input_iterator(indices), indices.size(), @@ -496,7 +496,7 @@ void gather_bitmask(table_view const& source, not target[i]->nullable()) { auto const state = op == gather_bitmask_op::PASSTHROUGH ? mask_state::ALL_VALID : mask_state::UNINITIALIZED; - auto mask = create_null_mask(target[i]->size(), state, stream, mr); + auto mask = detail::create_null_mask(target[i]->size(), state, stream, mr); target[i]->set_null_mask(std::move(mask), 0); } } diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 6319da752bc..4b2c5b0a8d6 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -18,9 +18,33 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { + +/** + * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + **/ +rmm::device_buffer create_null_mask( + size_type size, + mask_state state, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + **/ +void set_null_mask(bitmask_type *bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + /** * @copydoc cudf::segmented_count_set_bits * @@ -28,7 +52,7 @@ namespace detail { */ std::vector segmented_count_set_bits(bitmask_type const *bitmask, std::vector const &indices, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * @copydoc cudf::segmented_count_unset_bits @@ -37,22 +61,41 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, */ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, std::vector const &indices, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** - * @brief Returns a bitwise AND of the specified bitmasks + * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type, + *rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + **/ +rmm::device_buffer copy_bitmask( + bitmask_type const *mask, + size_type begin_bit, + size_type end_bit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + **/ +rmm::device_buffer copy_bitmask( + column_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc bitmask_and(std::vector, std::vector const&, size_type, + * rmm::mr::device_memory_resource *) * - * @param masks The list of data pointers of the bitmasks to be ANDed - * @param begin_bits The bit offsets from which each mask is to be ANDed - * @param mask_size The number of bits to be ANDed in each mask * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer Output bitmask */ rmm::device_buffer bitmask_and(std::vector const &masks, std::vector const &begin_bits, size_type mask_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); /** @@ -61,8 +104,8 @@ rmm::device_buffer bitmask_and(std::vector const &masks, * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ rmm::device_buffer bitmask_and(table_view const &view, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr); /** * @brief Performs a bitwise AND of the specified bitmasks, @@ -80,7 +123,7 @@ void inplace_bitmask_and(bitmask_type *dest_mask, std::vector const &masks, std::vector const &begin_bits, size_type mask_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); } // namespace detail diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index c9719228f87..011a3fa616c 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include #include #include @@ -25,6 +25,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -87,14 +88,14 @@ std::pair valid_if( InputIterator begin, InputIterator end, Predicate p, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(begin <= end, "Invalid range."); size_type size = thrust::distance(begin, end); - auto null_mask = create_null_mask(size, mask_state::UNINITIALIZED, stream, mr); + auto null_mask = detail::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr); size_type null_count{0}; if (size > 0) { @@ -103,7 +104,7 @@ std::pair valid_if( constexpr size_type block_size{256}; grid_1d grid{size, block_size}; - valid_if_kernel<<>>( + valid_if_kernel<<>>( static_cast(null_mask.data()), begin, size, p, valid_count.data()); null_count = size - valid_count.value(stream); diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 50ea7ead37d..110fd2b5087 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -76,7 +76,6 @@ size_type num_bitmask_words(size_type number_of_bits); * * @param size The number of elements to be represented by the mask * @param state The desired state of the mask - * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned device_buffer. * @return rmm::device_buffer A `device_buffer` for use as a null bitmask * satisfying the desired size and state @@ -84,7 +83,6 @@ size_type num_bitmask_words(size_type number_of_bits); rmm::device_buffer create_null_mask( size_type size, mask_state state, - cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -98,13 +96,8 @@ rmm::device_buffer create_null_mask( * @param begin_bit Index of the first bit to set (inclusive) * @param end_bit Index of the last bit to set (exclusive) * @param valid If true set all entries to valid; otherwise, set all to null. - * @param stream CUDA stream used for device memory operations and kernel launches. **/ -void set_null_mask(bitmask_type* bitmask, - size_type begin_bit, - size_type end_bit, - bool valid, - cudaStream_t stream = 0); +void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid); /** * @brief Given a bitmask, counts the number of set (1) bits in the range @@ -188,7 +181,6 @@ std::vector segmented_count_unset_bits(bitmask_type const* bitmask, * @param mask Bitmask residing in device memory whose bits will be copied * @param begin_bit Index of the first bit to be copied (inclusive) * @param end_bit Index of the last bit to be copied (exclusive) - * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned device_buffer * @return rmm::device_buffer A `device_buffer` containing the bits * `[begin_bit, end_bit)` from `mask`. @@ -197,7 +189,6 @@ rmm::device_buffer copy_bitmask( bitmask_type const* mask, size_type begin_bit, size_type end_bit, - cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -207,14 +198,12 @@ rmm::device_buffer copy_bitmask( * Returns empty `device_buffer` if the column is not nullable * * @param view Column view whose bitmask needs to be copied - * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned device_buffer * @return rmm::device_buffer A `device_buffer` containing the bits * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask. **/ rmm::device_buffer copy_bitmask( column_view const& view, - cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -224,14 +213,12 @@ rmm::device_buffer copy_bitmask( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns - * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned device_buffer * @return rmm::device_buffer Output bitmask */ rmm::device_buffer bitmask_and( table_view const& view, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index ed3a6aebf31..dcce9f043e8 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -28,6 +28,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" /** * @file @@ -67,7 +68,10 @@ class scalar { * @param is_valid true: set the value to valid. false: set it to null * @param stream CUDA stream used for device memory operations. */ - void set_valid(bool is_valid, cudaStream_t stream = 0) { _is_valid.set_value(is_valid, stream); } + void set_valid(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default) + { + _is_valid.set_value(is_valid, stream); + } /** * @brief Indicates whether the scalar contains a valid value @@ -78,7 +82,10 @@ class scalar { * @return true Value is valid * @return false Value is invalid/null */ - bool is_valid(cudaStream_t stream = 0) const { return _is_valid.value(stream); } + bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const + { + return _is_valid.value(stream); + } /** * @brief Returns a raw pointer to the validity bool in device memory @@ -109,7 +116,7 @@ class scalar { */ scalar(data_type type, bool is_valid = false, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : _type(type), _is_valid(is_valid, stream, mr) { @@ -136,7 +143,7 @@ class fixed_width_scalar : public scalar { * @param value New value of scalar * @param stream CUDA stream used for device memory operations. */ - void set_value(T value, cudaStream_t stream = 0) + void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default) { _data.set_value(value, stream); this->set_valid(true, stream); @@ -152,7 +159,10 @@ class fixed_width_scalar : public scalar { * * @param stream CUDA stream used for device memory operations. */ - T value(cudaStream_t stream = 0) const { return _data.value(stream); } + T value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const + { + return _data.value(stream); + } /** * @brief Returns a raw pointer to the value in device memory @@ -179,7 +189,7 @@ class fixed_width_scalar : public scalar { */ fixed_width_scalar(T value, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar(data_type(type_to_id()), is_valid, stream, mr), _data(value, stream, mr) { @@ -195,7 +205,7 @@ class fixed_width_scalar : public scalar { */ fixed_width_scalar(rmm::device_scalar&& data, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar(data_type(type_to_id()), is_valid, stream, mr), _data{std::forward>(data)} @@ -232,7 +242,7 @@ class numeric_scalar : public detail::fixed_width_scalar { */ numeric_scalar(T value, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : detail::fixed_width_scalar(value, is_valid, stream, mr) { @@ -248,7 +258,7 @@ class numeric_scalar : public detail::fixed_width_scalar { */ numeric_scalar(rmm::device_scalar&& data, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : detail::fixed_width_scalar(std::forward>(data), is_valid, stream, mr) { @@ -286,7 +296,7 @@ class fixed_point_scalar : public scalar { fixed_point_scalar(rep_type value, numeric::scale_type scale, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar{data_type{type_to_id(), static_cast(scale)}, is_valid, stream, mr}, _data{value} @@ -303,7 +313,7 @@ class fixed_point_scalar : public scalar { */ fixed_point_scalar(rep_type value, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar{data_type{type_to_id(), 0}, is_valid, stream, mr}, _data{value} { @@ -319,7 +329,7 @@ class fixed_point_scalar : public scalar { */ fixed_point_scalar(T value, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar{data_type{type_to_id(), 0}, is_valid, stream, mr}, _data{numeric::scaled_integer{value}.value} @@ -338,7 +348,7 @@ class fixed_point_scalar : public scalar { */ fixed_point_scalar(rmm::device_scalar&& data, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar{data_type{type_to_id()}, is_valid, stream, mr}, // note that scale is ignored here _data{std::forward>(data)} @@ -350,7 +360,10 @@ class fixed_point_scalar : public scalar { * * @param stream CUDA stream used for device memory operations. */ - rep_type value(cudaStream_t stream = 0) const { return _data.value(stream); } + rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const + { + return _data.value(stream); + } /** * @brief Returns a raw pointer to the value in device memory @@ -390,7 +403,7 @@ class string_scalar : public scalar { */ string_scalar(std::string const& string, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar(data_type(type_id::STRING), is_valid), _data(string.data(), string.size(), stream, mr) { @@ -407,7 +420,7 @@ class string_scalar : public scalar { */ string_scalar(value_type const& source, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : scalar(data_type(type_id::STRING), is_valid), _data(source.data(), source.size_bytes(), stream, mr) @@ -425,7 +438,7 @@ class string_scalar : public scalar { */ string_scalar(rmm::device_scalar& data, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : string_scalar(data.value(stream), is_valid, stream, mr) { @@ -441,14 +454,17 @@ class string_scalar : public scalar { * * @param stream CUDA stream used for device memory operations. */ - std::string to_string(cudaStream_t stream = 0) const; + std::string to_string(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * @brief Get the value of the scalar as a string_view * * @param stream CUDA stream used for device memory operations. */ - value_type value(cudaStream_t stream = 0) const { return value_type{data(), size()}; } + value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const + { + return value_type{data(), size()}; + } /** * @brief Returns the size of the string in bytes @@ -492,7 +508,7 @@ class chrono_scalar : public detail::fixed_width_scalar { */ chrono_scalar(T value, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : detail::fixed_width_scalar(value, is_valid, stream, mr) { @@ -508,7 +524,7 @@ class chrono_scalar : public detail::fixed_width_scalar { */ chrono_scalar(rmm::device_scalar&& data, bool is_valid = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : detail::fixed_width_scalar(std::forward>(data), is_valid, stream, mr) { @@ -535,7 +551,7 @@ struct timestamp_scalar : chrono_scalar { template timestamp_scalar(Duration2 const& value, bool is_valid, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : chrono_scalar(T{typename T::duration{value}}, is_valid, stream, mr) { @@ -564,7 +580,7 @@ struct duration_scalar : chrono_scalar { */ duration_scalar(typename T::rep value, bool is_valid, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : chrono_scalar(T{value}, is_valid, stream, mr) { diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 21954104d72..7bfe1df4239 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -21,6 +21,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace strings { @@ -54,11 +55,11 @@ std::unique_ptr copy_if_else( StringPairIterLeft lhs_end, StringPairIterRight rhs_begin, Filter filter_fn, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = std::distance(lhs_begin, lhs_end); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); auto execpol = rmm::exec_policy(stream); // create null mask @@ -86,16 +87,17 @@ std::unique_ptr copy_if_else( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto d_offsets = offsets_column->view().template data(); // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = create_chars_child_column(strings_count, null_count, bytes, mr, stream); - auto d_chars = chars_column->mutable_view().template data(); + size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; + auto chars_column = + create_chars_child_column(strings_count, null_count, bytes, mr, stream.value()); + auto d_chars = chars_column->mutable_view().template data(); // fill in chars thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [lhs_begin, rhs_begin, filter_fn, d_offsets, d_chars] __device__(size_type idx) { @@ -110,7 +112,7 @@ std::unique_ptr copy_if_else( std::move(chars_column), null_count, std::move(null_mask), - stream, + stream.value(), mr); } diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index 114a4195a95..6bdbce3c933 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -17,8 +17,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -62,7 +62,7 @@ std::unique_ptr merge(strings_column_view const& lhs, rmm::device_buffer null_mask{0, stream, mr}; size_type null_count = lhs.null_count() + rhs.null_count(); if (null_count > 0) - null_mask = create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr); + null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr); // build offsets column auto offsets_transformer = [d_lhs, d_rhs] __device__(auto index_pair) { diff --git a/cpp/include/cudf/strings/detail/modify_strings.cuh b/cpp/include/cudf/strings/detail/modify_strings.cuh index e61a404441b..c90ca4575f8 100644 --- a/cpp/include/cudf/strings/detail/modify_strings.cuh +++ b/cpp/include/cudf/strings/detail/modify_strings.cuh @@ -16,12 +16,14 @@ #pragma once #include +#include #include #include #include #include #include +#include namespace cudf { namespace strings { @@ -65,7 +67,8 @@ std::unique_ptr modify_strings(strings_column_view const& strings, size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // get the lookup tables used for case conversion device_probe_functor d_probe_fctr{d_column, std::forward(args)...}; diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index 53d2310364d..627b9902506 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -17,10 +17,13 @@ #include #include +#include #include #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -62,7 +65,8 @@ std::unique_ptr scatter( // create null mask -- caller must update this rmm::device_buffer null_mask{0, stream, mr}; - if (target.has_nulls()) null_mask = copy_bitmask(target.parent(), stream, mr); + if (target.has_nulls()) + null_mask = cudf::detail::copy_bitmask(target.parent(), rmm::cuda_stream_view{stream}, mr); // create string vectors rmm::device_vector target_vector = create_string_vector_from_column(target, stream); diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 52075c6d93b..55aabb87d8d 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -17,35 +17,36 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include +#include +#include + #include #include #include -#include -#include // replace eventually - -#include "compiled/binary_ops.hpp" -#include "cudf/binaryop.hpp" -#include "cudf/fixed_point/fixed_point.hpp" -#include "cudf/types.hpp" - #include #include #include #include #include #include + +#include +#include +#include // replace eventually +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { @@ -62,9 +63,9 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col, if (col.is_empty()) return rmm::device_buffer{0, stream, mr}; if (not s.is_valid()) { - return create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr); + return cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr); } else if (s.is_valid() and col.nullable()) { - return copy_bitmask(col, stream, mr); + return cudf::detail::copy_bitmask(col, rmm::cuda_stream_view{stream}, mr); } else { return rmm::device_buffer{0, stream, mr}; } @@ -336,7 +337,7 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); } else { - auto new_mask = bitmask_and(table_view({lhs, rhs}), mr, stream); + auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); return make_fixed_width_column( output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); } @@ -731,7 +732,7 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto new_mask = bitmask_and(table_view({lhs, rhs}), mr, stream); + auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr); auto out = make_fixed_width_column( output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 0109e788eb4..e21681a8467 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -164,7 +165,7 @@ struct binary_op { rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - auto new_mask = bitmask_and(table_view({lhs, rhs}), mr, stream); + auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); auto out = make_fixed_width_column( out_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 429697f64c6..bc464cab372 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -25,17 +25,21 @@ #include #include +#include +#include +#include + #include #include #include #include + #include -#include -#include #include #include #include +#include "rmm/mr/device/device_memory_resource.hpp" namespace cudf { size_type state_null_count(mask_state state, size_type size) @@ -67,10 +71,12 @@ size_type num_bitmask_words(size_type number_of_bits) detail::size_in_bits()); } +namespace detail { + // Create a device_buffer for a null mask rmm::device_buffer create_null_mask(size_type size, mask_state state, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { size_type mask_size{0}; @@ -81,13 +87,14 @@ rmm::device_buffer create_null_mask(size_type size, if (state != mask_state::UNINITIALIZED) { uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00; - CUDA_TRY( - cudaMemsetAsync(static_cast(mask.data()), fill_value, mask_size, stream)); + CUDA_TRY(cudaMemsetAsync( + static_cast(mask.data()), fill_value, mask_size, stream.value())); } return mask; } +namespace { __global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination, size_type begin_bit, size_type end_bit, @@ -116,12 +123,15 @@ __global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination, } } } +} // namespace -// Set pre-allocated null mask of given bit range [begin_bit, end_bit) -// to valid, if valid==true, +// Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; -void set_null_mask( - bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid, cudaStream_t stream) +void set_null_mask(bitmask_type *bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(begin_bit >= 0, "Invalid range."); @@ -130,12 +140,29 @@ void set_null_mask( auto number_of_mask_words = num_bitmask_words(end_bit) - begin_bit / detail::size_in_bits(); cudf::detail::grid_1d config(number_of_mask_words, 256); - set_null_mask_kernel<<>>( + set_null_mask_kernel<<>>( static_cast(bitmask), begin_bit, end_bit, valid, number_of_mask_words); CHECK_CUDA(stream); } } +} // namespace detail + +// Create a device_buffer for a null mask +rmm::device_buffer create_null_mask(size_type size, + mask_state state, + rmm::mr::device_memory_resource *mr) +{ + return detail::create_null_mask(size, state, rmm::cuda_stream_default, mr); +} + +// Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, +// or null, otherwise; +void set_null_mask(bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid) +{ + return detail::set_null_mask(bitmask, begin_bit, end_bit, valid); +} + namespace { /** @@ -371,12 +398,56 @@ struct to_word_index : public thrust::unary_function { namespace detail { +// Create a bitmask from a specific range +rmm::device_buffer copy_bitmask(bitmask_type const *mask, + size_type begin_bit, + size_type end_bit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(begin_bit >= 0, "Invalid range."); + CUDF_EXPECTS(begin_bit <= end_bit, "Invalid bit range."); + rmm::device_buffer dest_mask{}; + auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit); + if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; } + if (begin_bit == 0) { + dest_mask = rmm::device_buffer{static_cast(mask), num_bytes, stream, mr}; + } else { + auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit); + dest_mask = rmm::device_buffer{num_bytes, stream, mr}; + cudf::detail::grid_1d config(number_of_mask_words, 256); + copy_offset_bitmask<<>>( + static_cast(dest_mask.data()), + mask, + begin_bit, + end_bit, + number_of_mask_words); + CHECK_CUDA(stream.value()); + } + return dest_mask; +} + +// Create a bitmask from a column view +rmm::device_buffer copy_bitmask(column_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_FUNC_RANGE(); + rmm::device_buffer null_mask{0, stream, mr}; + if (view.nullable()) { + null_mask = + copy_bitmask(view.null_mask(), view.offset(), view.offset() + view.size(), stream, mr); + } + return null_mask; +} + // Inplace Bitwise AND of the masks void inplace_bitmask_and(bitmask_type *dest_mask, std::vector const &masks, std::vector const &begin_bits, size_type mask_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }), @@ -385,15 +456,13 @@ void inplace_bitmask_and(bitmask_type *dest_mask, CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }), "Mask pointer cannot be null"); - auto num_bytes = bitmask_allocation_size_bytes(mask_size); - auto number_of_mask_words = num_bitmask_words(mask_size); rmm::device_vector d_masks(masks); rmm::device_vector d_begin_bits(begin_bits); cudf::detail::grid_1d config(number_of_mask_words, 256); - offset_bitmask_and<<>>( + offset_bitmask_and<<>>( dest_mask, d_masks.data().get(), d_begin_bits.data().get(), @@ -401,21 +470,19 @@ void inplace_bitmask_and(bitmask_type *dest_mask, mask_size, number_of_mask_words); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); } // Bitwise AND of the masks rmm::device_buffer bitmask_and(std::vector const &masks, std::vector const &begin_bits, size_type mask_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { rmm::device_buffer dest_mask{}; auto num_bytes = bitmask_allocation_size_bytes(mask_size); - auto number_of_mask_words = num_bitmask_words(mask_size); - dest_mask = rmm::device_buffer{num_bytes, stream, mr}; inplace_bitmask_and( static_cast(dest_mask.data()), masks, begin_bits, mask_size, stream, mr); @@ -426,7 +493,7 @@ rmm::device_buffer bitmask_and(std::vector const &masks, cudf::size_type count_set_bits(bitmask_type const *bitmask, size_type start, size_type stop, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { if (nullptr == bitmask) { return 0; } @@ -444,8 +511,9 @@ cudf::size_type count_set_bits(bitmask_type const *bitmask, rmm::device_scalar non_zero_count(0, stream); - count_set_bits_kernel<<>>( - bitmask, start, stop - 1, non_zero_count.data()); + count_set_bits_kernel + <<>>( + bitmask, start, stop - 1, non_zero_count.data()); return non_zero_count.value(); } @@ -453,7 +521,7 @@ cudf::size_type count_set_bits(bitmask_type const *bitmask, cudf::size_type count_unset_bits(bitmask_type const *bitmask, size_type start, size_type stop, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { if (nullptr == bitmask) { return 0; } auto num_bits = (stop - start); @@ -462,7 +530,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask, std::vector segmented_count_set_bits(bitmask_type const *bitmask, std::vector const &indices, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_EXPECTS(indices.size() % 2 == 0, "Array of indices needs to have an even number of elements."); @@ -522,7 +590,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, num_ranges, first_word_indices, last_word_indices, - stream)); + stream.value())); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); // second perform segmented reduction @@ -534,7 +602,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, num_ranges, first_word_indices, last_word_indices, - stream)); + stream.value())); CHECK_CUDA(stream); @@ -548,7 +616,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, subtract_set_bits_range_boundaries_kerenel<<>>( + stream.value()>>>( bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin()); CHECK_CUDA(stream); @@ -558,16 +626,16 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, d_null_counts.data().get(), num_ranges * sizeof(size_type), cudaMemcpyDeviceToHost, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); // now ret is valid. + stream.synchronize(); // now ret is valid. return ret; } std::vector segmented_count_unset_bits(bitmask_type const *bitmask, std::vector const &indices, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (indices.empty()) { return std::vector{}; @@ -587,8 +655,8 @@ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, // Returns the bitwise AND of the null masks of all columns in the table view rmm::device_buffer bitmask_and(table_view const &view, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; @@ -631,7 +699,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, std::vector const &indices) { CUDF_FUNC_RANGE(); - return detail::segmented_count_set_bits(bitmask, indices, 0); + return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default); } // Count zero bits in the specified ranges @@ -639,57 +707,27 @@ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, std::vector const &indices) { CUDF_FUNC_RANGE(); - return detail::segmented_count_unset_bits(bitmask, indices, 0); + return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default); } // Create a bitmask from a specific range rmm::device_buffer copy_bitmask(bitmask_type const *mask, size_type begin_bit, size_type end_bit, - cudaStream_t stream, rmm::mr::device_memory_resource *mr) { - CUDF_FUNC_RANGE(); - CUDF_EXPECTS(begin_bit >= 0, "Invalid range."); - CUDF_EXPECTS(begin_bit <= end_bit, "Invalid bit range."); - rmm::device_buffer dest_mask{}; - auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit); - if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; } - if (begin_bit == 0) { - dest_mask = rmm::device_buffer{static_cast(mask), num_bytes, stream, mr}; - } else { - auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit); - dest_mask = rmm::device_buffer{num_bytes, stream, mr}; - cudf::detail::grid_1d config(number_of_mask_words, 256); - copy_offset_bitmask<<>>( - static_cast(dest_mask.data()), - mask, - begin_bit, - end_bit, - number_of_mask_words); - CHECK_CUDA(stream); - } - return dest_mask; + return detail::copy_bitmask(mask, begin_bit, end_bit, rmm::cuda_stream_default, mr); } // Create a bitmask from a column view -rmm::device_buffer copy_bitmask(column_view const &view, - cudaStream_t stream, - rmm::mr::device_memory_resource *mr) +rmm::device_buffer copy_bitmask(column_view const &view, rmm::mr::device_memory_resource *mr) { - rmm::device_buffer null_mask{0, stream, mr}; - if (view.nullable()) { - null_mask = - copy_bitmask(view.null_mask(), view.offset(), view.offset() + view.size(), stream, mr); - } - return null_mask; + return detail::copy_bitmask(view, rmm::cuda_stream_default, mr); } -rmm::device_buffer bitmask_and(table_view const &view, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) +rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_resource *mr) { - return detail::bitmask_and(view, mr, stream); + return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu index 809abe40989..399bc26f786 100644 --- a/cpp/src/column/column.cu +++ b/cpp/src/column/column.cu @@ -38,6 +38,7 @@ #include #include "cudf/structs/structs_column_view.hpp" #include "cudf/types.hpp" +#include "rmm/cuda_stream_view.hpp" namespace cudf { // Copy constructor @@ -207,12 +208,13 @@ struct create_column_from_view { children.emplace_back(std::make_unique(indices_view, stream, mr)); children.emplace_back(std::make_unique(dict_view.keys(), stream, mr)); } - return std::make_unique(view.type(), - view.size(), - rmm::device_buffer{0, stream, mr}, - cudf::copy_bitmask(view, stream, mr), - view.null_count(), - std::move(children)); + return std::make_unique( + view.type(), + view.size(), + rmm::device_buffer{0, stream, mr}, + cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr), + view.null_count(), + std::move(children)); } template ()> * = nullptr> @@ -231,7 +233,7 @@ struct create_column_from_view { view.size() * cudf::size_of(view.type()), stream, mr}, - cudf::copy_bitmask(view, stream, mr), + cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr), view.null_count(), std::move(children)); } @@ -265,12 +267,13 @@ struct create_column_from_view { auto num_rows = children.empty() ? 0 : children.front()->size(); - return make_structs_column(num_rows, - std::move(children), - view.null_count(), - cudf::copy_bitmask(view.null_mask(), begin, end, stream, mr), - stream, - mr); + return make_structs_column( + num_rows, + std::move(children), + view.null_count(), + cudf::detail::copy_bitmask(view.null_mask(), begin, end, rmm::cuda_stream_view{stream}, mr), + stream, + mr); } }; } // anonymous namespace diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 648e1a14708..efbfd1de501 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -16,10 +16,10 @@ #include #include +#include #include #include #include -#include #include #include #include @@ -36,6 +36,7 @@ struct size_of_helper { constexpr int operator()() const { CUDF_FAIL("Invalid, non fixed-width element type."); + return 0; } template make_numeric_column(data_type type, return std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - create_null_mask(size, state, stream, mr), + detail::create_null_mask(size, state, stream, mr), state_null_count(state, size), std::vector>{}); } @@ -99,7 +100,7 @@ std::unique_ptr make_fixed_point_column(data_type type, return std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - create_null_mask(size, state, stream, mr), + detail::create_null_mask(size, state, stream, mr), state_null_count(state, size), std::vector>{}); } @@ -117,7 +118,7 @@ std::unique_ptr make_timestamp_column(data_type type, return std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - create_null_mask(size, state, stream, mr), + detail::create_null_mask(size, state, stream, mr), state_null_count(state, size), std::vector>{}); } @@ -135,7 +136,7 @@ std::unique_ptr make_duration_column(data_type type, return std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - create_null_mask(size, state, stream, mr), + detail::create_null_mask(size, state, stream, mr), state_null_count(state, size), std::vector>{}); } @@ -182,17 +183,18 @@ std::unique_ptr column_from_scalar_dispatch::operator()(value.type(), size, rmm::device_buffer{0, stream, mr}, - create_null_mask(size, mask_state::ALL_NULL, stream, mr), + null_mask, size); // Create a strings column_view with all nulls and no children. // Since we are setting every row to the scalar, the fill() never needs to access // any of the children in the strings column which would otherwise cause an exception. - auto null_mask = create_null_mask(size, mask_state::ALL_NULL, stream); column_view sc{ data_type{type_id::STRING}, size, nullptr, static_cast(null_mask.data()), size}; auto sv = static_cast const&>(value); diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 225e08eb1a8..0ab19f5af1a 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -225,7 +226,7 @@ std::unique_ptr fused_concatenate(std::vector const& views, // Allocate output auto const policy = has_nulls ? mask_policy::ALWAYS : mask_policy::NEVER; - auto out_col = detail::allocate_like(views.front(), output_size, policy, mr, stream); + auto out_col = detail::allocate_like(views.front(), output_size, policy, stream, mr); out_col->set_null_count(0); // prevent null count from being materialized auto out_view = out_col->mutable_view(); auto d_out_view = mutable_column_device_view::create(out_view, stream); @@ -386,7 +387,7 @@ rmm::device_buffer concatenate_masks(std::vector const& views, }); rmm::device_buffer null_mask = - create_null_mask(total_element_count, mask_state::UNINITIALIZED, 0, mr); + create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr); detail::concatenate_masks(views, static_cast(null_mask.data()), 0); diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index 9e7211e9757..6c0aeb601c2 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -18,13 +18,14 @@ #include #include #include +#include #include #include -#include #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -49,8 +50,8 @@ inline mask_state should_allocate_mask(mask_allocation_policy mask_alloc, bool m std::unique_ptr allocate_like(column_view const& input, size_type size, mask_allocation_policy mask_alloc, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column"); @@ -59,13 +60,13 @@ std::unique_ptr allocate_like(column_view const& input, std::vector> children{}; children.reserve(input.num_children()); for (size_type index = 0; index < input.num_children(); index++) { - children.emplace_back(allocate_like(input.child(index), size, mask_alloc, mr, stream)); + children.emplace_back(allocate_like(input.child(index), size, mask_alloc, stream, mr)); } return std::make_unique(input.type(), size, rmm::device_buffer(size * size_of(input.type()), stream, mr), - create_null_mask(size, allocate_mask, stream, mr), + detail::create_null_mask(size, allocate_mask, stream, mr), state_null_count(allocate_mask, input.size()), std::move(children)); } @@ -107,7 +108,7 @@ std::unique_ptr allocate_like(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::allocate_like(input, input.size(), mask_alloc, mr); + return detail::allocate_like(input, input.size(), mask_alloc, rmm::cuda_stream_default, mr); } std::unique_ptr allocate_like(column_view const& input, @@ -116,7 +117,7 @@ std::unique_ptr allocate_like(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::allocate_like(input, size, mask_alloc, mr); + return detail::allocate_like(input, size, mask_alloc, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index f4858714705..619d24c1204 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -21,6 +21,7 @@ #include #include #include "cudf/fixed_point/fixed_point.hpp" +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -36,27 +37,27 @@ struct copy_if_else_functor_impl { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (left_nullable) { if (right_nullable) { auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } if (right_nullable) { auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return detail::copy_if_else(false, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return detail::copy_if_else(false, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } }; @@ -71,8 +72,8 @@ struct copy_if_else_functor_impl { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using T = string_view; @@ -81,20 +82,20 @@ struct copy_if_else_functor_impl { auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); return strings::detail::copy_if_else( - lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } if (right_nullable) { auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } auto lhs_iter = cudf::detail::make_pair_iterator(lhs); auto rhs_iter = cudf::detail::make_pair_iterator(rhs); - return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream); + return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr); } }; @@ -109,8 +110,8 @@ struct copy_if_else_functor_impl { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("copy_if_else not supported for list_view yet"); } @@ -124,8 +125,8 @@ struct copy_if_else_functor_impl { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("copy_if_else not supported for struct_view yet"); } @@ -142,8 +143,8 @@ struct copy_if_else_functor_impl { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("copy_if_else not supported for decimal32 yet"); } @@ -160,8 +161,8 @@ struct copy_if_else_functor_impl { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("copy_if_else not supported for decimal64 yet"); } @@ -179,11 +180,11 @@ struct copy_if_else_functor { bool left_nullable, bool right_nullable, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { copy_if_else_functor_impl copier{}; - return copier(lhs, rhs, size, left_nullable, right_nullable, filter, mr, stream); + return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr); } }; @@ -194,8 +195,8 @@ std::unique_ptr copy_if_else(Left const& lhs, bool left_nullable, bool right_nullable, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8), @@ -218,8 +219,8 @@ std::unique_ptr copy_if_else(Left const& lhs, left_nullable, right_nullable, filter, - mr, - stream); + stream, + mr); } else { auto filter = [bool_mask_device] __device__(cudf::size_type i) { return bool_mask_device.element(i); @@ -232,8 +233,8 @@ std::unique_ptr copy_if_else(Left const& lhs, left_nullable, right_nullable, filter, - mr, - stream); + stream, + mr); } } @@ -242,8 +243,8 @@ std::unique_ptr copy_if_else(Left const& lhs, std::unique_ptr copy_if_else(column_view const& lhs, column_view const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.size() == lhs.size(), "Boolean mask column must be the same size as lhs and rhs columns"); @@ -253,15 +254,15 @@ std::unique_ptr copy_if_else(column_view const& lhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, - mr, - stream); + stream, + mr); } std::unique_ptr copy_if_else(scalar const& lhs, column_view const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.size() == rhs.size(), "Boolean mask column must be the same size as rhs column"); @@ -270,15 +271,15 @@ std::unique_ptr copy_if_else(scalar const& lhs, !lhs.is_valid(), rhs.has_nulls(), boolean_mask, - mr, - stream); + stream, + mr); } std::unique_ptr copy_if_else(column_view const& lhs, scalar const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(boolean_mask.size() == lhs.size(), "Boolean mask column must be the same size as lhs column"); @@ -287,17 +288,17 @@ std::unique_ptr copy_if_else(column_view const& lhs, lhs.has_nulls(), !rhs.is_valid(), boolean_mask, - mr, - stream); + stream, + mr); } std::unique_ptr copy_if_else(scalar const& lhs, scalar const& rhs, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return copy_if_else(lhs, rhs, !lhs.is_valid(), !rhs.is_valid(), boolean_mask, mr, stream); + return copy_if_else(lhs, rhs, !lhs.is_valid(), !rhs.is_valid(), boolean_mask, stream, mr); } }; // namespace detail @@ -308,7 +309,7 @@ std::unique_ptr copy_if_else(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr); } std::unique_ptr copy_if_else(scalar const& lhs, @@ -317,7 +318,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr); } std::unique_ptr copy_if_else(column_view const& lhs, @@ -326,7 +327,7 @@ std::unique_ptr copy_if_else(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr); } std::unique_ptr copy_if_else(scalar const& lhs, @@ -335,7 +336,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 812867ba3ca..daca5900768 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -103,7 +104,7 @@ struct out_of_place_copy_range_dispatch { auto p_ret = std::make_unique(target, stream, mr); if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) { p_ret->set_null_mask( - cudf::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), 0); + cudf::detail::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), 0); } if (source_end != source_begin) { // otherwise no-op diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu index e3be4d4cc13..c270be1ccca 100644 --- a/cpp/src/copying/sample.cu +++ b/cpp/src/copying/sample.cu @@ -21,6 +21,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" #include #include @@ -34,8 +35,8 @@ std::unique_ptr
sample(table_view const& input, size_type const n, sample_with_replacement replacement, int64_t const seed, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(n >= 0, "expected number of samples should be non-negative"); auto const num_rows = input.num_rows(); @@ -58,13 +59,13 @@ std::unique_ptr
sample(table_view const& input, thrust::make_transform_iterator(thrust::counting_iterator(0), RandomGen); auto end = thrust::make_transform_iterator(thrust::counting_iterator(n), RandomGen); - return detail::gather(input, begin, end, false, mr, stream); + return detail::gather(input, begin, end, false, mr, stream.value()); } else { - auto gather_map = - make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream); + auto gather_map = make_numeric_column( + data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream.value()); auto gather_map_mutable_view = gather_map->mutable_view(); // Shuffle all the row indices - thrust::shuffle_copy(rmm::exec_policy(stream)->on(stream), + thrust::shuffle_copy(rmm::exec_policy(stream)->on(stream.value()), thrust::counting_iterator(0), thrust::counting_iterator(num_rows), gather_map_mutable_view.begin(), @@ -77,7 +78,7 @@ std::unique_ptr
sample(table_view const& input, gather_map_view.end(), false, mr, - stream); + stream.value()); } } @@ -91,6 +92,6 @@ std::unique_ptr
sample(table_view const& input, { CUDF_FUNC_RANGE(); - return detail::sample(input, n, replacement, seed, mr); + return detail::sample(input, n, replacement, seed, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 3b893ba4f29..b0f0bbef064 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -75,7 +76,7 @@ void scatter_scalar_bitmask(std::vector> const& source, if (target[i]->nullable() or not source_is_valid) { if (not target[i]->nullable()) { // Target must have a null mask if the source is not valid - auto mask = create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr); + auto mask = detail::create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr); target[i]->set_null_mask(std::move(mask), 0); } @@ -349,7 +350,7 @@ std::unique_ptr boolean_mask_scatter(scalar const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - return detail::copy_if_else(input, target, boolean_mask, mr, stream); + return detail::copy_if_else(input, target, boolean_mask, stream, mr); } std::unique_ptr
boolean_mask_scatter(table_view const& input, diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu index b024c79ab2d..169b6760985 100644 --- a/cpp/src/copying/shift.cu +++ b/cpp/src/copying/shift.cu @@ -26,6 +26,8 @@ #include #include +#include + #include #include #include @@ -53,8 +55,8 @@ struct shift_functor { column_view const& input, size_type offset, scalar const& fill_value, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using Type = device_storage_type_t; using ScalarType = cudf::scalar_type_t; @@ -62,7 +64,7 @@ struct shift_functor { auto device_input = column_device_view::create(input); auto output = - detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, mr, stream); + detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr); auto device_output = mutable_column_device_view::create(*output); auto size = input.size(); @@ -103,7 +105,7 @@ struct shift_functor { }; thrust::transform( - rmm::exec_policy(stream)->on(stream), index_begin, index_end, data, func_value); + rmm::exec_policy(stream)->on(stream.value()), index_begin, index_end, data, func_value); return output; } @@ -111,11 +113,13 @@ struct shift_functor { } // anonymous namespace +namespace detail { + std::unique_ptr shift(column_view const& input, size_type offset, scalar const& fill_value, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(input.type() == fill_value.type(), @@ -123,7 +127,17 @@ std::unique_ptr shift(column_view const& input, if (input.is_empty()) { return empty_like(input); } - return type_dispatcher(input.type(), shift_functor{}, input, offset, fill_value, mr, stream); + return type_dispatcher(input.type(), shift_functor{}, input, offset, fill_value, stream, mr); +} + +} // namespace detail + +std::unique_ptr shift(column_view const& input, + size_type offset, + scalar const& fill_value, + rmm::mr::device_memory_resource* mr) +{ + return detail::shift(input, offset, fill_value, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/copying/slice.cpp b/cpp/src/copying/slice.cpp index f202fd6dfb0..a9141b7a48f 100644 --- a/cpp/src/copying/slice.cpp +++ b/cpp/src/copying/slice.cpp @@ -22,12 +22,13 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { std::vector slice(column_view const& input, std::vector const& indices, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even"); @@ -63,7 +64,7 @@ std::vector slice(cudf::column_view const& input, std::vector const& indices) { CUDF_FUNC_RANGE(); - return detail::slice(input, indices, 0); + return detail::slice(input, indices, rmm::cuda_stream_default); } std::vector slice(cudf::table_view const& input, diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 919d1d4eacc..c3e2cc9a2ff 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -19,11 +19,13 @@ #include #include #include +#include #include #include #include #include #include +#include "rmm/cuda_stream_view.hpp" #include @@ -165,8 +167,13 @@ std::unique_ptr apply_datetime_op(column_view const& column, // Return an empty column if source column is empty if (size == 0) return make_empty_column(output_col_type); - auto output = make_fixed_width_column( - output_col_type, size, copy_bitmask(column, stream, mr), column.null_count(), stream, mr); + auto output = + make_fixed_width_column(output_col_type, + size, + cudf::detail::copy_bitmask(column, rmm::cuda_stream_view{stream}, mr), + column.null_count(), + stream, + mr); auto launch = launch_functor::type>{ column, static_cast(*output)}; @@ -260,8 +267,9 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu // Return an empty column if source column is empty if (size == 0) return make_empty_column(output_col_type); - auto output_col_mask = bitmask_and(table_view({timestamp_column, months_column}), mr, stream); - auto output = make_fixed_width_column( + auto output_col_mask = + cudf::detail::bitmask_and(table_view({timestamp_column, months_column}), stream, mr); + auto output = make_fixed_width_column( output_col_type, size, std::move(output_col_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); auto launch = add_calendrical_months_functor{ diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index dbe22acab27..dc18afebb3b 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include +#include namespace cudf { namespace dictionary { @@ -114,10 +116,11 @@ std::unique_ptr add_keys( // create new dictionary column with keys_column and indices_column // null mask has not changed - return make_dictionary_column(std::move(keys_column), - std::move(indices_column), - copy_bitmask(dictionary_column.parent(), stream, mr), - dictionary_column.null_count()); + return make_dictionary_column( + std::move(keys_column), + std::move(indices_column), + cudf::detail::copy_bitmask(dictionary_column.parent(), rmm::cuda_stream_view{stream}, mr), + dictionary_column.null_count()); } } // namespace detail diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu index deaff20dc9e..c0bde1c92a5 100644 --- a/cpp/src/dictionary/decode.cu +++ b/cpp/src/dictionary/decode.cu @@ -17,12 +17,15 @@ #include #include #include +#include #include #include #include #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -52,7 +55,9 @@ std::unique_ptr decode(dictionary_column_view const& source, auto output_column = std::unique_ptr(std::move(table_column.front())); // apply any nulls to the output column - output_column->set_null_mask(copy_bitmask(source.parent(), stream, mr), source.null_count()); + output_column->set_null_mask( + cudf::detail::copy_bitmask(source.parent(), rmm::cuda_stream_view{stream}, mr), + source.null_count()); return output_column; } diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index dc52820d848..286f4961946 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -16,10 +16,12 @@ #include #include +#include #include #include #include #include +#include namespace cudf { namespace { @@ -57,7 +59,8 @@ std::unique_ptr make_dictionary_column(column_view const& keys_column, type_dispatcher(indices_column.type(), dispatch_create_indices{}, indices_column, mr, stream); rmm::device_buffer null_mask{0, stream, mr}; auto null_count = indices_column.null_count(); - if (null_count) null_mask = copy_bitmask(indices_column, stream, mr); + if (null_count) + null_mask = detail::copy_bitmask(indices_column, rmm::cuda_stream_view{stream}, mr); std::vector> children; children.emplace_back(std::move(indices_copy)); diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index ce3062680e3..613974efde7 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include namespace cudf { namespace dictionary { @@ -61,10 +63,11 @@ std::unique_ptr encode(column_view const& input_column, indices_column = cudf::detail::cast(indices_column->view(), indices_type, mr, stream); // create column with keys_column and indices_column - return make_dictionary_column(std::move(keys_column), - std::move(indices_column), - copy_bitmask(input_column, stream, mr), - input_column.null_count()); + return make_dictionary_column( + std::move(keys_column), + std::move(indices_column), + cudf::detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr), + input_column.null_count()); } /** diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 4b96b66571a..fa3219ef039 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -117,8 +117,8 @@ std::unique_ptr replace_indices(column_view const& input, input_pair_iterator + input.size(), replacement_iter, predicate, - mr, - stream); + stream, + mr); } } // namespace diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index a9bd95a2876..de6ab9f7261 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -93,7 +94,8 @@ struct out_of_place_fill_range_dispatch { if (end != begin) { // otherwise no fill if (!p_ret->nullable() && !value.is_valid()) { p_ret->set_null_mask( - cudf::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), 0); + cudf::detail::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), + 0); } auto ret_view = p_ret->mutable_view(); @@ -153,7 +155,7 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()(input, stream, mr); auto mview = result->mutable_view(); - cudf::set_null_mask(mview.null_mask(), begin, end, false, stream); + cudf::detail::set_null_mask(mview.null_mask(), begin, end, false, stream); mview.set_null_count(input.null_count() + (end - begin)); return result; } diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 7cace035d93..5bc7e0d02f0 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -278,7 +278,8 @@ void compute_single_pass_aggs(table_view const& keys, bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; if (skip_key_rows_with_nulls) { - auto row_bitmask{bitmask_and(keys, rmm::mr::get_current_device_resource(), stream)}; + auto row_bitmask{ + cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource())}; thrust::for_each_n( rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 5476a2011e7..88bdaf829a1 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -239,7 +239,8 @@ column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream) { if (_keys_bitmask_column) return _keys_bitmask_column->view(); - auto row_bitmask = bitmask_and(_keys, rmm::mr::get_current_device_resource(), stream); + auto row_bitmask = + cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource()); _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8), _keys.num_rows(), diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 79c95133b91..141c8121dff 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -132,11 +135,11 @@ struct dispatch_to_cudf_column { // If array is sliced, we have to copy whole mask and then take copy. auto out_mask = (num_rows == static_cast(data_buffer->size() / sizeof(T))) ? *tmp_mask - : copy_bitmask(static_cast(tmp_mask->data()), - array.offset(), - array.offset() + num_rows, - stream, - mr); + : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), + array.offset(), + array.offset() + num_rows, + rmm::cuda_stream_view{stream}, + mr); col->set_null_mask(std::move(out_mask)); } @@ -186,11 +189,11 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; if (has_nulls) { auto out_mask = - copy_bitmask(static_cast(get_mask_buffer(array, mr, stream)->data()), - array.offset(), - array.offset() + array.length(), - stream, - mr); + detail::copy_bitmask(static_cast(get_mask_buffer(array, mr, stream)->data()), + array.offset(), + array.offset() + array.length(), + rmm::cuda_stream_view{stream}, + mr); out_col->set_null_mask(std::move(out_mask)); } @@ -286,11 +289,11 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto out_mask = *(get_mask_buffer(array, mr, stream)); if (struct_array->null_bitmap_data() != nullptr) { - out_mask = copy_bitmask(static_cast(out_mask.data()), - array.offset(), - array.offset() + array.length(), - stream, - mr); + out_mask = detail::copy_bitmask(static_cast(out_mask.data()), + array.offset(), + array.offset() + array.length(), + rmm::cuda_stream_view{stream}, + mr); } return make_structs_column( diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 1753dff593b..3d6d298de71 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -300,7 +301,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, schema_desc[schema_data_idx].count = dict[i].first; } if (out_buffers[i].null_mask_size()) { - set_null_mask(out_buffers[i].null_mask(), 0, num_rows, true, stream); + cudf::detail::set_null_mask(out_buffers[i].null_mask(), 0, num_rows, true, stream); } } rmm::device_buffer block_list( diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu index 839a013c784..863e7f0a8b3 100644 --- a/cpp/src/io/csv/durations.cu +++ b/cpp/src/io/csv/durations.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -174,7 +175,8 @@ struct dispatch_from_durations_fn { auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(durations, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), duration_to_string_size_fn{d_column}); diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 90bdc42804c..cde8a321f8e 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -22,6 +22,7 @@ #pragma once #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace io { @@ -117,7 +119,10 @@ struct column_buffer { default: _data = create_data(type, size, stream, mr); break; } - if (is_nullable) { _null_mask = create_null_mask(size, mask_state::ALL_NULL, stream, mr); } + if (is_nullable) { + _null_mask = cudf::detail::create_null_mask( + size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr); + } } auto data() { return _strings.size() ? _strings.data().get() : _data.data(); } diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index df5495c02e0..c7bf2139a83 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -3,6 +3,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace lists { @@ -54,7 +55,8 @@ std::unique_ptr copy_slice(lists_column_view const& lists, cudf::detail::slice(lists.child(), {start_offset, end_offset}, stream).front()); // Compute the null mask of the result: - auto null_mask = cudf::copy_bitmask(lists.null_mask(), start, end, stream, mr); + auto null_mask = + cudf::detail::copy_bitmask(lists.null_mask(), start, end, rmm::cuda_stream_view{stream}, mr); return make_lists_column(lists_count, std::move(offsets), diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 71ac0865e5e..c22f5afe181 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -262,7 +262,7 @@ struct column_merger { // materialize_merged_bitmask_kernel() // which won't be called anymore (because of the _condition_ below) // - cudf::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream_); + cudf::detail::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream_); // set the null count: // diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 09a8b714819..280cc0198cf 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -14,17 +14,20 @@ * limitations under the License. */ -#include -#include +#include #include #include +#include #include #include #include #include #include -#include + +#include +#include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -36,7 +39,7 @@ struct quantile_functor { interpolation interp; bool retain_types; rmm::mr::device_memory_resource* mr; - cudaStream_t stream; + rmm::cuda_stream_view stream; template std::enable_if_t::value, std::unique_ptr> operator()( @@ -51,13 +54,14 @@ struct quantile_functor { { using Result = std::conditional_t; - auto type = data_type{type_to_id()}; - auto output = make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream, mr); + auto type = data_type{type_to_id()}; + auto output = + make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream.value(), mr); if (output->size() == 0) { return output; } if (input.is_empty()) { - auto mask = create_null_mask(output->size(), mask_state::ALL_NULL, stream, mr); + auto mask = cudf::detail::create_null_mask(output->size(), mask_state::ALL_NULL, stream, mr); output->set_null_mask(std::move(mask), output->size()); return output; } diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu index 8def1be553c..d5c9527e927 100644 --- a/cpp/src/reductions/scan.cu +++ b/cpp/src/reductions/scan.cu @@ -1,18 +1,17 @@ #include #include #include -#include -#include - -#include -#include -#include - #include #include +#include +#include +#include #include #include #include +#include +#include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -47,9 +46,11 @@ struct ScanDispatcher { { const size_type size = input_view.size(); auto output_column = - detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, mr, stream); + detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr); if (null_handling == null_policy::EXCLUDE) { - output_column->set_null_mask(copy_bitmask(input_view, stream, mr), input_view.null_count()); + output_column->set_null_mask( + detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr), + input_view.null_count()); } mutable_column_view output = output_column->mutable_view(); auto d_input = column_device_view::create(input_view, stream); @@ -91,7 +92,7 @@ struct ScanDispatcher { cudaStream_t stream) { rmm::device_buffer mask = - create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr); + detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr); auto d_input = column_device_view::create(input_view, stream); auto v = detail::make_validity_iterator(*d_input); auto first_null_position = @@ -114,9 +115,11 @@ struct ScanDispatcher { { const size_type size = input_view.size(); auto output_column = - detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, mr, stream); + detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr); if (null_handling == null_policy::EXCLUDE) { - output_column->set_null_mask(copy_bitmask(input_view, stream, mr), input_view.null_count()); + output_column->set_null_mask( + detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr), + input_view.null_count()); } else { if (input_view.nullable()) { output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream), @@ -166,7 +169,9 @@ struct ScanDispatcher { auto output_column = make_strings_column(result, Op::template identity(), stream, mr); if (null_handling == null_policy::EXCLUDE) { - output_column->set_null_mask(copy_bitmask(input_view, stream, mr), input_view.null_count()); + output_column->set_null_mask( + detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr), + input_view.null_count()); } else { if (input_view.nullable()) { output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream), diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 38af16ed5e2..fff063b269a 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -158,7 +158,7 @@ std::enable_if_t(), std::unique_ptr> clamp cudaStream_t stream) { auto output = - detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, mr, stream); + detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr); // mask will not change if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); } diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu index 4ce992ec9ee..6232da34f06 100644 --- a/cpp/src/replace/nans.cu +++ b/cpp/src/replace/nans.cu @@ -61,8 +61,8 @@ struct replace_nans_functor { input_pair_iterator + size, replacement_pair_iterator, predicate, - mr, - stream); + stream, + mr); } else { auto replacement_pair_iterator = make_pair_iterator(replacement); return copy_if_else(true, @@ -70,8 +70,8 @@ struct replace_nans_functor { input_pair_iterator + size, replacement_pair_iterator, predicate, - mr, - stream); + stream, + mr); } } else { auto input_pair_iterator = make_pair_iterator(*input_device_view); @@ -82,8 +82,8 @@ struct replace_nans_functor { input_pair_iterator + size, replacement_pair_iterator, predicate, - mr, - stream); + stream, + mr); } else { auto replacement_pair_iterator = make_pair_iterator(replacement); return copy_if_else(false, @@ -91,8 +91,8 @@ struct replace_nans_functor { input_pair_iterator + size, replacement_pair_iterator, predicate, - mr, - stream); + stream, + mr); } } } diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index c7bb01d3ecd..2a8fea154e5 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include "cudf/copying.hpp" #include @@ -152,13 +154,14 @@ struct replace_nulls_column_kernel_forwarder { cudf::size_type nrows = input.size(); cudf::detail::grid_1d grid{nrows, BLOCK_SIZE}; - std::unique_ptr output; - if (replacement.has_nulls()) - output = cudf::detail::allocate_like( - input, input.size(), cudf::mask_allocation_policy::ALWAYS, mr, stream); - else - output = cudf::detail::allocate_like( - input, input.size(), cudf::mask_allocation_policy::NEVER, mr, stream); + auto output = + cudf::detail::allocate_like(input, + input.size(), + replacement.has_nulls() ? cudf::mask_allocation_policy::ALWAYS + : cudf::mask_allocation_policy::NEVER, + stream, + mr); + auto output_view = output->mutable_view(); auto replace = replace_nulls; @@ -217,7 +220,7 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< auto device_replacement = cudf::column_device_view::create(replacement); rmm::device_buffer valid_bits = - cudf::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); + cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); // Call first pass kernel to get sizes in offsets cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1}; diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 07a9f0fab9f..21b583cddbe 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -37,13 +37,13 @@ #include #include #include +#include #include #include #include #include #include #include -#include #include #include #include @@ -52,6 +52,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace { // anonymous @@ -317,7 +318,7 @@ struct replace_kernel_forwarder { ? cudf::mask_allocation_policy::ALWAYS : cudf::mask_allocation_policy::NEVER; return cudf::detail::allocate_like( - input_col, input_col.size(), mask_allocation_policy, mr, stream); + input_col, input_col.size(), mask_allocation_policy, stream, mr); }(); auto output_view = output->mutable_view(); @@ -395,8 +396,8 @@ std::unique_ptr replace_kernel_forwarder::operator() #include +#include #include #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -68,7 +70,8 @@ struct byte_list_conversion { auto offsets_column = cudf::strings::detail::make_offsets_child_column( begin, begin + input_column.size(), mr, stream); - rmm::device_buffer null_mask = copy_bitmask(input_column, stream, mr); + rmm::device_buffer null_mask = + detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr); return make_lists_column(input_column.size(), std::move(offsets_column), @@ -97,7 +100,7 @@ std::unique_ptr byte_list_conversion::operator()( std::move(contents.children[cudf::strings_column_view::offsets_column_index]), std::move(contents.children[cudf::strings_column_view::chars_column_index]), input_column.null_count(), - copy_bitmask(input_column, stream, mr), + detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr), stream, mr); } diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index d6c68d56797..ef2ef8858ea 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -127,7 +127,7 @@ struct interleave_columns_functor { auto arch_column = input.column(0); auto output_size = input.num_columns() * input.num_rows(); auto output = - allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, mr, stream); + allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, stream, mr); auto device_input = table_device_view::create(input); auto device_output = mutable_column_device_view::create(*output); auto index_begin = thrust::make_counting_iterator(0); diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index b228136ad1b..89d3534a41f 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -19,14 +19,16 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { -std::string string_scalar::to_string(cudaStream_t stream) const +std::string string_scalar::to_string(rmm::cuda_stream_view stream) const { std::string result; result.resize(_data.size()); - CUDA_TRY(cudaMemcpyAsync(&result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync( + &result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); return result; } diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index 652a90809a3..a3a16130dfb 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include +#include + #include #include #include @@ -242,8 +245,12 @@ std::unique_ptr rank(column_view const &input, std::unique_ptr rank_column = [&null_handling, &output_type, &input, &mr, &stream] { // na_option=keep assign NA to NA values if (null_handling == null_policy::EXCLUDE) - return make_numeric_column( - output_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + return make_numeric_column(output_type, + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); else return make_numeric_column(output_type, input.size(), mask_state::UNALLOCATED, stream, mr); }(); diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 1bad00f49a4..e16291b6aa2 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#include + #include #include @@ -59,7 +62,8 @@ std::unique_ptr counts_fn(strings_column_view const& strings, cudf::data_type{type_id::INT32}, strings_count, rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr), - copy_bitmask(strings.parent(), stream, mr), // copy the null mask + cudf::detail::copy_bitmask( + strings.parent(), rmm::cuda_stream_view{stream}, mr), // copy the null mask strings.null_count()); auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index b3bf684f33a..48306ce4e11 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -19,15 +19,19 @@ #include #include #include +#include #include #include #include #include #include #include + #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -142,7 +146,8 @@ std::unique_ptr convert_case(strings_column_view const& strings, size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // get the lookup tables used for case conversion auto d_flags = get_character_flags_table(); diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index d3ac5229dd4..6e63e756c2e 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -26,9 +27,10 @@ #include #include +#include + #include -// namespace cudf { namespace strings { namespace detail { @@ -45,12 +47,13 @@ std::unique_ptr all_characters_of_type( auto d_column = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // get the static character types table @@ -168,7 +171,8 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str d_replacement}; // copy null mask from input column - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // this utility calls filterer to build the offsets and chars columns auto children = cudf::strings::detail::make_strings_children( @@ -192,12 +196,13 @@ std::unique_ptr is_integer( auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); thrust::transform(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), @@ -234,12 +239,13 @@ std::unique_ptr is_float( auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // check strings for valid float chars thrust::transform(rmm::exec_policy(stream)->on(stream), diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 60f7e13d866..57bd7abef2f 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -26,12 +27,15 @@ #include #include #include + #include #include + #include #include #include + #include namespace cudf { @@ -210,7 +214,7 @@ std::unique_ptr join_strings(strings_column_view const& strings, size_type null_count = 0; rmm::device_buffer null_mask{0, stream, mr}; // init to null null-mask if (strings.null_count() == strings_count && !narep.is_valid()) { - null_mask = create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr); + null_mask = cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr); null_count = 1; } auto chars_column = diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 6b441b29c47..96c87f554b5 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -17,8 +17,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -26,6 +26,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -79,12 +81,13 @@ std::unique_ptr contains_util( auto d_prog = *prog; // create the output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // fill the output column @@ -200,12 +203,13 @@ std::unique_ptr count_re( auto d_prog = *prog; // create the output column - auto results = make_numeric_column(data_type{type_id::INT32}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::INT32}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // fill the output column diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index 18fdf68aa23..1ba2151c0a7 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,8 @@ #include #include +#include + #include #include @@ -49,12 +52,13 @@ std::unique_ptr to_booleans(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create output column copying the strings' null-mask - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); @@ -106,7 +110,8 @@ std::unique_ptr from_booleans(column_view const& booleans, auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(booleans, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(booleans, rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index d44e8a7ec13..f716b1500c6 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -28,9 +29,12 @@ #include #include +#include +#include + #include + #include -#include #include namespace cudf { @@ -414,12 +418,13 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; - auto results = make_timestamp_column(timestamp_type, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_timestamp_column( + timestamp_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); cudf::type_dispatcher( timestamp_type, dispatch_to_timestamps_fn(), d_column, format, units, results_view, stream); @@ -564,12 +569,13 @@ std::unique_ptr is_timestamp(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); format_compiler compiler(format.c_str(), stream); @@ -886,7 +892,8 @@ std::unique_ptr from_timestamps(column_view const& timestamps, auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(timestamps, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(timestamps, rmm::cuda_stream_view{stream}, mr); // Each string will be the same number of bytes which can be determined // directly from the format string. auto d_str_bytes = compiler.template_bytes(); // size in bytes of each string diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index ba444c4ebe0..d2709e2ebe1 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -15,14 +15,18 @@ */ #include #include +#include #include #include #include #include +#include +#include + #include + #include -#include #include namespace cudf { @@ -409,7 +413,8 @@ struct dispatch_from_durations_fn { auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(durations, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -723,12 +728,13 @@ std::unique_ptr to_durations(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; - auto results = make_duration_column(duration_type, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_duration_column( + duration_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); cudf::type_dispatcher( duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream); diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 4ba347dbd50..8abf49c5dca 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -26,10 +27,12 @@ #include #include -#include #include +#include + #include #include + #include #include @@ -175,12 +178,13 @@ std::unique_ptr to_floats(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create float output column copying the strings null-mask - auto results = make_numeric_column(output_type, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + output_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); // fill output column with floats type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream); @@ -467,7 +471,8 @@ struct dispatch_from_floats_fn { auto d_column = *column; // copy the null mask - rmm::device_buffer null_mask = copy_bitmask(floats, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(floats, rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), float_to_string_size_fn{d_column}); diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 60fe3a80d79..a8ea7cf3ab9 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,8 @@ #include #include +#include + #include #include #include @@ -129,12 +132,13 @@ std::unique_ptr hex_to_integers( auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create integer output column copying the strings null-mask - auto results = make_numeric_column(output_type, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + output_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); // fill output column with integers type_dispatcher(output_type, dispatch_hex_to_integers_fn{}, d_strings, results_view, stream); @@ -149,12 +153,13 @@ std::unique_ptr is_hex(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); thrust::transform(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 248f2f9a717..42bd70899a9 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,8 @@ #include #include +#include + #include #include @@ -101,12 +104,13 @@ std::unique_ptr to_integers(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create integer output column copying the strings null-mask - auto results = make_numeric_column(output_type, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + output_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); // fill output column with integers type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream); @@ -180,7 +184,8 @@ struct dispatch_from_integers_fn { auto d_column = *column; // copy the null mask - rmm::device_buffer null_mask = copy_bitmask(integers, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), integer_to_string_size_fn{d_column}); diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index 3a18480c866..dcccad30f30 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,8 @@ #include #include +#include + #include #include @@ -78,12 +81,13 @@ std::unique_ptr ipv4_to_integers( auto strings_column = column_device_view::create(strings.parent(), stream); // create output column copying the strings' null-mask - auto results = make_numeric_column(data_type{type_id::INT64}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::INT64}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // fill output column with ipv4 integers thrust::transform(rmm::exec_policy(stream)->on(stream), @@ -168,7 +172,8 @@ std::unique_ptr integers_to_ipv4( auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(integers, stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { @@ -212,12 +217,13 @@ std::unique_ptr is_ipv4(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); thrust::transform(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 5030e49a23a..9b5c142511f 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -120,7 +123,8 @@ std::unique_ptr url_encode( auto d_strings = *strings_column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), url_encoder_fn{d_strings}); @@ -222,7 +226,8 @@ std::unique_ptr url_decode( auto d_strings = *strings_column; // copy null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), url_decoder_fn{d_strings}); diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 38f43a8bb5d..9a8a64f2f99 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -249,7 +250,8 @@ std::unique_ptr concatenate(std::vector const& columns, rmm::device_buffer null_mask{0, stream, mr}; size_type null_count{}; if (has_nulls) { - null_mask = create_null_mask(strings_count, mask_state::UNINITIALIZED, stream, mr); + null_mask = + cudf::detail::create_null_mask(strings_count, mask_state::UNINITIALIZED, stream, mr); } { // Copy offsets columns with single kernel launch diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 3db0017f55f..975d84c7875 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -25,7 +26,10 @@ #include #include +#include + #include + #include namespace cudf { @@ -123,7 +127,8 @@ std::unique_ptr filter_characters( auto d_strings = *strings_column; // create null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // create offsets column filter_fn ffn{d_strings, keep_characters, table.begin(), table.end(), d_replacement}; diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index 7e401146d9f..1b3ede7c88c 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#include + #include namespace cudf { @@ -65,12 +68,13 @@ std::unique_ptr find_fn(strings_column_view const& strings, auto d_strings = *strings_column; auto strings_count = strings.size(); // create output column - auto results = make_numeric_column(data_type{type_id::INT32}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::INT32}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the position values by evaluating the passed function @@ -187,7 +191,9 @@ std::unique_ptr contains_fn(strings_column_view const& strings, { auto const true_scalar = make_fixed_width_scalar(true, stream); auto results = make_column_from_scalar(*true_scalar, strings.size(), mr, stream); - results->set_null_mask(copy_bitmask(strings.parent(), stream, mr), strings.null_count()); + results->set_null_mask( + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count()); return results; } @@ -195,12 +201,13 @@ std::unique_ptr contains_fn(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the bool values by evaluating the passed function @@ -250,12 +257,13 @@ std::unique_ptr contains_fn(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings_count, - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column( + data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the bool values by evaluating the passed function diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu index 5c0904c2cb8..d7e695c0a3a 100644 --- a/cpp/src/strings/findall.cu +++ b/cpp/src/strings/findall.cu @@ -17,14 +17,15 @@ #include #include #include +#include #include -#include #include #include #include #include #include #include +#include "rmm/cuda_stream_view.hpp" #include @@ -112,15 +113,15 @@ std::unique_ptr
findall_re( strings_column_view const& strings, std::string const& pattern, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(strings.parent(), stream.value()); auto d_strings = *strings_column; auto d_flags = detail::get_character_flags_table(); // compile regex into device object - auto prog = reprog_device::create(pattern, d_flags, strings_count, stream); + auto prog = reprog_device::create(pattern, d_flags, strings_count, stream.value()); auto d_prog = *prog; auto execpol = rmm::exec_policy(stream); int regex_insts = prog->insts_counts(); @@ -129,19 +130,19 @@ std::unique_ptr
findall_re( auto d_find_counts = find_counts.data().get(); if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_find_counts, findall_count_fn{d_strings, d_prog}); else if (regex_insts <= RX_MEDIUM_INSTS) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_find_counts, findall_count_fn{d_strings, d_prog}); else - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_find_counts, @@ -150,41 +151,41 @@ std::unique_ptr
findall_re( std::vector> results; size_type columns = - *thrust::max_element(execpol->on(stream), find_counts.begin(), find_counts.end()); + *thrust::max_element(execpol->on(stream.value()), find_counts.begin(), find_counts.end()); // boundary case: if no columns, return all nulls column (issue #119) if (columns == 0) - results.emplace_back( - std::make_unique(data_type{type_id::STRING}, - strings_count, - rmm::device_buffer{0, stream, mr}, // no data - create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), - strings_count)); + results.emplace_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); for (int32_t column_index = 0; column_index < columns; ++column_index) { rmm::device_vector indices(strings_count); string_index_pair* d_indices = indices.data().get(); if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_indices, findall_fn{d_strings, d_prog, column_index, d_find_counts}); else if (regex_insts <= RX_MEDIUM_INSTS) thrust::transform( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_indices, findall_fn{d_strings, d_prog, column_index, d_find_counts}); else - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_indices, findall_fn{d_strings, d_prog, column_index, d_find_counts}); // - results.emplace_back(make_strings_column(indices, stream, mr)); + results.emplace_back(make_strings_column(indices, stream.value(), mr)); } return std::make_unique
(std::move(results)); } diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 8e9951a7bf8..05b5293e432 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -67,7 +70,8 @@ std::unique_ptr pad( auto d_strings = *strings_column; // create null_mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = @@ -155,7 +159,8 @@ std::unique_ptr zfill( auto d_strings = *strings_column; // copy bitmask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index e94d01f0268..f373c97b1ef 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -108,7 +111,8 @@ std::unique_ptr replace(strings_column_view const& strings, auto d_strings = *strings_column; // copy the null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -194,7 +198,8 @@ std::unique_ptr replace_slice(strings_column_view const& strings, auto d_strings = *strings_column; // copy the null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -303,7 +308,8 @@ std::unique_ptr replace(strings_column_view const& strings, auto d_repls = *repls_column; // copy the null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index a8fcd4adbd3..4ef46b289e2 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -493,12 +494,12 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, *thrust::max_element(execpol->on(stream), token_counts.begin(), token_counts.end()); // boundary case: if no columns, return one null column (custrings issue #119) if (columns_count == 0) { - results.push_back( - std::make_unique(data_type{type_id::STRING}, - strings_count, - rmm::device_buffer{0, stream, mr}, // no data - create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), - strings_count)); + results.push_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); } // create working area to hold all token positions @@ -764,12 +765,12 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, std::vector> results; // boundary case: if no columns, return one null column (issue #119) if (columns_count == 0) { - results.push_back( - std::make_unique(data_type{type_id::STRING}, - strings_count, - rmm::device_buffer{0, stream, mr}, // no data - create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), - strings_count)); + results.push_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); } // get the positions for every token diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index 18df9d5e48c..ea5a2d8ef69 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#include + #include #include @@ -118,7 +121,8 @@ std::unique_ptr strip( size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column -- calculate the size of each output string auto offsets_transformer_itr = thrust::make_transform_iterator( diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index d5695bddb31..1d4656ffa8f 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -111,7 +112,8 @@ std::unique_ptr slice_strings( auto d_step = get_scalar_device_view(const_cast&>(step)); // copy the null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index e61a40d655f..1fc9ff7f813 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include + #include namespace cudf { @@ -92,7 +94,8 @@ std::unique_ptr translate( auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // create offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index 5864cc1f2c7..181283c5e34 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -103,7 +104,8 @@ std::unique_ptr wrap( size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // build offsets column auto offsets_column = std::make_unique(strings.offsets(), stream, mr); // makes a copy diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index cc006760519..ac67b08eba0 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -19,18 +19,24 @@ #include #include #include +#include #include #include #include #include + #include #include + #include #include +#include + #include #include + #include namespace nvtext { @@ -161,7 +167,8 @@ std::unique_ptr normalize_spaces( auto strings_column = cudf::column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // copy bitmask - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // create offsets by calculating size of each string for output auto offsets_transformer_itr = @@ -247,13 +254,14 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars}); chars_column->set_null_count(0); // reset null count for child column - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), - strings.null_count(), - copy_bitmask(strings.parent(), stream, mr), - stream, - mr); + return cudf::make_strings_column( + strings_count, + std::move(offsets_column), + std::move(chars_column), + strings.null_count(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + stream, + mr); } } // namespace detail diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 7733e521b04..4263c5f1864 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -214,7 +215,8 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st *replacements_column}; // copy null mask from input column - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // this utility calls replacer to build the offsets and chars columns auto children = cudf::strings::detail::make_strings_children( @@ -249,7 +251,8 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement}; // copy null mask from input column - rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr); + rmm::device_buffer null_mask = + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); // this utility calls filterer to build the offsets and chars columns auto children = cudf::strings::detail::make_strings_children( diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu index ec4ad17448b..1521dc90dae 100644 --- a/cpp/src/text/stemmer.cu +++ b/cpp/src/text/stemmer.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -97,12 +98,13 @@ std::unique_ptr is_letter(cudf::strings_column_view const& strings if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}); // create empty output column - auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, - strings.size(), - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); // set values into output column auto strings_column = cudf::column_device_view::create(strings.parent(), stream); thrust::transform(rmm::exec_policy(stream)->on(stream), @@ -204,12 +206,13 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); // create empty output column - auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, - strings.size(), - copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); + auto results = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT32}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), + strings.null_count(), + stream, + mr); // compute measures into output column auto strings_column = cudf::column_device_view::create(strings.parent(), stream); thrust::transform(rmm::exec_policy(stream)->on(stream), diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 7729d17aadc..c8e7bd2fd5e 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -53,7 +53,7 @@ std::pair, std::unique_ptr> encode( auto num_rows = keys_table->num_rows(); auto mask = - cudf::detail::bitmask_and(keys_table->view(), rmm::mr::get_current_device_resource(), stream); + cudf::detail::bitmask_and(keys_table->view(), stream, rmm::mr::get_current_device_resource()); auto num_rows_with_nulls = cudf::count_unset_bits(reinterpret_cast(mask.data()), 0, num_rows); diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 338bb481606..e96f6e4f004 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -110,12 +112,13 @@ struct dispatch_unary_cast_to { rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - auto size = input.size(); - auto output = std::make_unique(type, - size, - rmm::device_buffer{size * cudf::size_of(type), 0, mr}, - copy_bitmask(input, 0, mr), - input.null_count()); + auto size = input.size(); + auto output = + std::make_unique(type, + size, + rmm::device_buffer{size * cudf::size_of(type), stream, mr}, + detail::copy_bitmask(input, stream, mr), + input.null_count()); mutable_column_view output_mutable = *output; diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index 1b4f91ad10f..08b653c7353 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -16,12 +16,15 @@ #include #include +#include #include #include #include #include #include +#include + #include #include @@ -261,12 +264,13 @@ std::unique_ptr transform_fn(cudf::dictionary_column_view const& i auto dictionary_itr = dictionary::detail::make_dictionary_iterator(*dictionary_view); auto default_mr = rmm::mr::get_current_device_resource(); // call unary-op using temporary output buffer - auto output = transform_fn(dictionary_itr, - dictionary_itr + input.size(), - copy_bitmask(input.parent(), stream, default_mr), - input.null_count(), - default_mr, - stream); + auto output = transform_fn( + dictionary_itr, + dictionary_itr + input.size(), + detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, default_mr), + input.null_count(), + default_mr, + stream); return cudf::dictionary::detail::encode( output->view(), dictionary::detail::get_indices_type_for_size(output->size()), mr, stream); } @@ -278,12 +282,13 @@ struct MathOpDispatcher { rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - return transform_fn(input.begin(), - input.end(), - copy_bitmask(input, stream, mr), - input.null_count(), - mr, - stream); + return transform_fn( + input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), + input.null_count(), + mr, + stream); } struct dictionary_dispatch { @@ -335,12 +340,13 @@ struct BitwiseOpDispatcher { rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - return transform_fn(input.begin(), - input.end(), - copy_bitmask(input, stream, mr), - input.null_count(), - mr, - stream); + return transform_fn( + input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), + input.null_count(), + mr, + stream); } struct dictionary_dispatch { @@ -400,12 +406,13 @@ struct LogicalOpDispatcher { rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - return transform_fn(input.begin(), - input.end(), - copy_bitmask(input, stream, mr), - input.null_count(), - mr, - stream); + return transform_fn( + input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), + input.null_count(), + mr, + stream); } struct dictionary_dispatch { @@ -416,12 +423,13 @@ struct LogicalOpDispatcher { { auto dictionary_view = cudf::column_device_view::create(input.parent(), stream); auto dictionary_itr = dictionary::detail::make_dictionary_iterator(*dictionary_view); - return transform_fn(dictionary_itr, - dictionary_itr + input.size(), - copy_bitmask(input.parent(), stream, mr), - input.null_count(), - mr, - stream); + return transform_fn( + dictionary_itr, + dictionary_itr + input.size(), + cudf::detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, mr), + input.null_count(), + mr, + stream); } template ()>* = nullptr> From 7802d5246694b27c64efc61e68d0043cb882ad55 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 2 Nov 2020 17:04:06 +1100 Subject: [PATCH 03/51] Revert commented out stuff. --- .../type_dispatcher/type_dispatcher_benchmark.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu index 222a2c40618..56b6ead120e 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu @@ -144,14 +144,14 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread) // std::vector v_stream(n_cols); for (int c = 0; c < n_cols; c++) { auto d_column = mutable_column_device_view::create(input.column(c)); - // cudf::type_dispatcher( - // d_column->type(), ColumnHandle{}, *d_column, work_per_thread); + cudf::type_dispatcher( + d_column->type(), ColumnHandle{}, *d_column, work_per_thread); } } else if (dispatching_type == DEVICE_DISPATCHING) { auto d_table_view = mutable_table_device_view::create(input); - // auto f = device_dispatching_kernel; + auto f = device_dispatching_kernel; // Launch the kernel - // f<<>>(*d_table_view); + f<<>>(*d_table_view); } else if (dispatching_type == NO_DISPATCHING) { auto f = no_dispatching_kernel; // Launch the kernel From f0ca10c0453b91eba0ff739ec433b13eae6f85d0 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 2 Nov 2020 17:23:52 +1100 Subject: [PATCH 04/51] Convert AST to cuda_stream_view --- cpp/include/cudf/ast/detail/transform.cuh | 3 ++- cpp/src/ast/transform.cu | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 3366acefe35..454085ff9bd 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -27,6 +27,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { @@ -369,7 +370,7 @@ struct ast_plan { std::unique_ptr compute_column( table_view const table, expression const& expr, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index b8906a36121..ffc80a926fb 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -29,6 +29,8 @@ #include #include #include + +#include #include #include @@ -87,7 +89,7 @@ __launch_bounds__(max_block_size) __global__ std::unique_ptr compute_column(table_view const table, expression const& expr, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // Linearize the AST @@ -126,14 +128,14 @@ std::unique_ptr compute_column(table_view const table, reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); // Create table device view - auto table_device = table_device_view::create(table, stream); + auto table_device = table_device_view::create(table, stream.value()); auto const table_num_rows = table.num_rows(); // Prepare output column auto output_column = cudf::make_fixed_width_column( - expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr); + expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream.value(), mr); auto mutable_output_device = - cudf::mutable_column_device_view::create(output_column->mutable_view(), stream); + cudf::mutable_column_device_view::create(output_column->mutable_view(), stream.value()); // Configure kernel parameters auto const num_intermediates = expr_linearizer.get_intermediate_count(); @@ -153,7 +155,7 @@ std::unique_ptr compute_column(table_view const table, // Execute the kernel cudf::ast::detail::compute_column_kernel - <<>>( + <<>>( *table_device, device_literals, *mutable_output_device, @@ -162,7 +164,7 @@ std::unique_ptr compute_column(table_view const table, device_operator_source_indices, num_operators, num_intermediates); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return output_column; } @@ -173,7 +175,7 @@ std::unique_ptr compute_column(table_view const table, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::compute_column(table, expr, 0, mr); + return detail::compute_column(table, expr, rmm::cuda_stream_default, mr); } } // namespace ast From 22a14ddb83b2350e58c9d64f3df0c076d08bd975 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 2 Nov 2020 17:52:23 +1100 Subject: [PATCH 05/51] Convert column_device_view to rmm::cuda_stream_view --- cpp/include/cudf/column/column_device_view.cuh | 6 ++++-- cpp/src/column/column_device_view.cu | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 5446d9b2f29..5118db2364e 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -28,6 +28,8 @@ #include #include +#include + /** * @file column_device_view.cuh * @brief Column device view class definitons @@ -386,7 +388,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { *`source_view` available in device memory. */ static std::unique_ptr> create( - column_view source_view, cudaStream_t stream = 0); + column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Destroy the `column_device_view` object. @@ -480,7 +482,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view */ static std::unique_ptr> - create(mutable_column_view source_view, cudaStream_t stream = 0); + create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Returns pointer to the base device memory allocation casted to diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 8e61f776e39..fb3bab68446 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -18,6 +18,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" #include @@ -92,7 +93,7 @@ ColumnDeviceView* child_columns_to_device_array(ColumnView const& source, void* // helper function for column_device_view::create and mutable_column_device::create methods template std::unique_ptr> -create_device_view_from_view(ColumnView const& source, cudaStream_t stream) +create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view stream) { size_type num_children = source.num_children(); // First calculate the size of memory needed to hold the @@ -129,9 +130,9 @@ create_device_view_from_view(ColumnView const& source, cudaStream_t stream) staging_buffer.data(), descendant_storage->size(), cudaMemcpyDefault, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaStreamSynchronize(stream.value())); return result; } @@ -153,7 +154,7 @@ column_device_view::column_device_view(column_view source, void* h_ptr, void* d_ // Construct a unique_ptr that invokes `destroy()` as it's deleter std::unique_ptr> -column_device_view::create(column_view source, cudaStream_t stream) +column_device_view::create(column_view source, rmm::cuda_stream_view stream) { size_type num_children = source.num_children(); if (num_children == 0) { @@ -203,7 +204,7 @@ void mutable_column_device_view::destroy() { delete this; } // Construct a unique_ptr that invokes `destroy()` as it's deleter std::unique_ptr> -mutable_column_device_view::create(mutable_column_view source, cudaStream_t stream) +mutable_column_device_view::create(mutable_column_view source, rmm::cuda_stream_view stream) { return source.num_children() == 0 ? std::unique_ptr(new mutable_column_device_view(source)) From 2fab2ad9a2d305afc955dfa83ac33ca83bdca449 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 2 Nov 2020 18:39:15 +1100 Subject: [PATCH 06/51] Convert column to rmm::cuda_stream_view --- cpp/include/cudf/column/column.hpp | 11 ++++----- cpp/src/column/column.cu | 37 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index ce0ed412b27..b94a2f13e1d 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -15,10 +15,12 @@ */ #pragma once +#include "column_view.hpp" + #include #include -#include "column_view.hpp" +#include #include #include @@ -50,9 +52,6 @@ class column { /** * @brief Construct a new column by deep copying the contents of `other`. * - * All device memory allocation and copying is done using the - * `device_memory_resource` and `stream` from `other`. - * * @param other The column to copy **/ column(column const& other); @@ -69,7 +68,7 @@ class column { * @param mr Device memory resource to use for all device memory allocations */ column(column const& other, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -124,7 +123,7 @@ class column { * @param mr Device memory resource to use for all device memory allocations */ explicit column(column_view view, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu index 399bc26f786..b64f88291b7 100644 --- a/cpp/src/column/column.cu +++ b/cpp/src/column/column.cu @@ -26,19 +26,19 @@ #include #include #include +#include +#include #include #include #include -#include +#include #include #include +#include #include #include -#include "cudf/structs/structs_column_view.hpp" -#include "cudf/types.hpp" -#include "rmm/cuda_stream_view.hpp" namespace cudf { // Copy constructor @@ -54,7 +54,9 @@ column::column(column const &other) } // Copy ctor w/ explicit stream/mr -column::column(column const &other, cudaStream_t stream, rmm::mr::device_memory_resource *mr) +column::column(column const &other, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) : _type{other._type}, _size{other._size}, _data{other._data, stream, mr}, @@ -181,7 +183,7 @@ void column::set_null_count(size_type new_null_count) namespace { struct create_column_from_view { cudf::column_view view; - cudaStream_t stream; + rmm::cuda_stream_view stream{}; rmm::mr::device_memory_resource *mr; template operator()() { cudf::strings_column_view sview(view); - return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream, mr); + return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream.value(), mr); } template (indices_view, stream, mr)); children.emplace_back(std::make_unique(dict_view.keys(), stream, mr)); } - return std::make_unique( - view.type(), - view.size(), - rmm::device_buffer{0, stream, mr}, - cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr), - view.null_count(), - std::move(children)); + return std::make_unique(view.type(), + view.size(), + rmm::device_buffer{0, stream, mr}, + cudf::detail::copy_bitmask(view, stream, mr), + view.null_count(), + std::move(children)); } template ()> * = nullptr> @@ -233,7 +234,7 @@ struct create_column_from_view { view.size() * cudf::size_of(view.type()), stream, mr}, - cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr), + cudf::detail::copy_bitmask(view, stream, mr), view.null_count(), std::move(children)); } @@ -243,7 +244,7 @@ struct create_column_from_view { std::unique_ptr operator()() { auto lists_view = lists_column_view(view); - return cudf::lists::detail::copy_slice(lists_view, 0, view.size(), stream, mr); + return cudf::lists::detail::copy_slice(lists_view, 0, view.size(), stream.value(), mr); } template Date: Tue, 3 Nov 2020 12:27:12 +1100 Subject: [PATCH 07/51] Changelog for #6646 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 017d9f35806..08186bb4408 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ - PR #6514 Initial work for decimal type in Java/JNI - PR #6608 Improve subword tokenizer docs - PR #6612 Update JNI to new RMM cuda_stream_view API +- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1) ## Bug Fixes From 0ebf99e00572a55ab83f9c244ef6ee4eb75e973b Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 3 Nov 2020 14:52:16 +1100 Subject: [PATCH 08/51] Convert column factories to cuda_stream_view --- cpp/include/cudf/column/column_factories.hpp | 152 ++++++++++--------- cpp/src/column/column_factories.cpp | 48 +++--- cpp/src/copying/scatter.cu | 2 +- cpp/src/dictionary/replace.cu | 2 +- cpp/src/filling/fill.cu | 2 +- cpp/src/hash/hashing.cu | 2 +- cpp/src/lists/lists_column_factories.cu | 4 +- cpp/src/replace/clamp.cu | 2 +- cpp/src/strings/find.cu | 33 ++-- cpp/src/strings/strings_column_factories.cu | 30 ++-- cpp/src/strings/substring.cu | 8 +- cpp/src/structs/structs_column_factories.cu | 14 +- 12 files changed, 153 insertions(+), 146 deletions(-) diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index b40089f0929..7665cd8ca86 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -15,11 +15,13 @@ */ #pragma once -#include #include #include #include +#include +#include + namespace cudf { /** * @addtogroup column_factories @@ -31,9 +33,9 @@ namespace cudf { /** * @brief Creates an empty column of the specified @p type * - * An empty column does not contain any elements or a validity mask. + * An empty column contains zero elements and no validity mask. * - * @param type The desired type + * @param[in] type The column data type * @return Empty column with desired type */ std::unique_ptr make_empty_column(data_type type); @@ -59,7 +61,7 @@ std::unique_ptr make_numeric_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -85,7 +87,7 @@ std::unique_ptr make_numeric_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); @@ -115,7 +117,7 @@ std::unique_ptr make_fixed_point_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -139,7 +141,7 @@ std::unique_ptr make_fixed_point_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); @@ -171,7 +173,7 @@ std::unique_ptr make_timestamp_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -197,7 +199,7 @@ std::unique_ptr make_timestamp_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); @@ -229,7 +231,7 @@ std::unique_ptr make_duration_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -255,7 +257,7 @@ std::unique_ptr make_duration_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); @@ -287,7 +289,7 @@ std::unique_ptr make_fixed_width_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -313,7 +315,7 @@ std::unique_ptr make_fixed_width_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); @@ -342,16 +344,16 @@ std::unique_ptr make_fixed_width_column( * * @throws std::bad_alloc if device memory allocation fails * - * @param strings The vector of pointer/size pairs. + * @param[in] strings The vector of pointer/size pairs. * Each pointer must be a device memory address or `nullptr` * (indicating a null string). The size must be the number of bytes. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used for allocation of the column's `null_mask` and children + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. */ std::unique_ptr make_strings_column( const rmm::device_vector>& strings, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -370,20 +372,20 @@ std::unique_ptr make_strings_column( * * @throws std::bad_alloc if device memory allocation fails * - * @param string_views The vector of string_view. + * @param[in] string_views The vector of string_view. * Each string_view must point to a device memory address or * `null_placeholder` (indicating a null string). The size must be the number of * bytes. - * @param null_placeholder string_view indicating null string in given list of + * @param[in] null_placeholder string_view indicating null string in given list of * string_views. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used for allocation of the column's `null_mask` and children + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. */ std::unique_ptr make_strings_column( const rmm::device_vector& string_views, const string_view null_placeholder, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -401,21 +403,21 @@ std::unique_ptr make_strings_column( * * @throws std::bad_alloc if device memory allocation fails * - * @param strings The vector of chars in device memory. + * @param[in] strings The vector of chars in device memory. * This char vector is expected to be UTF-8 encoded characters. - * @param offsets The vector of byte offsets in device memory. + * @param[in] offsets The vector of byte offsets in device memory. * The number of elements is one more than the total number * of strings so the `offsets.back()` is the total * number of bytes in the strings array. * `offsets.front()` must always be 0 to point to the beginning * of `strings`. - * @param null_mask Device vector containing the null element indicator bitmask. + * @param[in] null_mask Device vector containing the null element indicator bitmask. * Arrow format for nulls is used for interpeting this bitmask. - * @param null_count The number of null string entries. If equal to + * @param[in] null_count The number of null string entries. If equal to * `UNKNOWN_NULL_COUNT`, the null count will be computed dynamically on the * first invocation of `column::null_count()` - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used for allocation of the column's `null_mask` and children + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. */ std::unique_ptr make_strings_column( @@ -423,7 +425,7 @@ std::unique_ptr make_strings_column( const rmm::device_vector& offsets, const rmm::device_vector& null_mask = {}, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -441,21 +443,21 @@ std::unique_ptr make_strings_column( * * @throws std::bad_alloc if device memory allocation fails * - * @param strings The contiguous array of chars in host memory. + * @param[in] strings The contiguous array of chars in host memory. * This char array is expected to be UTF-8 encoded characters. - * @param offsets The array of byte offsets in host memory. + * @param[in] offsets The array of byte offsets in host memory. * The number of elements is one more than the total number * of strings so the `offsets.back()` is the total * number of bytes in the strings array. * `offsets.front()` must always be 0 to point to the beginning * of `strings`. - * @param null_mask Host vector containing the null element indicator bitmask. + * @param[in] null_mask Host vector containing the null element indicator bitmask. * Arrow format for nulls is used for interpeting this bitmask. - * @param null_count The number of null string entries. If equal to + * @param[in] null_count The number of null string entries. If equal to * `UNKNOWN_NULL_COUNT`, the null count will be computed dynamically on the * first invocation of `column::null_count()` - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used for allocation of the column's `null_mask` and children + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. */ std::unique_ptr make_strings_column( @@ -463,7 +465,7 @@ std::unique_ptr make_strings_column( const std::vector& offsets, const std::vector& null_mask = {}, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -471,19 +473,19 @@ std::unique_ptr make_strings_column( * and null mask and null count. The columns and mask are moved into the * resulting strings column. * - * @param num_strings The number of strings the column represents. - * @param offsets_column The column of offset values for this column. + * @param[in] num_strings The number of strings the column represents. + * @param[in] offsets_column The column of offset values for this column. * The number of elements is one more than the total number * of strings so the offset[last] - offset[0] is the total * number of bytes in the strings vector. - * @param chars_column The column of char bytes for all the strings for this column. + * @param[in] chars_column The column of char bytes for all the strings for this column. * Individual strings are identified by the offsets and the * nullmask. - * @param null_count The number of null string entries. - * @param null_mask The bits specifying the null strings in device memory. + * @param[in] null_count The number of null string entries. + * @param[in] null_mask The bits specifying the null strings in device memory. * Arrow format for nulls is used for interpeting this bitmask. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used for allocation of the column's `null_mask` and children + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. */ std::unique_ptr make_strings_column( @@ -492,7 +494,7 @@ std::unique_ptr make_strings_column( std::unique_ptr chars_column, size_type null_count, rmm::device_buffer&& null_mask, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -534,21 +536,21 @@ std::unique_ptr make_strings_column( * data (depth 1) {1, 2, 3, 4, 5, 6, 7} * @endcode * - * @param num_lists The number of lists the column represents. - * @param offsets_column The column of offset values for this column. Each value should represent - * the starting offset into the child elements that corresponds to the - * beginning of the row, with the first row starting at 0. The length of row - * N can be determined by subtracting offsets[N+1] - offsets[N]. The total - * number of offsets should be 1 longer than the # of rows in the column. - * @param child_column The column of nested data referenced by the lists represented by the + * @param[in] num_lists The number of lists the column represents. + * @param[in] offsets_column The column of offset values for this column. Each value should + * represent the starting offset into the child elements that corresponds to the beginning of the + * row, with the first row starting at 0. The length of row N can be determined by subtracting + * offsets[N+1] - offsets[N]. The total number of offsets should be 1 longer than the # of rows in + * the column. + * @param[in] child_column The column of nested data referenced by the lists represented by the * offsets_column. Note: the child column may itself be * further nested. - * @param null_count The number of null list entries. - * @param null_mask The bits specifying the null lists in device memory. + * @param[in] null_count The number of null list entries. + * @param[in] null_mask The bits specifying the null lists in device memory. * Arrow format for nulls is used for interpeting this bitmask. - * @param stream Optional stream for use with all memory allocation + * @param[in] stream Optional stream for use with all memory allocation * and device kernels - * @param mr Optional resource to use for device memory + * @param[in] mr Optional resource to use for device memory * allocation of the column's `null_mask` and children. */ std::unique_ptr make_lists_column( @@ -557,7 +559,7 @@ std::unique_ptr make_lists_column( std::unique_ptr child_column, size_type null_count, rmm::device_buffer&& null_mask, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -575,12 +577,12 @@ std::unique_ptr make_lists_column( * The specified null mask governs which struct row has a null value. This * is orthogonal to the null values of individual child columns. * - * @param num_rows The number of struct values in the struct column. - * @param child_columns The list of child/members that the struct is comprised of. - * @param null_count The number of null values in the struct column. - * @param null_mask The bits specifying the null struct values in the column. - * @param stream Optional stream for use with all memory allocation and device kernels. - * @param mr Optional resource to use for device memory allocation. + * @param[in] num_rows The number of struct values in the struct column. + * @param[in] child_columns The list of child/members that the struct is comprised of. + * @param[in] null_count The number of null values in the struct column. + * @param[in] null_mask The bits specifying the null struct values in the column. + * @param[in] stream Optional stream for use with all memory allocation and device kernels. + * @param[in] mr Optional resource to use for device memory allocation. * */ std::unique_ptr make_structs_column( @@ -588,7 +590,7 @@ std::unique_ptr make_structs_column( std::vector>&& child_columns, size_type null_count, rmm::device_buffer&& null_mask, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -599,16 +601,16 @@ std::unique_ptr make_structs_column( * The output column will contain all null rows if `s.invalid()==false` * The output column will be empty if `size==0`. * - * @param s The scalar to use for values in the column. - * @param size The number of rows for the output column. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. + * @param[in] s The scalar to use for values in the column. + * @param[in] size The number of rows for the output column. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned column's device memory. */ std::unique_ptr make_column_from_scalar( scalar const& s, size_type size, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Return a dictionary column with size elements that are all equal to the @@ -619,16 +621,16 @@ std::unique_ptr make_column_from_scalar( * * @throw cudf::logic_error if `s.is_valid()==false` * - * @param s The scalar to use for values in the column. - * @param size The number of rows for the output column. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory. + * @param[in] s The scalar to use for values in the column. + * @param[in] size The number of rows for the output column. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned column's device memory. */ std::unique_ptr make_dictionary_from_scalar( scalar const& s, size_type size, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index efbfd1de501..72943313dc2 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -73,7 +73,7 @@ std::unique_ptr make_empty_column(data_type type) std::unique_ptr make_numeric_column(data_type type, size_type size, mask_state state, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -91,7 +91,7 @@ std::unique_ptr make_numeric_column(data_type type, std::unique_ptr make_fixed_point_column(data_type type, size_type size, mask_state state, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -109,7 +109,7 @@ std::unique_ptr make_fixed_point_column(data_type type, std::unique_ptr make_timestamp_column(data_type type, size_type size, mask_state state, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -127,7 +127,7 @@ std::unique_ptr make_timestamp_column(data_type type, std::unique_ptr make_duration_column(data_type type, size_type size, mask_state state, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -145,7 +145,7 @@ std::unique_ptr make_duration_column(data_type type, std::unique_ptr make_fixed_width_column(data_type type, size_type size, mask_state state, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -163,15 +163,15 @@ struct column_from_scalar_dispatch { template std::unique_ptr operator()(scalar const& value, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { if (!value.is_valid()) return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr); auto output_column = make_fixed_width_column(value.type(), size, mask_state::UNALLOCATED, stream, mr); auto view = output_column->mutable_view(); - detail::fill_in_place(view, 0, size, value, stream); + detail::fill_in_place(view, 0, size, value, stream.value()); return output_column; } }; @@ -180,8 +180,8 @@ template <> std::unique_ptr column_from_scalar_dispatch::operator()( scalar const& value, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); @@ -199,7 +199,7 @@ std::unique_ptr column_from_scalar_dispatch::operator()(null_mask.data()), size}; auto sv = static_cast const&>(value); // fill the column with the scalar - auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, mr, stream); + auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, mr, stream.value()); output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); // should be no nulls return output; } @@ -208,8 +208,8 @@ template <> std::unique_ptr column_from_scalar_dispatch::operator()( scalar const& value, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("dictionary not supported when creating from scalar"); } @@ -218,8 +218,8 @@ template <> std::unique_ptr column_from_scalar_dispatch::operator()( scalar const& value, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("TODO"); } @@ -228,31 +228,31 @@ template <> std::unique_ptr column_from_scalar_dispatch::operator()( scalar const& value, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("TODO. struct_view currently not supported."); } std::unique_ptr make_column_from_scalar(scalar const& s, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (size == 0) return make_empty_column(s.type()); - return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, mr, stream); + return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr); } std::unique_ptr make_dictionary_from_scalar(scalar const& s, size_type size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (size == 0) return make_empty_column(data_type{type_id::DICTIONARY32}); CUDF_EXPECTS(s.is_valid(), "cannot create a dictionary with a null key"); return make_dictionary_column( - make_column_from_scalar(s, 1, mr, stream), - make_column_from_scalar(numeric_scalar(0), size, mr, stream), + make_column_from_scalar(s, 1, stream, mr), + make_column_from_scalar(numeric_scalar(0), size, stream, mr), rmm::device_buffer{0, stream, mr}, 0); } diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 39ef1237ad9..373ed224f99 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -177,7 +177,7 @@ struct column_scalar_scatterer_impl { { auto dict_target = dictionary::detail::add_keys( dictionary_column_view(target), - make_column_from_scalar(source.get(), 1, rmm::mr::get_current_device_resource(), stream) + make_column_from_scalar(source.get(), 1, stream, rmm::mr::get_current_device_resource()) ->view(), mr, stream); diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index fa3219ef039..918063ac508 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -173,7 +173,7 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, // first add the replacment to the keys so only the indices need to be processed auto const default_mr = rmm::mr::get_current_device_resource(); auto input_matched = dictionary::detail::add_keys( - input, make_column_from_scalar(replacement, 1, default_mr, stream)->view(), mr, stream); + input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream); auto const input_view = dictionary_column_view(input_matched->view()); auto const scalar_index = get_index(input_view, replacement, default_mr, stream); diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index de6ab9f7261..a711482c1ac 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -162,7 +162,7 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()view(), mr, stream); cudf::column_view const target_indices = diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 63401ad823a..2066b889dd4 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -658,7 +658,7 @@ std::unique_ptr md5_hash(table_view const& input, { if (input.num_columns() == 0 || input.num_rows() == 0) { const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e"); - auto output = make_column_from_scalar(string_128bit, input.num_rows(), mr, stream); + auto output = make_column_from_scalar(string_128bit, input.num_rows(), stream, mr); return output; } diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu index 54ae7cfd5f5..baee0e82b72 100644 --- a/cpp/src/lists/lists_column_factories.cu +++ b/cpp/src/lists/lists_column_factories.cu @@ -17,6 +17,8 @@ #include #include +#include + namespace cudf { /** @@ -28,7 +30,7 @@ std::unique_ptr make_lists_column(size_type num_rows, std::unique_ptr child_column, size_type null_count, rmm::device_buffer&& null_mask, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); } diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index fff063b269a..a2fd8c91bc7 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -323,7 +323,7 @@ std::unique_ptr dispatch_clamp::operator()( if (key.is_valid()) { result = dictionary::detail::add_keys( matched_view, - make_column_from_scalar(key_replace, 1, rmm::mr::get_current_device_resource(), stream) + make_column_from_scalar(key_replace, 1, stream, rmm::mr::get_current_device_resource()) ->view(), mr, stream); diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index 5a017570b59..d5a6356e3f1 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -68,13 +68,12 @@ std::unique_ptr find_fn(strings_column_view const& strings, auto d_strings = *strings_column; auto strings_count = strings.size(); // create output column - auto results = make_numeric_column( - data_type{type_id::INT32}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::INT32}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the position values by evaluating the passed function @@ -190,10 +189,9 @@ std::unique_ptr contains_fn(strings_column_view const& strings, if (target.size() == 0) // empty target string returns true { auto const true_scalar = make_fixed_width_scalar(true, stream); - auto results = make_column_from_scalar(*true_scalar, strings.size(), mr, stream); - results->set_null_mask( - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count()); + auto results = make_column_from_scalar(*true_scalar, strings.size(), stream, mr); + results->set_null_mask(cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count()); return results; } @@ -201,13 +199,12 @@ std::unique_ptr contains_fn(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the bool values by evaluating the passed function diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 6a97f30e5a0..60da9b682ec 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -23,6 +23,8 @@ #include #include +#include + #include #include @@ -32,18 +34,18 @@ namespace cudf { // Create a strings-type column from vector of pointer/size pairs std::unique_ptr make_strings_column( const rmm::device_vector>& strings, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); size_type strings_count = strings.size(); - if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream.value()); auto execpol = rmm::exec_policy(stream); auto d_strings = strings.data().get(); // check total size is not too large for cudf column size_t bytes = thrust::transform_reduce( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_strings] __device__(size_t idx) { @@ -63,7 +65,7 @@ std::unique_ptr make_strings_column( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); @@ -80,10 +82,10 @@ std::unique_ptr make_strings_column( // build chars column auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream.value()); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, d_offsets, d_chars] __device__(size_type idx) { @@ -115,7 +117,7 @@ struct string_view_to_pair { // Create a strings-type column from vector of string_view std::unique_ptr make_strings_column(const rmm::device_vector& string_views, const string_view null_placeholder, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto it_pair = thrust::make_transform_iterator(string_views.begin(), string_view_to_pair{null_placeholder}); @@ -129,11 +131,11 @@ std::unique_ptr make_strings_column(const rmm::device_vector& stri const rmm::device_vector& offsets, const rmm::device_vector& valid_mask, size_type null_count, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); size_type num_strings = offsets.size() - 1; - if (num_strings == 0) return strings::detail::make_empty_strings_column(mr, stream); + if (num_strings == 0) return strings::detail::make_empty_strings_column(mr, stream.value()); CUDF_EXPECTS(null_count < num_strings, "null strings column not yet supported"); if (null_count > 0) { @@ -152,7 +154,7 @@ std::unique_ptr make_strings_column(const rmm::device_vector& stri offsets.data().get(), (num_strings + 1) * sizeof(int32_t), cudaMemcpyDeviceToDevice, - stream)); + stream.value())); // build null bitmask rmm::device_buffer null_mask{ valid_mask.data().get(), @@ -164,10 +166,10 @@ std::unique_ptr make_strings_column(const rmm::device_vector& stri // build chars column auto chars_column = - strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream.value()); auto chars_view = chars_column->mutable_view(); CUDA_TRY(cudaMemcpyAsync( - chars_view.data(), strings.data().get(), bytes, cudaMemcpyDeviceToDevice, stream)); + chars_view.data(), strings.data().get(), bytes, cudaMemcpyDeviceToDevice, stream.value())); return make_strings_column(num_strings, std::move(offsets_column), @@ -183,7 +185,7 @@ std::unique_ptr make_strings_column(const std::vector& strings, const std::vector& offsets, const std::vector& null_mask, size_type null_count, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { rmm::device_vector d_strings{strings}; rmm::device_vector d_offsets{offsets}; @@ -198,7 +200,7 @@ std::unique_ptr make_strings_column(size_type num_strings, std::unique_ptr chars_column, size_type null_count, rmm::device_buffer&& null_mask, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); CUDF_EXPECTS(num_strings == offsets_column->size() - 1, diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 1d4656ffa8f..493e773adb9 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -350,12 +350,12 @@ std::unique_ptr slice_strings(strings_column_view const& strings, // Compute the substring indices first auto start_chars_pos_vec = make_column_from_scalar(numeric_scalar(0, true, stream), strings_count, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); auto stop_chars_pos_vec = make_column_from_scalar(numeric_scalar(0, true, stream), strings_count, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); auto start_char_pos = start_chars_pos_vec->mutable_view().data(); auto end_char_pos = stop_chars_pos_vec->mutable_view().data(); diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu index 2e239fce5f3..5f92fea76f5 100644 --- a/cpp/src/structs/structs_column_factories.cu +++ b/cpp/src/structs/structs_column_factories.cu @@ -14,12 +14,16 @@ * limitations under the License. */ -#include #include #include +#include + +#include + +#include + +#include #include -#include "cudf/types.hpp" -#include "thrust/iterator/counting_iterator.h" namespace cudf { namespace { @@ -29,7 +33,7 @@ void superimpose_parent_nullmask(bitmask_type const* parent_null_mask, std::size_t parent_null_mask_size, size_type parent_null_count, column& child, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (!child.nullable()) { @@ -78,7 +82,7 @@ std::unique_ptr make_structs_column( std::vector>&& child_columns, size_type null_count, rmm::device_buffer&& null_mask, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(null_count <= 0 || !null_mask.is_empty(), From 31716db65511f0de03af26c1a5643802e073338d Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 3 Nov 2020 15:22:40 +1100 Subject: [PATCH 09/51] convert detail/aggregation headers and source to cuda_stream_view --- .../cudf/detail/aggregation/aggregation.cuh | 15 ++++++++++----- cpp/src/aggregation/aggregation.cu | 4 +++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 95e5d74b8a5..51cdb7e5841 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -23,6 +23,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -409,15 +411,17 @@ struct identity_initializer { public: template std::enable_if_t(), void> operator()(mutable_column_view const& col, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { - thrust::fill( - rmm::exec_policy(stream)->on(stream), col.begin(), col.end(), get_identity()); + thrust::fill(rmm::exec_policy(stream)->on(stream.value()), + col.begin(), + col.end(), + get_identity()); } template std::enable_if_t(), void> operator()(mutable_column_view const& col, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { CUDF_FAIL("Unsupported aggregation for initializing values"); } @@ -436,10 +440,11 @@ struct identity_initializer { * @param table The table of columns to initialize. * @param aggs A vector of aggregation operations corresponding to the table * columns. The aggregations determine the identity value for each column. + * @param stream CUDA stream used for device memory operations and kernel launches. */ void initialize_with_identity(mutable_table_view& table, std::vector const& aggs, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); } // namespace detail } // namespace cudf diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index fb4a30299fc..564713e959b 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -16,11 +16,13 @@ #include +#include + namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, std::vector const& aggs, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // TODO: Initialize all the columns in a single kernel instead of invoking one // kernel per column From ba2fc0fd3662a1d63bfc055e8b112b6e392b52f9 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 3 Nov 2020 15:46:33 +1100 Subject: [PATCH 10/51] Use cuda_stream_view in groupby sort_helper --- .../cudf/detail/groupby/sort_helper.hpp | 29 +++++----- cpp/include/cudf/detail/utilities/cuda.cuh | 7 ++- cpp/src/groupby/groupby.cu | 3 +- cpp/src/groupby/sort/groupby.cu | 2 +- cpp/src/groupby/sort/sort_helper.cu | 58 ++++++++++--------- 5 files changed, 53 insertions(+), 46 deletions(-) diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index 8024de7a6af..5d14b8dd8b6 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace cudf { namespace groupby { @@ -92,8 +93,8 @@ struct sort_groupby_helper { */ std::unique_ptr sorted_values( column_view const& values, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Groups a column of values according to `keys` @@ -107,8 +108,8 @@ struct sort_groupby_helper { */ std::unique_ptr grouped_values( column_view const& values, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Get a table of sorted unique keys @@ -116,8 +117,8 @@ struct sort_groupby_helper { * @return a new table in which each row is a unique row in the sorted key table. */ std::unique_ptr
unique_keys( - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Get a table of sorted keys @@ -125,8 +126,8 @@ struct sort_groupby_helper { * @return a new table containing the sorted keys. */ std::unique_ptr
sorted_keys( - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Get the number of groups in `keys` @@ -140,7 +141,7 @@ struct sort_groupby_helper { * When include_null_keys = NO, returned value is the number of rows in `keys` * in which no element is null */ - size_type num_keys(cudaStream_t stream = 0); + size_type num_keys(rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Get the sorted order of `keys`. @@ -155,7 +156,7 @@ struct sort_groupby_helper { * * @return the sort order indices for `keys`. */ - column_view key_sort_order(cudaStream_t stream = 0); + column_view key_sort_order(rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Get each group's offset into the sorted order of `keys`. @@ -168,7 +169,7 @@ struct sort_groupby_helper { * @return vector of offsets of the starting point of each group in the sorted * key table */ - index_vector const& group_offsets(cudaStream_t stream = 0); + index_vector const& group_offsets(rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Get the group labels corresponding to the sorted order of `keys`. @@ -183,7 +184,7 @@ struct sort_groupby_helper { * * @return vector of group labels for each row in the sorted key column */ - index_vector const& group_labels(cudaStream_t stream = 0); + index_vector const& group_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default); private: /** @@ -200,7 +201,7 @@ struct sort_groupby_helper { * @return A nullable column of `INT32` containing group labels in the order * of the unsorted key table */ - column_view unsorted_keys_labels(cudaStream_t stream = 0); + column_view unsorted_keys_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Get the column representing the row bitmask for the `keys` @@ -214,7 +215,7 @@ struct sort_groupby_helper { * Computes and stores bitmask on first invocation and returns stored column * on subsequent calls. */ - column_view keys_bitmask_column(cudaStream_t stream = 0); + column_view keys_bitmask_column(rmm::cuda_stream_view stream = rmm::cuda_stream_default); private: column_ptr _key_sorted_order; ///< Indices to produce _keys in sorted order diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 77d12663f20..33c61414a1c 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -21,9 +21,10 @@ #include #include +#include + #include -#include #include namespace cudf { @@ -168,9 +169,9 @@ __global__ void single_thread_kernel(F f) * @param stream CUDA stream used for the kernel launch */ template -void device_single_thread(Functor functor, cudaStream_t stream = 0) +void device_single_thread(Functor functor, rmm::cuda_stream_view stream = rmm::cuda_stream_default) { - single_thread_kernel<<<1, 1, 0, stream>>>(functor); + single_thread_kernel<<<1, 1, 0, stream.value()>>>(functor); } } // namespace detail diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 22d41f938c5..90bbf6490ac 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -33,6 +33,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace groupby { @@ -137,7 +138,7 @@ std::pair, std::vector> groupby::aggr groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto grouped_keys = helper().sorted_keys(mr, 0); + auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr); auto group_offsets = helper().group_offsets(0); std::vector group_offsets_vector(group_offsets.size()); diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 27c1a659b91..8e924f65d73 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -434,7 +434,7 @@ std::pair, std::vector> groupby::sort auto results = detail::extract_results(requests, cache); - return std::make_pair(helper().unique_keys(mr, stream), std::move(results)); + return std::make_pair(helper().unique_keys(stream, mr), std::move(results)); } } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 88bdaf829a1..b6a07a86af7 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -25,6 +25,8 @@ #include #include +#include + #include #include #include @@ -84,7 +86,7 @@ namespace cudf { namespace groupby { namespace detail { namespace sort { -size_type sort_groupby_helper::num_keys(cudaStream_t stream) +size_type sort_groupby_helper::num_keys(rmm::cuda_stream_view stream) { if (_num_keys > -1) return _num_keys; @@ -100,7 +102,7 @@ size_type sort_groupby_helper::num_keys(cudaStream_t stream) return _num_keys; } -column_view sort_groupby_helper::key_sort_order(cudaStream_t stream) +column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream) { auto sliced_key_sorted_order = [stream, this]() { return cudf::detail::slice(this->_key_sorted_order->view(), 0, this->num_keys(stream)); @@ -117,7 +119,7 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream) auto d_key_sorted_order = _key_sorted_order->mutable_view().data(); - thrust::sequence(rmm::exec_policy(stream)->on(stream), + thrust::sequence(rmm::exec_policy(stream)->on(stream.value()), d_key_sorted_order, d_key_sorted_order + _key_sorted_order->size(), 0); @@ -131,7 +133,7 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream) {}, std::vector(_keys.num_columns(), null_order::AFTER), rmm::mr::get_current_device_resource(), - stream); + stream.value()); } else { // Pandas style // Temporarily prepend the keys table with a column that indicates the // presence of a null value within a row. This allows moving all rows that @@ -144,7 +146,7 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream) {}, std::vector(_keys.num_columns() + 1, null_order::AFTER), rmm::mr::get_current_device_resource(), - stream); + stream.value()); // All rows with one or more null values are at the end of the resulting sorted order. } @@ -152,27 +154,28 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream) return sliced_key_sorted_order(); } -sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(cudaStream_t stream) +sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets( + rmm::cuda_stream_view stream) { if (_group_offsets) return *_group_offsets; _group_offsets = std::make_unique(num_keys(stream) + 1); - auto device_input_table = table_device_view::create(_keys, stream); + auto device_input_table = table_device_view::create(_keys, stream.value()); auto sorted_order = key_sort_order().data(); decltype(_group_offsets->begin()) result_end; auto exec = rmm::exec_policy(stream); if (has_nulls(_keys)) { result_end = thrust::unique_copy( - exec->on(stream), + exec->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_keys(stream)), _group_offsets->begin(), permuted_row_equality_comparator(*device_input_table, sorted_order)); } else { result_end = thrust::unique_copy( - exec->on(stream), + exec->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_keys(stream)), _group_offsets->begin(), @@ -186,7 +189,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(cuda return *_group_offsets; } -sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(cudaStream_t stream) +sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels( + rmm::cuda_stream_view stream) { if (_group_labels) return *_group_labels; @@ -198,19 +202,19 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(cudaS if (num_keys(stream) == 0) return group_labels; auto exec = rmm::exec_policy(stream); - thrust::scatter(exec->on(stream), + thrust::scatter(exec->on(stream.value()), thrust::make_constant_iterator(1, decltype(num_groups())(1)), thrust::make_constant_iterator(1, num_groups()), group_offsets().begin() + 1, group_labels.begin()); thrust::inclusive_scan( - exec->on(stream), group_labels.begin(), group_labels.end(), group_labels.begin()); + exec->on(stream.value()), group_labels.begin(), group_labels.end(), group_labels.begin()); return group_labels; } -column_view sort_groupby_helper::unsorted_keys_labels(cudaStream_t stream) +column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stream) { if (_unsorted_keys_labels) return _unsorted_keys_labels->view(); @@ -228,14 +232,14 @@ column_view sort_groupby_helper::unsorted_keys_labels(cudaStream_t stream) table_view({temp_labels->view()}), false, rmm::mr::get_current_device_resource(), - stream); + stream.value()); _unsorted_keys_labels = std::move(t_unsorted_keys_labels->release()[0]); return _unsorted_keys_labels->view(); } -column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream) +column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view stream) { if (_keys_bitmask_column) return _keys_bitmask_column->view(); @@ -250,7 +254,7 @@ column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream) auto keys_bitmask_view = _keys_bitmask_column->mutable_view(); using T = id_to_type; - thrust::fill(rmm::exec_policy(stream)->on(stream), + thrust::fill(rmm::exec_policy(stream)->on(stream.value()), keys_bitmask_view.begin(), keys_bitmask_view.end(), 0); @@ -259,14 +263,14 @@ column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream) } sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values( - column_view const& values, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { column_ptr values_sort_order = cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(), values}), {}, std::vector(2, null_order::AFTER), mr, - stream); + stream.value()); // Zero-copy slice this sort order so that its new size is num_keys() column_view gather_map = cudf::detail::slice(values_sort_order->view(), 0, num_keys(stream)); @@ -276,13 +280,13 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values( cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, mr, - stream); + stream.value()); return std::move(sorted_values_table->release()[0]); } sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values( - column_view const& values, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto gather_map = key_sort_order(); @@ -291,13 +295,13 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values( cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, mr, - stream); + stream.value()); return std::move(grouped_values_table->release()[0]); } -std::unique_ptr
sort_groupby_helper::unique_keys(rmm::mr::device_memory_resource* mr, - cudaStream_t stream) +std::unique_ptr
sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto idx_data = key_sort_order().data(); @@ -305,18 +309,18 @@ std::unique_ptr
sort_groupby_helper::unique_keys(rmm::mr::device_memory_r group_offsets().begin(), [idx_data] __device__(size_type i) { return idx_data[i]; }); return cudf::detail::gather( - _keys, gather_map_it, gather_map_it + num_groups(), false, mr, stream); + _keys, gather_map_it, gather_map_it + num_groups(), false, mr, stream.value()); } -std::unique_ptr
sort_groupby_helper::sorted_keys(rmm::mr::device_memory_resource* mr, - cudaStream_t stream) +std::unique_ptr
sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return cudf::detail::gather(_keys, key_sort_order(), cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, mr, - stream); + stream.value()); } } // namespace sort From d8827e909a161a026060dcf74e770b9980e8ad15 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 3 Nov 2020 16:14:04 +1100 Subject: [PATCH 11/51] Convert binops to cuda_stream_view --- cpp/include/cudf/detail/binaryop.hpp | 21 ++-- cpp/src/binaryop/binaryop.cpp | 117 ++++++++++++----------- cpp/src/binaryop/compiled/binary_ops.cu | 63 ++++++------ cpp/src/binaryop/compiled/binary_ops.hpp | 16 ++-- cpp/src/groupby/sort/groupby.cu | 4 +- 5 files changed, 113 insertions(+), 108 deletions(-) diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp index 23dccc70414..c12482967e1 100644 --- a/cpp/include/cudf/detail/binaryop.hpp +++ b/cpp/include/cudf/detail/binaryop.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,11 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include +#include + namespace cudf { //! Inner interfaces and implementations namespace detail { @@ -32,8 +33,8 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator, @@ -46,8 +47,8 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::binary_operation(column_view const&, column_view const&, @@ -60,8 +61,8 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::binary_operation(column_view const&, column_view const&, @@ -74,8 +75,8 @@ std::unique_ptr binary_operation( column_view const& rhs, std::string const& ptx, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 55aabb87d8d..fde4caa068d 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -45,8 +45,9 @@ #include #include +#include + #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { @@ -57,7 +58,7 @@ namespace detail { */ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col, scalar const& s, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (col.is_empty()) return rmm::device_buffer{0, stream, mr}; @@ -65,7 +66,7 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col, if (not s.is_valid()) { return cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr); } else if (s.is_valid() and col.nullable()) { - return cudf::detail::copy_bitmask(col, rmm::cuda_stream_view{stream}, mr); + return cudf::detail::copy_bitmask(col, stream, mr); } else { return rmm::device_buffer{0, stream, mr}; } @@ -105,11 +106,11 @@ void binary_operation(mutable_column_view& out, scalar const& lhs, column_view const& rhs, binary_operator op, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (is_null_dependent(op)) { cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) + hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_s_with_validity", // name of the kernel we are // launching {cudf::jit::get_type_name(out.type()), // list of template arguments @@ -126,7 +127,7 @@ void binary_operation(mutable_column_view& out, lhs.is_valid()); } else { cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) + hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_s", // name of the kernel we are // launching {cudf::jit::get_type_name(out.type()), // list of template arguments @@ -144,11 +145,11 @@ void binary_operation(mutable_column_view& out, column_view const& lhs, scalar const& rhs, binary_operator op, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (is_null_dependent(op)) { cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) + hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_s_with_validity", // name of the kernel we are // launching {cudf::jit::get_type_name(out.type()), // list of template arguments @@ -165,7 +166,7 @@ void binary_operation(mutable_column_view& out, rhs.is_valid()); } else { cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) + hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_s", // name of the kernel we are // launching {cudf::jit::get_type_name(out.type()), // list of template arguments @@ -183,11 +184,11 @@ void binary_operation(mutable_column_view& out, column_view const& lhs, column_view const& rhs, binary_operator op, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (is_null_dependent(op)) { cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) + hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_v_with_validity", // name of the kernel we are // launching {cudf::jit::get_type_name(out.type()), // list of template arguments @@ -205,7 +206,7 @@ void binary_operation(mutable_column_view& out, rhs.offset()); } else { cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) + hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_v", // name of the kernel we are // launching {cudf::jit::get_type_name(out.type()), // list of template arguments @@ -223,7 +224,7 @@ void binary_operation(mutable_column_view& out, column_view const& lhs, column_view const& rhs, const std::string& ptx, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::string const output_type_name = cudf::jit::get_type_name(out.type()); @@ -234,7 +235,7 @@ void binary_operation(mutable_column_view& out, cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name) + code::kernel; cudf::jit::launcher( - ptx_hash, cuda_source, header_names, cudf::jit::compiler_flags, headers_code, stream) + ptx_hash, cuda_source, header_names, cudf::jit::compiler_flags, headers_code, stream.value()) .set_kernel_inst("kernel_v_v", // name of the kernel // we are launching {output_type_name, // list of template arguments @@ -277,8 +278,8 @@ std::unique_ptr make_fixed_width_column_for_output(scalar const& lhs, column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); @@ -304,8 +305,8 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh scalar const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr); @@ -331,8 +332,8 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); @@ -415,8 +416,8 @@ bool is_same_scale_necessary(binary_operator op) std::unique_ptr fixed_point_binary_operation(scalar const& lhs, column_view const& rhs, binary_operator op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace numeric; @@ -427,7 +428,7 @@ std::unique_ptr fixed_point_binary_operation(scalar const& lhs, auto const scale = compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale()); auto const output_type = is_comparison_binop(op) ? data_type{type_id::BOOL8} // : data_type{lhs.type().id(), scale}; - auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream); + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); if (rhs.is_empty()) return out; @@ -460,13 +461,13 @@ std::unique_ptr fixed_point_binary_operation(scalar const& lhs, auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{rhs.type().scale()}); - return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream); + return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr); } else { CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE"); auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{rhs.type().scale()}); - return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream); + return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr); } }(); binops::jit::binary_operation(out_view, lhs, result->view(), op, stream); @@ -491,8 +492,8 @@ std::unique_ptr fixed_point_binary_operation(scalar const& lhs, std::unique_ptr fixed_point_binary_operation(column_view const& lhs, scalar const& rhs, binary_operator op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace numeric; @@ -503,7 +504,7 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, auto const scale = compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale()); auto const output_type = is_comparison_binop(op) ? data_type{type_id::BOOL8} // : data_type{lhs.type().id(), scale}; - auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream); + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); if (lhs.is_empty()) return out; @@ -536,13 +537,13 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{lhs.type().scale()}); - return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream); + return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr); } else { CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE"); auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{lhs.type().scale()}); - return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream); + return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr); } }(); binops::jit::binary_operation(out_view, result->view(), rhs, op, stream); @@ -567,8 +568,8 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, std::unique_ptr fixed_point_binary_operation(column_view const& lhs, column_view const& rhs, binary_operator op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace numeric; @@ -579,7 +580,7 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, auto const scale = compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale()); auto const output_type = is_comparison_binop(op) ? data_type{type_id::BOOL8} // : data_type{lhs.type().id(), scale}; - auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream); + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); if (lhs.is_empty() or rhs.is_empty()) return out; @@ -594,13 +595,13 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{lhs.type().scale()}); - return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream); + return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr); } else { CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE"); auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{lhs.type().scale()}); - return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream); + return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr); } }(); binops::jit::binary_operation(out_view, result->view(), rhs, op, stream); @@ -612,13 +613,13 @@ std::unique_ptr fixed_point_binary_operation(column_view const& lhs, auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{rhs.type().scale()}); - return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream); + return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr); } else { CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE"); auto const factor = numeric::detail::ipow(diff); auto const scalar = make_fixed_point_scalar(factor, scale_type{rhs.type().scale()}); - return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream); + return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr); } }(); binops::jit::binary_operation(out_view, lhs, result->view(), op, stream); @@ -634,21 +635,21 @@ std::unique_ptr binary_operation(scalar const& lhs, column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING) - return binops::compiled::binary_operation(lhs, rhs, op, output_type, mr, stream); + return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr); if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) - return fixed_point_binary_operation(lhs, rhs, op, mr, stream); + return fixed_point_binary_operation(lhs, rhs, op, stream, mr); // Check for datatype CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype"); CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype"); CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype"); - auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream); + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); if (rhs.is_empty()) return out; @@ -661,21 +662,21 @@ std::unique_ptr binary_operation(column_view const& lhs, scalar const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING) - return binops::compiled::binary_operation(lhs, rhs, op, output_type, mr, stream); + return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr); if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) - return fixed_point_binary_operation(lhs, rhs, op, mr, stream); + return fixed_point_binary_operation(lhs, rhs, op, stream, mr); // Check for datatype CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype"); CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype"); CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype"); - auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream); + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); if (lhs.is_empty()) return out; @@ -688,23 +689,23 @@ std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match"); if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING) - return binops::compiled::binary_operation(lhs, rhs, op, output_type, mr, stream); + return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr); if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) - return fixed_point_binary_operation(lhs, rhs, op, mr, stream); + return fixed_point_binary_operation(lhs, rhs, op, stream, mr); // Check for datatype CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype"); CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype"); CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype"); - auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream); + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); if (lhs.is_empty() or rhs.is_empty()) return out; @@ -717,8 +718,8 @@ std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, std::string const& ptx, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Check for datatype auto is_type_supported_ptx = [](data_type type) -> bool { @@ -753,7 +754,7 @@ std::unique_ptr binary_operation(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, mr); + return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr); } std::unique_ptr binary_operation(column_view const& lhs, @@ -763,7 +764,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, mr); + return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr); } std::unique_ptr binary_operation(column_view const& lhs, @@ -773,7 +774,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, mr); + return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr); } std::unique_ptr binary_operation(column_view const& lhs, @@ -783,7 +784,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, ptx, output_type, mr); + return detail::binary_operation(lhs, rhs, ptx, output_type, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index e21681a8467..a466a66f74f 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include +#include #include "binary_ops.hpp" @@ -105,8 +106,8 @@ struct binary_op { binary_operator op, data_type out_type, bool const reversed, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto new_mask = binops::detail::scalar_col_valid_mask_and(lhs, rhs, stream, mr); auto out = make_fixed_width_column(out_type, @@ -125,12 +126,12 @@ struct binary_op { if (lhs.has_nulls()) { auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{}); reversed - ? thrust::transform(rmm::exec_policy(stream)->on(stream), + ? thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), out_itr, apply_binop_scalar_rhs_lhs{op, rhs_scalar_view}) - : thrust::transform(rmm::exec_policy(stream)->on(stream), + : thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), out_itr, @@ -140,12 +141,12 @@ struct binary_op { thrust::make_counting_iterator(size_type{0}), [col = *lhs_device_view] __device__(size_type i) { return col.element(i); }); reversed - ? thrust::transform(rmm::exec_policy(stream)->on(stream), + ? thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), out_itr, apply_binop_scalar_rhs_lhs{op, rhs_scalar_view}) - : thrust::transform(rmm::exec_policy(stream)->on(stream), + : thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), out_itr, @@ -153,7 +154,7 @@ struct binary_op { } } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return out; } @@ -162,8 +163,8 @@ struct binary_op { column_view const& rhs, binary_operator op, data_type out_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); auto out = make_fixed_width_column( @@ -177,7 +178,7 @@ struct binary_op { if (lhs.has_nulls() && rhs.has_nulls()) { auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{}); auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{}); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), rhs_itr, @@ -188,7 +189,7 @@ struct binary_op { auto rhs_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(size_type{0}), [col = *rhs_device_view] __device__(size_type i) { return col.element(i); }); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), rhs_itr, @@ -199,7 +200,7 @@ struct binary_op { thrust::make_counting_iterator(size_type{0}), [col = *lhs_device_view] __device__(size_type i) { return col.element(i); }); auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{}); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), rhs_itr, @@ -212,7 +213,7 @@ struct binary_op { auto rhs_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(size_type{0}), [col = *rhs_device_view] __device__(size_type i) { return col.element(i); }); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), lhs_itr, lhs_itr + lhs.size(), rhs_itr, @@ -221,7 +222,7 @@ struct binary_op { } } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return out; } @@ -304,7 +305,7 @@ struct null_considering_binop { void populate_out_col(LhsViewT const& lhsv, RhsViewT const& rhsv, cudf::size_type col_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, CompareFunc cfunc, OutT* out_col) const { @@ -312,7 +313,7 @@ struct null_considering_binop { compare_functor binop_func{lhsv, rhsv, cfunc}; // Execute it on every element - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(col_size), out_col, @@ -326,8 +327,8 @@ struct null_considering_binop { binary_operator op, data_type output_type, cudf::size_type col_size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { std::unique_ptr out; // Create device views for inputs @@ -418,8 +419,8 @@ std::unique_ptr binary_operation(scalar const& lhs, column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // hard-coded to only work with cudf::string_view so we don't explode compile times CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype"); @@ -427,12 +428,12 @@ std::unique_ptr binary_operation(scalar const& lhs, if (is_null_dependent(op)) { if (rhs.is_empty()) return cudf::make_empty_column(output_type); auto rhs_device_view = cudf::column_device_view::create(rhs, stream); - return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), mr, stream); + return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), stream, mr); } else { CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype"); // Should pass the right type of scalar and column_view when specializing binary_op return binary_op{}( - rhs, lhs, op, output_type, true, mr, stream); + rhs, lhs, op, output_type, true, stream, mr); } } @@ -440,8 +441,8 @@ std::unique_ptr binary_operation(column_view const& lhs, scalar const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // hard-coded to only work with cudf::string_view so we don't explode compile times CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype"); @@ -449,11 +450,11 @@ std::unique_ptr binary_operation(column_view const& lhs, if (is_null_dependent(op)) { if (lhs.is_empty()) return cudf::make_empty_column(output_type); auto lhs_device_view = cudf::column_device_view::create(lhs, stream); - return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), mr, stream); + return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), stream, mr); } else { CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype"); return binary_op{}( - lhs, rhs, op, output_type, false, mr, stream); + lhs, rhs, op, output_type, false, stream, mr); } } @@ -461,8 +462,8 @@ std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // hard-coded to only work with cudf::string_view so we don't explode compile times CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype"); @@ -473,11 +474,11 @@ std::unique_ptr binary_operation(column_view const& lhs, auto lhs_device_view = cudf::column_device_view::create(lhs, stream); auto rhs_device_view = cudf::column_device_view::create(rhs, stream); return null_considering_binop{}( - *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), mr, stream); + *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr); } else { CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype"); return binary_op{}( - lhs, rhs, op, output_type, mr, stream); + lhs, rhs, op, output_type, stream, mr); } } diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp index 3e6203ce8dd..a3f62f5018e 100644 --- a/cpp/src/binaryop/compiled/binary_ops.hpp +++ b/cpp/src/binaryop/compiled/binary_ops.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace binops { namespace detail { @@ -27,7 +29,7 @@ namespace detail { */ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col, scalar const& s, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail @@ -66,8 +68,8 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a binary operation between a string column and a string @@ -92,8 +94,8 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a binary operation between two string columns. @@ -118,8 +120,8 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace compiled } // namespace binops diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 8e924f65d73..c9038082d88 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -269,8 +269,8 @@ void store_result_functor::operator()(aggregation const& agg) count_result, binary_operator::DIV, cudf::detail::target_type(values.type(), aggregation::MEAN), - mr, - stream); + stream, + mr); cache.add_result(col_idx, agg, std::move(result)); }; From 6c88b596dc55483303bb4ee24ef52f8b1f2675d0 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 4 Nov 2020 17:27:20 +1100 Subject: [PATCH 12/51] Fix includes and copyright dates --- cpp/benchmarks/copying/shift_benchmark.cu | 31 +++++++++++++++---- .../null_mask/set_null_mask_benchmark.cpp | 7 +++-- cpp/include/cudf/ast/detail/transform.cuh | 3 +- cpp/include/cudf/column/column.hpp | 4 +-- .../cudf/column/column_device_view.cuh | 5 +-- cpp/include/cudf/detail/copy.hpp | 2 +- cpp/include/cudf/detail/copy_if.cuh | 5 +-- cpp/include/cudf/detail/copy_if_else.cuh | 2 +- cpp/include/cudf/detail/null_mask.hpp | 5 +-- cpp/include/cudf/detail/valid_if.cuh | 7 +++-- cpp/include/cudf/null_mask.hpp | 2 +- cpp/include/cudf/scalar/scalar.hpp | 2 +- .../cudf/strings/detail/copy_if_else.cuh | 5 +-- cpp/include/cudf/strings/detail/scatter.cuh | 2 +- cpp/src/binaryop/binaryop.cpp | 3 +- cpp/src/binaryop/compiled/binary_ops.cu | 6 ++-- cpp/src/bitmask/null_mask.cu | 4 +-- cpp/src/column/column_device_view.cu | 5 +-- cpp/src/copying/copy.cpp | 5 +-- cpp/src/copying/copy.cu | 5 +-- cpp/src/copying/copy_range.cu | 2 -- cpp/src/copying/sample.cu | 3 +- cpp/src/copying/shift.cu | 1 + cpp/src/copying/slice.cpp | 5 +-- cpp/src/datetime/datetime_ops.cu | 2 +- cpp/src/dictionary/dictionary_factories.cu | 1 + cpp/src/filling/fill.cu | 2 -- cpp/src/groupby/sort/sort_helper.cu | 2 +- cpp/src/io/csv/durations.cu | 1 + cpp/src/io/utilities/column_buffer.hpp | 2 +- cpp/src/lists/copying/copying.cu | 22 +++++++++++-- cpp/src/merge/merge.cu | 1 + cpp/src/quantiles/quantile.cu | 3 +- cpp/src/reductions/scan.cu | 18 ++++++++++- cpp/src/replace/nulls.cu | 2 +- cpp/src/replace/replace.cu | 7 +++-- cpp/src/reshape/byte_cast.cu | 3 +- cpp/src/reshape/interleave_columns.cu | 1 + cpp/src/scalar/scalar.cpp | 4 +-- cpp/src/strings/findall.cu | 4 ++- cpp/src/strings/split/split.cu | 1 + cpp/src/strings/substring.cu | 1 + cpp/src/strings/translate.cu | 1 + cpp/src/strings/wrap.cu | 3 +- cpp/src/text/replace.cu | 3 ++ cpp/src/text/stemmer.cu | 1 + cpp/src/unary/cast_ops.cu | 2 +- 47 files changed, 144 insertions(+), 64 deletions(-) diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift_benchmark.cu index 4cf3455debb..291c0ef6777 100644 --- a/cpp/benchmarks/copying/shift_benchmark.cu +++ b/cpp/benchmarks/copying/shift_benchmark.cu @@ -1,15 +1,34 @@ -#include -#include -#include -#include -#include -#include +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include + #include #include #include #include + +#include + +#include +#include +#include +#include +#include + #include template > diff --git a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp index e0a35ff0097..7f663700e02 100644 --- a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp +++ b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp @@ -14,12 +14,13 @@ * limitations under the License. */ -#include -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" +#include +#include #include +#include + class SetNullmask : public cudf::benchmark { }; diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index 454085ff9bd..96c8abe6c66 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -25,9 +25,10 @@ #include #include +#include + #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index b94a2f13e1d..7966b6a1472 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #pragma once -#include "column_view.hpp" +#include #include #include diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 5118db2364e..046a8069d1f 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -15,8 +15,6 @@ */ #pragma once -#include -#include #include #include #include @@ -30,6 +28,9 @@ #include +#include +#include + /** * @file column_device_view.cuh * @brief Column device view class definitons diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index 0312f1ebe75..719323b6045 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 9399df22450..abae1c33d4f 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,9 +36,10 @@ #include #include -#include #include +#include + namespace { // Compute the count of elements that pass the mask within each block template diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index d5be077d27b..763da179639 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 4b2c5b0a8d6..50a2424e86c 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,9 @@ #include +#include + #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 011a3fa616c..f8f3ba51468 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,10 +22,11 @@ #include #include +#include +#include + #include #include -#include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 110fd2b5087..690f4cdbbb0 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index dcce9f043e8..1f960c21197 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -22,13 +22,13 @@ #include #include +#include #include #include #include #include #include -#include "rmm/cuda_stream_view.hpp" /** * @file diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 7bfe1df4239..3433ab7d210 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace strings { diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index 627b9902506..4f495afa099 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 55aabb87d8d..8322cc9cfae 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -45,8 +45,9 @@ #include #include +#include + #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index e21681a8467..94096158fab 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "binary_ops.hpp" + #include #include #include @@ -22,8 +24,6 @@ #include -#include "binary_ops.hpp" - namespace cudf { namespace binops { namespace compiled { diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 2a61c343b05..06f969a9d43 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,6 @@ #include #include #include -#include "rmm/mr/device/device_memory_resource.hpp" namespace cudf { size_type state_null_count(mask_state state, size_type size) diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index fb3bab68446..4250d63761f 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -17,10 +17,11 @@ #include #include #include -#include -#include "rmm/cuda_stream_view.hpp" #include +#include + +#include namespace cudf { // Trivially copy all members but the children diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index 6c0aeb601c2..811c9b6e42d 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,8 +24,9 @@ #include #include +#include + #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index 619d24c1204..91244af2d13 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -19,9 +19,10 @@ #include #include #include +#include #include -#include "cudf/fixed_point/fixed_point.hpp" -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace detail { diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index daca5900768..95be6cb8bbc 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -34,8 +34,6 @@ #include -#include - #include namespace { diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu index c270be1ccca..e9e2e1d6340 100644 --- a/cpp/src/copying/sample.cu +++ b/cpp/src/copying/sample.cu @@ -21,7 +21,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include #include #include diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu index 169b6760985..2dc3b04f2f2 100644 --- a/cpp/src/copying/shift.cu +++ b/cpp/src/copying/shift.cu @@ -31,6 +31,7 @@ #include #include #include + #include #include #include diff --git a/cpp/src/copying/slice.cpp b/cpp/src/copying/slice.cpp index a9141b7a48f..017dc37d002 100644 --- a/cpp/src/copying/slice.cpp +++ b/cpp/src/copying/slice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,9 @@ #include #include +#include + #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index c3e2cc9a2ff..ce2df92efc0 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -25,9 +25,9 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" #include +#include namespace cudf { namespace datetime { diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index 286f4961946..ec598b71f88 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -21,6 +21,7 @@ #include #include #include + #include namespace cudf { diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index de6ab9f7261..a564eae5f01 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -32,8 +32,6 @@ #include #include -#include - #include namespace { diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 88bdaf829a1..f219084bfc7 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu index 863e7f0a8b3..15dfb5f5534 100644 --- a/cpp/src/io/csv/durations.cu +++ b/cpp/src/io/csv/durations.cu @@ -20,6 +20,7 @@ #include #include #include + #include #include diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index cde8a321f8e..0290857119b 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -31,8 +31,8 @@ #include #include +#include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace io { diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index c7bf2139a83..ccf57a09d52 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -1,9 +1,27 @@ -#include +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include #include + +#include + +#include + #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace lists { diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index c22f5afe181..265a20bcbb7 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -24,6 +24,7 @@ #include #include + #include #include #include diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 280cc0198cf..31205f292c0 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -25,9 +25,10 @@ #include #include +#include + #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu index d5c9527e927..6d90124db36 100644 --- a/cpp/src/reductions/scan.cu +++ b/cpp/src/reductions/scan.cu @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include #include @@ -11,7 +26,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace detail { diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 2a8fea154e5..d13d729536b 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -33,7 +34,6 @@ #include #include #include -#include "cudf/copying.hpp" #include diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 21b583cddbe..01f75f41cfc 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -17,7 +17,7 @@ * limitations under the License. */ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,9 +50,10 @@ #include #include -#include +#include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace { // anonymous diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 841a8879aa6..0f5c7595cd0 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -21,7 +21,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace detail { diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index ef2ef8858ea..9e6197afe0f 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -19,6 +19,7 @@ #include #include #include + #include namespace cudf { diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 89d3534a41f..052c2aaedc7 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ #include +#include #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { std::string string_scalar::to_string(rmm::cuda_stream_view stream) const diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu index d7e695c0a3a..7830ab13dbb 100644 --- a/cpp/src/strings/findall.cu +++ b/cpp/src/strings/findall.cu @@ -23,9 +23,11 @@ #include #include #include + #include #include -#include "rmm/cuda_stream_view.hpp" + +#include #include diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 4ef46b289e2..fb0efa1131c 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -25,6 +25,7 @@ #include #include #include + #include #include // upper_bound() diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 1d4656ffa8f..af068e2997e 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -27,6 +27,7 @@ #include #include #include + #include namespace cudf { diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 1fc9ff7f813..4cc5d2bcba8 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -23,6 +23,7 @@ #include #include #include + #include #include diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index 181283c5e34..c61fd0797a4 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -15,6 +15,8 @@ */ #include +#include + #include #include #include @@ -25,7 +27,6 @@ #include #include #include -#include namespace cudf { namespace strings { diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 4263c5f1864..8da94e69da9 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -23,9 +23,12 @@ #include #include #include + #include #include + #include + #include namespace nvtext { diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu index 1521dc90dae..8810ea759e7 100644 --- a/cpp/src/text/stemmer.cu +++ b/cpp/src/text/stemmer.cu @@ -23,6 +23,7 @@ #include #include #include + #include #include diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index e96f6e4f004..e8cc606865b 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 325ca52a79aec8488e4e3c89a9823279960817c8 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 5 Nov 2020 13:14:41 +1100 Subject: [PATCH 13/51] Update round to use detail::copy_bitmask --- cpp/src/round/round.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 9bc95175d9f..362ed4e8da0 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -124,7 +125,7 @@ struct round_fn { auto result = cudf::make_fixed_width_column(input.type(), // input.size(), - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count(), stream, mr); From ad24fb7dd6a054b2c5f68ada49b6c21c12c786c7 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 5 Nov 2020 13:22:58 +1100 Subject: [PATCH 14/51] Use stream.synchronize() --- cpp/src/column/column_device_view.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 4250d63761f..fb54c9b0bcc 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -133,7 +133,7 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str cudaMemcpyDefault, stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream.value())); + stream.synchronize(); return result; } From 3b6b0aabbed5faea6c7eea22744f378cc826e378 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 5 Nov 2020 14:21:31 +1100 Subject: [PATCH 15/51] Convert concatenate to cuda_stream_view --- cpp/include/cudf/detail/concatenate.cuh | 6 +- cpp/include/cudf/detail/concatenate.hpp | 10 ++-- .../cudf/dictionary/detail/concatenate.hpp | 4 +- cpp/include/cudf/lists/detail/concatenate.hpp | 4 +- .../cudf/strings/detail/concatenate.hpp | 6 +- cpp/src/copying/concatenate.cu | 57 ++++++++++--------- cpp/src/dictionary/add_keys.cu | 4 +- cpp/src/dictionary/detail/concatenate.cu | 27 +++++---- cpp/src/dictionary/set_keys.cu | 2 +- cpp/src/interop/from_arrow.cpp | 2 +- cpp/src/join/hash_join.cu | 2 +- cpp/src/lists/copying/concatenate.cu | 23 ++++---- cpp/src/replace/replace.cu | 2 +- cpp/src/strings/copying/concatenate.cu | 22 ++++--- 14 files changed, 97 insertions(+), 74 deletions(-) diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate.cuh index b379a5b81a2..a30ad6e853d 100644 --- a/cpp/include/cudf/detail/concatenate.cuh +++ b/cpp/include/cudf/detail/concatenate.cuh @@ -21,6 +21,8 @@ #include #include +#include + #include namespace cudf { @@ -36,7 +38,7 @@ void concatenate_masks(rmm::device_vector const& d_views, rmm::device_vector const& d_offsets, bitmask_type* dest_mask, size_type output_size, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @copydoc cudf::concatenate_masks(std::vector const&,bitmask_type*) @@ -45,7 +47,7 @@ void concatenate_masks(rmm::device_vector const& d_views, */ void concatenate_masks(std::vector const& views, bitmask_type* dest_mask, - cudaStream_t stream); + rmm::cuda_stream_view stream); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp index 04faa0b11b5..43eb5203b37 100644 --- a/cpp/include/cudf/detail/concatenate.hpp +++ b/cpp/include/cudf/detail/concatenate.hpp @@ -19,6 +19,8 @@ #include #include +#include + #include namespace cudf { @@ -31,8 +33,8 @@ namespace detail { */ std::unique_ptr concatenate( std::vector const& columns_to_concat, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::concatenate(std::vector const&,rmm::mr::device_memory_resource*) @@ -41,8 +43,8 @@ std::unique_ptr concatenate( */ std::unique_ptr
concatenate( std::vector const& tables_to_concat, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp index 82467c7cda3..ae2e0f0ba38 100644 --- a/cpp/include/cudf/dictionary/detail/concatenate.hpp +++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -35,7 +37,7 @@ namespace detail { */ std::unique_ptr concatenate( std::vector const& columns, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp index 580e37dcf6a..f9adc893b8e 100644 --- a/cpp/include/cudf/lists/detail/concatenate.hpp +++ b/cpp/include/cudf/lists/detail/concatenate.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace lists { namespace detail { @@ -41,7 +43,7 @@ namespace detail { */ std::unique_ptr concatenate( std::vector const& columns, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp index a544e66a197..3e6fc6d67fc 100644 --- a/cpp/include/cudf/strings/detail/concatenate.hpp +++ b/cpp/include/cudf/strings/detail/concatenate.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -40,8 +42,8 @@ namespace detail { */ std::unique_ptr concatenate( std::vector const& columns, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace strings diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 95a4d16673e..1063422bf73 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -33,6 +33,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -44,11 +45,11 @@ constexpr bool use_fused_kernel_heuristic(bool const has_nulls, size_t const num return has_nulls || num_columns > 4; } -auto create_device_views(std::vector const& views, cudaStream_t stream) +auto create_device_views(std::vector const& views, rmm::cuda_stream_view stream) { // Create device views for each input view - using CDViewPtr = - decltype(column_device_view::create(std::declval(), std::declval())); + using CDViewPtr = decltype( + column_device_view::create(std::declval(), std::declval())); auto device_view_owners = std::vector(views.size()); std::transform( views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) { @@ -130,11 +131,11 @@ void concatenate_masks(rmm::device_vector const& d_views, rmm::device_vector const& d_offsets, bitmask_type* dest_mask, size_type output_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { constexpr size_type block_size{256}; cudf::detail::grid_1d config(output_size, block_size); - concatenate_masks_kernel<<>>( + concatenate_masks_kernel<<>>( d_views.data().get(), d_offsets.data().get(), static_cast(d_views.size()), @@ -144,7 +145,7 @@ void concatenate_masks(rmm::device_vector const& d_views, void concatenate_masks(std::vector const& views, bitmask_type* dest_mask, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Preprocess and upload inputs to device memory auto const device_views = create_device_views(views, stream); @@ -210,8 +211,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views, template std::unique_ptr fused_concatenate(std::vector const& views, bool const has_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using mask_policy = cudf::mask_allocation_policy; @@ -238,7 +239,7 @@ std::unique_ptr fused_concatenate(std::vector const& views, cudf::detail::grid_1d config(output_size, block_size); auto const kernel = has_nulls ? fused_concatenate_kernel : fused_concatenate_kernel; - kernel<<>>( + kernel<<>>( d_views.data().get(), d_offsets.data().get(), static_cast(d_views.size()), @@ -253,8 +254,8 @@ std::unique_ptr fused_concatenate(std::vector const& views, template std::unique_ptr for_each_concatenate(std::vector const& views, bool const has_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type const total_element_count = std::accumulate(views.begin(), views.end(), 0, [](auto accumulator, auto const& v) { @@ -263,15 +264,17 @@ std::unique_ptr for_each_concatenate(std::vector const& vie using mask_policy = cudf::mask_allocation_policy; auto const policy = has_nulls ? mask_policy::ALWAYS : mask_policy::NEVER; - auto col = cudf::allocate_like(views.front(), total_element_count, policy, mr); + auto col = cudf::detail::allocate_like(views.front(), total_element_count, policy, stream, mr); col->set_null_count(0); // prevent null count from being materialized... auto m_view = col->mutable_view(); // ...when we take a mutable view auto count = 0; for (auto& v : views) { - thrust::copy( - rmm::exec_policy()->on(stream), v.begin(), v.end(), m_view.begin() + count); + thrust::copy(rmm::exec_policy(stream)->on(stream.value()), + v.begin(), + v.end(), + m_view.begin() + count); count += v.size(); } @@ -285,8 +288,8 @@ std::unique_ptr for_each_concatenate(std::vector const& vie struct concatenate_dispatch { std::vector const& views; + rmm::cuda_stream_view stream; rmm::mr::device_memory_resource* mr; - cudaStream_t stream; // fixed width template @@ -299,9 +302,9 @@ struct concatenate_dispatch { // Use a heuristic to guess when the fused kernel will be faster if (use_fused_kernel_heuristic(has_nulls, views.size())) { - return fused_concatenate(views, has_nulls, mr, stream); + return fused_concatenate(views, has_nulls, stream, mr); } else { - return for_each_concatenate(views, has_nulls, mr, stream); + return for_each_concatenate(views, has_nulls, stream, mr); } } }; @@ -315,7 +318,7 @@ std::unique_ptr concatenate_dispatch::operator()() template <> std::unique_ptr concatenate_dispatch::operator()() { - return cudf::strings::detail::concatenate(views, mr, stream); + return cudf::strings::detail::concatenate(views, stream, mr); } template <> @@ -326,8 +329,8 @@ std::unique_ptr concatenate_dispatch::operator()() // Concatenates the elements from a vector of column_views std::unique_ptr concatenate(std::vector const& columns_to_concat, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate."); @@ -343,12 +346,12 @@ std::unique_ptr concatenate(std::vector const& columns_to_c return empty_like(columns_to_concat.front()); } - return type_dispatcher(type, concatenate_dispatch{columns_to_concat, mr, stream}); + return type_dispatcher(type, concatenate_dispatch{columns_to_concat, stream, mr}); } std::unique_ptr
concatenate(std::vector const& tables_to_concat, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (tables_to_concat.empty()) { return std::make_unique
(); } @@ -368,7 +371,7 @@ std::unique_ptr
concatenate(std::vector const& tables_to_conc tables_to_concat.cend(), std::back_inserter(cols), [i](auto const& t) { return t.column(i); }); - concat_columns.emplace_back(detail::concatenate(cols, mr, stream)); + concat_columns.emplace_back(detail::concatenate(cols, stream, mr)); } return std::make_unique
(std::move(concat_columns)); } @@ -394,7 +397,7 @@ rmm::device_buffer concatenate_masks(std::vector const& views, return null_mask; } // no nulls, so return an empty device buffer - return rmm::device_buffer{0, (cudaStream_t)0, mr}; + return rmm::device_buffer{0, rmm::cuda_stream_default, mr}; } // Concatenates the elements from a vector of column_views @@ -402,14 +405,14 @@ std::unique_ptr concatenate(std::vector const& columns_to_c rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(columns_to_concat, mr, 0); + return detail::concatenate(columns_to_concat, rmm::cuda_stream_default, mr); } std::unique_ptr
concatenate(std::vector const& tables_to_concat, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(tables_to_concat, mr, 0); + return detail::concatenate(tables_to_concat, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index dc18afebb3b..5633dcfbc30 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -58,10 +58,10 @@ std::unique_ptr add_keys( // first, concatenate the keys together // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e] auto combined_keys = cudf::detail::concatenate( - std::vector{old_keys, new_keys}, rmm::mr::get_current_device_resource(), stream); + std::vector{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource()); // sort and remove any duplicates from the combined keys // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] - auto table_keys = cudf::detail::drop_duplicates(table_view{{*combined_keys}}, + auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}}, std::vector{0}, // only one key column duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 0baec216c55..b83de6575e8 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -24,10 +24,13 @@ #include #include +#include +#include + #include #include + #include -#include #include namespace cudf { @@ -84,7 +87,7 @@ struct compute_children_offsets_fn { * @param stream Stream used for allocating the output rmm::device_uvector. * @return Vector of offsets_pair objects for keys and indices. */ - rmm::device_uvector create_children_offsets(cudaStream_t stream) + rmm::device_uvector create_children_offsets(rmm::cuda_stream_view stream) { std::vector offsets(columns_ptrs.size()); thrust::transform_exclusive_scan( @@ -105,8 +108,8 @@ struct compute_children_offsets_fn { offsets.data(), offsets.size() * sizeof(offsets_pair), cudaMemcpyHostToDevice, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); return d_offsets; } @@ -130,7 +133,7 @@ struct dispatch_compute_indices { column_view const& new_keys, offsets_pair const* d_offsets, size_type const* d_map_to_keys, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto keys_view = column_device_view::create(all_keys, stream); @@ -155,7 +158,7 @@ struct dispatch_compute_indices { auto result_itr = cudf::detail::indexalator_factory::make_output_iterator(result->mutable_view()); // new indices values are computed by matching the concatenated keys to the new key set - thrust::lower_bound(rmm::exec_policy(stream)->on(stream), + thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()), new_keys_view->begin(), new_keys_view->end(), all_itr, @@ -173,7 +176,7 @@ struct dispatch_compute_indices { column_view const&, offsets_pair const*, size_type const*, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource*) { CUDF_FAIL("list_view as keys for dictionary not supported"); @@ -183,7 +186,7 @@ struct dispatch_compute_indices { } // namespace std::unique_ptr concatenate(std::vector const& columns, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // exception here is the same behavior as in cudf::concatenate @@ -202,7 +205,7 @@ std::unique_ptr concatenate(std::vector const& columns, return keys; }); auto all_keys = - cudf::detail::concatenate(keys_views, rmm::mr::get_current_device_resource(), stream); + cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource()); // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column @@ -211,7 +214,7 @@ std::unique_ptr concatenate(std::vector const& columns, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, mr, - stream) + stream.value()) ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); @@ -222,7 +225,7 @@ std::unique_ptr concatenate(std::vector const& columns, if (dict_view.is_empty()) return column_view{data_type{type_id::UINT32}, 0, nullptr}; return dict_view.get_indices_annotated(); // nicely includes validity mask and view offset }); - auto all_indices = cudf::detail::concatenate(indices_views, mr, stream); + auto all_indices = cudf::detail::concatenate(indices_views, stream, mr); auto const indices_size = all_indices->size(); // build a vector of values to map the old indices to the concatenated keys @@ -234,7 +237,7 @@ std::unique_ptr concatenate(std::vector const& columns, }); // the indices offsets (pair.second) are for building the map thrust::lower_bound( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), children_offsets.begin() + 1, children_offsets.end(), indices_itr, diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index e5be253fd1d..d95fdefe153 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -156,7 +156,7 @@ std::vector> match_dictionaries(std::vector keys(input.size()); std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); }); - auto new_keys = cudf::detail::concatenate(keys, rmm::mr::get_current_device_resource(), stream); + auto new_keys = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource()); auto keys_view = new_keys->view(); std::vector> result(input.size()); std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) { diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 141c8121dff..045c1174b08 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -380,7 +380,7 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, concat_columns.end(), std::back_inserter(column_views), [](auto const& col) { return col->view(); }); - return cudf::detail::concatenate(column_views, mr, stream); + return cudf::detail::concatenate(column_views, stream, mr); }); return std::make_unique
(std::move(columns)); diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 120777abf96..456e26a7cae 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -425,7 +425,7 @@ std::pair, std::unique_ptr
> construct_join_output_ rmm::mr::get_current_device_resource(), stream); common_table = cudf::detail::concatenate( - {common_from_build->view(), common_from_probe->view()}, mr, stream); + {common_from_build->view(), common_from_probe->view()}, stream, mr); } joined_indices = concatenate_vector_pairs(complement_indices, joined_indices); } else { diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu index 4fddf8f3ce9..4fc1ffce1ec 100644 --- a/cpp/src/lists/copying/concatenate.cu +++ b/cpp/src/lists/copying/concatenate.cu @@ -23,6 +23,9 @@ #include #include #include + +#include + #include namespace cudf { @@ -46,7 +49,7 @@ namespace { */ std::unique_ptr merge_offsets(std::vector const& columns, size_type total_list_count, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // outgoing offsets @@ -61,18 +64,18 @@ std::unique_ptr merge_offsets(std::vector const& colu std::for_each(columns.begin(), columns.end(), [&](lists_column_view const& c) { if (c.size() > 0) { // handle sliced columns - int const local_shift = - shift - - (c.offset() > 0 ? cudf::detail::get_value(c.offsets(), c.offset(), stream) : 0); - column_device_view offsets(c.offsets(), 0, 0); + int const local_shift = shift - (c.offset() > 0 ? cudf::detail::get_value( + c.offsets(), c.offset(), stream.value()) + : 0); + column_device_view offsets(c.offsets(), nullptr, nullptr); thrust::transform( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), offsets.begin() + c.offset(), offsets.begin() + c.offset() + c.size() + 1, d_merged_offsets.begin() + count, [local_shift] __device__(size_type offset) { return offset + local_shift; }); - shift += c.get_sliced_child(stream).size(); + shift += c.get_sliced_child(stream.value()).size(); count += c.size(); } }); @@ -88,7 +91,7 @@ std::unique_ptr merge_offsets(std::vector const& colu */ std::unique_ptr concatenate( std::vector const& columns, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { std::vector lists_columns; @@ -107,9 +110,9 @@ std::unique_ptr concatenate( [&total_list_count, &children, stream](lists_column_view const& l) { // count total # of lists total_list_count += l.size(); - children.push_back(l.get_sliced_child(stream)); + children.push_back(l.get_sliced_child(stream.value())); }); - auto data = cudf::detail::concatenate(children, mr, stream); + auto data = cudf::detail::concatenate(children, stream, mr); // merge offsets auto offsets = merge_offsets(lists_columns, total_list_count, stream, mr); diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 01f75f41cfc..eef397b6a13 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -453,7 +453,7 @@ std::unique_ptr replace_kernel_forwarder::operator()view(), mr, stream); }(); auto matched_view = cudf::dictionary_column_view(matched_input->view()); diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 9a8a64f2f99..be56c256bfa 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -24,6 +24,8 @@ #include #include +#include + #include #include #include @@ -61,7 +63,8 @@ struct chars_size_transform { } }; -auto create_strings_device_views(std::vector const& views, cudaStream_t stream) +auto create_strings_device_views(std::vector const& views, + rmm::cuda_stream_view stream) { // Create device views for each input view using CDViewPtr = @@ -101,12 +104,12 @@ auto create_strings_device_views(std::vector const& views, cudaStre // error: the default constructor of "cudf::column_device_view" cannot be // referenced -- it is a deleted function auto d_partition_offsets = rmm::device_vector(views.size() + 1); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), d_views.cbegin(), d_views.cend(), std::next(d_partition_offsets.begin()), chars_size_transform{}); - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), d_partition_offsets.cbegin(), d_partition_offsets.cend(), d_partition_offsets.begin()); @@ -213,8 +216,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const* } std::unique_ptr concatenate(std::vector const& columns, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Compute output sizes auto const device_views = create_strings_device_views(columns, stream); @@ -225,7 +228,7 @@ std::unique_ptr concatenate(std::vector const& columns, auto const total_bytes = std::get<5>(device_views); auto const offsets_count = strings_count + 1; - if (strings_count == 0) { return make_empty_strings_column(mr, stream); } + if (strings_count == 0) { return make_empty_strings_column(mr, stream.value()); } CUDF_EXPECTS(offsets_count <= std::numeric_limits::max(), "total number of strings is too large for cudf column"); @@ -261,7 +264,7 @@ std::unique_ptr concatenate(std::vector const& columns, cudf::detail::grid_1d config(offsets_count, block_size); auto const kernel = has_nulls ? fused_concatenate_string_offset_kernel : fused_concatenate_string_offset_kernel; - kernel<<>>( + kernel<<>>( d_views.data().get(), d_input_offsets.data().get(), d_partition_offsets.data().get(), @@ -281,7 +284,7 @@ std::unique_ptr concatenate(std::vector const& columns, constexpr size_type block_size{256}; cudf::detail::grid_1d config(total_bytes, block_size); auto const kernel = fused_concatenate_string_chars_kernel; - kernel<<>>( + kernel<<>>( d_views.data().get(), d_partition_offsets.data().get(), static_cast(d_views.size()), @@ -303,7 +306,8 @@ std::unique_ptr concatenate(std::vector const& columns, // copy the chars column data auto d_chars = chars_child.data() + bytes_offset; size_type bytes = thrust::device_pointer_cast(d_offsets)[column_size] - bytes_offset; - CUDA_TRY(cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY( + cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value())); // get ready for the next column d_new_chars += bytes; From a2edf78e0154ad011b449862376f771d29e3f133 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 13:13:43 +1100 Subject: [PATCH 16/51] Convert table and copy_if to cuda_stream_view --- cpp/include/cudf/detail/copy_if.cuh | 53 ++++++++++--------- cpp/include/cudf/table/table.hpp | 6 ++- cpp/src/dictionary/remove_keys.cu | 2 +- .../stream_compaction/apply_boolean_mask.cu | 4 +- cpp/src/stream_compaction/drop_nans.cu | 2 +- cpp/src/stream_compaction/drop_nulls.cu | 2 +- cpp/src/table/table.cpp | 6 ++- cpp/src/text/generate_ngrams.cu | 4 +- 8 files changed, 42 insertions(+), 37 deletions(-) diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index abae1c33d4f..05a84a238ff 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -210,8 +211,8 @@ struct scatter_gather_functor { cudf::size_type const* block_offsets, Filter filter, cudf::size_type per_thread, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto output_column = cudf::detail::allocate_like( input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr); @@ -231,18 +232,18 @@ struct scatter_gather_functor { CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), 0, cudf::bitmask_allocation_size_bytes(output.size()), - stream)); + stream.value())); } auto output_device_view = cudf::mutable_column_device_view::create(output, stream); auto input_device_view = cudf::column_device_view::create(input, stream); - scatter<<>>(*output_device_view, - null_count.data(), - *input_device_view, - block_offsets, - input.size(), - per_thread, - filter); + scatter<<>>(*output_device_view, + null_count.data(), + *input_device_view, + block_offsets, + input.size(), + per_thread, + filter); if (has_valid) { output_column->set_null_count(null_count.value(stream)); } return output_column; @@ -256,19 +257,19 @@ struct scatter_gather_functor { cudf::size_type const* block_offsets, Filter filter, cudf::size_type per_thread, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector indices(output_size, stream); - thrust::copy_if(rmm::exec_policy(stream)->on(stream), + thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()), thrust::counting_iterator(0), thrust::counting_iterator(input.size()), indices.begin(), filter); auto output_table = cudf::detail::gather( - cudf::table_view{{input}}, indices.begin(), indices.end(), false, mr, stream); + cudf::table_view{{input}}, indices.begin(), indices.end(), false, mr, stream.value()); // There will be only one column return std::make_unique(std::move(output_table->get_column(0))); @@ -281,8 +282,8 @@ struct scatter_gather_functor { cudf::size_type const* block_offsets, Filter filter, cudf::size_type per_thread, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_FAIL("fixed_point type not supported for this operation yet"); } @@ -309,8 +310,8 @@ template std::unique_ptr
copy_if( table_view const& input, Filter filter, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_FUNC_RANGE(); @@ -326,12 +327,12 @@ std::unique_ptr
copy_if( rmm::device_uvector block_offsets(grid.num_blocks + 1, stream); // 1. Find the count of elements in each block that "pass" the mask - compute_block_counts<<>>( + compute_block_counts<<>>( block_counts.begin(), input.num_rows(), per_thread, filter); // initialize just the first element of block_offsets to 0 since the InclusiveSum below // starts at the second element. - CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream)); + CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); // 2. Find the offset for each block's output using a scan of block counts if (grid.num_blocks > 1) { @@ -342,7 +343,7 @@ std::unique_ptr
copy_if( block_counts.begin(), block_offsets.begin() + 1, grid.num_blocks, - stream); + stream.value()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); // Run exclusive prefix sum @@ -351,7 +352,7 @@ std::unique_ptr
copy_if( block_counts.begin(), block_offsets.begin() + 1, grid.num_blocks, - stream); + stream.value()); } // As it is InclusiveSum, last value in block_offsets will be output_size @@ -362,9 +363,9 @@ std::unique_ptr
copy_if( grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), sizeof(cudf::size_type), cudaMemcpyDefault, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); if (output_size == input.num_rows()) { return std::make_unique
(input, stream, mr); @@ -378,8 +379,8 @@ std::unique_ptr
copy_if( block_offsets.begin(), filter, per_thread, - mr, - stream); + stream, + mr); }); return std::make_unique
(std::move(out_columns)); diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index afce337303f..e760e18c6d6 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -63,7 +65,7 @@ class table { * @param mr Device memory resource used for allocating the device memory for the new columns **/ table(table_view view, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index e6b179c0a7d..3913d68b10f 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -85,7 +85,7 @@ std::unique_ptr remove_keys_fn( // copy the non-removed keys ( keys_to_keep_fn(idx)==true ) auto table_keys = cudf::detail::copy_if( - table_view{{keys_view, keys_positions->view()}}, keys_to_keep_fn, mr, stream) + table_view{{keys_view, keys_positions->view()}}, keys_to_keep_fn, stream, mr) ->release(); auto const filtered_view = table_keys[1]->view(); auto filtered_itr = cudf::detail::indexalator_factory::make_input_iterator(filtered_view); diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu index a6f548b7c5d..ccb31898e95 100644 --- a/cpp/src/stream_compaction/apply_boolean_mask.cu +++ b/cpp/src/stream_compaction/apply_boolean_mask.cu @@ -74,9 +74,9 @@ std::unique_ptr
apply_boolean_mask(table_view const& input, auto device_boolean_mask = cudf::column_device_view::create(boolean_mask, stream); if (boolean_mask.has_nulls()) { - return detail::copy_if(input, boolean_mask_filter{*device_boolean_mask}, mr, stream); + return detail::copy_if(input, boolean_mask_filter{*device_boolean_mask}, stream, mr); } else { - return detail::copy_if(input, boolean_mask_filter{*device_boolean_mask}, mr, stream); + return detail::copy_if(input, boolean_mask_filter{*device_boolean_mask}, stream, mr); } } diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu index 2ba0f05b45a..ddd5d0c9934 100644 --- a/cpp/src/stream_compaction/drop_nans.cu +++ b/cpp/src/stream_compaction/drop_nans.cu @@ -99,7 +99,7 @@ std::unique_ptr
drop_nans(table_view const& input, auto keys_device_view = cudf::table_device_view::create(keys_view, stream); return cudf::detail::copy_if( - input, valid_table_filter{*keys_device_view, keep_threshold}, mr, stream); + input, valid_table_filter{*keys_device_view, keep_threshold}, stream, mr); } } // namespace detail diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu index fb487e6d6e4..49708b635d8 100644 --- a/cpp/src/stream_compaction/drop_nulls.cu +++ b/cpp/src/stream_compaction/drop_nulls.cu @@ -72,7 +72,7 @@ std::unique_ptr
drop_nulls(table_view const& input, auto keys_device_view = cudf::table_device_view::create(keys_view, stream); return cudf::detail::copy_if( - input, valid_table_filter{*keys_device_view, keep_threshold}, mr, stream); + input, valid_table_filter{*keys_device_view, keep_threshold}, stream, mr); } } // namespace detail diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp index aca414e0df4..afda6313254 100644 --- a/cpp/src/table/table.cpp +++ b/cpp/src/table/table.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { // Copy the columns from another table @@ -46,7 +48,7 @@ table::table(std::vector>&& columns) : _columns{std::mov } // Copy the contents of a `table_view` -table::table(table_view view, cudaStream_t stream, rmm::mr::device_memory_resource* mr) +table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) : _num_rows{view.num_rows()} { CUDF_FUNC_RANGE(); diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index c3e338b59d8..792b94aaee6 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -104,8 +104,8 @@ std::unique_ptr generate_ngrams( if (d_strings.is_null(idx)) return false; return !d_strings.element(idx).empty(); }, - mr, - stream) + stream, + mr) ->release(); strings_count = table_offsets.front()->size() - 1; return std::move(table_offsets.front()); From 94b1627ec7c170e6523961b2d8d689a4b23d5194 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 13:23:51 +1100 Subject: [PATCH 17/51] Changelog for #6648 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e46bd535dc7..4cf4cfae1b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ - PR #6610 Add ability to set scalar values in `cudf.DataFrame` - PR #6612 Update JNI to new RMM cuda_stream_view API - PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1) +- PR #6648 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 2) - PR #6579 Update scatter APIs to use reference wrapper / const scalar - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json` - PR #6622 Update `to_pandas` api docs From c497fcc749dad1c87bc108b516406cad6837fafe Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 13:27:12 +1100 Subject: [PATCH 18/51] Convert copy_range to cuda_stream_view --- cpp/include/cudf/detail/copy_range.cuh | 16 ++--- .../cudf/strings/detail/copy_range.cuh | 15 ++--- cpp/src/copying/copy_range.cu | 59 ++++++++++--------- 3 files changed, 48 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 599b7de358d..afe67540c42 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ #include #include #include + +#include #include #include @@ -134,7 +136,7 @@ void copy_range(SourceValueIterator source_value_begin, mutable_column_view& target, size_type target_begin, size_type target_end, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { CUDF_EXPECTS((target_begin <= target_end) && (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()), @@ -162,7 +164,7 @@ void copy_range(SourceValueIterator source_value_begin, auto kernel = copy_range_kernel; - kernel<<>>( + kernel<<>>( source_value_begin, source_validity_begin, *mutable_column_device_view::create(target, stream), @@ -174,7 +176,7 @@ void copy_range(SourceValueIterator source_value_begin, } else { auto kernel = copy_range_kernel; - kernel<<>>( + kernel<<>>( source_value_begin, source_validity_begin, *mutable_column_device_view::create(target, stream), @@ -195,7 +197,7 @@ void copy_range_in_place(column_view const& source, size_type source_begin, size_type source_end, size_type target_begin, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @copydoc cudf::copy_range @@ -208,8 +210,8 @@ std::unique_ptr copy_range( size_type source_begin, size_type source_end, size_type target_begin, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index b4b586d2b19..fe0d1dcf2a7 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -22,6 +22,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" #include #include @@ -100,8 +101,8 @@ std::unique_ptr copy_range( strings_column_view const& target, size_type target_begin, size_type target_end, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS( (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()), @@ -154,7 +155,7 @@ std::unique_ptr copy_range( source_value_begin, source_validity_begin, d_target, target_begin, target_end}); p_offsets_column = detail::make_offsets_child_column( - string_size_begin, string_size_begin + target.size(), mr, stream); + string_size_begin, string_size_begin + target.size(), mr, stream.value()); } else if (null_count > 0) { // check validities for source only auto string_size_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -162,7 +163,7 @@ std::unique_ptr copy_range( source_value_begin, source_validity_begin, d_target, target_begin, target_end}); p_offsets_column = detail::make_offsets_child_column( - string_size_begin, string_size_begin + target.size(), mr, stream); + string_size_begin, string_size_begin + target.size(), mr, stream.value()); } else { // no need to check validities auto string_size_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -170,7 +171,7 @@ std::unique_ptr copy_range( source_value_begin, source_validity_begin, d_target, target_begin, target_end}); p_offsets_column = detail::make_offsets_child_column( - string_size_begin, string_size_begin + target.size(), mr, stream); + string_size_begin, string_size_begin + target.size(), mr, stream.value()); } // create the chars column @@ -180,12 +181,12 @@ std::unique_ptr copy_range( auto chars_bytes = p_offsets[target.size()]; auto p_chars_column = strings::detail::create_chars_child_column( - target.size(), null_count, chars_bytes, mr, stream); + target.size(), null_count, chars_bytes, mr, stream.value()); // copy to the chars column auto p_chars = (p_chars_column->mutable_view()).template data(); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(target.size()), [source_value_begin, diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 95be6cb8bbc..1df9fc78aa2 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -35,6 +35,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace { template @@ -43,7 +44,7 @@ void in_place_copy_range(cudf::column_view const& source, cudf::size_type source_begin, cudf::size_type source_end, cudf::size_type target_begin, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { auto p_source_device_view = cudf::column_device_view::create(source, stream); if (source.has_nulls()) { @@ -72,7 +73,7 @@ struct in_place_copy_range_dispatch { std::enable_if_t(), void> operator()(cudf::size_type source_begin, cudf::size_type source_end, cudf::size_type target_begin, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { in_place_copy_range(source, target, source_begin, source_end, target_begin, stream); } @@ -81,7 +82,7 @@ struct in_place_copy_range_dispatch { std::enable_if_t(), void> operator()(cudf::size_type source_begin, cudf::size_type source_end, cudf::size_type target_begin, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { CUDF_FAIL("in-place copy does not work for variable width types."); } @@ -96,8 +97,8 @@ struct out_of_place_copy_range_dispatch { cudf::size_type source_begin, cudf::size_type source_end, cudf::size_type target_begin, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto p_ret = std::make_unique(target, stream, mr); if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) { @@ -119,8 +120,8 @@ std::unique_ptr out_of_place_copy_range_dispatch::operator() out_of_place_copy_range_dispatch::operator()begin() + source_begin, @@ -142,8 +143,8 @@ std::unique_ptr out_of_place_copy_range_dispatch::operator() out_of_place_copy_range_dispatch::operator() out_of_place_copy_range_dispatch::operator() out_of_place_copy_range_dispatch::operator() out_of_place_copy_range_dispatch::operator()view()); auto source_matched = cudf::dictionary::detail::set_keys( - dict_source, target_view.keys(), rmm::mr::get_current_device_resource(), stream); + dict_source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); auto const source_view = cudf::dictionary_column_view(source_matched->view()); // build the new indices by calling in_place_copy_range on just the indices @@ -230,8 +231,8 @@ std::unique_ptr out_of_place_copy_range_dispatch::operator() copy_range(column_view const& source, size_type source_begin, size_type source_end, size_type target_begin, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) && (source_begin <= source_end) && (target_begin >= 0) && @@ -286,8 +287,8 @@ std::unique_ptr copy_range(column_view const& source, source_begin, source_end, target_begin, - mr, - stream); + stream, + mr); } } // namespace detail @@ -299,7 +300,8 @@ void copy_range_in_place(column_view const& source, size_type target_begin) { CUDF_FUNC_RANGE(); - return detail::copy_range_in_place(source, target, source_begin, source_end, target_begin, 0); + return detail::copy_range_in_place( + source, target, source_begin, source_end, target_begin, rmm::cuda_stream_default); } std::unique_ptr copy_range(column_view const& source, @@ -310,7 +312,8 @@ std::unique_ptr copy_range(column_view const& source, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_range(source, target, source_begin, source_end, target_begin, mr, 0); + return detail::copy_range( + source, target, source_begin, source_end, target_begin, rmm::cuda_stream_default, mr); } } // namespace cudf From 03aec6aaa698b0994aa528efa7caeff652c32a7f Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 13:38:04 +1100 Subject: [PATCH 19/51] Convert fill to cuda_stream_view --- cpp/include/cudf/detail/fill.hpp | 10 +++-- cpp/include/cudf/strings/detail/fill.hpp | 8 ++-- cpp/src/column/column_factories.cpp | 2 +- cpp/src/filling/fill.cu | 51 +++++++++++++----------- cpp/src/strings/filling/fill.cu | 20 +++++----- 5 files changed, 50 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp index 24438c3af06..cfaf323ab12 100644 --- a/cpp/include/cudf/detail/fill.hpp +++ b/cpp/include/cudf/detail/fill.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ #include #include +#include + #include namespace cudf { @@ -33,7 +35,7 @@ void fill_in_place(mutable_column_view& destination, size_type begin, size_type end, scalar const& value, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @copydoc cudf::fill @@ -45,8 +47,8 @@ std::unique_ptr fill( size_type begin, size_type end, scalar const& value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp index 519880df561..1ddf0ad5cdf 100644 --- a/cpp/include/cudf/strings/detail/fill.hpp +++ b/cpp/include/cudf/strings/detail/fill.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -44,8 +46,8 @@ std::unique_ptr fill( size_type begin, size_type end, string_scalar const& value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace strings diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 72943313dc2..a79277ca21e 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -199,7 +199,7 @@ std::unique_ptr column_from_scalar_dispatch::operator()(null_mask.data()), size}; auto sv = static_cast const&>(value); // fill the column with the scalar - auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, mr, stream.value()); + auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr); output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); // should be no nulls return output; } diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index f801ba0eab4..6fba9bc01a5 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -32,6 +32,9 @@ #include #include +#include +#include + #include namespace { @@ -40,7 +43,7 @@ void in_place_fill(cudf::mutable_column_view& destination, cudf::size_type begin, cudf::size_type end, cudf::scalar const& value, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { using ScalarType = cudf::scalar_type_t; auto p_scalar = static_cast(&value); @@ -61,7 +64,7 @@ struct in_place_fill_range_dispatch { template std::enable_if_t(), void> operator()(cudf::size_type begin, cudf::size_type end, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { in_place_fill(destination, begin, end, value, stream); } @@ -69,7 +72,7 @@ struct in_place_fill_range_dispatch { template std::enable_if_t(), void> operator()(cudf::size_type begin, cudf::size_type end, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { CUDF_FAIL("in-place fill does not work for variable width types."); } @@ -83,8 +86,8 @@ struct out_of_place_fill_range_dispatch { std::unique_ptr operator()( cudf::size_type begin, cudf::size_type end, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch."); auto p_ret = std::make_unique(input, stream, mr); @@ -108,8 +111,8 @@ template <> std::unique_ptr out_of_place_fill_range_dispatch::operator()( cudf::size_type begin, cudf::size_type end, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("list_view not supported yet"); } @@ -118,8 +121,8 @@ template <> std::unique_ptr out_of_place_fill_range_dispatch::operator()( cudf::size_type begin, cudf::size_type end, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("struct_view not supported yet"); } @@ -128,22 +131,22 @@ template <> std::unique_ptr out_of_place_fill_range_dispatch::operator()( cudf::size_type begin, cudf::size_type end, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch."); using ScalarType = cudf::scalar_type_t; auto p_scalar = static_cast(&value); return cudf::strings::detail::fill( - cudf::strings_column_view(input), begin, end, *p_scalar, mr, stream); + cudf::strings_column_view(input), begin, end, *p_scalar, stream, mr); } template <> std::unique_ptr out_of_place_fill_range_dispatch::operator()( cudf::size_type begin, cudf::size_type end, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return std::make_unique(input, stream, mr); cudf::dictionary_column_view const target(input); @@ -162,21 +165,21 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()view(), mr, stream); + cudf::dictionary::detail::add_keys(target, scalar_column->view(), mr, stream.value()); cudf::column_view const target_indices = cudf::dictionary_column_view(target_matched->view()).get_indices_annotated(); // get the index of the key just added auto index_of_value = cudf::dictionary::detail::get_index( - target_matched->view(), value, rmm::mr::get_current_device_resource(), stream); + target_matched->view(), value, rmm::mr::get_current_device_resource(), stream.value()); // now call fill using just the indices column and the new index auto new_indices = cudf::type_dispatcher(target_indices.type(), out_of_place_fill_range_dispatch{*index_of_value, target_indices}, begin, end, - mr, - stream); + stream, + mr); auto const indices_type = new_indices->type(); auto const output_size = new_indices->size(); // record these auto const null_count = new_indices->null_count(); // before the release() @@ -206,7 +209,7 @@ void fill_in_place(mutable_column_view& destination, size_type begin, size_type end, scalar const& value, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_EXPECTS(cudf::is_fixed_width(destination.type()) == true, "In-place fill does not support variable-sized types."); @@ -228,13 +231,13 @@ std::unique_ptr fill(column_view const& input, size_type begin, size_type end, scalar const& value, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS((begin >= 0) && (end <= input.size()) && (begin <= end), "Range is out of bounds."); return cudf::type_dispatcher( - input.type(), out_of_place_fill_range_dispatch{value, input}, begin, end, mr, stream); + input.type(), out_of_place_fill_range_dispatch{value, input}, begin, end, stream, mr); } } // namespace detail @@ -245,7 +248,7 @@ void fill_in_place(mutable_column_view& destination, scalar const& value) { CUDF_FUNC_RANGE(); - return detail::fill_in_place(destination, begin, end, value, 0); + return detail::fill_in_place(destination, begin, end, value, rmm::cuda_stream_default); } std::unique_ptr fill(column_view const& input, @@ -255,7 +258,7 @@ std::unique_ptr fill(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::fill(input, begin, end, value, mr, 0); + return detail::fill(input, begin, end, value, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index fa36de38e22..5ed3de2c888 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -34,11 +36,11 @@ std::unique_ptr fill( size_type begin, size_type end, string_scalar const& value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(mr, stream.value()); CUDF_EXPECTS((begin >= 0) && (end <= strings_count), "Parameters [begin,end) are outside the range of the provided strings column"); CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values"); @@ -72,17 +74,17 @@ std::unique_ptr fill( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto d_offsets = offsets_column->view().data(); // create the chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; + auto chars_column = strings::detail::create_chars_child_column( + strings_count, null_count, bytes, mr, stream.value()); // fill the chars column auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, begin, end, d_value, d_offsets, d_chars] __device__(size_type idx) { From 927378b27e6d437bb79c8786fe9c379ca4931806 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 15:35:57 +1100 Subject: [PATCH 20/51] Convert gather to cuda_stream_view --- cpp/include/cudf/detail/copy_if.cuh | 2 +- cpp/include/cudf/detail/gather.cuh | 41 ++++++++++---------- cpp/include/cudf/detail/gather.hpp | 21 +++++++++- cpp/include/cudf/detail/get_value.cuh | 11 ++++-- cpp/include/cudf/detail/indexalator.cuh | 2 +- cpp/include/cudf/detail/scatter.cuh | 2 +- cpp/include/cudf/lists/detail/gather.cuh | 22 ++++++----- cpp/include/cudf/lists/lists_column_view.hpp | 4 +- cpp/include/cudf/strings/detail/gather.cuh | 25 ++++++------ cpp/include/cudf/table/table_device_view.cuh | 14 ++++--- cpp/src/copying/gather.cu | 15 ++++--- cpp/src/copying/sample.cu | 6 +-- cpp/src/dictionary/add_keys.cu | 4 +- cpp/src/dictionary/decode.cu | 4 +- cpp/src/dictionary/remove_keys.cu | 4 +- cpp/src/filling/repeat.cu | 4 +- cpp/src/groupby/groupby.cu | 1 + cpp/src/groupby/hash/groupby.cu | 8 ++-- cpp/src/groupby/sort/group_argmax.cu | 4 +- cpp/src/groupby/sort/group_argmin.cu | 4 +- cpp/src/groupby/sort/group_nth_element.cu | 2 +- cpp/src/groupby/sort/groupby.cu | 8 ++-- cpp/src/groupby/sort/sort_helper.cu | 14 +++---- cpp/src/hash/hashing.cu | 2 +- cpp/src/join/hash_join.cu | 20 +++++----- cpp/src/join/semi_join.cu | 3 +- cpp/src/lists/copying/gather.cu | 8 ++-- cpp/src/lists/extract.cu | 4 +- cpp/src/lists/lists_column_view.cu | 4 +- cpp/src/partitioning/partitioning.cu | 2 +- cpp/src/partitioning/round_robin.cu | 10 ++--- cpp/src/quantiles/quantiles.cu | 11 ++++-- cpp/src/reshape/tile.cu | 2 +- cpp/src/rolling/rolling.cu | 4 +- cpp/src/sort/sort.cu | 4 +- cpp/src/stream_compaction/drop_duplicates.cu | 4 +- cpp/src/strings/copying/copying.cu | 4 +- cpp/src/strings/sorting/sorting.cu | 4 +- cpp/src/table/table_device_view.cu | 14 ++++--- cpp/src/transform/encode.cu | 4 +- 40 files changed, 187 insertions(+), 139 deletions(-) diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 05a84a238ff..9daec13d578 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -269,7 +269,7 @@ struct scatter_gather_functor { filter); auto output_table = cudf::detail::gather( - cudf::table_view{{input}}, indices.begin(), indices.end(), false, mr, stream.value()); + cudf::table_view{{input}}, indices.begin(), indices.end(), false, stream, mr); // There will be only one column return std::make_unique(std::move(output_table->get_column(0))); diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index f20af839916..9a115772a0c 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -37,6 +37,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" #include #include @@ -120,11 +121,11 @@ void gather_helper(InputItr source_itr, MapIterator gather_map_begin, MapIterator gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream) + rmm::cuda_stream_view stream) { using map_type = typename std::iterator_traits::value_type; if (nullify_out_of_bounds) { - thrust::gather_if(rmm::exec_policy(stream)->on(stream), + thrust::gather_if(rmm::exec_policy(stream)->on(stream.value()), gather_map_begin, gather_map_end, gather_map_begin, @@ -132,7 +133,7 @@ void gather_helper(InputItr source_itr, target_itr, bounds_checker{0, source_size}); } else { - thrust::gather(rmm::exec_policy(stream)->on(stream), + thrust::gather(rmm::exec_policy(stream)->on(stream.value()), gather_map_begin, gather_map_end, source_itr, @@ -169,7 +170,7 @@ struct column_gatherer_impl { MapIterator gather_map_begin, MapIterator gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const num_rows = cudf::distance(gather_map_begin, gather_map_end); @@ -216,15 +217,15 @@ struct column_gatherer_impl { MapItType gather_map_begin, MapItType gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (true == nullify_out_of_bounds) { return cudf::strings::detail::gather( - strings_column_view(source_column), gather_map_begin, gather_map_end, mr, stream); + strings_column_view(source_column), gather_map_begin, gather_map_end, stream, mr); } else { return cudf::strings::detail::gather( - strings_column_view(source_column), gather_map_begin, gather_map_end, mr, stream); + strings_column_view(source_column), gather_map_begin, gather_map_end, stream, mr); } } }; @@ -289,7 +290,7 @@ struct column_gatherer_impl { MapItRoot gather_map_begin, MapItRoot gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { lists_column_view list(column); @@ -354,7 +355,7 @@ struct column_gatherer { MapIterator gather_map_begin, MapIterator gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { column_gatherer_impl gatherer{}; @@ -386,7 +387,7 @@ struct column_gatherer_impl { MapItType gather_map_begin, MapItType gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { dictionary_column_view dictionary(source_column); @@ -457,7 +458,7 @@ void gather_bitmask(table_device_view input, size_type mask_count, size_type mask_size, size_type* valid_counts, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (mask_size == 0) { return; } @@ -469,7 +470,7 @@ void gather_bitmask(table_device_view input, valid_if_n_kernel; cudf::detail::grid_1d grid{mask_size, block_size, 1}; - kernel<<>>( + kernel<<>>( counting_it, counting_it, selector, masks, mask_count, mask_size, valid_counts); } @@ -478,8 +479,8 @@ void gather_bitmask(table_view const& source, MapIterator gather_map, std::vector>& target, gather_bitmask_op op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (target.empty()) { return; } @@ -548,7 +549,7 @@ struct column_gatherer_impl { MapItRoot gather_map_begin, MapItRoot gather_map_end, bool nullify_out_of_bounds, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { structs_column_view structs_column(column); @@ -578,8 +579,8 @@ struct column_gatherer_impl { gather_map_begin, output_struct_members, nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK, - mr, - stream); + stream, + mr); return cudf::make_structs_column( gather_map_size, @@ -620,8 +621,8 @@ std::unique_ptr
gather( MapIterator gather_map_begin, MapIterator gather_map_end, bool nullify_out_of_bounds = false, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { std::vector> destination_columns; @@ -641,7 +642,7 @@ std::unique_ptr
gather( auto const op = nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK; - gather_bitmask(source_table, gather_map_begin, destination_columns, op, mr, stream); + gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr); return std::make_unique
(std::move(destination_columns)); } diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp index 0f9b01c53d5..adace7e27f8 100644 --- a/cpp/include/cudf/detail/gather.hpp +++ b/cpp/include/cudf/detail/gather.hpp @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once #include @@ -5,6 +20,8 @@ #include +#include + #include namespace cudf { @@ -48,7 +65,7 @@ std::unique_ptr
gather( column_view const& gather_map, out_of_bounds_policy bounds, negative_index_policy neg_indices, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 25ca123fb16..eeff6fb2d9b 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -38,15 +40,18 @@ namespace detail { * @return Value from the `col_view[element_index]` */ template -T get_value(column_view const& col_view, size_type element_index, cudaStream_t stream) +T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stream_view stream) { CUDF_EXPECTS(cudf::is_fixed_width(col_view.type()), "get_value supports only fixed-width types"); CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); T result; - CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync(&result, + col_view.data() + element_index, + sizeof(T), + cudaMemcpyDeviceToHost, + stream.value())); return result; } diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index adee8fd84e0..32ac19518d2 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -229,7 +229,7 @@ struct base_indexalator { * @code * auto begin = indexalator_factory::create_input_iterator(gather_map); * auto end = begin + gather_map.size(); - * auto result = detail::gather( source, begin, end, IGNORE, mr, stream ); + * auto result = detail::gather( source, begin, end, IGNORE, stream, mr ); * @endcode * * @code diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 0e30ce603cf..6d93c78fd3e 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -283,7 +283,7 @@ std::unique_ptr
scatter( auto gather_map = scatter_to_gather( updated_scatter_map_begin, updated_scatter_map_end, target.num_rows(), stream); - gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, mr, stream); + gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, stream, mr); return std::make_unique
(std::move(result)); } diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index 51291e69b6b..8ea84780fc4 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -19,7 +19,9 @@ #include #include #include + #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace lists { @@ -59,9 +61,9 @@ template gather_data make_gather_data(cudf::lists_column_view const& source_column, MapItType gather_map, size_type gather_map_size, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr, - rmm::device_uvector&& prev_base_offsets) + rmm::device_uvector&& prev_base_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // size of the gather map is the # of output rows size_type output_count = gather_map_size; @@ -79,7 +81,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the compacted outgoing offsets. auto count_iter = thrust::make_counting_iterator(0); thrust::transform_exclusive_scan( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), count_iter, count_iter + offset_count, dst_offsets_v.begin(), @@ -103,7 +105,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the base offsets rmm::device_uvector base_offsets = rmm::device_uvector(output_count, stream); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), gather_map, gather_map + offset_count, base_offsets.data(), @@ -234,16 +236,16 @@ template gather_data make_gather_data(cudf::lists_column_view const& source_column, MapItType gather_map, size_type gather_map_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { return make_gather_data( source_column, gather_map, gather_map_size, + rmm::device_uvector{0, stream, mr}, stream, - mr, - rmm::device_uvector{0, stream, mr}); + mr); } /** @@ -262,7 +264,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, std::unique_ptr gather_list_nested( lists_column_view const& list, gather_data& gd, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -281,7 +283,7 @@ std::unique_ptr gather_list_nested( std::unique_ptr gather_list_leaf( column_view const& column, gather_data const& gd, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index cc1463f3f91..d494ee445b3 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -18,6 +18,8 @@ #include #include +#include + /** * @file * @brief Class definition for cudf::lists_column_view @@ -84,7 +86,7 @@ class lists_column_view : private column_view { * * @throw cudf::logic error if this is an empty column */ - column_view get_sliced_child(cudaStream_t stream) const; + column_view get_sliced_child(rmm::cuda_stream_view stream) const; }; /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 25d47288013..8ca70db74a6 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -22,6 +22,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { @@ -61,12 +62,12 @@ std::unique_ptr gather( strings_column_view const& strings, MapIterator begin, MapIterator end, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto output_count = std::distance(begin, end); auto strings_count = strings.size(); - if (output_count == 0) return make_empty_strings_column(mr, stream); + if (output_count == 0) return make_empty_strings_column(mr, stream.value()); auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -80,13 +81,13 @@ std::unique_ptr gather( }; auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream.value()); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.template data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[output_count]; - auto chars_column = create_chars_child_column(output_count, 0, bytes, mr, stream); + auto chars_column = create_chars_child_column(output_count, 0, bytes, mr, stream.value()); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); // fill in chars @@ -102,8 +103,10 @@ std::unique_ptr gather( string_view d_str = d_strings.element(index); memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes()); }; - thrust::for_each_n( - execpol->on(stream), thrust::make_counting_iterator(0), output_count, gather_chars); + thrust::for_each_n(execpol->on(stream.value()), + thrust::make_counting_iterator(0), + output_count, + gather_chars); return make_strings_column(output_count, std::move(offsets_column), @@ -143,11 +146,11 @@ std::unique_ptr gather( MapIterator begin, MapIterator end, bool nullify_out_of_bounds, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - if (nullify_out_of_bounds) return gather(strings, begin, end, mr, stream); - return gather(strings, begin, end, mr, stream); + if (nullify_out_of_bounds) return gather(strings, begin, end, stream, mr); + return gather(strings, begin, end, stream, mr); } } // namespace detail diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index f60f5f9fe57..8a1938423f0 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -21,6 +21,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" /** * @file table_device_view.cuh @@ -67,10 +68,9 @@ class table_device_view_base { ColumnDeviceView* _columns{}; ///< Array of view objects in device memory size_type _num_rows{}; size_type _num_columns{}; - cudaStream_t _stream{}; protected: - table_device_view_base(HostTableView source_view, cudaStream_t stream); + table_device_view_base(HostTableView source_view, rmm::cuda_stream_view stream); rmm::device_buffer* _descendant_storage{}; }; @@ -78,7 +78,8 @@ class table_device_view_base { class table_device_view : public detail::table_device_view_base { public: - static auto create(table_view source_view, cudaStream_t stream = 0) + static auto create(table_view source_view, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { auto deleter = [](table_device_view* t) { t->destroy(); }; return std::unique_ptr{ @@ -86,7 +87,7 @@ class table_device_view : public detail::table_device_view_base(source_view, stream) { } @@ -95,7 +96,8 @@ class table_device_view : public detail::table_device_view_base { public: - static auto create(mutable_table_view source_view, cudaStream_t stream = 0) + static auto create(mutable_table_view source_view, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { auto deleter = [](mutable_table_device_view* t) { t->destroy(); }; return std::unique_ptr{ @@ -103,7 +105,7 @@ class mutable_table_device_view } private: - mutable_table_device_view(mutable_table_view source_view, cudaStream_t stream) + mutable_table_device_view(mutable_table_view source_view, rmm::cuda_stream_view stream) : detail::table_device_view_base(source_view, stream) { diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index f8e11500603..4e186c00ac3 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -23,6 +23,8 @@ #include #include +#include + #include namespace cudf { @@ -32,8 +34,8 @@ std::unique_ptr
gather(table_view const& source_table, column_view const& gather_map, out_of_bounds_policy bounds, negative_index_policy neg_indices, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(gather_map.has_nulls() == false, "gather_map contains nulls"); @@ -45,7 +47,7 @@ std::unique_ptr
gather(table_view const& source_table, cudf::size_type begin = neg_indices == negative_index_policy::ALLOWED ? -source_table.num_rows() : 0; cudf::size_type end = source_table.num_rows(); - CUDF_EXPECTS(gather_map.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream), + CUDF_EXPECTS(gather_map.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream.value()), map_begin, map_end, [begin, end] __device__(size_type index) { @@ -63,11 +65,11 @@ std::unique_ptr
gather(table_view const& source_table, thrust::make_transform_iterator(map_begin, idx_converter), thrust::make_transform_iterator(map_end, idx_converter), bounds == out_of_bounds_policy::IGNORE, - mr, - stream); + stream, + mr); } return gather( - source_table, map_begin, map_end, bounds == out_of_bounds_policy::IGNORE, mr, stream); + source_table, map_begin, map_end, bounds == out_of_bounds_policy::IGNORE, stream, mr); } } // namespace detail @@ -87,6 +89,7 @@ std::unique_ptr
gather(table_view const& source_table, gather_map, check_bounds ? detail::out_of_bounds_policy::FAIL : detail::out_of_bounds_policy::NULLIFY, index_policy, + rmm::cuda_stream_default, mr); } diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu index e9e2e1d6340..15dc3565da4 100644 --- a/cpp/src/copying/sample.cu +++ b/cpp/src/copying/sample.cu @@ -60,7 +60,7 @@ std::unique_ptr
sample(table_view const& input, thrust::make_transform_iterator(thrust::counting_iterator(0), RandomGen); auto end = thrust::make_transform_iterator(thrust::counting_iterator(n), RandomGen); - return detail::gather(input, begin, end, false, mr, stream.value()); + return detail::gather(input, begin, end, false, stream, mr); } else { auto gather_map = make_numeric_column( data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream.value()); @@ -78,8 +78,8 @@ std::unique_ptr
sample(table_view const& input, gather_map_view.begin(), gather_map_view.end(), false, - mr, - stream.value()); + stream, + mr); } } diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 5633dcfbc30..c02f38e2a0e 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -92,8 +92,8 @@ std::unique_ptr add_keys( indices_view, cudf::detail::out_of_bounds_policy::IGNORE, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream) + stream, + mr) ->release(); // The output of lower_bound is INT32 but we need to convert to unsigned indices. auto const indices_type = get_indices_type_for_size(keys_column->size()); diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu index c0bde1c92a5..913da30df16 100644 --- a/cpp/src/dictionary/decode.cu +++ b/cpp/src/dictionary/decode.cu @@ -49,8 +49,8 @@ std::unique_ptr decode(dictionary_column_view const& source, indices, cudf::detail::out_of_bounds_policy::IGNORE, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream) + stream, + mr) ->release(); auto output_column = std::unique_ptr(std::move(table_column.front())); diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 3913d68b10f..e04c6257692 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -114,8 +114,8 @@ std::unique_ptr remove_keys_fn( indices_view, cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream) + stream, + mr) ->release(); std::unique_ptr indices_column(std::move(table_indices.front())); diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 8191fd179c7..96e2e15f262 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -131,7 +131,7 @@ std::unique_ptr
repeat(table_view const& input_table, thrust::make_counting_iterator(output_size), indices.begin()); - return gather(input_table, indices.begin(), indices.end(), false, mr, stream); + return gather(input_table, indices.begin(), indices.end(), false, stream, mr); } std::unique_ptr
repeat(table_view const& input_table, @@ -151,7 +151,7 @@ std::unique_ptr
repeat(table_view const& input_table, thrust::make_counting_iterator(0), [count] __device__(auto i) { return i / count; }); auto map_end = map_begin + output_size; - return gather(input_table, map_begin, map_end, false, mr, stream); + return gather(input_table, map_begin, map_end, false, stream, mr); } } // namespace detail diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 90bbf6490ac..4c391852386 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -150,6 +150,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re helper().key_sort_order(), cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, + rmm::cuda_stream_default, mr); return groupby::groups{ std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)}; diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 5bc7e0d02f0..14f813ae142 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -150,7 +150,7 @@ void sparse_to_dense_results(std::vector const& requests, [&sparse_results, &gather_map, map_size, i, mr, stream](auto const& agg) { auto s = sparse_results.get_result(i, agg); auto dense_result_table = cudf::detail::gather( - table_view({s}), gather_map.begin(), gather_map.begin() + map_size, false, mr, stream); + table_view({s}), gather_map.begin(), gather_map.begin() + map_size, false, stream, mr); return std::move(dense_result_table->release()[0]); }; @@ -173,8 +173,8 @@ void sparse_to_dense_results(std::vector const& requests, arg_result->nullable() ? cudf::detail::out_of_bounds_policy::IGNORE : cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); return std::move(transformed_result->release()[0]); }; @@ -396,7 +396,7 @@ std::unique_ptr
groupby_null_templated(table_view const& keys, sparse_to_dense_results(requests, sparse_results, cache, gather_map, map_size, stream, mr); return cudf::detail::gather( - keys, gather_map.begin(), gather_map.begin() + map_size, false, mr, stream); + keys, gather_map.begin(), gather_map.begin() + map_size, false, stream, mr); } } // namespace diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index 3d9d490e669..b49fbeb7387 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -54,8 +54,8 @@ std::unique_ptr group_argmax(column_view const& values, indices->nullable() ? cudf::detail::out_of_bounds_policy::IGNORE : cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); return std::move(result_table->release()[0]); } diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index 1beaab58fe3..5ae11ba0506 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -54,8 +54,8 @@ std::unique_ptr group_argmin(column_view const& values, indices->nullable() ? cudf::detail::out_of_bounds_policy::IGNORE : cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); return std::move(result_table->release()[0]); } diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu index e33ec34b92e..bc9d0016207 100644 --- a/cpp/src/groupby/sort/group_nth_element.cu +++ b/cpp/src/groupby/sort/group_nth_element.cu @@ -104,7 +104,7 @@ std::unique_ptr group_nth_element(column_view const &values, }); } auto output_table = cudf::detail::gather( - table_view{{values}}, nth_index.begin(), nth_index.end(), true, mr, stream); + table_view{{values}}, nth_index.begin(), nth_index.end(), true, stream, mr); if (!output_table->get_column(0).has_nulls()) output_table->get_column(0).set_null_mask({}, 0); return std::make_unique(std::move(output_table->get_column(0))); } diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index c9038082d88..7077e6f089c 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -204,8 +204,8 @@ void store_result_functor::operator()(aggregation const& agg) argmin_result.nullable() ? cudf::detail::out_of_bounds_policy::IGNORE : cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); return std::move(transformed_result->release()[0]); } }(); @@ -241,8 +241,8 @@ void store_result_functor::operator()(aggregation const& agg) argmax_result.nullable() ? cudf::detail::out_of_bounds_policy::IGNORE : cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); return std::move(transformed_result->release()[0]); } }(); diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index e63a3a61015..064c3e97b20 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -279,8 +279,8 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values( gather_map, cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream.value()); + stream, + mr); return std::move(sorted_values_table->release()[0]); } @@ -294,8 +294,8 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values( gather_map, cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream.value()); + stream, + mr); return std::move(grouped_values_table->release()[0]); } @@ -309,7 +309,7 @@ std::unique_ptr
sort_groupby_helper::unique_keys(rmm::cuda_stream_view st group_offsets().begin(), [idx_data] __device__(size_type i) { return idx_data[i]; }); return cudf::detail::gather( - _keys, gather_map_it, gather_map_it + num_groups(), false, mr, stream.value()); + _keys, gather_map_it, gather_map_it + num_groups(), false, stream, mr); } std::unique_ptr
sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream, @@ -319,8 +319,8 @@ std::unique_ptr
sort_groupby_helper::sorted_keys(rmm::cuda_stream_view st key_sort_order(), cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream.value()); + stream, + mr); } } // namespace sort diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 2066b889dd4..03b6248f35a 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -583,7 +583,7 @@ std::pair, std::vector> hash_partition_table( // Handle bitmask using gather to take advantage of ballot_sync detail::gather_bitmask( - input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, mr, stream); + input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, stream, mr); } auto output{std::make_unique
(std::move(output_cols))}; diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 456e26a7cae..91188539790 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -416,14 +416,14 @@ std::pair, std::unique_ptr
> construct_join_output_ complement_indices.second.begin(), complement_indices.second.end(), nullify_out_of_bounds, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); auto common_from_probe = detail::gather(probe.select(probe_common_col), joined_indices.first.begin(), joined_indices.first.end(), nullify_out_of_bounds, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); common_table = cudf::detail::concatenate( {common_from_build->view(), common_from_probe->view()}, stream, mr); } @@ -434,8 +434,8 @@ std::pair, std::unique_ptr
> construct_join_output_ joined_indices.first.begin(), joined_indices.first.end(), nullify_out_of_bounds, - mr, - stream); + stream, + mr); } } @@ -444,15 +444,15 @@ std::pair, std::unique_ptr
> construct_join_output_ joined_indices.first.begin(), joined_indices.first.end(), nullify_out_of_bounds, - mr, - stream); + stream, + mr); std::unique_ptr
build_table = detail::gather(build.select(build_noncommon_col), joined_indices.second.begin(), joined_indices.second.end(), nullify_out_of_bounds, - mr, - stream); + stream, + mr); return combine_join_columns(probe_table->release(), probe_noncommon_col, diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 318bd30fe0f..9d6dd55ec03 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -24,6 +24,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -155,7 +156,7 @@ std::unique_ptr left_semi_anti_join( // rebuild left table for call to gather auto const left_updated = scatter_columns(left_selected, left_on, left); return cudf::detail::gather( - left_updated.select(return_columns), gather_map.begin(), gather_map_end, false, mr); + left_updated.select(return_columns), gather_map.begin(), gather_map_end, false, stream, mr); } } // namespace detail diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu index 1f6155b3167..96c20fd93ad 100644 --- a/cpp/src/lists/copying/gather.cu +++ b/cpp/src/lists/copying/gather.cu @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace lists { namespace detail { @@ -86,7 +88,7 @@ struct list_gatherer { */ std::unique_ptr gather_list_leaf(column_view const& column, gather_data const& gd, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // gather map iterator for this level (N) @@ -135,7 +137,7 @@ std::unique_ptr gather_list_leaf(column_view const& column, */ std::unique_ptr gather_list_nested(cudf::lists_column_view const& list, gather_data& gd, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // gather map iterator for this level (N) @@ -164,7 +166,7 @@ std::unique_ptr gather_list_nested(cudf::lists_column_view const& list, // generate gather_data for next level (N+1), potentially recycling the temporary // base_offsets buffer. gather_data child_gd = make_gather_data( - list, gather_map_begin, gather_map_size, stream, mr, std::move(gd.base_offsets)); + list, gather_map_begin, gather_map_size, std::move(gd.base_offsets), stream, mr); // the nesting case. if (list.child().type() == cudf::data_type{type_id::LIST}) { diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu index e1fbc74b818..5adb21a47f1 100644 --- a/cpp/src/lists/extract.cu +++ b/cpp/src/lists/extract.cu @@ -97,8 +97,8 @@ std::unique_ptr extract_list_element(lists_column_view lists_column, d_gather_map, d_gather_map + gather_map->size(), true, // nullify-out-of-bounds - mr, - stream) + stream, + mr) ->release(); if (result.front()->null_count() == 0) result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); diff --git a/cpp/src/lists/lists_column_view.cu b/cpp/src/lists/lists_column_view.cu index e4d52d74a13..5c717487951 100644 --- a/cpp/src/lists/lists_column_view.cu +++ b/cpp/src/lists/lists_column_view.cu @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { lists_column_view::lists_column_view(column_view const& lists_column) : column_view(lists_column) @@ -41,7 +43,7 @@ column_view lists_column_view::child() const return column_view::child(child_column_index); } -column_view lists_column_view::get_sliced_child(cudaStream_t stream) const +column_view lists_column_view::get_sliced_child(rmm::cuda_stream_view stream) const { // if I have a positive offset, I need to slice my child if (offset() > 0) { diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index b18c231b309..c63b7079a07 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -596,7 +596,7 @@ std::pair, std::vector> hash_partition_table( // Handle bitmask using gather to take advantage of ballot_sync detail::gather_bitmask( - input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, mr, stream); + input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, stream, mr); } auto output{std::make_unique
(std::move(output_cols))}; diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index 4d9d67e8dd8..aadcaa6d51f 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -99,8 +99,8 @@ std::pair, std::vector> degenerate rotated_iter_begin, rotated_iter_begin + nrows, // map false, - mr, - stream); + stream, + mr); auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector(num_partitions)); @@ -134,8 +134,8 @@ std::pair, std::vector> degenerate d_row_indices.begin(), d_row_indices.end(), // map false, - mr, - stream); + stream, + mr); auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector(num_partitions)); @@ -251,7 +251,7 @@ std::pair, std::vector> round_robin_part return num_partitions * index_within_partition + partition_index; }); - auto uniq_tbl = cudf::detail::gather(input, iter_begin, iter_begin + nrows, false, mr, stream); + auto uniq_tbl = cudf::detail::gather(input, iter_begin, iter_begin + nrows, false, stream, mr); auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector(num_partitions)); // this has the effect of rotating the set of partition sizes diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index 96377223a19..51e71104cac 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -23,6 +23,8 @@ #include #include +#include + #include #include @@ -33,6 +35,7 @@ std::unique_ptr
quantiles(table_view const& input, SortMapIterator sortmap, std::vector const& q, interpolation interp, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto quantile_idx_lookup = [sortmap, interp, size = input.num_rows()] __device__(double q) { @@ -44,7 +47,7 @@ std::unique_ptr
quantiles(table_view const& input, auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup); - return detail::gather(input, quantile_idx_iter, quantile_idx_iter + q.size(), false, mr); + return detail::gather(input, quantile_idx_iter, quantile_idx_iter + q.size(), false, stream, mr); } } // namespace detail @@ -67,10 +70,12 @@ std::unique_ptr
quantiles(table_view const& input, CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row."); if (is_input_sorted == sorted::YES) { - return detail::quantiles(input, thrust::make_counting_iterator(0), q, interp, mr); + return detail::quantiles( + input, thrust::make_counting_iterator(0), q, interp, rmm::cuda_stream_default, mr); } else { auto sorted_idx = detail::sorted_order(input, column_order, null_precedence); - return detail::quantiles(input, sorted_idx->view().data(), q, interp, mr); + return detail::quantiles( + input, sorted_idx->view().data(), q, interp, rmm::cuda_stream_default, mr); } } diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu index 73b20f5ba3f..2803ee1bab3 100644 --- a/cpp/src/reshape/tile.cu +++ b/cpp/src/reshape/tile.cu @@ -52,7 +52,7 @@ std::unique_ptr
tile(const table_view &in, auto counting_it = thrust::make_counting_iterator(0); auto tiled_it = thrust::make_transform_iterator(counting_it, tile_functor{in_num_rows}); - return detail::gather(in, tiled_it, tiled_it + out_num_rows, false, mr, stream); + return detail::gather(in, tiled_it, tiled_it + out_num_rows, false, stream, mr); } } // namespace detail diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index b0d63c19e9a..a31eabe3964 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -701,8 +701,8 @@ struct rolling_window_launcher { output->view(), detail::out_of_bounds_policy::IGNORE, detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); return std::make_unique(std::move(output_table->get_column(0))); } diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index ce92aad9859..d163c4e5be8 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -49,8 +49,8 @@ std::unique_ptr
sort_by_key(table_view const& values, sorted_order->view(), detail::out_of_bounds_policy::NULLIFY, detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index e9206f60c8b..970ce7eb198 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -213,8 +213,8 @@ std::unique_ptr
drop_duplicates(table_view const& input, unique_indices_view, detail::out_of_bounds_policy::NULLIFY, detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); } /** diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu index f2cf4e29e50..4c99b45f5ce 100644 --- a/cpp/src/strings/copying/copying.cu +++ b/cpp/src/strings/copying/copying.cu @@ -55,8 +55,8 @@ std::unique_ptr copy_slice(strings_column_view const& strings, indices_view, cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream) + stream, + mr) ->release(); std::unique_ptr output_column(std::move(sliced_table.front())); if (output_column->null_count() == 0) diff --git a/cpp/src/strings/sorting/sorting.cu b/cpp/src/strings/sorting/sorting.cu index 64b78475541..0a5a2238d9b 100644 --- a/cpp/src/strings/sorting/sorting.cu +++ b/cpp/src/strings/sorting/sorting.cu @@ -67,8 +67,8 @@ std::unique_ptr sort(strings_column_view strings, indices_view, cudf::detail::out_of_bounds_policy::NULLIFY, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream) + stream, + mr) ->release(); return std::move(table_sorted.front()); } diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 5ddab572225..a2cb69044ed 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ #include #include +#include + #include #include #include @@ -34,8 +36,8 @@ void table_device_view_base::destroy() template table_device_view_base::table_device_view_base( - HostTableView source_view, cudaStream_t stream) - : _num_rows{source_view.num_rows()}, _num_columns{source_view.num_columns()}, _stream{stream} + HostTableView source_view, rmm::cuda_stream_view stream) + : _num_rows{source_view.num_rows()}, _num_columns{source_view.num_columns()} { // The table's columns must be converted to ColumnDeviceView // objects and copied into device memory for the table_device_view's @@ -82,9 +84,9 @@ table_device_view_base::table_device_view_base( d_end += col_child_data_size; } - CUDA_TRY( - cudaMemcpyAsync(_columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync( + _columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream.value())); + stream.synchronize(); } } diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index c8e7bd2fd5e..57475e0f59e 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -78,8 +78,8 @@ std::pair, std::unique_ptr> encode( gather_map_column, cudf::detail::out_of_bounds_policy::FAIL, cudf::detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + stream, + mr); } auto indices_column = From 243d2a153a56c80e63c3c2c875e9fe79b7421a4c Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 15:37:28 +1100 Subject: [PATCH 21/51] rename type_conversion .cu->.cpp --- cpp/src/io/utilities/{type_conversion.cu => type_conversion.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cpp/src/io/utilities/{type_conversion.cu => type_conversion.cpp} (100%) diff --git a/cpp/src/io/utilities/type_conversion.cu b/cpp/src/io/utilities/type_conversion.cpp similarity index 100% rename from cpp/src/io/utilities/type_conversion.cu rename to cpp/src/io/utilities/type_conversion.cpp From 554a370eee4651bf9abd218c70669fdb4834f535 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 15:39:17 +1100 Subject: [PATCH 22/51] Rename structs_column_view .cu->.cpp --- .../structs/{structs_column_view.cu => structs_column_view.cpp} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename cpp/src/structs/{structs_column_view.cu => structs_column_view.cpp} (97%) diff --git a/cpp/src/structs/structs_column_view.cu b/cpp/src/structs/structs_column_view.cpp similarity index 97% rename from cpp/src/structs/structs_column_view.cu rename to cpp/src/structs/structs_column_view.cpp index f9cb345de6f..ff7f6516cef 100644 --- a/cpp/src/structs/structs_column_view.cu +++ b/cpp/src/structs/structs_column_view.cpp @@ -25,4 +25,4 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns"); } -} // namespace cudf \ No newline at end of file +} // namespace cudf From a76e9ec270228248537219e2fbbbfc8d947b1965 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Fri, 6 Nov 2020 15:44:20 +1100 Subject: [PATCH 23/51] Convert hash groupby to cuda_stream_view --- cpp/include/cudf/detail/groupby.hpp | 3 ++- cpp/src/groupby/hash/groupby.cu | 27 ++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp index 3091b5c121e..c616a2c8d50 100644 --- a/cpp/include/cudf/detail/groupby.hpp +++ b/cpp/include/cudf/detail/groupby.hpp @@ -19,6 +19,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace groupby { @@ -41,7 +42,7 @@ std::pair, std::vector> groupby( table_view const& keys, std::vector const& requests, null_policy include_null_keys, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 14f813ae142..e0c9d92fd30 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-20, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,8 @@ #include #include +#include + #include #include @@ -137,7 +139,7 @@ void sparse_to_dense_results(std::vector const& requests, cudf::detail::result_cache* dense_results, rmm::device_vector const& gather_map, size_type map_size, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { for (size_t i = 0; i < requests.size(); i++) { @@ -203,7 +205,7 @@ void sparse_to_dense_results(std::vector const& requests, template auto create_hash_map(table_device_view const& d_keys, null_policy include_null_keys, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { size_type constexpr unused_key{std::numeric_limits::max()}; size_type constexpr unused_value{std::numeric_limits::max()}; @@ -226,7 +228,7 @@ auto create_hash_map(table_device_view const& d_keys, hasher, rows_equal, allocator_type(), - stream); + stream.value()); } /** @@ -241,7 +243,7 @@ void compute_single_pass_aggs(table_view const& keys, cudf::detail::result_cache* sparse_results, Map& map, null_policy include_null_keys, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // flatten the aggs to a table that can be operated on by aggregate_row table_view flattened_values; @@ -281,7 +283,7 @@ void compute_single_pass_aggs(table_view const& keys, auto row_bitmask{ cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource())}; thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), keys.num_rows(), hash::compute_single_pass_aggs{map, @@ -292,7 +294,7 @@ void compute_single_pass_aggs(table_view const& keys, static_cast(row_bitmask.data())}); } else { thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), keys.num_rows(), hash::compute_single_pass_aggs{ @@ -313,9 +315,8 @@ void compute_single_pass_aggs(table_view const& keys, * `map`. */ template -std::pair, size_type> extract_populated_keys(Map map, - size_type num_keys, - cudaStream_t stream = 0) +std::pair, size_type> extract_populated_keys( + Map map, size_type num_keys, rmm::cuda_stream_view stream) { rmm::device_vector populated_keys(num_keys); @@ -326,7 +327,7 @@ std::pair, size_type> extract_populated_keys(Map m }; auto end_it = thrust::copy_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_transform_iterator(map.data(), get_key), thrust::make_transform_iterator(map.data() + map.capacity(), get_key), populated_keys.begin(), @@ -369,7 +370,7 @@ std::unique_ptr
groupby_null_templated(table_view const& keys, std::vector const& requests, cudf::detail::result_cache* cache, null_policy include_null_keys, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto d_keys = table_device_view::create(keys); @@ -425,7 +426,7 @@ std::pair, std::vector> groupby( table_view const& keys, std::vector const& requests, null_policy include_null_keys, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { cudf::detail::result_cache cache(requests.size()); From 817c7156b05a429f766011c9581339d0dc67bfc3 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 08:30:11 +1100 Subject: [PATCH 24/51] Convert hashing to cuda_stream_view --- cpp/include/cudf/detail/hashing.hpp | 18 +++-- cpp/src/hash/hashing.cu | 119 +++++++++++++++------------- 2 files changed, 73 insertions(+), 64 deletions(-) diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index c5600f0af18..445affb37f7 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -17,6 +17,8 @@ #include +#include + namespace cudf { namespace detail { /** @@ -28,8 +30,8 @@ std::pair, std::vector> hash_partition( table_view const& input, std::vector const& columns_to_hash, int num_partitions, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::hash @@ -40,19 +42,19 @@ std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, std::vector const& initial_hash = {}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, std::vector const& initial_hash = {}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr md5_hash( table_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 03b6248f35a..ab703c78261 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -27,6 +27,8 @@ #include #include +#include + namespace cudf { namespace { // Launch configuration for optimized hash partition @@ -330,7 +332,7 @@ void copy_block_partitions_impl(InputIter const input, size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // We need 3 chunks of shared memory: // 1. BLOCK_SIZE * ROWS_PER_THREAD elements of size_type for copying to output @@ -339,7 +341,7 @@ void copy_block_partitions_impl(InputIter const input, int const smem = OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD * sizeof(*output) + (num_partitions + 1) * sizeof(size_type) * 2; - copy_block_partitions<<>>( + copy_block_partitions<<>>( input, output, num_rows, @@ -357,7 +359,7 @@ rmm::device_vector compute_gather_map(size_type num_rows, size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto sequence = thrust::make_counting_iterator(0); rmm::device_vector gather_map(num_rows); @@ -385,8 +387,8 @@ struct copy_block_partitions_dispatcher { size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr); @@ -412,8 +414,8 @@ struct copy_block_partitions_dispatcher { size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Use move_to_output_buffer to create an equivalent gather map auto gather_map = compute_gather_map(input.size(), @@ -443,8 +445,8 @@ std::pair, std::vector> hash_partition_table( table_view const& input, table_view const& table_to_hash, size_type num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const num_rows = table_to_hash.num_rows(); @@ -490,14 +492,14 @@ std::pair, std::vector> hash_partition_table( compute_row_partition_numbers<<>>(hasher, - num_rows, - num_partitions, - partitioner_type(num_partitions), - row_partition_numbers.data().get(), - row_partition_offset.data().get(), - block_partition_sizes.data().get(), - global_partition_sizes.data().get()); + stream.value()>>>(hasher, + num_rows, + num_partitions, + partitioner_type(num_partitions), + row_partition_numbers.data().get(), + row_partition_offset.data().get(), + block_partition_sizes.data().get(), + global_partition_sizes.data().get()); } else { // Determines how the mapping between hash value and partition number is computed using partitioner_type = modulo_partitioner; @@ -508,19 +510,19 @@ std::pair, std::vector> hash_partition_table( compute_row_partition_numbers<<>>(hasher, - num_rows, - num_partitions, - partitioner_type(num_partitions), - row_partition_numbers.data().get(), - row_partition_offset.data().get(), - block_partition_sizes.data().get(), - global_partition_sizes.data().get()); + stream.value()>>>(hasher, + num_rows, + num_partitions, + partitioner_type(num_partitions), + row_partition_numbers.data().get(), + row_partition_offset.data().get(), + block_partition_sizes.data().get(), + global_partition_sizes.data().get()); } // Compute exclusive scan of all blocks' partition sizes in-place to determine // the starting point for each blocks portion of each partition in the output - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), block_partition_sizes.begin(), block_partition_sizes.end(), scanned_block_partition_sizes.data().get()); @@ -529,7 +531,7 @@ std::pair, std::vector> hash_partition_table( // of each partition in final output. // TODO This can be done independently on a separate stream size_type* scanned_global_partition_sizes{global_partition_sizes.data().get()}; - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), global_partition_sizes.begin(), global_partition_sizes.end(), scanned_global_partition_sizes); @@ -541,7 +543,7 @@ std::pair, std::vector> hash_partition_table( scanned_global_partition_sizes, num_partitions * sizeof(size_type), cudaMemcpyDeviceToHost, - stream)); + stream.value())); // When the number of partitions is less than a threshold, we can apply an // optimization using shared memory to copy values to the output buffer. @@ -566,8 +568,8 @@ std::pair, std::vector> hash_partition_table( block_partition_sizes_ptr, scanned_block_partition_sizes_ptr, grid_size, - mr, - stream); + stream, + mr); }); if (has_nulls(input)) { @@ -596,12 +598,17 @@ std::pair, std::vector> hash_partition_table( compute_row_output_locations<<>>( + stream.value()>>>( row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr); // Use the resulting scatter map to materialize the output - auto output = detail::scatter( - input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, mr, stream); + auto output = detail::scatter(input, + row_partition_numbers.begin(), + row_partition_numbers.end(), + input, + false, + mr, + stream.value()); return std::make_pair(std::move(output), std::move(partition_offsets)); } @@ -620,8 +627,8 @@ std::pair, std::vector> hash_partition( table_view const& input, std::vector const& columns_to_hash, int num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -633,28 +640,28 @@ std::pair, std::vector> hash_partition( } if (has_nulls(table_to_hash)) { - return hash_partition_table(input, table_to_hash, num_partitions, mr, stream); + return hash_partition_table(input, table_to_hash, num_partitions, stream, mr); } else { - return hash_partition_table(input, table_to_hash, num_partitions, mr, stream); + return hash_partition_table(input, table_to_hash, num_partitions, stream, mr); } } std::unique_ptr hash(table_view const& input, hash_id hash_function, std::vector const& initial_hash, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { switch (hash_function) { - case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, mr, stream); - case (hash_id::HASH_MD5): return md5_hash(input, mr, stream); + case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr); + case (hash_id::HASH_MD5): return md5_hash(input, stream, mr); default: return nullptr; } } std::unique_ptr md5_hash(table_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.num_columns() == 0 || input.num_rows() == 0) { const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e"); @@ -673,14 +680,14 @@ std::unique_ptr md5_hash(table_view const& input, "MD5 unsupported column type"); // Result column allocation and creation - auto begin = thrust::make_constant_iterator(32); - auto offsets_column = - cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), mr, stream); + auto begin = thrust::make_constant_iterator(32); + auto offsets_column = cudf::strings::detail::make_offsets_child_column( + begin, begin + input.num_rows(), mr, stream.value()); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.data(); auto chars_column = strings::detail::create_chars_child_column( - input.num_rows(), 0, input.num_rows() * 32, mr, stream); + input.num_rows(), 0, input.num_rows() * 32, mr, stream.value()); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); @@ -689,7 +696,7 @@ std::unique_ptr md5_hash(table_view const& input, auto const device_input = table_device_view::create(input, stream); // Hash each row, hashing each element sequentially left to right - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.num_rows()), [d_chars, device_input = *device_input] __device__(auto row_index) { @@ -718,8 +725,8 @@ std::unique_ptr md5_hash(table_view const& input, std::unique_ptr murmur_hash3_32(table_view const& input, std::vector const& initial_hash, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // TODO this should be UINT32 auto output = make_numeric_column( @@ -739,13 +746,13 @@ std::unique_ptr murmur_hash3_32(table_view const& input, auto device_initial_hash = rmm::device_vector(initial_hash); if (nullable) { - thrust::tabulate(rmm::exec_policy(stream)->on(stream), + thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()), output_view.begin(), output_view.end(), row_hasher_initial_values( *device_input, device_initial_hash.data().get())); } else { - thrust::tabulate(rmm::exec_policy(stream)->on(stream), + thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()), output_view.begin(), output_view.end(), row_hasher_initial_values( @@ -753,12 +760,12 @@ std::unique_ptr murmur_hash3_32(table_view const& input, } } else { if (nullable) { - thrust::tabulate(rmm::exec_policy(stream)->on(stream), + thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()), output_view.begin(), output_view.end(), row_hasher(*device_input)); } else { - thrust::tabulate(rmm::exec_policy(stream)->on(stream), + thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()), output_view.begin(), output_view.end(), row_hasher(*device_input)); @@ -776,7 +783,7 @@ std::unique_ptr hash(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash(input, hash_function, initial_hash, mr); + return detail::hash(input, hash_function, initial_hash, rmm::cuda_stream_default, mr); } std::unique_ptr murmur_hash3_32(table_view const& input, @@ -784,7 +791,7 @@ std::unique_ptr murmur_hash3_32(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::murmur_hash3_32(input, initial_hash, mr); + return detail::murmur_hash3_32(input, initial_hash, rmm::cuda_stream_default, mr); } } // namespace cudf From 2da7bb140261672aa751862c1a53d86f754d4fad Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 11:07:56 +1100 Subject: [PATCH 25/51] Add conda_include_dirs to benchmarks cmakelists --- cpp/benchmarks/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 667498fa965..893ca87e169 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -58,6 +58,11 @@ include_directories("${CMAKE_BINARY_DIR}/include" "${RMM_INCLUDE}" "${CMAKE_CURRENT_SOURCE_DIR}") +if(CONDA_INCLUDE_DIRS) + include_directories("${CONDA_INCLUDE_DIRS}") +endif(CONDA_INCLUDE_DIRS) + + ################################################################################################### # - library paths --------------------------------------------------------------------------------- From be38bdaf1c6ddbe46ce92cd028b8cd9e457c85fc Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 11:39:55 +1100 Subject: [PATCH 26/51] Convert interop to rmm::cuda_stream_view --- cpp/include/cudf/detail/interop.hpp | 18 +++--- cpp/src/interop/dlpack.cpp | 17 ++--- cpp/src/interop/from_arrow.cpp | 99 +++++++++++++++-------------- cpp/src/interop/to_arrow.cpp | 35 +++++----- 4 files changed, 88 insertions(+), 81 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 8271a04265d..c6d2014f80e 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -21,6 +21,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -31,8 +33,8 @@ namespace detail { */ std::unique_ptr
from_dlpack( DLManagedTensor const* managed_tensor, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::to_dlpack @@ -41,8 +43,8 @@ std::unique_ptr
from_dlpack( */ DLManagedTensor* to_dlpack( table_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); // Creating arrow as per given type_id and buffer arguments template @@ -101,8 +103,8 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); **/ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, - arrow::MemoryPool* ar_mr = arrow::default_memory_pool(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @copydoc cudf::arrow_to_cudf @@ -111,8 +113,8 @@ std::shared_ptr to_arrow(table_view input, **/ std::unique_ptr
from_arrow( arrow::Table const& input_table, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index 714e95f28b8..bb79a1d437e 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -21,6 +21,7 @@ #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace { @@ -113,8 +114,8 @@ struct dltensor_context { namespace detail { std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(nullptr != managed_tensor, "managed_tensor is null"); auto const& tensor = managed_tensor->dl_tensor; @@ -171,7 +172,7 @@ std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, reinterpret_cast(tensor_data), bytes, cudaMemcpyDefault, - stream)); + stream.value())); tensor_data += col_stride; } @@ -180,8 +181,8 @@ std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, } DLManagedTensor* to_dlpack(table_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const num_rows = input.num_rows(); auto const num_cols = input.num_columns(); @@ -241,7 +242,7 @@ DLManagedTensor* to_dlpack(table_view const& input, get_column_data(col), stride_bytes, cudaMemcpyDefault, - stream)); + stream.value())); tensor_data += stride_bytes; } @@ -256,12 +257,12 @@ DLManagedTensor* to_dlpack(table_view const& input, std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, rmm::mr::device_memory_resource* mr) { - return detail::from_dlpack(managed_tensor, mr); + return detail::from_dlpack(managed_tensor, rmm::cuda_stream_default, mr); } DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr) { - return detail::to_dlpack(input, mr); + return detail::to_dlpack(input, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 045c1174b08..690647d9306 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -92,8 +92,8 @@ struct dispatch_to_cudf_column { * @brief Returns mask from an array withut any offsets. */ std::unique_ptr get_mask_buffer(arrow::Array const& array, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (array.null_bitmap_data() == nullptr) { return std::make_unique(0, stream, mr); @@ -107,7 +107,7 @@ struct dispatch_to_cudf_column { reinterpret_cast(mask_buffer->address()), array.null_bitmap()->size(), cudaMemcpyDefault, - stream)); + stream.value())); return mask; } @@ -115,8 +115,8 @@ struct dispatch_to_cudf_column { std::unique_ptr operator()(arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto data_buffer = array.data()->buffers[1]; size_type const num_rows = array.length(); @@ -128,9 +128,9 @@ struct dispatch_to_cudf_column { reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(T), sizeof(T) * num_rows, cudaMemcpyDefault, - stream)); + stream.value())); if (has_nulls) { - auto tmp_mask = get_mask_buffer(array, mr, stream); + auto tmp_mask = get_mask_buffer(array, stream, mr); // If array is sliced, we have to copy whole mask and then take copy. auto out_mask = (num_rows == static_cast(data_buffer->size() / sizeof(T))) @@ -138,7 +138,7 @@ struct dispatch_to_cudf_column { : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), array.offset(), array.offset() + num_rows, - rmm::cuda_stream_view{stream}, + stream, mr); col->set_null_mask(std::move(out_mask)); @@ -162,16 +162,16 @@ std::unique_ptr get_empty_type_column(size_type size) std::unique_ptr get_column(arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); template <> std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto data_buffer = array.data()->buffers[1]; auto data = rmm::device_buffer(data_buffer->size(), stream, mr); @@ -179,20 +179,20 @@ std::unique_ptr dispatch_to_cudf_column::operator()( reinterpret_cast(data_buffer->address()), data_buffer->size(), cudaMemcpyDefault, - stream)); + stream.value())); auto out_col = mask_to_bools(static_cast(data.data()), array.offset(), array.offset() + array.length(), - stream, + stream.value(), mr); auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; if (has_nulls) { auto out_mask = - detail::copy_bitmask(static_cast(get_mask_buffer(array, mr, stream)->data()), + detail::copy_bitmask(static_cast(get_mask_buffer(array, stream, mr)->data()), array.offset(), array.offset() + array.length(), - rmm::cuda_stream_view{stream}, + stream, mr); out_col->set_null_mask(std::move(out_mask)); @@ -206,10 +206,12 @@ std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - if (array.length() == 0) { return cudf::strings::detail::make_empty_strings_column(mr, stream); } + if (array.length() == 0) { + return cudf::strings::detail::make_empty_strings_column(mr, stream.value()); + } auto str_array = static_cast(&array); auto offset_array = std::make_unique( str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); @@ -217,16 +219,16 @@ std::unique_ptr dispatch_to_cudf_column::operator()( str_array->value_data()->size(), str_array->value_data(), nullptr); auto offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT32), true, mr, stream); + *offset_array, data_type(type_id::INT32), true, stream, mr); auto chars_column = dispatch_to_cudf_column{}.operator()( - *char_array, data_type(type_id::INT8), true, mr, stream); + *char_array, data_type(type_id::INT8), true, stream, mr); auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), std::move(chars_column), UNKNOWN_NULL_COUNT, - std::move(*get_mask_buffer(array, mr, stream)), + std::move(*get_mask_buffer(array, stream, mr)), stream, mr); @@ -242,19 +244,20 @@ std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto dict_array = static_cast(&array); auto dict_type = arrow_to_cudf_type(*(dict_array->dictionary()->type())); - auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, mr, stream); + auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, stream, mr); auto ind_type = arrow_to_cudf_type(*(dict_array->indices()->type())); - auto indices_column = get_column(*(dict_array->indices()), ind_type, false, mr, stream); + auto indices_column = get_column(*(dict_array->indices()), ind_type, false, stream, mr); // If index type is not of type uint32_t, then cast it to uint32_t auto const dict_indices_type = data_type{type_id::UINT32}; if (indices_column->type().id() != dict_indices_type.id()) - indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, mr, stream); + indices_column = + cudf::detail::cast(indices_column->view(), dict_indices_type, mr, stream.value()); // Child columns shouldn't have masks and we need the mask in main column auto column_contents = indices_column->release(); @@ -272,8 +275,8 @@ std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto struct_array = static_cast(&array); std::vector> child_columns; @@ -284,10 +287,10 @@ std::unique_ptr dispatch_to_cudf_column::operator()( std::back_inserter(child_columns), [&mr, &stream](auto const& child_array) { auto type = arrow_to_cudf_type(*(child_array->type())); - return get_column(*child_array, type, false, mr, stream); + return get_column(*child_array, type, false, stream, mr); }); - auto out_mask = *(get_mask_buffer(array, mr, stream)); + auto out_mask = *(get_mask_buffer(array, stream, mr)); if (struct_array->null_bitmap_data() != nullptr) { out_mask = detail::copy_bitmask(static_cast(out_mask.data()), array.offset(), @@ -305,24 +308,24 @@ std::unique_ptr dispatch_to_cudf_column::operator()( arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto list_array = static_cast(&array); auto offset_array = std::make_unique( list_array->value_offsets()->size() / sizeof(int32_t), list_array->value_offsets(), nullptr); auto offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT32), true, mr, stream); + *offset_array, data_type(type_id::INT32), true, stream, mr); auto child_type = arrow_to_cudf_type(*(list_array->values()->type())); - auto child_column = get_column(*(list_array->values()), child_type, false, mr, stream); + auto child_column = get_column(*(list_array->values()), child_type, false, stream, mr); auto const num_rows = offsets_column->size() - 1; auto out_col = make_lists_column(num_rows, std::move(offsets_column), std::move(child_column), UNKNOWN_NULL_COUNT, - std::move(*get_mask_buffer(array, mr, stream)), + std::move(*get_mask_buffer(array, stream, mr)), stream, mr); @@ -336,19 +339,19 @@ std::unique_ptr dispatch_to_cudf_column::operator()( std::unique_ptr get_column(arrow::Array const& array, data_type type, bool skip_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return type.id() != type_id::EMPTY - ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, mr, stream) + ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr) : get_empty_type_column(array.length()); } } // namespace std::unique_ptr
from_arrow(arrow::Table const& input_table, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input_table.num_columns() == 0) { return std::make_unique
(); } std::vector> columns; @@ -363,12 +366,12 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, if (cudf_type.id() == type_id::EMPTY) { return get_empty_type_column(chunked_array->length()); } - transform(array_chunks.begin(), - array_chunks.end(), - std::back_inserter(concat_columns), - [&cudf_type, &mr, &stream](auto const& array_chunk) { - return get_column(*array_chunk, cudf_type, false, mr, stream); - }); + std::transform(array_chunks.begin(), + array_chunks.end(), + std::back_inserter(concat_columns), + [&cudf_type, &mr, &stream](auto const& array_chunk) { + return get_column(*array_chunk, cudf_type, false, stream, mr); + }); if (concat_columns.empty()) { return std::make_unique(cudf_type, 0, rmm::device_buffer(0)); } else if (concat_columns.size() == 1) { @@ -393,7 +396,7 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, { CUDF_FUNC_RANGE(); - return detail::from_arrow(input_table, mr); + return detail::from_arrow(input_table, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp index 13afde2ee4b..4f7a939b055 100644 --- a/cpp/src/interop/to_arrow.cpp +++ b/cpp/src/interop/to_arrow.cpp @@ -27,6 +27,7 @@ #include #include +#include #include namespace cudf { @@ -39,7 +40,7 @@ namespace { template std::shared_ptr fetch_data_buffer(column_view input_view, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const int64_t data_size_in_bytes = sizeof(T) * input_view.size(); @@ -52,7 +53,7 @@ std::shared_ptr fetch_data_buffer(column_view input_view, input_view.data(), data_size_in_bytes, cudaMemcpyDeviceToHost, - stream)); + stream.value())); return data_buffer; } @@ -62,7 +63,7 @@ std::shared_ptr fetch_data_buffer(column_view input_view, */ std::shared_ptr fetch_mask_buffer(column_view input_view, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size()); @@ -75,7 +76,7 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(), mask_size_in_bytes, cudaMemcpyDeviceToHost, - stream)); + stream.value())); // Resets all padded bits to 0 mask_buffer->ZeroPadding(); @@ -97,7 +98,7 @@ struct dispatch_to_arrow { column_view input_view, std::vector const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::vector> child_arrays; std::vector child_indices(input_view.num_children()); @@ -119,7 +120,7 @@ struct dispatch_to_arrow { cudf::type_id id, column_metadata const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { return to_arrow_array(id, static_cast(input_view.size()), @@ -134,9 +135,9 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in cudf::type_id id, column_metadata const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - auto bitmask = bools_to_mask(input, rmm::mr::get_current_device_resource(), stream); + auto bitmask = bools_to_mask(input, rmm::mr::get_current_device_resource(), stream.value()); auto result = arrow::AllocateBuffer(static_cast(bitmask.first->size()), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); @@ -147,7 +148,7 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in bitmask.first->data(), bitmask.first->size(), cudaMemcpyDeviceToHost, - stream)); + stream.value())); return to_arrow_array(id, static_cast(input.size()), data_buffer, @@ -161,7 +162,7 @@ std::shared_ptr dispatch_to_arrow::operator()( cudf::type_id id, column_metadata const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::unique_ptr tmp_column = ((input.offset() != 0) or @@ -201,7 +202,7 @@ std::shared_ptr dispatch_to_arrow::operator()( cudf::type_id id, column_metadata const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_EXPECTS(metadata.children_meta.size() == input.num_children(), "Number of field names and number of children doesn't match\n"); @@ -237,7 +238,7 @@ std::shared_ptr dispatch_to_arrow::operator()( cudf::type_id id, column_metadata const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::unique_ptr tmp_column = nullptr; if ((input.offset() != 0) or @@ -269,14 +270,14 @@ std::shared_ptr dispatch_to_arrow::operator()( cudf::type_id id, column_metadata const& metadata, arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Arrow dictionary requires indices to be signed integer std::unique_ptr dict_indices = cast(cudf::dictionary_column_view(input).get_indices_annotated(), cudf::data_type{type_id::INT32}, rmm::mr::get_current_device_resource(), - stream); + stream.value()); auto indices = dispatch_to_arrow{}.operator()( dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream); auto dict_keys = cudf::dictionary_column_view(input).keys(); @@ -296,8 +297,8 @@ std::shared_ptr dispatch_to_arrow::operator()( std::shared_ptr to_arrow(table_view input, std::vector const& metadata, - arrow::MemoryPool* ar_mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr) { CUDF_EXPECTS((metadata.size() == input.num_columns()), "columns' metadata should be equal to number of columns in table"); @@ -335,7 +336,7 @@ std::shared_ptr to_arrow(table_view input, { CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, ar_mr); + return detail::to_arrow(input, metadata, rmm::cuda_stream_default, ar_mr); } } // namespace cudf From a546bcc8c576c67e5caa39f73d0b44cfa1562981 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 11:55:36 +1100 Subject: [PATCH 27/51] Add missing dlpack and to_arrow synchronization. --- cpp/src/interop/dlpack.cpp | 6 ++++++ cpp/src/interop/to_arrow.cpp | 9 ++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index bb79a1d437e..1ae6119aefd 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -249,6 +249,12 @@ DLManagedTensor* to_dlpack(table_view const& input, // Defer ownership of managed tensor to caller managed_tensor->deleter = dltensor_context::deleter; managed_tensor->manager_ctx = context.release(); + + // synchronize the stream because after the return the data may be accessed from the host before + // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host + // memory is used). + stream.synchronize(); + return managed_tensor.release(); } diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp index 4f7a939b055..5f270597403 100644 --- a/cpp/src/interop/to_arrow.cpp +++ b/cpp/src/interop/to_arrow.cpp @@ -326,7 +326,14 @@ std::shared_ptr to_arrow(table_view input, std::back_inserter(fields), [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); }); - return arrow::Table::Make(arrow::schema(fields), arrays); + auto result = arrow::Table::Make(arrow::schema(fields), arrays); + + // synchronize the stream because after the return the data may be accessed from the host before + // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host + // memory is used). + stream.synchronize(); + + return result; } } // namespace detail From 7d863dc439e145e9bc53797e6ee2ce00acb5e59b Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 12:44:41 +1100 Subject: [PATCH 28/51] Convert reductions, quantiles to cuda_stream_view --- cpp/include/cudf/detail/quantiles.hpp | 56 +++++++++++++++ cpp/include/cudf/detail/reduction.cuh | 33 ++++----- .../cudf/detail/reduction_functions.hpp | 51 +++++++------ cpp/src/quantiles/quantile.cu | 71 ++++++++++++++----- cpp/src/reductions/all.cu | 13 ++-- cpp/src/reductions/any.cu | 13 ++-- cpp/src/reductions/compound.cuh | 36 +++++----- cpp/src/reductions/max.cu | 13 ++-- cpp/src/reductions/mean.cu | 13 ++-- cpp/src/reductions/min.cu | 13 ++-- cpp/src/reductions/nth_element.cu | 13 ++-- cpp/src/reductions/product.cu | 13 ++-- cpp/src/reductions/reductions.cpp | 71 ++++++++++--------- cpp/src/reductions/simple.cuh | 33 ++++----- cpp/src/reductions/std.cu | 13 ++-- cpp/src/reductions/sum.cu | 13 ++-- cpp/src/reductions/sum_of_squares.cu | 11 +-- cpp/src/reductions/var.cu | 13 ++-- 18 files changed, 315 insertions(+), 177 deletions(-) create mode 100644 cpp/include/cudf/detail/quantiles.hpp diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp new file mode 100644 index 00000000000..e93886c4f11 --- /dev/null +++ b/cpp/include/cudf/detail/quantiles.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { +namespace detail { + +/** @copydoc cudf::quantile(column_view const&, std::vector const&, interpolation, + column_view const&, bool, rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr quantile( + column_view const& input, + std::vector const& q, + interpolation interp = interpolation::LINEAR, + column_view const& ordered_indices = {}, + bool exact = true, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @copydoc cudf::quantiles(table_view const&, std::vector const&, interpolation, + cudf::sorted, std::vector const&, std::vector const&, + rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
quantiles( + table_view const& input, + std::vector const& q, + interpolation interp = interpolation::NEAREST, + cudf::sorted is_input_sorted = sorted::NO, + std::vector const& column_order = {}, + std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh index 84cde38fab8..063114adbc3 100644 --- a/cpp/include/cudf/detail/reduction.cuh +++ b/cpp/include/cudf/detail/reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include "reduction_operators.cuh" +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace reduction { @@ -49,8 +50,8 @@ template reduce(InputIterator d_in, cudf::size_type num_items, op::simple_op sop, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto binary_op = sop.get_binary_op(); OutputType identity = sop.template get_identity(); @@ -66,7 +67,7 @@ std::unique_ptr reduce(InputIterator d_in, num_items, binary_op, identity, - stream); + stream.value()); d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; // Run reduction @@ -77,7 +78,7 @@ std::unique_ptr reduce(InputIterator d_in, num_items, binary_op, identity, - stream); + stream.value()); // only for string_view, data is copied auto s = new cudf::scalar_type_t(std::move(dev_result), true, stream, mr); @@ -92,8 +93,8 @@ template reduce(InputIterator d_in, cudf::size_type num_items, op::simple_op sop, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto binary_op = sop.get_binary_op(); OutputType identity = sop.template get_identity(); @@ -109,7 +110,7 @@ std::unique_ptr reduce(InputIterator d_in, num_items, binary_op, identity, - stream); + stream.value()); d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; // Run reduction @@ -120,7 +121,7 @@ std::unique_ptr reduce(InputIterator d_in, num_items, binary_op, identity, - stream); + stream.value()); using ScalarType = cudf::scalar_type_t; auto s = new ScalarType(dev_result, true, stream, mr); // only for string_view, data is copied @@ -135,8 +136,8 @@ template reduce(InputIterator d_in, cudf::size_type num_items, op::simple_op sop, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("dictionary type not supported"); } @@ -169,8 +170,8 @@ std::unique_ptr reduce(InputIterator d_in, op::compound_op cop, cudf::size_type valid_count, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto binary_op = cop.get_binary_op(); IntermediateType identity = cop.template get_identity(); @@ -186,7 +187,7 @@ std::unique_ptr reduce(InputIterator d_in, num_items, binary_op, identity, - stream); + stream.value()); d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; // Run reduction @@ -197,12 +198,12 @@ std::unique_ptr reduce(InputIterator d_in, num_items, binary_op, identity, - stream); + stream.value()); // compute the result value from intermediate value in device using ScalarType = cudf::scalar_type_t; auto result = new ScalarType(OutputType{0}, true, stream, mr); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), intermediate_result.data(), 1, [dres = result->data(), cop, valid_count, ddof] __device__(auto i) { diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp index 99c7a679600..01df55dea05 100644 --- a/cpp/include/cudf/detail/reduction_functions.hpp +++ b/cpp/include/cudf/detail/reduction_functions.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace reduction { /** @@ -38,8 +40,9 @@ namespace reduction { std::unique_ptr sum( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Computes minimum of elements in input column * @@ -56,8 +59,9 @@ std::unique_ptr sum( std::unique_ptr min( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Computes maximum of elements in input column * @@ -74,8 +78,9 @@ std::unique_ptr min( std::unique_ptr max( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Computes any of elements in input column is true when typecasted to bool * @@ -93,8 +98,9 @@ std::unique_ptr max( std::unique_ptr any( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Computes all of elements in input column is true when typecasted to bool * @@ -112,8 +118,9 @@ std::unique_ptr any( std::unique_ptr all( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Computes product of elements in input column * @@ -131,8 +138,8 @@ std::unique_ptr all( std::unique_ptr product( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Computes sum of squares of elements in input column @@ -151,8 +158,8 @@ std::unique_ptr product( std::unique_ptr sum_of_squares( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Computes mean of elements in input column @@ -171,8 +178,8 @@ std::unique_ptr sum_of_squares( std::unique_ptr mean( column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Computes variance of elements in input column @@ -192,8 +199,8 @@ std::unique_ptr variance( column_view const& col, data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Computes standard deviation of elements in input column @@ -213,8 +220,8 @@ std::unique_ptr standard_deviation( column_view const& col, data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns nth element in input column @@ -244,8 +251,8 @@ std::unique_ptr nth_element( column_view const& col, size_type n, null_policy null_handling, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace reduction } // namespace cudf diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 31205f292c0..d4241157817 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -39,8 +39,8 @@ struct quantile_functor { std::vector const& q; interpolation interp; bool retain_types; - rmm::mr::device_memory_resource* mr; rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; template std::enable_if_t::value, std::unique_ptr> operator()( @@ -55,9 +55,8 @@ struct quantile_functor { { using Result = std::conditional_t; - auto type = data_type{type_to_id()}; - auto output = - make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream.value(), mr); + auto type = data_type{type_to_id()}; + auto output = make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream, mr); if (output->size() == 0) { return output; } @@ -112,33 +111,42 @@ std::unique_ptr quantile(column_view const& input, std::vector const& q, interpolation interp, bool retain_types, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto functor = quantile_functor{ - ordered_indices, size, q, interp, retain_types, mr, stream}; + ordered_indices, size, q, interp, retain_types, stream, mr}; return type_dispatcher(input.type(), functor, input); } -} // namespace detail - std::unique_ptr quantile(column_view const& input, std::vector const& q, interpolation interp, column_view const& ordered_indices, bool exact, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); - if (ordered_indices.is_empty()) { if (exact) { - return detail::quantile( - input, thrust::make_counting_iterator(0), input.size(), q, interp, exact, mr, 0); + return detail::quantile(input, + thrust::make_counting_iterator(0), + input.size(), + q, + interp, + exact, + stream, + mr); } else { - return detail::quantile( - input, thrust::make_counting_iterator(0), input.size(), q, interp, exact, mr, 0); + return detail::quantile(input, + thrust::make_counting_iterator(0), + input.size(), + q, + interp, + exact, + stream, + mr); } } else { @@ -146,13 +154,38 @@ std::unique_ptr quantile(column_view const& input, "`ordered_indicies` type must be `INT32`."); if (exact) { - return detail::quantile( - input, ordered_indices.data(), ordered_indices.size(), q, interp, exact, mr, 0); + return detail::quantile(input, + ordered_indices.data(), + ordered_indices.size(), + q, + interp, + exact, + stream, + mr); } else { - return detail::quantile( - input, ordered_indices.data(), ordered_indices.size(), q, interp, exact, mr, 0); + return detail::quantile(input, + ordered_indices.data(), + ordered_indices.size(), + q, + interp, + exact, + stream, + mr); } } } +} // namespace detail + +std::unique_ptr quantile(column_view const& input, + std::vector const& q, + interpolation interp, + column_view const& ordered_indices, + bool exact, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::quantile(input, q, interp, ordered_indices, exact, rmm::cuda_stream_default, mr); +} + } // namespace cudf diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu index 80a0b25176e..496ea822e92 100644 --- a/cpp/src/reductions/all.cu +++ b/cpp/src/reductions/all.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,18 @@ */ // The translation unit for reduction `max` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::all(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8), "all() operation can be applied with output type `bool8` only"); - return cudf::reduction::min(col, cudf::data_type(cudf::type_id::BOOL8), mr, stream); + return cudf::reduction::min(col, cudf::data_type(cudf::type_id::BOOL8), stream, mr); } diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu index ff04714190b..91d2c2f767a 100644 --- a/cpp/src/reductions/any.cu +++ b/cpp/src/reductions/any.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,18 @@ */ // The translation unit for reduction `max` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::any(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8), "any() operation can be applied with output type `bool8` only"); - return cudf::reduction::max(col, cudf::data_type(cudf::type_id::BOOL8), mr, stream); + return cudf::reduction::max(col, cudf::data_type(cudf::type_id::BOOL8), stream, mr); } diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 4bda26409fd..18baa37bd21 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,8 @@ #include #include + +#include #include namespace cudf { @@ -33,9 +35,9 @@ namespace compound { * @param[in] ddof `Delta Degrees of Freedom` used for `std`, `var`. * The divisor used in calculations is N - ddof, where N * represents the number of elements. - * @param[in] mr Device memory resource used to allocate the returned scalar's device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @returns Output scalar in device memory + * @param[in] mr Device memory resource used to allocate the returned scalar's device memory + * @return Output scalar in device memory * * @tparam ElementType the input column cudf dtype * @tparam ResultType the output cudf dtype @@ -46,8 +48,8 @@ template std::unique_ptr compound_reduction(column_view const& col, data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { cudf::size_type valid_count = col.size() - col.null_count(); @@ -61,12 +63,12 @@ std::unique_ptr compound_reduction(column_view const& col, dcol->pair_begin(), compound_op.template get_null_replacing_element_transformer()); result = detail::reduce( - it, col.size(), compound_op, valid_count, ddof, mr, stream); + it, col.size(), compound_op, valid_count, ddof, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), compound_op.template get_element_transformer()); result = detail::reduce( - it, col.size(), compound_op, valid_count, ddof, mr, stream); + it, col.size(), compound_op, valid_count, ddof, stream, mr); } // set scalar is valid if (col.null_count() < col.size()) @@ -93,18 +95,18 @@ struct result_type_dispatcher { std::unique_ptr operator()(column_view const& col, cudf::data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return compound_reduction(col, output_dtype, ddof, mr, stream); + return compound_reduction(col, output_dtype, ddof, stream, mr); } template ()>* = nullptr> std::unique_ptr operator()(column_view const& col, cudf::data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unsupported output data type"); } @@ -126,19 +128,19 @@ struct element_type_dispatcher { std::unique_ptr operator()(column_view const& col, cudf::data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return cudf::type_dispatcher( - output_dtype, result_type_dispatcher(), col, output_dtype, ddof, mr, stream); + output_dtype, result_type_dispatcher(), col, output_dtype, ddof, stream, mr); } template ()>* = nullptr> std::unique_ptr operator()(column_view const& col, cudf::data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL( "Reduction operators other than `min` and `max`" diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu index 74084091d5b..88819783af7 100644 --- a/cpp/src/reductions/max.cu +++ b/cpp/src/reductions/max.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ */ // The translation unit for reduction `max` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::max(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using reducer = cudf::reduction::simple::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr); } diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu index 5d2d3b17b2c..b05016ed257 100644 --- a/cpp/src/reductions/mean.cu +++ b/cpp/src/reductions/mean.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,18 @@ */ // The translation unit for reduction `mean` -#include #include "compound.cuh" +#include + +#include + std::unique_ptr cudf::reduction::mean(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using reducer = cudf::reduction::compound::element_type_dispatcher; return cudf::type_dispatcher( - col.type(), reducer(), col, output_dtype, /* ddof is not used for mean*/ 1, mr, stream); + col.type(), reducer(), col, output_dtype, /* ddof is not used for mean*/ 1, stream, mr); } diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu index 67c2e714a52..fcbdf456de2 100644 --- a/cpp/src/reductions/min.cu +++ b/cpp/src/reductions/min.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ */ // The translation unit for reduction `min` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::min(column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using reducer = cudf::reduction::simple::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr); } diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu index f68270cf8b3..85e0b8afde9 100644 --- a/cpp/src/reductions/nth_element.cu +++ b/cpp/src/reductions/nth_element.cu @@ -20,14 +20,16 @@ #include #include -#include +#include #include +#include + std::unique_ptr cudf::reduction::nth_element(column_view const& col, size_type n, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds"); auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); }; @@ -41,11 +43,12 @@ std::unique_ptr cudf::reduction::nth_element(column_view const& co [] __device__(auto b) { return static_cast(b); }); rmm::device_uvector null_skipped_index(col.size(), stream); // null skipped index for valids only. - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), bitmask_iterator, bitmask_iterator + col.size(), null_skipped_index.begin()); - auto n_pos = thrust::upper_bound(rmm::exec_policy(stream)->on(stream), + + auto n_pos = thrust::upper_bound(rmm::exec_policy(stream)->on(stream.value()), null_skipped_index.begin(), null_skipped_index.end(), n); diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu index 5b9b78ec2ce..8f23bbb88cc 100644 --- a/cpp/src/reductions/product.cu +++ b/cpp/src/reductions/product.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ */ // The translation unit for reduction `product` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::product(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using reducer = cudf::reduction::simple::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr); } diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index de4608ed391..7afebaab154 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -17,27 +17,28 @@ #include #include #include -#include -#include -#include - #include +#include #include +#include +#include #include #include +#include + namespace cudf { namespace detail { struct reduce_dispatch_functor { column_view const col; data_type output_dtype; rmm::mr::device_memory_resource *mr; - cudaStream_t stream; + rmm::cuda_stream_view stream; reduce_dispatch_functor(column_view const &col, data_type output_dtype, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) : col(col), output_dtype(output_dtype), mr(mr), stream(stream) { } @@ -46,55 +47,59 @@ struct reduce_dispatch_functor { std::unique_ptr operator()(std::unique_ptr const &agg) { switch (k) { - case aggregation::SUM: return reduction::sum(col, output_dtype, mr, stream); break; - case aggregation::PRODUCT: return reduction::product(col, output_dtype, mr, stream); break; - case aggregation::MIN: return reduction::min(col, output_dtype, mr, stream); break; - case aggregation::MAX: return reduction::max(col, output_dtype, mr, stream); break; - case aggregation::ANY: return reduction::any(col, output_dtype, mr, stream); break; - case aggregation::ALL: return reduction::all(col, output_dtype, mr, stream); break; + case aggregation::SUM: return reduction::sum(col, output_dtype, stream, mr); break; + case aggregation::PRODUCT: return reduction::product(col, output_dtype, stream, mr); break; + case aggregation::MIN: return reduction::min(col, output_dtype, stream, mr); break; + case aggregation::MAX: return reduction::max(col, output_dtype, stream, mr); break; + case aggregation::ANY: return reduction::any(col, output_dtype, stream, mr); break; + case aggregation::ALL: return reduction::all(col, output_dtype, stream, mr); break; case aggregation::SUM_OF_SQUARES: - return reduction::sum_of_squares(col, output_dtype, mr, stream); + return reduction::sum_of_squares(col, output_dtype, stream, mr); break; - case aggregation::MEAN: return reduction::mean(col, output_dtype, mr, stream); break; + case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr); break; case aggregation::VARIANCE: { auto var_agg = static_cast(agg.get()); - return reduction::variance(col, output_dtype, var_agg->_ddof, mr, stream); + return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr); } break; case aggregation::STD: { auto var_agg = static_cast(agg.get()); - return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, mr, stream); + return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr); } break; case aggregation::MEDIAN: { - auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr); + auto sorted_indices = + detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value()); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0]; - auto col_ptr = quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, mr); + auto col_ptr = detail::quantile( + col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, mr); return get_element(*col_ptr, 0, mr); } break; case aggregation::QUANTILE: { auto quantile_agg = static_cast(agg.get()); CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1, "Reduction quantile accepts only one quantile value"); - auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr); + auto sorted_indices = + detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value()); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0]; - auto col_ptr = quantile(col, - quantile_agg->_quantiles, - quantile_agg->_interpolation, - valid_sorted_indices, - true, - mr); + auto col_ptr = detail::quantile(col, + quantile_agg->_quantiles, + quantile_agg->_interpolation, + valid_sorted_indices, + true, + stream, + mr); return get_element(*col_ptr, 0, mr); } break; case aggregation::NUNIQUE: { auto nunique_agg = static_cast(agg.get()); return make_fixed_width_scalar( detail::distinct_count( - col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), - stream, + col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream.value()), + stream.value(), mr); } break; case aggregation::NTH_ELEMENT: { auto nth_agg = static_cast(agg.get()); - return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, mr, stream); + return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr); } break; default: CUDF_FAIL("Unsupported reduction operator"); } @@ -105,8 +110,8 @@ std::unique_ptr reduce( column_view const &col, std::unique_ptr const &agg, data_type output_dtype, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { std::unique_ptr result = make_default_constructed_scalar(output_dtype); result->set_valid(false, stream); @@ -115,7 +120,7 @@ std::unique_ptr reduce( if (col.size() <= col.null_count()) return result; result = - aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, mr, stream}, agg); + aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg); return result; } } // namespace detail @@ -126,7 +131,7 @@ std::unique_ptr reduce(column_view const &col, rmm::mr::device_memory_resource *mr) { CUDF_FUNC_RANGE(); - return detail::reduce(col, agg, output_dtype, mr); + return detail::reduce(col, agg, output_dtype, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh index 980b709e241..c10c163d0c4 100644 --- a/cpp/src/reductions/simple.cuh +++ b/cpp/src/reductions/simple.cuh @@ -17,11 +17,12 @@ #pragma once #include - #include +#include #include #include -#include "cudf/structs/struct_view.hpp" + +#include namespace cudf { namespace reduction { @@ -42,8 +43,8 @@ namespace simple { template std::unique_ptr simple_reduction(column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // reduction by iterator auto dcol = cudf::column_device_view::create(col, stream); @@ -54,11 +55,11 @@ std::unique_ptr simple_reduction(column_view const& col, auto it = thrust::make_transform_iterator( dcol->pair_begin(), simple_op.template get_null_replacing_element_transformer()); - result = detail::reduce(it, col.size(), Op{}, mr, stream); + result = detail::reduce(it, col.size(), Op{}, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), simple_op.template get_element_transformer()); - result = detail::reduce(it, col.size(), Op{}, mr, stream); + result = detail::reduce(it, col.size(), Op{}, stream, mr); } // set scalar is valid result->set_valid((col.null_count() < col.size()), stream); @@ -91,17 +92,17 @@ struct result_type_dispatcher { template ()>* = nullptr> std::unique_ptr operator()(column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return simple_reduction(col, output_dtype, mr, stream); + return simple_reduction(col, output_dtype, stream, mr); } template ()>* = nullptr> std::unique_ptr operator()(column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("input data type is not convertible to output data type"); } @@ -129,18 +130,18 @@ struct element_type_dispatcher { template ()>* = nullptr> std::unique_ptr operator()(column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return cudf::type_dispatcher( - output_dtype, result_type_dispatcher(), col, output_dtype, mr, stream); + output_dtype, result_type_dispatcher(), col, output_dtype, stream, mr); } template ()>* = nullptr> std::unique_ptr operator()(column_view const& col, data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL( "Reduction operators other than `min` and `max`" diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu index 39ba7e8292c..a3f410f1407 100644 --- a/cpp/src/reductions/std.cu +++ b/cpp/src/reductions/std.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,12 @@ */ // The translation unit for reduction `standard deviation` -#include #include "compound.cuh" +#include + +#include + // @param[in] ddof Delta Degrees of Freedom used for `std`, `var`. // The divisor used in calculations is N - ddof, where N // represents the number of elements. @@ -26,14 +29,14 @@ std::unique_ptr cudf::reduction::standard_deviation( column_view const& col, cudf::data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // TODO: add cuda version check when the fix is available #if !defined(__CUDACC_DEBUG__) using reducer = cudf::reduction::compound::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, stream, mr); #else // workaround for bug 200529165 which causes compilation error only at device // debug build the bug will be fixed at cuda 10.2 diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu index f75002e1eba..d295dfe3706 100644 --- a/cpp/src/reductions/sum.cu +++ b/cpp/src/reductions/sum.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ */ // The translation unit for reduction `sum` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::sum(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using reducer = cudf::reduction::simple::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr); } diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu index a989eb7ad48..ca898bf9bce 100644 --- a/cpp/src/reductions/sum_of_squares.cu +++ b/cpp/src/reductions/sum_of_squares.cu @@ -15,15 +15,18 @@ */ // The translation unit for reduction `sum of squares` -#include #include "simple.cuh" +#include + +#include + std::unique_ptr cudf::reduction::sum_of_squares(column_view const& col, cudf::data_type const output_dtype, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using reducer = cudf::reduction::simple::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr); } diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu index 4d180c118c3..eab57344cc6 100644 --- a/cpp/src/reductions/var.cu +++ b/cpp/src/reductions/var.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,12 @@ // The translation unit for reduction `variance` -#include #include "compound.cuh" +#include + +#include + // @param[in] ddof Delta Degrees of Freedom used for `std`, `var`. // The divisor used in calculations is N - ddof, where N // represents the number of elements. @@ -26,13 +29,13 @@ std::unique_ptr cudf::reduction::variance(column_view const& col, cudf::data_type const output_dtype, cudf::size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // TODO: add cuda version check when the fix is available #if !defined(__CUDACC_DEBUG__) using reducer = cudf::reduction::compound::element_type_dispatcher; - return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, mr, stream); + return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, stream, mr); #else // workaround for bug 200529165 which causes compilation error only at device // debug build the bug will be fixed at cuda 10.2 From 84a200ea44274a1467313dc5c835df9ee19950f2 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 12:51:19 +1100 Subject: [PATCH 29/51] Convert repeat to cuda_stream_view --- cpp/include/cudf/detail/repeat.hpp | 12 +++++----- cpp/src/filling/repeat.cu | 36 ++++++++++++++++-------------- cpp/src/join/cross_join.cu | 2 +- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp index afd6c0b5d5a..1c358b3da71 100644 --- a/cpp/include/cudf/detail/repeat.hpp +++ b/cpp/include/cudf/detail/repeat.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + #include namespace cudf { @@ -33,8 +35,8 @@ std::unique_ptr
repeat( table_view const& input_table, column_view const& count, bool check_count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::repeat(table_view const&, size_type, @@ -45,8 +47,8 @@ std::unique_ptr
repeat( std::unique_ptr
repeat( table_view const& input_table, size_type count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 96e2e15f262..224f6dfe3a0 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -29,14 +28,16 @@ #include #include +#include +#include +#include + #include #include #include #include #include -#include - #include #include @@ -73,7 +74,7 @@ struct compute_offsets { template std::enable_if_t::value, rmm::device_vector> operator()( - bool check_count, cudaStream_t stream = 0) + bool check_count, rmm::cuda_stream_view stream) { // static_cast is necessary due to bool if (check_count && static_cast(std::numeric_limits::max()) > @@ -83,14 +84,15 @@ struct compute_offsets { "count should not have values larger than size_type's limit."); } rmm::device_vector offsets(p_column->size()); - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), p_column->begin(), p_column->end(), offsets.begin()); if (check_count == true) { - CUDF_EXPECTS(thrust::is_sorted( - rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end()) == true, - "count has negative values or the resulting table has more \ + CUDF_EXPECTS( + thrust::is_sorted( + rmm::exec_policy(stream)->on(stream.value()), offsets.begin(), offsets.end()) == true, + "count has negative values or the resulting table has more \ rows than size_type's limit."); } @@ -99,7 +101,7 @@ struct compute_offsets { template std::enable_if_t::value, rmm::device_vector> operator()( - bool check_count, cudaStream_t stream) + bool check_count, rmm::cuda_stream_view stream) { CUDF_FAIL("count value should be a integral type."); } @@ -112,8 +114,8 @@ namespace detail { std::unique_ptr
repeat(table_view const& input_table, column_view const& count, bool check_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input_table.num_rows() == count.size(), "in and count must have equal size"); CUDF_EXPECTS(count.has_nulls() == false, "count cannot contain nulls"); @@ -124,7 +126,7 @@ std::unique_ptr
repeat(table_view const& input_table, size_type output_size{offsets.back()}; rmm::device_vector indices(output_size); - thrust::upper_bound(rmm::exec_policy(stream)->on(stream), + thrust::upper_bound(rmm::exec_policy(stream)->on(stream.value()), offsets.begin(), offsets.end(), thrust::make_counting_iterator(0), @@ -136,8 +138,8 @@ std::unique_ptr
repeat(table_view const& input_table, std::unique_ptr
repeat(table_view const& input_table, size_type count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(count >= 0, "count value should be non-negative"); CUDF_EXPECTS( @@ -162,7 +164,7 @@ std::unique_ptr
repeat(table_view const& input_table, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat(input_table, count, check_count, mr, 0); + return detail::repeat(input_table, count, check_count, rmm::cuda_stream_default, mr); } std::unique_ptr
repeat(table_view const& input_table, @@ -170,7 +172,7 @@ std::unique_ptr
repeat(table_view const& input_table, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat(input_table, count, mr, 0); + return detail::repeat(input_table, count, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index 27aa7672825..5a2dc32e27a 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -54,7 +54,7 @@ std::unique_ptr cross_join( } // Repeat left table - auto left_repeated = detail::repeat(left, right.num_rows(), mr, stream); + auto left_repeated = detail::repeat(left, right.num_rows(), stream, mr); // Tile right table auto right_tiled = detail::tile(right, left.num_rows(), stream, mr); From 876d9efe999da86a7c2be7680a901d42b4c7a494 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 14:20:15 +1100 Subject: [PATCH 30/51] Add quantiles.hpp to meta.yaml --- conda/recipes/libcudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 6cb9ce2adff..b017940eee7 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -75,6 +75,7 @@ test: - test -f $PREFIX/include/cudf/detail/replace.hpp - test -f $PREFIX/include/cudf/detail/reshape.hpp - test -f $PREFIX/include/cudf/detail/round.hpp + - test -f $PREFIX/include/cudf/detail/quantiles.hpp - test -f $PREFIX/include/cudf/detail/scatter.hpp - test -f $PREFIX/include/cudf/detail/search.hpp - test -f $PREFIX/include/cudf/detail/sequence.hpp From 6a7d15ca241d766f012c9c5a1e8b1edaacf90d6c Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 14:20:56 +1100 Subject: [PATCH 31/51] Convert replace to cuda_stream_view --- cpp/include/cudf/detail/replace.hpp | 25 +++--- .../cudf/dictionary/detail/replace.hpp | 14 ++-- cpp/include/cudf/strings/detail/replace.hpp | 18 +++-- cpp/src/dictionary/replace.cu | 37 +++++---- cpp/src/io/json/reader_impl.cu | 2 +- cpp/src/replace/nans.cu | 33 ++++---- cpp/src/replace/nulls.cu | 76 ++++++++++--------- cpp/src/replace/replace.cu | 59 +++++++------- cpp/src/strings/replace/replace.cu | 66 ++++++++-------- 9 files changed, 172 insertions(+), 158 deletions(-) diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp index 989ea3f7e0f..d872f3edbcd 100644 --- a/cpp/include/cudf/detail/replace.hpp +++ b/cpp/include/cudf/detail/replace.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,13 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include -#include -// Forward declaration +#include + +#include namespace cudf { namespace detail { @@ -32,8 +32,8 @@ namespace detail { std::unique_ptr replace_nulls( column_view const& input, cudf::column_view const& replacement, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::replace_nulls(column_view const&, scalar const&, @@ -44,8 +44,8 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nulls( column_view const& input, scalar const& replacement, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::replace_nans(column_view const&, column_view const&, @@ -56,7 +56,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nans( column_view const& input, column_view const& replacement, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -68,7 +68,7 @@ std::unique_ptr replace_nans( std::unique_ptr replace_nans( column_view const& input, scalar const& replacement, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -80,7 +80,8 @@ std::unique_ptr find_and_replace_all( column_view const& input_col, column_view const& values_to_replace, column_view const& replacement_values, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp index 040f71a5751..7166633c378 100644 --- a/cpp/include/cudf/dictionary/detail/replace.hpp +++ b/cpp/include/cudf/dictionary/detail/replace.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -32,15 +34,15 @@ namespace detail { * * @param input Column with nulls to replace. * @param replacement Column with values to use for replacing. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column with null rows replaced. */ std::unique_ptr replace_nulls( dictionary_column_view const& input, dictionary_column_view const& replacement, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create a new dictionary column by replacing nulls with a @@ -50,15 +52,15 @@ std::unique_ptr replace_nulls( * * @param input Column with nulls to replace. * @param replacement Value to use for replacing. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column with null rows replaced. */ std::unique_ptr replace_nulls( dictionary_column_view const& input, scalar const& replacement, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp index 3a665492102..64e626794e7 100644 --- a/cpp/include/cudf/strings/detail/replace.hpp +++ b/cpp/include/cudf/strings/detail/replace.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -34,8 +36,8 @@ std::unique_ptr replace( string_scalar const& target, string_scalar const& repl, int32_t maxrepl = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&, @@ -48,8 +50,8 @@ std::unique_ptr replace_slice( string_scalar const& repl = string_scalar(""), size_type start = 0, size_type stop = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&, @@ -61,8 +63,8 @@ std::unique_ptr replace( strings_column_view const& strings, strings_column_view const& targets, strings_column_view const& repls, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&, @@ -73,8 +75,8 @@ std::unique_ptr replace( std::unique_ptr replace_nulls( strings_column_view const& strings, string_scalar const& repl = string_scalar(""), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace strings diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 918063ac508..097490c4ff3 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -23,6 +23,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" #include #include @@ -104,8 +105,8 @@ auto make_scalar_iterator(scalar const& input) template std::unique_ptr replace_indices(column_view const& input, ReplacementIter replacement_iter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const input_view = column_device_view::create(input, stream); auto const d_input = *input_view; @@ -129,8 +130,8 @@ std::unique_ptr replace_indices(column_view const& input, */ std::unique_ptr replace_nulls(dictionary_column_view const& input, dictionary_column_view const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) { return cudf::empty_like(input.parent()); } if (!input.has_nulls()) { return std::make_unique(input.parent()); } @@ -138,7 +139,7 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match"); // first combine the keys so both input dictionaries have the same set - auto matched = match_dictionaries({input, replacement}, mr, stream); + auto matched = match_dictionaries({input, replacement}, mr, stream.value()); // now build the new indices by doing replace-null using the updated input indices auto const input_indices = @@ -146,13 +147,15 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, auto const repl_indices = dictionary_column_view(matched.back()->view()).get_indices_annotated(); auto new_indices = repl_indices.has_nulls() - ? replace_indices(input_indices, make_nullable_index_iterator(repl_indices), mr, stream) + ? replace_indices(input_indices, make_nullable_index_iterator(repl_indices), stream, mr) : replace_indices( - input_indices, make_nullable_index_iterator(repl_indices), mr, stream); + input_indices, make_nullable_index_iterator(repl_indices), stream, mr); // auto keys_column = ; - return make_dictionary_column( - std::move(matched.front()->release().children.back()), std::move(new_indices), mr, stream); + return make_dictionary_column(std::move(matched.front()->release().children.back()), + std::move(new_indices), + mr, + stream.value()); } /** @@ -161,8 +164,8 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, */ std::unique_ptr replace_nulls(dictionary_column_view const& input, scalar const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) { return cudf::empty_like(input.parent()); } if (!input.has_nulls() || !replacement.is_valid()) { @@ -173,18 +176,20 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, // first add the replacment to the keys so only the indices need to be processed auto const default_mr = rmm::mr::get_current_device_resource(); auto input_matched = dictionary::detail::add_keys( - input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream); + input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream.value()); auto const input_view = dictionary_column_view(input_matched->view()); - auto const scalar_index = get_index(input_view, replacement, default_mr, stream); + auto const scalar_index = get_index(input_view, replacement, default_mr, stream.value()); // now build the new indices by doing replace-null on the updated indices auto const input_indices = input_view.get_indices_annotated(); auto new_indices = - replace_indices(input_indices, make_scalar_iterator(*scalar_index), mr, stream); + replace_indices(input_indices, make_scalar_iterator(*scalar_index), stream, mr); new_indices->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - return make_dictionary_column( - std::move(input_matched->release().children.back()), std::move(new_indices), mr, stream); + return make_dictionary_column(std::move(input_matched->release().children.back()), + std::move(new_indices), + mr, + stream.value()); } } // namespace detail diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index ae0cb40e522..3246f7e9ed0 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -609,7 +609,7 @@ table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream) if (out_column->type().id() == type_id::STRING) { // Need to remove escape character in case of '\"' and '\\' out_columns.emplace_back(cudf::strings::detail::replace( - out_column->view(), target->view(), repl->view(), mr_, stream)); + out_column->view(), target->view(), repl->view(), stream, mr_)); } else { out_columns.emplace_back(std::move(out_column)); } diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu index 6232da34f06..d26be0ad47f 100644 --- a/cpp/src/replace/nans.cu +++ b/cpp/src/replace/nans.cu @@ -25,6 +25,8 @@ #include #include +#include + #include namespace cudf { @@ -37,8 +39,8 @@ struct replace_nans_functor { column_view const& input, Replacement const& replacement, bool replacement_nullable, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type() == replacement.type(), "Input and replacement must be of the same type"); @@ -106,9 +108,10 @@ struct replace_nans_functor { }; } // namespace + std::unique_ptr replace_nans(column_view const& input, column_view const& replacement, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.size() == replacement.size(), @@ -119,17 +122,17 @@ std::unique_ptr replace_nans(column_view const& input, input, *column_device_view::create(replacement), replacement.nullable(), - mr, - stream); + stream, + mr); } std::unique_ptr replace_nans(column_view const& input, scalar const& replacement, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { return type_dispatcher( - input.type(), replace_nans_functor{}, input, replacement, true, mr, stream); + input.type(), replace_nans_functor{}, input, replacement, true, stream, mr); } } // namespace detail @@ -147,7 +150,7 @@ std::unique_ptr replace_nans(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_nans(input, replacement, 0, mr); + return detail::replace_nans(input, replacement, rmm::cuda_stream_default, mr); } } // namespace cudf @@ -175,9 +178,9 @@ struct normalize_nans_and_zeros_kernel_forwarder { template ::value>* = nullptr> void operator()(cudf::column_device_view in, cudf::mutable_column_device_view out, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(in.size()), out.head(), @@ -188,7 +191,7 @@ struct normalize_nans_and_zeros_kernel_forwarder { template ::value>* = nullptr> void operator()(cudf::column_device_view in, cudf::mutable_column_device_view out, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_FAIL("Unexpected non floating-point type."); } @@ -198,7 +201,7 @@ struct normalize_nans_and_zeros_kernel_forwarder { namespace cudf { namespace detail { -void normalize_nans_and_zeros(mutable_column_view in_out, cudaStream_t stream = 0) +void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view stream) { if (in_out.is_empty()) { return; } CUDF_EXPECTS( @@ -240,11 +243,11 @@ std::unique_ptr normalize_nans_and_zeros(column_view const& input, { CUDF_FUNC_RANGE(); // output. copies the input - std::unique_ptr out = std::make_unique(input, (cudaStream_t)0, mr); + std::unique_ptr out = std::make_unique(input, rmm::cuda_stream_default, mr); // from device. unique_ptr which gets automatically cleaned up when we leave. auto out_view = out->mutable_view(); - detail::normalize_nans_and_zeros(out_view, 0); + detail::normalize_nans_and_zeros(out_view, rmm::cuda_stream_default); return out; } @@ -262,7 +265,7 @@ std::unique_ptr normalize_nans_and_zeros(column_view const& input, void normalize_nans_and_zeros(mutable_column_view& in_out) { CUDF_FUNC_RANGE(); - detail::normalize_nans_and_zeros(in_out, 0); + detail::normalize_nans_and_zeros(in_out, rmm::cuda_stream_default); } } // namespace cudf diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index d13d729536b..6f860dfd60d 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include #include #include @@ -29,11 +30,12 @@ #include #include #include +#include #include #include -#include #include #include +#include "rmm/cuda_stream_view.hpp" #include @@ -148,8 +150,8 @@ struct replace_nulls_column_kernel_forwarder { template ()>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, cudf::column_view const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { cudf::size_type nrows = input.size(); cudf::detail::grid_1d grid{nrows, BLOCK_SIZE}; @@ -174,7 +176,7 @@ struct replace_nulls_column_kernel_forwarder { rmm::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); - replace<<>>( + replace<<>>( *device_in, *device_replacement, *device_out, valid_count); if (output_view.nullable()) { @@ -187,8 +189,8 @@ struct replace_nulls_column_kernel_forwarder { template ()>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, cudf::column_view const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("No specialization exists for the given type."); } @@ -198,8 +200,8 @@ template <> std::unique_ptr replace_nulls_column_kernel_forwarder::operator()( cudf::column_view const& input, cudf::column_view const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); @@ -224,7 +226,7 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< // Call first pass kernel to get sizes in offsets cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1}; - replace_first<<>>( + replace_first<<>>( *device_in, *device_replacement, reinterpret_cast(valid_bits.data()), @@ -233,21 +235,21 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< valid_count); std::unique_ptr offsets = cudf::strings::detail::make_offsets_child_column( - sizes_view.begin(), sizes_view.end(), mr, stream); + sizes_view.begin(), sizes_view.end(), mr, stream.value()); auto offsets_view = offsets->mutable_view(); int32_t size; CUDA_TRY(cudaMemcpyAsync( - &size, offsets_view.end() - 1, sizeof(int32_t), cudaMemcpyDefault, stream)); + &size, offsets_view.end() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value())); // Allocate chars array and output null mask - cudf::size_type null_count = input.size() - valid_counter.value(stream); - std::unique_ptr output_chars = - cudf::strings::detail::create_chars_child_column(input.size(), null_count, size, mr, stream); + cudf::size_type null_count = input.size() - valid_counter.value(stream); + std::unique_ptr output_chars = cudf::strings::detail::create_chars_child_column( + input.size(), null_count, size, mr, stream.value()); auto output_chars_view = output_chars->mutable_view(); - replace_second<<>>( + replace_second<<>>( *device_in, *device_replacement, reinterpret_cast(valid_bits.data()), @@ -268,12 +270,12 @@ template <> std::unique_ptr replace_nulls_column_kernel_forwarder::operator()( cudf::column_view const& input, cudf::column_view const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { cudf::dictionary_column_view dict_input(input); cudf::dictionary_column_view dict_repl(replacement); - return cudf::dictionary::detail::replace_nulls(dict_input, dict_repl, mr, stream); + return cudf::dictionary::detail::replace_nulls(dict_input, dict_repl, stream, mr); } template @@ -292,8 +294,8 @@ struct replace_nulls_scalar_kernel_forwarder { typename std::enable_if_t()>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, cudf::scalar const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch"); std::unique_ptr output = @@ -306,7 +308,7 @@ struct replace_nulls_scalar_kernel_forwarder { auto device_in = cudf::column_device_view::create(input); auto func = replace_nulls_functor{s1.data()}; - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.data(), input.data() + input.size(), cudf::detail::make_validity_iterator(*device_in), @@ -318,8 +320,8 @@ struct replace_nulls_scalar_kernel_forwarder { template ()>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, cudf::scalar const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("No specialization exists for the given type."); } @@ -329,24 +331,24 @@ template <> std::unique_ptr replace_nulls_scalar_kernel_forwarder::operator()( cudf::column_view const& input, cudf::scalar const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch"); cudf::strings_column_view input_s(input); const cudf::string_scalar& repl = static_cast(replacement); - return cudf::strings::replace_nulls(input_s, repl, mr); + return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr); } template <> std::unique_ptr replace_nulls_scalar_kernel_forwarder::operator()( cudf::column_view const& input, cudf::scalar const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { cudf::dictionary_column_view dict_input(input); - return cudf::dictionary::detail::replace_nulls(dict_input, replacement, mr, stream); + return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr); } } // end anonymous namespace @@ -355,8 +357,8 @@ namespace cudf { namespace detail { std::unique_ptr replace_nulls(cudf::column_view const& input, cudf::column_view const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch"); CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch"); @@ -366,13 +368,13 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, if (!input.has_nulls()) { return std::make_unique(input); } return cudf::type_dispatcher( - input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, mr, stream); + input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr); } std::unique_ptr replace_nulls(cudf::column_view const& input, cudf::scalar const& replacement, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) { return cudf::empty_like(input); } @@ -381,7 +383,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, } return cudf::type_dispatcher( - input.type(), replace_nulls_scalar_kernel_forwarder{}, input, replacement, mr, stream); + input.type(), replace_nulls_scalar_kernel_forwarder{}, input, replacement, stream, mr); } } // namespace detail @@ -391,7 +393,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::replace_nulls(input, replacement, mr, 0); + return cudf::detail::replace_nulls(input, replacement, rmm::cuda_stream_default, mr); } std::unique_ptr replace_nulls(cudf::column_view const& input, @@ -399,6 +401,6 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::replace_nulls(input, replacement, mr, 0); + return cudf::detail::replace_nulls(input, replacement, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index eef397b6a13..6ca894ac186 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -299,8 +299,8 @@ struct replace_kernel_forwarder { std::unique_ptr operator()(cudf::column_view const& input_col, cudf::column_view const& values_to_replace, cudf::column_view const& replacement_values, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); @@ -330,12 +330,12 @@ struct replace_kernel_forwarder { auto device_values_to_replace = cudf::column_device_view::create(values_to_replace); auto device_replacement_values = cudf::column_device_view::create(replacement_values); - replace<<>>(*device_in, - *device_out, - valid_count, - output_view.size(), - *device_values_to_replace, - *device_replacement_values); + replace<<>>(*device_in, + *device_out, + valid_count, + output_view.size(), + *device_values_to_replace, + *device_replacement_values); if (output_view.nullable()) { output->set_null_count(output->size() - valid_counter.value(stream)); @@ -347,8 +347,8 @@ struct replace_kernel_forwarder { std::unique_ptr operator()(cudf::column_view const& input_col, cudf::column_view const& values_to_replace, cudf::column_view const& replacement_values, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("No specialization exists for this type"); } @@ -359,8 +359,8 @@ std::unique_ptr replace_kernel_forwarder::operator() valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); @@ -402,7 +402,7 @@ std::unique_ptr replace_kernel_forwarder::operator()>>( + replace_first<<>>( *device_in, *device_values_to_replace, *device_replacement, @@ -412,22 +412,23 @@ std::unique_ptr replace_kernel_forwarder::operator() offsets = cudf::strings::detail::make_offsets_child_column( - sizes_view.begin(), sizes_view.end(), mr, stream); + sizes_view.begin(), sizes_view.end(), mr, stream.value()); auto offsets_view = offsets->mutable_view(); auto device_offsets = cudf::mutable_column_device_view::create(offsets_view); int32_t size; CUDA_TRY(cudaMemcpyAsync( - &size, offsets_view.end() - 1, sizeof(int32_t), cudaMemcpyDefault, stream)); + &size, offsets_view.end() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value())); + stream.synchronize(); // Allocate chars array and output null mask cudf::size_type null_count = input_col.size() - valid_counter.value(stream); std::unique_ptr output_chars = cudf::strings::detail::create_chars_child_column( - input_col.size(), null_count, size, mr, stream); + input_col.size(), null_count, size, mr, stream.value()); auto output_chars_view = output_chars->mutable_view(); auto device_chars = cudf::mutable_column_device_view::create(output_chars_view); - replace_second<<>>( + replace_second<<>>( *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices); return cudf::make_strings_column(input_col.size(), @@ -444,8 +445,8 @@ std::unique_ptr replace_kernel_forwarder::operator() replace_kernel_forwarder::operator()view(), mr, stream); + return cudf::dictionary::detail::add_keys(input, new_keys->view(), mr, stream.value()); }(); auto matched_view = cudf::dictionary_column_view(matched_input->view()); auto matched_values = cudf::dictionary::detail::set_keys( - values, matched_view.keys(), rmm::mr::get_current_device_resource(), stream); + values, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); auto matched_replacements = cudf::dictionary::detail::set_keys( - replacements, matched_view.keys(), rmm::mr::get_current_device_resource(), stream); + replacements, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); auto indices_type = matched_view.indices().type(); auto new_indices = cudf::type_dispatcher( @@ -469,8 +470,8 @@ std::unique_ptr replace_kernel_forwarder::operator()view()).indices(), cudf::dictionary_column_view(matched_replacements->view()).get_indices_annotated(), - mr, - stream); + stream, + mr); auto null_count = new_indices->null_count(); auto contents = new_indices->release(); auto indices_column = std::make_unique( @@ -489,8 +490,8 @@ namespace detail { std::unique_ptr find_and_replace_all(cudf::column_view const& input_col, cudf::column_view const& values_to_replace, cudf::column_view const& replacement_values, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(), "values_to_replace and replacement_values size mismatch."); @@ -509,8 +510,8 @@ std::unique_ptr find_and_replace_all(cudf::column_view const& inpu input_col, values_to_replace, replacement_values, - mr, - stream); + stream, + mr); } } // namespace detail @@ -532,6 +533,6 @@ std::unique_ptr find_and_replace_all(cudf::column_view const& inpu rmm::mr::device_memory_resource* mr) { return cudf::detail::find_and_replace_all( - input_col, values_to_replace, replacement_values, mr, 0); + input_col, values_to_replace, replacement_values, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index f373c97b1ef..a1aca664e25 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -95,11 +95,11 @@ std::unique_ptr replace(strings_column_view const& strings, string_scalar const& target, string_scalar const& repl, int32_t maxrepl, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid."); CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string."); @@ -111,23 +111,22 @@ std::unique_ptr replace(strings_column_view const& strings, auto d_strings = *strings_column; // copy the null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), replace_fn{d_strings, d_target, d_repl, maxrepl}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream); + create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value()); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, replace_fn{d_strings, d_target, d_repl, maxrepl, d_offsets, d_chars}); @@ -184,11 +183,11 @@ std::unique_ptr replace_slice(strings_column_view const& strings, string_scalar const& repl, size_type start, size_type stop, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); @@ -205,22 +204,22 @@ std::unique_ptr replace_slice(strings_column_view const& strings, thrust::make_counting_iterator(0), replace_slice_fn{d_strings, d_repl, start, stop}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream); + create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value()); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, replace_slice_fn{d_strings, d_repl, start, stop, d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -288,11 +287,11 @@ struct replace_multi_fn { std::unique_ptr replace(strings_column_view const& strings, strings_column_view const& targets, strings_column_view const& repls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)), "Parameters targets must not be empty and must not have nulls"); CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)), @@ -308,23 +307,22 @@ std::unique_ptr replace(strings_column_view const& strings, auto d_repls = *repls_column; // copy the null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), replace_multi_fn{d_strings, d_targets, d_repls}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream); + create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value()); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, replace_multi_fn{d_strings, d_targets, d_repls, d_offsets, d_chars}); @@ -340,11 +338,11 @@ std::unique_ptr replace(strings_column_view const& strings, std::unique_ptr replace_nulls(strings_column_view const& strings, string_scalar const& repl, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); string_view d_repl(repl.data(), repl.size()); @@ -359,15 +357,15 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, : d_strings.element(idx).size_bytes(); }); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, mr, stream.value()); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) { @@ -375,7 +373,7 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, if (!d_strings.is_null(idx)) d_str = d_strings.element(idx); memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes()); }); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -396,7 +394,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace(strings, target, repl, maxrepl, mr); + return detail::replace(strings, target, repl, maxrepl, rmm::cuda_stream_default, mr); } std::unique_ptr replace_slice(strings_column_view const& strings, @@ -406,7 +404,7 @@ std::unique_ptr replace_slice(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_slice(strings, repl, start, stop, mr); + return detail::replace_slice(strings, repl, start, stop, rmm::cuda_stream_default, mr); } std::unique_ptr replace(strings_column_view const& strings, @@ -415,7 +413,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace(strings, targets, repls, mr); + return detail::replace(strings, targets, repls, rmm::cuda_stream_default, mr); } std::unique_ptr replace_nulls(strings_column_view const& strings, @@ -423,7 +421,7 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_nulls(strings, repl, mr); + return detail::replace_nulls(strings, repl, rmm::cuda_stream_default, mr); } } // namespace strings From 1340241c74a825d9fad939460733f375e7866873 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 14:26:12 +1100 Subject: [PATCH 32/51] Convert reshape/tile to cuda_stream_view --- cpp/include/cudf/detail/reshape.hpp | 4 +++- cpp/src/reshape/tile.cu | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp index fa56254b998..fb24b7669d7 100644 --- a/cpp/include/cudf/detail/reshape.hpp +++ b/cpp/include/cudf/detail/reshape.hpp @@ -18,6 +18,8 @@ #include +#include + #include namespace cudf { @@ -30,7 +32,7 @@ namespace detail { std::unique_ptr
tile( table_view const& input, size_type count, - cudaStream_t stream = 0, + rmm::cuda_stream_view = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu index 2803ee1bab3..c912143f6d7 100644 --- a/cpp/src/reshape/tile.cu +++ b/cpp/src/reshape/tile.cu @@ -26,6 +26,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace { @@ -39,7 +40,7 @@ struct tile_functor { namespace detail { std::unique_ptr
tile(const table_view &in, size_type count, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { CUDF_EXPECTS(count >= 0, "Count cannot be negative"); @@ -61,7 +62,7 @@ std::unique_ptr
tile(const table_view &in, rmm::mr::device_memory_resource *mr) { CUDF_FUNC_RANGE(); - return detail::tile(in, count, 0, mr); + return detail::tile(in, count, rmm::cuda_stream_default, mr); } } // namespace cudf From 7a0c0f2834201ef392fe754e4defd55baba225d9 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 14:28:33 +1100 Subject: [PATCH 33/51] Convert round to cuda_stream_view --- cpp/include/cudf/detail/round.hpp | 4 +++- cpp/src/round/round.cu | 16 +++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp index 7e9fb03e0b0..c56686fa113 100644 --- a/cpp/include/cudf/detail/round.hpp +++ b/cpp/include/cudf/detail/round.hpp @@ -18,6 +18,8 @@ #include +#include + namespace cudf { //! Inner interfaces and implementations namespace detail { @@ -32,7 +34,7 @@ std::unique_ptr round( column_view const& input, int32_t decimal_places, rounding_method method, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index dab1dce1a35..701fa35d262 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -28,6 +28,8 @@ #include #include +#include + #include namespace cudf { @@ -201,7 +203,7 @@ template ()>* = nullptr> std::unique_ptr round_with(column_view const& input, int32_t decimal_places, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { using Functor = RoundFunctor; @@ -215,7 +217,7 @@ std::unique_ptr round_with(column_view const& input, auto out_view = result->mutable_view(); T const n = std::pow(10, std::abs(decimal_places)); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), out_view.begin(), @@ -230,7 +232,7 @@ template ()>* = nullptr> std::unique_ptr round_with(column_view const& input, int32_t decimal_places, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { using namespace numeric; @@ -254,7 +256,7 @@ std::unique_ptr round_with(column_view const& input, auto out_view = result->mutable_view(); Type const n = std::pow(10, std::abs(decimal_places + input.type().scale())); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), out_view.begin(), @@ -276,7 +278,7 @@ struct round_type_dispatcher { column_view const& input, int32_t decimal_places, cudf::rounding_method method, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // clang-format off @@ -302,7 +304,7 @@ struct round_type_dispatcher { std::unique_ptr round(column_view const& input, int32_t decimal_places, cudf::rounding_method method, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(cudf::is_numeric(input.type()) || cudf::is_fixed_point(input.type()), @@ -328,7 +330,7 @@ std::unique_ptr round(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::round(input, decimal_places, method, 0, mr); + return cudf::detail::round(input, decimal_places, method, rmm::cuda_stream_default, mr); } } // namespace cudf From 61446f81170a2fd23b2913b4edc3bd3b1e3263d1 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 14:54:30 +1100 Subject: [PATCH 34/51] Convert scatter to cuda_steam_view --- cpp/include/cudf/detail/scatter.cuh | 48 ++++----- cpp/include/cudf/detail/scatter.hpp | 32 +++--- cpp/include/cudf/strings/detail/scatter.cuh | 18 ++-- cpp/src/copying/scatter.cu | 104 +++++++++++--------- cpp/src/groupby/sort/sort_helper.cu | 4 +- cpp/src/hash/hashing.cu | 9 +- cpp/src/partitioning/partitioning.cu | 4 +- 7 files changed, 113 insertions(+), 106 deletions(-) diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 6d93c78fd3e..da5814933fa 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -28,6 +28,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -54,7 +56,7 @@ template auto scatter_to_gather(MapIterator scatter_map_begin, MapIterator scatter_map_end, size_type gather_rows, - cudaStream_t stream) + rmm::cuda_stream_view stream) { using MapValueType = typename thrust::iterator_traits::value_type; @@ -66,7 +68,7 @@ auto scatter_to_gather(MapIterator scatter_map_begin, // Convert scatter map to a gather map thrust::scatter( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(std::distance(scatter_map_begin, scatter_map_end)), scatter_map_begin, @@ -81,8 +83,8 @@ struct column_scatterer_impl { MapIterator scatter_map_begin, MapIterator scatter_map_end, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { auto result = std::make_unique(target, stream, mr); auto result_view = result->mutable_view(); @@ -91,7 +93,7 @@ struct column_scatterer_impl { // NOTE use source.begin + scatter rows rather than source.end in case the // scatter map is smaller than the number of source rows - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), source.begin(), source.begin() + cudf::distance(scatter_map_begin, scatter_map_end), scatter_map_begin, @@ -107,14 +109,14 @@ struct column_scatterer_impl { MapIterator scatter_map_begin, MapIterator scatter_map_end, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { using strings::detail::create_string_vector_from_column; - auto const source_vector = create_string_vector_from_column(source, stream); + auto const source_vector = create_string_vector_from_column(source, stream.value()); auto const begin = source_vector.begin(); auto const end = begin + std::distance(scatter_map_begin, scatter_map_end); - return strings::detail::scatter(begin, end, scatter_map_begin, target, mr, stream); + return strings::detail::scatter(begin, end, scatter_map_begin, target, stream, mr); } }; @@ -125,11 +127,11 @@ struct column_scatterer { MapIterator scatter_map_begin, MapIterator scatter_map_end, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { column_scatterer_impl scatterer{}; - return scatterer(source, scatter_map_begin, scatter_map_end, target, mr, stream); + return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr); } }; @@ -139,8 +141,8 @@ struct column_scatterer_impl { MapIterator scatter_map_begin, MapIterator scatter_map_end, column_view const& target_in, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { if (target_in.is_empty()) // empty begets empty return make_empty_column(data_type{type_id::DICTIONARY32}); @@ -154,17 +156,17 @@ struct column_scatterer_impl { "scatter dictionary keys must be the same type"); // first combine keys so both dictionaries have the same set - auto target_matched = dictionary::detail::add_keys(target, source.keys(), mr, stream); + auto target_matched = dictionary::detail::add_keys(target, source.keys(), mr, stream.value()); auto const target_view = dictionary_column_view(target_matched->view()); auto source_matched = dictionary::detail::set_keys( - source, target_view.keys(), rmm::mr::get_current_device_resource(), stream); + source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); auto const source_view = dictionary_column_view(source_matched->view()); // now build the new indices by doing a scatter on just the matched indices auto source_itr = indexalator_factory::make_input_iterator(source_view.indices()); auto new_indices = std::make_unique(target_view.get_indices_annotated(), stream, mr); auto target_itr = indexalator_factory::make_output_iterator(new_indices->mutable_view()); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), source_itr, source_itr + std::distance(scatter_map_begin, scatter_map_end), scatter_map_begin, @@ -221,8 +223,8 @@ struct column_scatterer_impl { * are to be scattered * @param[in] check_bounds Optionally perform bounds checking on the values of * `scatter_map` and throw an error if any of its values are out of bounds. - * @param[in] mr Device memory resource used to allocate the returned table's device memory * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned table's device memory * * @return Result of scattering values from source to target **/ @@ -233,8 +235,8 @@ std::unique_ptr
scatter( MapIterator scatter_map_end, table_view const& target, bool check_bounds = false, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_FUNC_RANGE(); @@ -247,7 +249,7 @@ std::unique_ptr
scatter( CUDF_EXPECTS( std::distance(scatter_map_begin, scatter_map_end) == thrust::count_if( - rmm::exec_policy(stream)->on(stream), scatter_map_begin, scatter_map_end, bounds), + rmm::exec_policy(stream)->on(stream.value()), scatter_map_begin, scatter_map_end, bounds), "Scatter map index out of bounds"); } @@ -276,8 +278,8 @@ std::unique_ptr
scatter( updated_scatter_map_begin, updated_scatter_map_end, target_col, - mr, - stream); + stream, + mr); }); auto gather_map = scatter_to_gather( diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index 6f92ae3b553..a5676c86f49 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,9 @@ #include #include #include +#include + +#include #include @@ -55,8 +58,8 @@ namespace detail { * are to be scattered * @param check_bounds Optionally perform bounds checking on the values of * `scatter_map` and throw an error if any of its values are out of bounds. - * @param mr Device memory resource used to allocate the returned table's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target **/ std::unique_ptr
scatter( @@ -64,8 +67,8 @@ std::unique_ptr
scatter( column_view const& scatter_map, table_view const& target, bool check_bounds = false, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Scatters a row of scalar values into a copy of the target table @@ -95,8 +98,8 @@ std::unique_ptr
scatter( * are to be scattered * @param check_bounds Optionally perform bounds checking on the values of * `scatter_map` and throw an error if any of its values are out of bounds. - * @param mr Device memory resource used to allocate the returned table's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target **/ std::unique_ptr
scatter( @@ -104,8 +107,8 @@ std::unique_ptr
scatter( column_view const& indices, table_view const& target, bool check_bounds = false, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::boolean_mask_scatter( @@ -115,11 +118,12 @@ std::unique_ptr
scatter( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr
boolean_mask_scatter(table_view const& source, - table_view const& target, - column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); +std::unique_ptr
boolean_mask_scatter( + table_view const& source, + table_view const& target, + column_view const& boolean_mask, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::boolean_mask_scatter( @@ -134,8 +138,8 @@ std::unique_ptr
boolean_mask_scatter( std::vector> const& source, table_view const& target, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index 4f495afa099..9e0497052a6 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -57,28 +57,28 @@ std::unique_ptr scatter( SourceIterator end, MapIterator scatter_map, strings_column_view const& target, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = target.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); // create null mask -- caller must update this rmm::device_buffer null_mask{0, stream, mr}; - if (target.has_nulls()) - null_mask = cudf::detail::copy_bitmask(target.parent(), rmm::cuda_stream_view{stream}, mr); + if (target.has_nulls()) null_mask = cudf::detail::copy_bitmask(target.parent(), stream, mr); // create string vectors - rmm::device_vector target_vector = create_string_vector_from_column(target, stream); + rmm::device_vector target_vector = + create_string_vector_from_column(target, stream.value()); // do the scatter thrust::scatter( - rmm::exec_policy(stream)->on(stream), begin, end, scatter_map, target_vector.begin()); + rmm::exec_policy(stream)->on(stream.value()), begin, end, scatter_map, target_vector.begin()); // build offsets column - auto offsets_column = child_offsets_from_string_vector(target_vector, mr, stream); + auto offsets_column = child_offsets_from_string_vector(target_vector, mr, stream.value()); // build chars column auto chars_column = child_chars_from_string_vector( - target_vector, offsets_column->view().data(), 0, mr, stream); + target_vector, offsets_column->view().data(), 0, mr, stream.value()); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 373ed224f99..7b50477fc20 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -32,6 +32,8 @@ #include #include +#include + #include #include #include @@ -65,8 +67,8 @@ void scatter_scalar_bitmask(std::vector> co MapIterator scatter_map, size_type num_scatter_rows, std::vector>& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { constexpr size_type block_size = 256; size_type const grid_size = grid_1d(num_scatter_rows, block_size).num_blocks; @@ -84,7 +86,7 @@ void scatter_scalar_bitmask(std::vector> co auto bitmask_kernel = source_is_valid ? marking_bitmask_kernel : marking_bitmask_kernel; - bitmask_kernel<<>>( + bitmask_kernel<<>>( *target_view, scatter_map, num_scatter_rows); } } @@ -96,8 +98,8 @@ struct column_scalar_scatterer_impl { MapIterator scatter_iter, size_type scatter_rows, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match"); @@ -111,7 +113,7 @@ struct column_scalar_scatterer_impl { auto scalar_iter = thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0)); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), scalar_iter, scalar_iter + scatter_rows, scatter_iter, @@ -127,8 +129,8 @@ struct column_scalar_scatterer_impl { MapIterator scatter_iter, size_type scatter_rows, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match"); @@ -136,7 +138,7 @@ struct column_scalar_scatterer_impl { auto const source_view = string_view(scalar_impl->data(), scalar_impl->size()); auto const begin = thrust::make_constant_iterator(source_view); auto const end = begin + scatter_rows; - return strings::detail::scatter(begin, end, scatter_iter, target, mr, stream); + return strings::detail::scatter(begin, end, scatter_iter, target, stream, mr); } }; @@ -146,8 +148,8 @@ struct column_scalar_scatterer_impl { MapIterator scatter_iter, size_type scatter_rows, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("scatter scalar to list_view not implemented"); } @@ -159,8 +161,8 @@ struct column_scalar_scatterer_impl { MapIterator scatter_iter, size_type scatter_rows, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("scatter scalar to struct_view not implemented"); } @@ -172,27 +174,29 @@ struct column_scalar_scatterer_impl { MapIterator scatter_iter, size_type scatter_rows, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { auto dict_target = dictionary::detail::add_keys( dictionary_column_view(target), make_column_from_scalar(source.get(), 1, stream, rmm::mr::get_current_device_resource()) ->view(), mr, - stream); + stream.value()); auto dict_view = dictionary_column_view(dict_target->view()); auto scalar_index = dictionary::detail::get_index( - dict_view, source.get(), rmm::mr::get_current_device_resource(), stream); + dict_view, source.get(), rmm::mr::get_current_device_resource(), stream.value()); auto scalar_iter = thrust::make_permutation_iterator( indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0)); auto new_indices = std::make_unique(dict_view.get_indices_annotated(), stream, mr); auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view()); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), scalar_iter, scalar_iter + scatter_rows, scatter_iter, target_iter); + // build the dictionary indices column from the result auto const indices_type = new_indices->type(); auto const output_size = new_indices->size(); @@ -220,11 +224,11 @@ struct column_scalar_scatterer { MapIterator scatter_iter, size_type scatter_rows, column_view const& target, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { column_scalar_scatterer_impl scatterer{}; - return scatterer(source, scatter_iter, scatter_rows, target, mr, stream); + return scatterer(source, scatter_iter, scatter_rows, target, stream, mr); } }; @@ -234,8 +238,8 @@ std::unique_ptr
scatter(table_view const& source, column_view const& scatter_map, table_view const& target, bool check_bounds, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(source.num_columns() == target.num_columns(), "Number of columns in source and target not equal"); @@ -255,15 +259,15 @@ std::unique_ptr
scatter(table_view const& source, // create index type normalizing iterator for the scatter_map auto map_begin = indexalator_factory::make_input_iterator(scatter_map); auto map_end = map_begin + scatter_map.size(); - return detail::scatter(source, map_begin, map_end, target, check_bounds, mr, stream); + return detail::scatter(source, map_begin, map_end, target, check_bounds, stream, mr); } std::unique_ptr
scatter(std::vector> const& source, column_view const& indices, table_view const& target, bool check_bounds, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(source.size() == static_cast(target.num_columns()), "Number of columns in source and target not equal"); @@ -279,7 +283,7 @@ std::unique_ptr
scatter(std::vector> auto const n_rows = target.num_rows(); if (check_bounds) { CUDF_EXPECTS( - indices.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream), + indices.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream.value()), map_begin, map_end, [n_rows] __device__(size_type index) { @@ -307,11 +311,11 @@ std::unique_ptr
scatter(std::vector> scatter_iter, scatter_rows, target_col, - mr, - stream); + stream, + mr); }); - scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, mr, stream); + scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, stream, mr); return std::make_unique
(std::move(result)); } @@ -319,27 +323,29 @@ std::unique_ptr
scatter(std::vector> std::unique_ptr boolean_mask_scatter(column_view const& input, column_view const& target, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto indices = cudf::make_numeric_column( data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream); auto mutable_indices = indices->mutable_view(); - thrust::sequence(rmm::exec_policy(stream)->on(stream), + thrust::sequence(rmm::exec_policy(stream)->on(stream.value()), mutable_indices.begin(), mutable_indices.end(), 0); // The scatter map is actually a table with only one column, which is scatter map. - auto scatter_map = detail::apply_boolean_mask( - table_view{{indices->view()}}, boolean_mask, rmm::mr::get_current_device_resource(), stream); + auto scatter_map = detail::apply_boolean_mask(table_view{{indices->view()}}, + boolean_mask, + rmm::mr::get_current_device_resource(), + stream.value()); auto output_table = detail::scatter(table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, false, - mr, - stream); + stream, + mr); // There is only one column in output_table return std::make_unique(std::move(output_table->get_column(0))); @@ -348,8 +354,8 @@ std::unique_ptr boolean_mask_scatter(column_view const& input, std::unique_ptr boolean_mask_scatter(scalar const& input, column_view const& target, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return detail::copy_if_else(input, target, boolean_mask, stream, mr); } @@ -357,8 +363,8 @@ std::unique_ptr boolean_mask_scatter(scalar const& input, std::unique_ptr
boolean_mask_scatter(table_view const& input, table_view const& target, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.num_columns() == target.num_columns(), "Mismatch in number of input columns and target columns"); @@ -382,7 +388,7 @@ std::unique_ptr
boolean_mask_scatter(table_view const& input, target.begin(), out_columns.begin(), [&boolean_mask, mr, stream](auto const& input_column, auto const& target_column) { - return boolean_mask_scatter(input_column, target_column, boolean_mask, mr, stream); + return boolean_mask_scatter(input_column, target_column, boolean_mask, stream, mr); }); return std::make_unique
(std::move(out_columns)); @@ -395,8 +401,8 @@ std::unique_ptr
boolean_mask_scatter( std::vector> const& input, table_view const& target, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(static_cast(input.size()) == target.num_columns(), "Mismatch in number of scalars and target columns"); @@ -421,7 +427,7 @@ std::unique_ptr
boolean_mask_scatter( out_columns.begin(), [&boolean_mask, mr, stream](auto const& scalar, auto const& target_column) { return boolean_mask_scatter( - scalar.get(), target_column, boolean_mask, mr, stream); + scalar.get(), target_column, boolean_mask, stream, mr); }); return std::make_unique
(std::move(out_columns)); @@ -439,7 +445,7 @@ std::unique_ptr
scatter(table_view const& source, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::scatter(source, scatter_map, target, check_bounds, mr); + return detail::scatter(source, scatter_map, target, check_bounds, rmm::cuda_stream_default, mr); } std::unique_ptr
scatter(std::vector> const& source, @@ -449,7 +455,7 @@ std::unique_ptr
scatter(std::vector> rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::scatter(source, indices, target, check_bounds, mr); + return detail::scatter(source, indices, target, check_bounds, rmm::cuda_stream_default, mr); } std::unique_ptr
boolean_mask_scatter(table_view const& input, @@ -458,7 +464,7 @@ std::unique_ptr
boolean_mask_scatter(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::boolean_mask_scatter(input, target, boolean_mask, mr); + return detail::boolean_mask_scatter(input, target, boolean_mask, rmm::cuda_stream_default, mr); } std::unique_ptr
boolean_mask_scatter( @@ -468,7 +474,7 @@ std::unique_ptr
boolean_mask_scatter( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::boolean_mask_scatter(input, target, boolean_mask, mr); + return detail::boolean_mask_scatter(input, target, boolean_mask, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 064c3e97b20..93d785b78f0 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -231,8 +231,8 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre scatter_map, table_view({temp_labels->view()}), false, - rmm::mr::get_current_device_resource(), - stream.value()); + stream, + rmm::mr::get_current_device_resource()); _unsorted_keys_labels = std::move(t_unsorted_keys_labels->release()[0]); diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index ab703c78261..8e91de9707f 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -602,13 +602,8 @@ std::pair, std::vector> hash_partition_table( row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr); // Use the resulting scatter map to materialize the output - auto output = detail::scatter(input, - row_partition_numbers.begin(), - row_partition_numbers.end(), - input, - false, - mr, - stream.value()); + auto output = detail::scatter( + input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr); return std::make_pair(std::move(output), std::move(partition_offsets)); } diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index c63b7079a07..3d0f35568f4 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -614,7 +614,7 @@ std::pair, std::vector> hash_partition_table( // Use the resulting scatter map to materialize the output auto output = detail::scatter( - input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, mr, stream); + input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr); return std::make_pair(std::move(output), std::move(partition_offsets)); } @@ -702,7 +702,7 @@ struct dispatch_map_type { // Scatter the rows into their partitions auto scattered = - cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, mr, stream); + cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr); return std::make_pair(std::move(scattered), std::move(partition_offsets)); } From 091aa274dbd91e46a81d5894a95a02cc89e95921 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 15:17:11 +1100 Subject: [PATCH 35/51] Convert search to cuda_stream_view --- cpp/include/cudf/detail/search.hpp | 18 ++-- cpp/include/cudf/dictionary/detail/search.hpp | 12 ++- cpp/src/copying/scatter.cu | 2 +- cpp/src/dictionary/add_keys.cu | 4 +- cpp/src/dictionary/remove_keys.cu | 4 +- cpp/src/dictionary/replace.cu | 2 +- cpp/src/dictionary/search.cu | 44 +++++---- cpp/src/dictionary/set_keys.cu | 2 +- cpp/src/filling/fill.cu | 2 +- cpp/src/replace/clamp.cu | 8 +- cpp/src/search/search.cu | 96 ++++++++++--------- cpp/src/transform/encode.cu | 4 +- 12 files changed, 106 insertions(+), 92 deletions(-) diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp index 3eca864ab52..c986418c790 100644 --- a/cpp/include/cudf/detail/search.hpp +++ b/cpp/include/cudf/detail/search.hpp @@ -21,6 +21,8 @@ #include #include +#include + #include namespace cudf { @@ -35,8 +37,8 @@ std::unique_ptr lower_bound( table_view const& values, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t steam = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::upper_bound @@ -48,8 +50,8 @@ std::unique_ptr upper_bound( table_view const& values, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::contains(column_view const&, scalar const&, @@ -57,7 +59,9 @@ std::unique_ptr upper_bound( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -bool contains(column_view const& col, scalar const& value, cudaStream_t stream = 0); +bool contains(column_view const& col, + scalar const& value, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @copydoc cudf::contains(column_view const&, column_view const&, @@ -68,8 +72,8 @@ bool contains(column_view const& col, scalar const& value, cudaStream_t stream = std::unique_ptr contains( column_view const& haystack, column_view const& needles, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp index 21ab0c92acd..cc0e8d0319b 100644 --- a/cpp/include/cudf/dictionary/detail/search.hpp +++ b/cpp/include/cudf/dictionary/detail/search.hpp @@ -17,6 +17,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -30,8 +32,8 @@ namespace detail { std::unique_ptr get_index( dictionary_column_view const& dictionary, scalar const& key, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Get the index for a key if it were added to the given dictionary. @@ -48,15 +50,15 @@ std::unique_ptr get_index( * * @param dictionary The dictionary to search for the key. * @param key The value to search for in the dictionary keyset. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Numeric scalar index value of the key within the dictionary */ std::unique_ptr get_insert_index( dictionary_column_view const& dictionary, scalar const& key, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace dictionary diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 7b50477fc20..887972f2dd6 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -185,7 +185,7 @@ struct column_scalar_scatterer_impl { stream.value()); auto dict_view = dictionary_column_view(dict_target->view()); auto scalar_index = dictionary::detail::get_index( - dict_view, source.get(), rmm::mr::get_current_device_resource(), stream.value()); + dict_view, source.get(), stream, rmm::mr::get_current_device_resource()); auto scalar_iter = thrust::make_permutation_iterator( indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0)); auto new_indices = std::make_unique(dict_view.get_indices_annotated(), stream, mr); diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index c02f38e2a0e..64ce8d1e07e 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -76,8 +76,8 @@ std::unique_ptr add_keys( table_view{{old_keys}}, std::vector{order::ASCENDING}, std::vector{null_order::AFTER}, // should be no nulls here - mr, - stream); + stream, + mr); // now create the indices column -- map old values to the new ones // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0] column_view indices_view(dictionary_column.indices().type(), diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index e04c6257692..f0f86a3dd1a 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -153,7 +153,7 @@ std::unique_ptr remove_keys( CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match"); // locate keys to remove by searching the keys column - auto const matches = cudf::detail::contains(keys_view, keys_to_remove, mr, stream); + auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr); auto d_matches = matches->view().data(); // call common utility method to keep the keys not matched to keys_to_remove auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; }; @@ -177,7 +177,7 @@ std::unique_ptr remove_unused_keys( rmm::exec_policy(stream)->on(stream), keys_positions.begin(), keys_positions.end()); // wrap the indices for comparison in contains() column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data()); - return cudf::detail::contains(keys_positions_view, indices_view, mr, stream); + return cudf::detail::contains(keys_positions_view, indices_view, stream, mr); }(); auto d_matches = matches->view().data(); diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 097490c4ff3..27a85c03898 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -178,7 +178,7 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, auto input_matched = dictionary::detail::add_keys( input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream.value()); auto const input_view = dictionary_column_view(input_matched->view()); - auto const scalar_index = get_index(input_view, replacement, default_mr, stream.value()); + auto const scalar_index = get_index(input_view, replacement, stream, default_mr); // now build the new indices by doing replace-null on the updated indices auto const input_indices = input_view.get_indices_annotated(); diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu index 942415ffb77..5f82fc8a36d 100644 --- a/cpp/src/dictionary/search.cu +++ b/cpp/src/dictionary/search.cu @@ -21,6 +21,8 @@ #include #include +#include + #include #include @@ -34,7 +36,7 @@ struct dispatch_scalar_index { template ()>* = nullptr> std::unique_ptr operator()(size_type index, bool is_valid, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { return std::make_unique>(index, is_valid, stream, mr); @@ -63,8 +65,8 @@ struct find_index_fn { not std::is_same::value>* = nullptr> std::unique_ptr operator()(dictionary_column_view const& input, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { if (!key.is_valid()) return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr); @@ -92,8 +94,8 @@ struct find_index_fn { std::enable_if_t::value>* = nullptr> std::unique_ptr operator()(dictionary_column_view const& input, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("dictionary column cannot be the keys column of another dictionary"); } @@ -101,8 +103,8 @@ struct find_index_fn { template ::value>* = nullptr> std::unique_ptr operator()(dictionary_column_view const& input, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("list_view column cannot be the keys column of a dictionary"); } @@ -111,8 +113,8 @@ struct find_index_fn { std::enable_if_t::value>* = nullptr> std::unique_ptr operator()(dictionary_column_view const& input, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("struct_view column cannot be the keys column of a dictionary"); } @@ -125,8 +127,8 @@ struct find_insert_index_fn { not std::is_same::value>* = nullptr> std::unique_ptr operator()(dictionary_column_view const& input, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { if (!key.is_valid()) return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr); @@ -137,7 +139,7 @@ struct find_insert_index_fn { using ScalarType = cudf::scalar_type_t; auto find_key = static_cast(key).value(stream); auto keys_view = column_device_view::create(input.keys(), stream); - auto iter = thrust::lower_bound(rmm::exec_policy(stream)->on(stream), + auto iter = thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()), keys_view->begin(), keys_view->end(), find_key); @@ -155,8 +157,8 @@ struct find_insert_index_fn { std::is_same::value>* = nullptr> std::unique_ptr operator()(dictionary_column_view const& input, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("column cannot be the keys for dictionary"); } @@ -166,23 +168,23 @@ struct find_insert_index_fn { std::unique_ptr get_index(dictionary_column_view const& dictionary, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (dictionary.is_empty()) return std::make_unique>(0, false, stream, mr); - return type_dispatcher(dictionary.keys().type(), find_index_fn(), dictionary, key, mr, stream); + return type_dispatcher(dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr); } std::unique_ptr get_insert_index(dictionary_column_view const& dictionary, scalar const& key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (dictionary.is_empty()) return std::make_unique>(0, false, stream, mr); return type_dispatcher( - dictionary.keys().type(), find_insert_index_fn(), dictionary, key, mr, stream); + dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr); } } // namespace detail @@ -194,7 +196,7 @@ std::unique_ptr get_index(dictionary_column_view const& dictionary, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::get_index(dictionary, key, mr); + return detail::get_index(dictionary, key, rmm::cuda_stream_default, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index d95fdefe153..ae4a817f182 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -119,7 +119,7 @@ std::unique_ptr set_keys( std::unique_ptr keys_column(std::move(table_keys.front())); // compute the new nulls - auto matches = cudf::detail::contains(keys, keys_column->view(), mr, stream); + auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr); auto d_matches = matches->view().data(); auto indices_itr = cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices()); diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index 6fba9bc01a5..d4fd526ff4b 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -171,7 +171,7 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()view(), value, rmm::mr::get_current_device_resource(), stream.value()); + target_matched->view(), value, stream, rmm::mr::get_current_device_resource()); // now call fill using just the indices column and the new index auto new_indices = cudf::type_dispatcher(target_indices.type(), diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index a2fd8c91bc7..cdd8d78fdef 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -338,15 +338,15 @@ std::unique_ptr dispatch_clamp::operator()( // get the indexes for lo_replace and for hi_replace auto lo_replace_index = dictionary::detail::get_index( - matched_view, lo_replace, rmm::mr::get_current_device_resource(), stream); + matched_view, lo_replace, stream, rmm::mr::get_current_device_resource()); auto hi_replace_index = dictionary::detail::get_index( - matched_view, hi_replace, rmm::mr::get_current_device_resource(), stream); + matched_view, hi_replace, stream, rmm::mr::get_current_device_resource()); // get the closest indexes for lo and for hi auto lo_index = dictionary::detail::get_insert_index( - matched_view, lo, rmm::mr::get_current_device_resource(), stream); + matched_view, lo, stream, rmm::mr::get_current_device_resource()); auto hi_index = dictionary::detail::get_insert_index( - matched_view, hi, rmm::mr::get_current_device_resource(), stream); + matched_view, hi, stream, rmm::mr::get_current_device_resource()); // call clamp with the scalar indexes and the matched indices auto matched_indices = matched_view.get_indices_annotated(); diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index 92b7ea49fd3..e8d776d0d2a 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -26,9 +26,12 @@ #include #include -#include #include +#include + +#include + namespace cudf { namespace { template on(stream), + thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()), it_data, it_data + data_size, it_vals, @@ -53,7 +56,7 @@ void launch_search(DataIterator it_data, it_output, comp); } else { - thrust::upper_bound(rmm::exec_policy(stream)->on(stream), + thrust::upper_bound(rmm::exec_policy(stream)->on(stream.value()), it_data, it_data + data_size, it_vals, @@ -68,8 +71,8 @@ std::unique_ptr search_ordered(table_view const& t, bool find_first, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Allocate result column std::unique_ptr result = make_numeric_column( @@ -79,7 +82,8 @@ std::unique_ptr search_ordered(table_view const& t, // Handle empty inputs if (t.num_rows() == 0) { - CUDA_TRY(cudaMemset(result_view.data(), 0, values.num_rows() * sizeof(size_type))); + CUDA_TRY(cudaMemsetAsync( + result_view.data(), 0, values.num_rows() * sizeof(size_type), stream.value())); return result; } @@ -96,7 +100,7 @@ std::unique_ptr search_ordered(table_view const& t, // This utility will ensure all corresponding dictionary columns have matching keys. // It will return any new dictionary columns created as well as updated table_views. auto matched = dictionary::detail::match_dictionaries( - {t, values}, rmm::mr::get_current_device_resource(), stream); + {t, values}, rmm::mr::get_current_device_resource(), stream.value()); auto d_t = table_device_view::create(matched.second.front(), stream); auto d_values = table_device_view::create(matched.second.back(), stream); auto count_it = thrust::make_counting_iterator(0); @@ -143,7 +147,7 @@ std::unique_ptr search_ordered(table_view const& t, struct contains_scalar_dispatch { template - bool operator()(column_view const& col, scalar const& value, cudaStream_t stream) + bool operator()(column_view const& col, scalar const& value, rmm::cuda_stream_view stream) { CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match"); @@ -153,14 +157,14 @@ struct contains_scalar_dispatch { auto s = static_cast(&value); if (col.has_nulls()) { - auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream), + auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream.value()), d_col->pair_begin(), d_col->pair_end(), thrust::make_pair(s->value(), true)); return found_iter != d_col->pair_end(); } else { - auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream), // + auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream.value()), // d_col->begin(), d_col->end(), s->value()); @@ -173,7 +177,7 @@ struct contains_scalar_dispatch { template <> bool contains_scalar_dispatch::operator()(column_view const& col, scalar const& value, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_FAIL("list_view type not supported yet"); } @@ -181,7 +185,7 @@ bool contains_scalar_dispatch::operator()(column_view const& co template <> bool contains_scalar_dispatch::operator()(column_view const& col, scalar const& value, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_FAIL("struct_view type not supported yet"); } @@ -189,12 +193,12 @@ bool contains_scalar_dispatch::operator()(column_view const& template <> bool contains_scalar_dispatch::operator()(column_view const& col, scalar const& value, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto dict_col = cudf::dictionary_column_view(col); // first, find the value in the dictionary's key set auto index = cudf::dictionary::detail::get_index( - dict_col, value, rmm::mr::get_current_device_resource(), stream); + dict_col, value, stream, rmm::mr::get_current_device_resource()); // if found, check the index is actually in the indices column return index->is_valid() ? cudf::type_dispatcher(dict_col.indices().type(), contains_scalar_dispatch{}, @@ -207,7 +211,7 @@ bool contains_scalar_dispatch::operator()(column_view const& } // namespace namespace detail { -bool contains(column_view const& col, scalar const& value, cudaStream_t stream) +bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream) { if (col.is_empty()) { return false; } @@ -220,8 +224,8 @@ struct multi_contains_dispatch { template std::unique_ptr operator()(column_view const& haystack, column_view const& needles, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::unique_ptr result = make_numeric_column(data_type{type_to_id()}, haystack.size(), @@ -235,21 +239,21 @@ struct multi_contains_dispatch { mutable_column_view result_view = result.get()->mutable_view(); if (needles.is_empty()) { - thrust::fill(rmm::exec_policy(stream)->on(stream), + thrust::fill(rmm::exec_policy(stream)->on(stream.value()), result_view.begin(), result_view.end(), false); return result; } - auto hash_set = cudf::detail::unordered_multiset::create(needles, stream); + auto hash_set = cudf::detail::unordered_multiset::create(needles, stream.value()); auto device_hash_set = hash_set.to_device(); auto d_haystack_ptr = column_device_view::create(haystack, stream); auto d_haystack = *d_haystack_ptr; if (haystack.has_nulls()) { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(haystack.size()), result_view.begin(), @@ -258,7 +262,7 @@ struct multi_contains_dispatch { device_hash_set.contains(d_haystack.element(index)); }); } else { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(haystack.size()), result_view.begin(), @@ -275,8 +279,8 @@ template <> std::unique_ptr multi_contains_dispatch::operator()( column_view const& haystack, column_view const& needles, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("list_view type not supported"); } @@ -285,8 +289,8 @@ template <> std::unique_ptr multi_contains_dispatch::operator()( column_view const& haystack, column_view const& needles, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("struct_view type not supported"); } @@ -295,17 +299,17 @@ template <> std::unique_ptr multi_contains_dispatch::operator()( column_view const& haystack_in, column_view const& needles_in, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { dictionary_column_view const haystack(haystack_in); dictionary_column_view const needles(needles_in); // first combine keys so both dictionaries have the same set auto haystack_matched = dictionary::detail::add_keys( - haystack, needles.keys(), rmm::mr::get_current_device_resource(), stream); + haystack, needles.keys(), rmm::mr::get_current_device_resource(), stream.value()); auto const haystack_view = dictionary_column_view(haystack_matched->view()); auto needles_matched = dictionary::detail::set_keys( - needles, haystack_view.keys(), rmm::mr::get_current_device_resource(), stream); + needles, haystack_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); auto const needles_view = dictionary_column_view(needles_matched->view()); // now just use the indices for the contains @@ -315,39 +319,39 @@ std::unique_ptr multi_contains_dispatch::operator()( multi_contains_dispatch{}, haystack_indices, needles_indices, - mr, - stream); + stream, + mr); } std::unique_ptr contains(column_view const& haystack, column_view const& needles, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch"); return cudf::type_dispatcher( - haystack.type(), multi_contains_dispatch{}, haystack, needles, mr, stream); + haystack.type(), multi_contains_dispatch{}, haystack, needles, stream, mr); } std::unique_ptr lower_bound(table_view const& t, table_view const& values, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return search_ordered(t, values, true, column_order, null_precedence, mr, stream); + return search_ordered(t, values, true, column_order, null_precedence, stream, mr); } std::unique_ptr upper_bound(table_view const& t, table_view const& values, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return search_ordered(t, values, false, column_order, null_precedence, mr, stream); + return search_ordered(t, values, false, column_order, null_precedence, stream, mr); } } // namespace detail @@ -361,7 +365,8 @@ std::unique_ptr lower_bound(table_view const& t, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::lower_bound(t, values, column_order, null_precedence, mr); + return detail::lower_bound( + t, values, column_order, null_precedence, rmm::cuda_stream_default, mr); } std::unique_ptr upper_bound(table_view const& t, @@ -371,13 +376,14 @@ std::unique_ptr upper_bound(table_view const& t, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::upper_bound(t, values, column_order, null_precedence, mr); + return detail::upper_bound( + t, values, column_order, null_precedence, rmm::cuda_stream_default, mr); } bool contains(column_view const& col, scalar const& value) { CUDF_FUNC_RANGE(); - return detail::contains(col, value); + return detail::contains(col, value, rmm::cuda_stream_default); } std::unique_ptr contains(column_view const& haystack, @@ -385,7 +391,7 @@ std::unique_ptr contains(column_view const& haystack, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(haystack, needles, mr); + return detail::contains(haystack, needles, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 57475e0f59e..a9bb84bd1c3 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -87,8 +87,8 @@ std::pair, std::unique_ptr> encode( input_table, std::vector(input_table.num_columns(), order::ASCENDING), std::vector(input_table.num_columns(), null_order::AFTER), - mr, - stream); + stream, + mr); return std::make_pair(std::move(keys_table), std::move(indices_column)); } From 9087cf537f43f3dc714a4679d53858385250f6f3 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 15:21:14 +1100 Subject: [PATCH 36/51] Convert sequence to cuda_stream_view --- cpp/include/cudf/detail/sequence.hpp | 10 ++++--- cpp/src/filling/sequence.cu | 43 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp index c71e97fe79b..c3bbb734476 100644 --- a/cpp/include/cudf/detail/sequence.hpp +++ b/cpp/include/cudf/detail/sequence.hpp @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -33,8 +35,8 @@ std::unique_ptr sequence( size_type size, scalar const& init, scalar const& step, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::sequence(size_type size, scalar const& init, @@ -46,8 +48,8 @@ std::unique_ptr sequence( std::unique_ptr sequence( size_type size, scalar const& init, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu index 50b00c74882..c09eebd8f5a 100644 --- a/cpp/src/filling/sequence.cu +++ b/cpp/src/filling/sequence.cu @@ -14,15 +14,16 @@ * limitations under the License. */ +#include +#include +#include #include #include #include #include #include -#include -#include -#include +#include namespace cudf { namespace detail { @@ -59,8 +60,8 @@ struct sequence_functor { std::unique_ptr operator()(size_type size, scalar const& init, scalar const& step, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr); auto result_device_view = mutable_column_device_view::create(*result, stream); @@ -73,7 +74,7 @@ struct sequence_functor { // not using thrust::sequence because it requires init and step to be passed as // constants, not iterators. to do that we would have to retrieve the scalar values off the gpu, // which is undesirable from a performance perspective. - thrust::tabulate(rmm::exec_policy(stream)->on(stream), + thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()), result_device_view->begin(), result_device_view->end(), tabulator{n_init, n_step}); @@ -87,8 +88,8 @@ struct sequence_functor { std::unique_ptr operator()(size_type size, scalar const& init, scalar const& step, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unsupported sequence scalar type"); } @@ -98,8 +99,8 @@ struct sequence_functor { typename std::enable_if_t() and not cudf::is_boolean()>* = nullptr> std::unique_ptr operator()(size_type size, scalar const& init, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr); auto result_device_view = mutable_column_device_view::create(*result, stream); @@ -110,7 +111,7 @@ struct sequence_functor { // not using thrust::sequence because it requires init and step to be passed as // constants, not iterators. to do that we would have to retrieve the scalar values off the gpu, // which is undesirable from a performance perspective. - thrust::tabulate(rmm::exec_policy(stream)->on(stream), + thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()), result_device_view->begin(), result_device_view->end(), const_tabulator{n_init}); @@ -123,8 +124,8 @@ struct sequence_functor { typename std::enable_if_t() or cudf::is_boolean()>* = nullptr> std::unique_ptr operator()(size_type size, scalar const& init, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unsupported sequence scalar type"); } @@ -135,26 +136,26 @@ struct sequence_functor { std::unique_ptr sequence(size_type size, scalar const& init, scalar const& step, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type."); CUDF_EXPECTS(size >= 0, "size must be >= 0"); CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric"); - return type_dispatcher(init.type(), sequence_functor{}, size, init, step, mr, stream); + return type_dispatcher(init.type(), sequence_functor{}, size, init, step, stream, mr); } std::unique_ptr sequence( size_type size, scalar const& init, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(size >= 0, "size must be >= 0"); CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric"); - return type_dispatcher(init.type(), sequence_functor{}, size, init, mr, stream); + return type_dispatcher(init.type(), sequence_functor{}, size, init, stream, mr); } } // namespace detail @@ -164,14 +165,14 @@ std::unique_ptr sequence(size_type size, scalar const& step, rmm::mr::device_memory_resource* mr) { - return detail::sequence(size, init, step, mr, 0); + return detail::sequence(size, init, step, rmm::cuda_stream_default, mr); } std::unique_ptr sequence(size_type size, scalar const& init, rmm::mr::device_memory_resource* mr) { - return detail::sequence(size, init, mr, 0); + return detail::sequence(size, init, rmm::cuda_stream_default, mr); } } // namespace cudf From 576dae1afcf1cb747c1fe8031fd66e794e875688 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 15:51:38 +1100 Subject: [PATCH 37/51] Convert sorting and stream compaction to cuda_stream_view --- cpp/include/cudf/detail/sorting.hpp | 14 ++++--- cpp/include/cudf/detail/stream_compaction.hpp | 26 ++++++------ cpp/include/cudf/strings/sorting.hpp | 6 ++- cpp/src/copying/scatter.cu | 6 +-- cpp/src/dictionary/add_keys.cu | 4 +- cpp/src/dictionary/detail/concatenate.cu | 4 +- cpp/src/dictionary/set_keys.cu | 4 +- cpp/src/groupby/sort/sort_helper.cu | 12 +++--- cpp/src/reductions/reductions.cpp | 4 +- cpp/src/sort/rank.cu | 4 +- cpp/src/sort/sort.cu | 21 ++++++---- cpp/src/sort/sort_impl.cuh | 16 ++++---- cpp/src/sort/stable_sort.cu | 13 +++--- .../stream_compaction/apply_boolean_mask.cu | 8 ++-- cpp/src/stream_compaction/drop_duplicates.cu | 41 ++++++++++--------- cpp/src/stream_compaction/drop_nans.cu | 10 +++-- cpp/src/stream_compaction/drop_nulls.cu | 9 ++-- cpp/src/strings/sorting/sorting.cu | 8 ++-- cpp/src/transform/encode.cu | 2 +- 19 files changed, 117 insertions(+), 95 deletions(-) diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp index 635678fa813..0ac20ed3c94 100644 --- a/cpp/include/cudf/detail/sorting.hpp +++ b/cpp/include/cudf/detail/sorting.hpp @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -33,8 +35,8 @@ std::unique_ptr sorted_order( table_view input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::stable_sorted_order @@ -45,8 +47,8 @@ std::unique_ptr stable_sorted_order( table_view input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::sort_by_key @@ -58,8 +60,8 @@ std::unique_ptr
sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 46068b64a93..5bc12fb0713 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -32,8 +34,8 @@ std::unique_ptr
drop_nulls( table_view const& input, std::vector const& keys, cudf::size_type keep_threshold, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::drop_nans(table_view const&, std::vector const&, @@ -45,8 +47,8 @@ std::unique_ptr
drop_nans( table_view const& input, std::vector const& keys, cudf::size_type keep_threshold, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::apply_boolean_mask @@ -56,8 +58,8 @@ std::unique_ptr
drop_nans( std::unique_ptr
apply_boolean_mask( table_view const& input, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::drop_duplicates @@ -69,8 +71,8 @@ std::unique_ptr
drop_duplicates( std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) @@ -80,7 +82,7 @@ std::unique_ptr
drop_duplicates( cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @copydoc cudf::distinct_count(table_view const&, null_equality) @@ -88,8 +90,8 @@ cudf::size_type distinct_count(column_view const& input, * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ cudf::size_type distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL, - cudaStream_t stream = 0); + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/strings/sorting.hpp b/cpp/include/cudf/strings/sorting.hpp index 8ce5d43ca12..84ce2e4ec2b 100644 --- a/cpp/include/cudf/strings/sorting.hpp +++ b/cpp/include/cudf/strings/sorting.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -48,7 +50,7 @@ std::unique_ptr sort( sort_type stype, cudf::order order = cudf::order::ASCENDING, cudf::null_order null_order = cudf::null_order::BEFORE, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 887972f2dd6..036962ab744 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -336,10 +336,8 @@ std::unique_ptr boolean_mask_scatter(column_view const& input, 0); // The scatter map is actually a table with only one column, which is scatter map. - auto scatter_map = detail::apply_boolean_mask(table_view{{indices->view()}}, - boolean_mask, - rmm::mr::get_current_device_resource(), - stream.value()); + auto scatter_map = detail::apply_boolean_mask( + table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource()); auto output_table = detail::scatter(table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 64ce8d1e07e..b72b4f38a56 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -65,8 +65,8 @@ std::unique_ptr add_keys( std::vector{0}, // only one key column duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, - mr, - stream) + stream, + mr) ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); // create a map for the indices diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index b83de6575e8..223e2d7c331 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -213,8 +213,8 @@ std::unique_ptr concatenate(std::vector const& columns, std::vector{0}, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, - mr, - stream.value()) + stream, + mr) ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index ae4a817f182..c934e495de3 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -113,8 +113,8 @@ std::unique_ptr set_keys( std::vector{0}, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, - mr, - stream) + stream, + mr) ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 93d785b78f0..4b4c6a96688 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -132,8 +132,8 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream) _keys, {}, std::vector(_keys.num_columns(), null_order::AFTER), - rmm::mr::get_current_device_resource(), - stream.value()); + stream, + rmm::mr::get_current_device_resource()); } else { // Pandas style // Temporarily prepend the keys table with a column that indicates the // presence of a null value within a row. This allows moving all rows that @@ -145,8 +145,8 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream) augmented_keys, {}, std::vector(_keys.num_columns() + 1, null_order::AFTER), - rmm::mr::get_current_device_resource(), - stream.value()); + stream, + rmm::mr::get_current_device_resource()); // All rows with one or more null values are at the end of the resulting sorted order. } @@ -269,8 +269,8 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values( cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(), values}), {}, std::vector(2, null_order::AFTER), - mr, - stream.value()); + stream, + mr); // Zero-copy slice this sort order so that its new size is num_keys() column_view gather_map = cudf::detail::slice(values_sort_order->view(), 0, num_keys(stream)); diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 7afebaab154..8677065a74a 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -67,7 +67,7 @@ struct reduce_dispatch_functor { } break; case aggregation::MEDIAN: { auto sorted_indices = - detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value()); + detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0]; auto col_ptr = detail::quantile( col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, mr); @@ -78,7 +78,7 @@ struct reduce_dispatch_functor { CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1, "Reduction quantile accepts only one quantile value"); auto sorted_indices = - detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value()); + detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0]; auto col_ptr = detail::quantile(col, quantile_agg->_quantiles, diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index a3a16130dfb..50f8155313f 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -259,8 +259,8 @@ std::unique_ptr rank(column_view const &input, std::unique_ptr sorted_order = (method == rank_method::FIRST) ? detail::stable_sorted_order( - table_view{{input}}, {column_order}, {null_precedence}, mr, stream) - : detail::sorted_order(table_view{{input}}, {column_order}, {null_precedence}, mr, stream); + table_view{{input}}, {column_order}, {null_precedence}, stream, mr) + : detail::sorted_order(table_view{{input}}, {column_order}, {null_precedence}, stream, mr); column_view sorted_order_view = sorted_order->view(); // dense: All equal values have same rank and rank always increases by 1 between groups diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index d163c4e5be8..18d6839e2a2 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "rmm/cuda_stream_view.hpp" #include "sort_impl.cuh" #include @@ -27,23 +28,23 @@ namespace detail { std::unique_ptr sorted_order(table_view input, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return sorted_order(input, column_order, null_precedence, mr, stream); + return sorted_order(input, column_order, null_precedence, stream, mr); } std::unique_ptr
sort_by_key(table_view const& values, table_view const& keys, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(values.num_rows() == keys.num_rows(), "Mismatch in number of rows for values and keys"); - auto sorted_order = detail::sorted_order(keys, column_order, null_precedence, mr, stream); + auto sorted_order = detail::sorted_order(keys, column_order, null_precedence, stream, mr); return detail::gather(values, sorted_order->view(), @@ -61,7 +62,7 @@ std::unique_ptr sorted_order(table_view input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sorted_order(input, column_order, null_precedence, mr); + return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr); } std::unique_ptr
sort(table_view input, @@ -70,7 +71,8 @@ std::unique_ptr
sort(table_view input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_by_key(input, input, column_order, null_precedence, mr); + return detail::sort_by_key( + input, input, column_order, null_precedence, rmm::cuda_stream_default, mr); } std::unique_ptr
sort_by_key(table_view const& values, @@ -80,7 +82,8 @@ std::unique_ptr
sort_by_key(table_view const& values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_by_key(values, keys, column_order, null_precedence, mr); + return detail::sort_by_key( + values, keys, column_order, null_precedence, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh index d043f2e8947..97de42d805d 100644 --- a/cpp/src/sort/sort_impl.cuh +++ b/cpp/src/sort/sort_impl.cuh @@ -23,6 +23,8 @@ #include #include +#include + #include namespace cudf { @@ -32,8 +34,8 @@ template std::unique_ptr sorted_order(table_view input, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.num_rows() == 0 or input.num_columns() == 0) { return cudf::make_numeric_column(data_type(type_to_id()), 0); @@ -56,7 +58,7 @@ std::unique_ptr sorted_order(table_view input, auto device_table = table_device_view::create(input, stream); - thrust::sequence(rmm::exec_policy(stream)->on(stream), + thrust::sequence(rmm::exec_policy(stream)->on(stream.value()), mutable_indices_view.begin(), mutable_indices_view.end(), 0); @@ -68,12 +70,12 @@ std::unique_ptr sorted_order(table_view input, auto comparator = row_lexicographic_comparator( *device_table, *device_table, d_column_order.data().get(), d_null_precedence.data().get()); if (stable) { - thrust::stable_sort(rmm::exec_policy(stream)->on(stream), + thrust::stable_sort(rmm::exec_policy(stream)->on(stream.value()), mutable_indices_view.begin(), mutable_indices_view.end(), comparator); } else { - thrust::sort(rmm::exec_policy(stream)->on(stream), + thrust::sort(rmm::exec_policy(stream)->on(stream.value()), mutable_indices_view.begin(), mutable_indices_view.end(), comparator); @@ -82,12 +84,12 @@ std::unique_ptr sorted_order(table_view input, auto comparator = row_lexicographic_comparator( *device_table, *device_table, d_column_order.data().get()); if (stable) { - thrust::stable_sort(rmm::exec_policy(stream)->on(stream), + thrust::stable_sort(rmm::exec_policy(stream)->on(stream.value()), mutable_indices_view.begin(), mutable_indices_view.end(), comparator); } else { - thrust::sort(rmm::exec_policy(stream)->on(stream), + thrust::sort(rmm::exec_policy(stream)->on(stream.value()), mutable_indices_view.begin(), mutable_indices_view.end(), comparator); diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu index 982d5df6a9a..860e88ae76e 100644 --- a/cpp/src/sort/stable_sort.cu +++ b/cpp/src/sort/stable_sort.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-20, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,15 +21,17 @@ #include #include +#include + namespace cudf { namespace detail { std::unique_ptr stable_sorted_order(table_view input, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return sorted_order(input, column_order, null_precedence, mr, stream); + return sorted_order(input, column_order, null_precedence, stream, mr); } } // namespace detail @@ -39,7 +41,8 @@ std::unique_ptr stable_sorted_order(table_view input, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) { - return detail::stable_sorted_order(input, column_order, null_precedence, mr); + return detail::stable_sorted_order( + input, column_order, null_precedence, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu index ccb31898e95..3eb10f9f717 100644 --- a/cpp/src/stream_compaction/apply_boolean_mask.cu +++ b/cpp/src/stream_compaction/apply_boolean_mask.cu @@ -25,6 +25,8 @@ #include #include +#include + #include namespace { @@ -61,8 +63,8 @@ namespace detail { */ std::unique_ptr
apply_boolean_mask(table_view const& input, column_view const& boolean_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (boolean_mask.is_empty()) { return empty_like(input); } @@ -90,6 +92,6 @@ std::unique_ptr
apply_boolean_mask(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::apply_boolean_mask(input, boolean_mask, mr); + return detail::apply_boolean_mask(input, boolean_mask, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 970ce7eb198..0208272a1d4 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -29,9 +29,12 @@ #include #include +#include + #include #include #include + #include #include @@ -105,14 +108,14 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, cudf::mutable_column_view& unique_indices, duplicate_keep_option keep, null_equality nulls_equal, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream) { // sort only indices auto sorted_indices = sorted_order(keys, std::vector{}, std::vector{}, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); // extract unique indices auto device_input_table = cudf::table_device_view::create(keys, stream); @@ -120,7 +123,7 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, if (cudf::has_nulls(keys)) { auto comp = row_equality_comparator( *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); - auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream), + auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream.value()), sorted_indices->view().begin(), sorted_indices->view().end(), unique_indices.begin(), @@ -134,7 +137,7 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, } else { auto comp = row_equality_comparator( *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); - auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream), + auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream.value()), sorted_indices->view().begin(), sorted_indices->view().end(), unique_indices.begin(), @@ -150,14 +153,14 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, cudf::size_type distinct_count(table_view const& keys, null_equality nulls_equal, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // sort only indices auto sorted_indices = sorted_order(keys, std::vector{}, std::vector{}, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); // count unique elements auto sorted_row_index = sorted_indices->view().data(); @@ -167,7 +170,7 @@ cudf::size_type distinct_count(table_view const& keys, row_equality_comparator comp( *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); return thrust::count_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::counting_iterator(0), thrust::counting_iterator(keys.num_rows()), [sorted_row_index, comp] __device__(cudf::size_type i) { @@ -177,7 +180,7 @@ cudf::size_type distinct_count(table_view const& keys, row_equality_comparator comp( *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); return thrust::count_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::counting_iterator(0), thrust::counting_iterator(keys.num_rows()), [sorted_row_index, comp] __device__(cudf::size_type i) { @@ -190,8 +193,8 @@ std::unique_ptr
drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { return empty_like(input); @@ -263,11 +266,11 @@ struct has_nans { * @returns bool true if `input` has `NAN` else false */ template ::value>* = nullptr> - bool operator()(column_view const& input, cudaStream_t stream) + bool operator()(column_view const& input, rmm::cuda_stream_view stream) { auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto count = thrust::count_if(rmm::exec_policy(stream)->on(stream), + auto count = thrust::count_if(rmm::exec_policy(stream)->on(stream.value()), thrust::counting_iterator(0), thrust::counting_iterator(input.size()), check_for_nan(device_view)); @@ -287,7 +290,7 @@ struct has_nans { * @returns bool Always false as non-floating point columns can't have `NAN` */ template ::value>* = nullptr> - bool operator()(column_view const& input, cudaStream_t stream) + bool operator()(column_view const& input, rmm::cuda_stream_view stream) { return false; } @@ -296,7 +299,7 @@ struct has_nans { cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (0 == input.size() || input.null_count() == input.size()) { return 0; } @@ -332,7 +335,7 @@ std::unique_ptr
drop_duplicates(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_duplicates(input, keys, keep, nulls_equal, mr); + return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr); } cudf::size_type distinct_count(column_view const& input, @@ -340,13 +343,13 @@ cudf::size_type distinct_count(column_view const& input, nan_policy nan_handling) { CUDF_FUNC_RANGE(); - return detail::distinct_count(input, null_handling, nan_handling); + return detail::distinct_count(input, null_handling, nan_handling, rmm::cuda_stream_default); } cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal) { CUDF_FUNC_RANGE(); - return detail::distinct_count(input, nulls_equal); + return detail::distinct_count(input, nulls_equal, rmm::cuda_stream_default); } } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu index ddd5d0c9934..80d92669344 100644 --- a/cpp/src/stream_compaction/drop_nans.cu +++ b/cpp/src/stream_compaction/drop_nans.cu @@ -23,6 +23,8 @@ #include #include +#include + namespace { struct dispatch_is_not_nan { @@ -82,8 +84,8 @@ namespace detail { std::unique_ptr
drop_nans(table_view const& input, std::vector const& keys, cudf::size_type keep_threshold, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto keys_view = input.select(keys); if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0) { @@ -113,7 +115,7 @@ std::unique_ptr
drop_nans(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::drop_nans(input, keys, keep_threshold, mr); + return cudf::detail::drop_nans(input, keys, keep_threshold, rmm::cuda_stream_default, mr); } /* * Filters a table to remove nan null elements. @@ -123,7 +125,7 @@ std::unique_ptr
drop_nans(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::drop_nans(input, keys, keys.size(), mr); + return cudf::detail::drop_nans(input, keys, keys.size(), rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu index 49708b635d8..71aa8f6c63c 100644 --- a/cpp/src/stream_compaction/drop_nulls.cu +++ b/cpp/src/stream_compaction/drop_nulls.cu @@ -21,6 +21,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace { // Returns true if the mask is true for index i in at least keep_threshold @@ -61,8 +62,8 @@ namespace detail { std::unique_ptr
drop_nulls(table_view const& input, std::vector const& keys, cudf::size_type keep_threshold, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto keys_view = input.select(keys); if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0 || not cudf::has_nulls(keys_view)) { @@ -86,7 +87,7 @@ std::unique_ptr
drop_nulls(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::drop_nulls(input, keys, keep_threshold, mr); + return cudf::detail::drop_nulls(input, keys, keep_threshold, rmm::cuda_stream_default, mr); } /* * Filters a table to remove null elements. @@ -96,7 +97,7 @@ std::unique_ptr
drop_nulls(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return cudf::detail::drop_nulls(input, keys, keys.size(), mr); + return cudf::detail::drop_nulls(input, keys, keys.size(), rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/strings/sorting/sorting.cu b/cpp/src/strings/sorting/sorting.cu index 0a5a2238d9b..3d78024064e 100644 --- a/cpp/src/strings/sorting/sorting.cu +++ b/cpp/src/strings/sorting/sorting.cu @@ -21,6 +21,8 @@ #include #include +#include + #include #include @@ -32,7 +34,7 @@ std::unique_ptr sort(strings_column_view strings, sort_type stype, cudf::order order, cudf::null_order null_order, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto execpol = rmm::exec_policy(stream); @@ -42,8 +44,8 @@ std::unique_ptr sort(strings_column_view strings, // sort the indices of the strings size_type num_strings = strings.size(); rmm::device_vector indices(num_strings); - thrust::sequence(execpol->on(stream), indices.begin(), indices.end()); - thrust::sort(execpol->on(stream), + thrust::sequence(execpol->on(stream.value()), indices.begin(), indices.end()); + thrust::sort(execpol->on(stream.value()), indices.begin(), indices.end(), [d_column, stype, order, null_order] __device__(size_type lhs, size_type rhs) { diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index a9bb84bd1c3..895f6309886 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -43,7 +43,7 @@ std::pair, std::unique_ptr> encode( // - resulting column elements are sorted ascending // - nulls are sorted to the beginning auto keys_table = cudf::detail::drop_duplicates( - input_table, drop_keys, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, mr, stream); + input_table, drop_keys, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, stream, mr); if (cudf::has_nulls(keys_table->view())) { // Rows with nulls appear at the top of `keys_table`, but we want them to appear at From 6ab4384fd7e12d755a085df66ddbcea0001fb5fd Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 17:53:00 +1100 Subject: [PATCH 38/51] convert transform to cuda_stream_view --- cpp/include/cudf/detail/transform.hpp | 22 ++++++++++++---------- cpp/src/dictionary/encode.cu | 2 +- cpp/src/interop/to_arrow.cpp | 2 +- cpp/src/jit/launcher.cpp | 7 +++++-- cpp/src/jit/launcher.h | 14 +++++++++----- cpp/src/transform/bools_to_mask.cu | 8 +++++--- cpp/src/transform/encode.cu | 7 ++++--- cpp/src/transform/mask_to_bools.cu | 8 +++++--- cpp/src/transform/nans_to_nulls.cu | 18 ++++++++++++------ cpp/src/transform/transform.cpp | 9 +++++---- 10 files changed, 59 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp index 9cffbd3be70..0309542d01f 100644 --- a/cpp/include/cudf/detail/transform.hpp +++ b/cpp/include/cudf/detail/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + namespace cudf { namespace detail { /** @@ -30,8 +32,8 @@ std::unique_ptr transform( std::string const& unary_udf, data_type output_type, bool is_ptx, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::nans_to_nulls @@ -40,8 +42,8 @@ std::unique_ptr transform( **/ std::pair, size_type> nans_to_nulls( column_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bools_to_mask @@ -50,8 +52,8 @@ std::pair, size_type> nans_to_nulls( **/ std::pair, cudf::size_type> bools_to_mask( column_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::encode @@ -60,8 +62,8 @@ std::pair, cudf::size_type> bools_to_mask( **/ std::pair, std::unique_ptr> encode( cudf::table_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::mask_to_bools @@ -72,7 +74,7 @@ std::unique_ptr mask_to_bools( bitmask_type const* null_mask, size_type begin_bit, size_type end_bit, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index 613974efde7..9c20d5006bf 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -45,7 +45,7 @@ std::unique_ptr encode(column_view const& input_column, CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32, "cannot encode a dictionary from a dictionary"); - auto codified = cudf::detail::encode(cudf::table_view({input_column}), mr, stream); + auto codified = cudf::detail::encode(cudf::table_view({input_column}), stream, mr); auto keys_table = std::move(codified.first); auto indices_column = std::move(codified.second); auto keys_column = std::move(keys_table->release().front()); diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp index 5f270597403..ec58da6bf0b 100644 --- a/cpp/src/interop/to_arrow.cpp +++ b/cpp/src/interop/to_arrow.cpp @@ -137,7 +137,7 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - auto bitmask = bools_to_mask(input, rmm::mr::get_current_device_resource(), stream.value()); + auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource()); auto result = arrow::AllocateBuffer(static_cast(bitmask.first->size()), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); diff --git a/cpp/src/jit/launcher.cpp b/cpp/src/jit/launcher.cpp index 704379ee82e..65bb55f9102 100644 --- a/cpp/src/jit/launcher.cpp +++ b/cpp/src/jit/launcher.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -22,14 +22,17 @@ #include #include +#include + namespace cudf { namespace jit { + launcher::launcher(const std::string& hash, const std::string& cuda_source, const std::vector& header_names, const std::vector& compiler_flags, jitify::experimental::file_callback_type file_callback, - cudaStream_t stream) + rmm::cuda_stream_view stream) : cache_instance{cudf::jit::cudfJitCache::Instance()}, stream(stream) { program = cache_instance.getProgram( diff --git a/cpp/src/jit/launcher.h b/cpp/src/jit/launcher.h index 3745854e277..60720816bc1 100644 --- a/cpp/src/jit/launcher.h +++ b/cpp/src/jit/launcher.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -20,9 +20,13 @@ #pragma once #include + +#include + +#include + #include #include -#include #include #include #include @@ -58,7 +62,7 @@ class launcher { const std::vector& header_names, const std::vector& compiler_flags, jitify::experimental::file_callback_type file_callback, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); launcher(launcher&&); launcher(const launcher&) = delete; launcher& operator=(launcher&&) = delete; @@ -91,14 +95,14 @@ class launcher { template void launch(Args... args) { - get_kernel().configure_1d_max_occupancy(0, 0, 0, stream).launch(args...); + get_kernel().configure_1d_max_occupancy(0, 0, 0, stream.value()).launch(args...); } private: cudf::jit::cudfJitCache& cache_instance; cudf::jit::named_prog program; cudf::jit::named_prog kernel_inst; - cudaStream_t stream; + rmm::cuda_stream_view stream; jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); } }; diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu index f7e029f5ed7..2cf4771890b 100644 --- a/cpp/src/transform/bools_to_mask.cu +++ b/cpp/src/transform/bools_to_mask.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,10 +25,12 @@ #include #include +#include + namespace cudf { namespace detail { std::pair, cudf::size_type> bools_to_mask( - column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool"); @@ -58,7 +60,7 @@ std::pair, cudf::size_type> bools_to_mask( column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::bools_to_mask(input, mr); + return detail::bools_to_mask(input, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 895f6309886..1ecf8a7814a 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -34,7 +35,7 @@ namespace cudf { namespace detail { std::pair, std::unique_ptr> encode( - table_view const& input_table, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { std::vector drop_keys(input_table.num_columns()); std::iota(drop_keys.begin(), drop_keys.end(), 0); @@ -59,7 +60,7 @@ std::pair, std::unique_ptr> encode( rmm::device_vector gather_map(num_rows); auto execpol = rmm::exec_policy(stream); - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), gather_map.begin(), @@ -98,7 +99,7 @@ std::pair, std::unique_ptr> encode( std::pair, std::unique_ptr> encode( cudf::table_view const& input, rmm::mr::device_memory_resource* mr) { - return detail::encode(input, mr, 0); + return detail::encode(input, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu index fb6e0a7148c..1202c754287 100644 --- a/cpp/src/transform/mask_to_bools.cu +++ b/cpp/src/transform/mask_to_bools.cu @@ -23,12 +23,14 @@ #include #include +#include + namespace cudf { namespace detail { std::unique_ptr mask_to_bools(bitmask_type const* bitmask, size_type begin_bit, size_type end_bit, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const length = end_bit - begin_bit; @@ -41,7 +43,7 @@ std::unique_ptr mask_to_bools(bitmask_type const* bitmask, if (length > 0) { auto mutable_view = out_col->mutable_view(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(begin_bit), thrust::make_counting_iterator(end_bit), mutable_view.begin(), @@ -57,6 +59,6 @@ std::unique_ptr mask_to_bools(bitmask_type const* bitmask, size_type end_bit, rmm::mr::device_memory_resource* mr) { - return detail::mask_to_bools(bitmask, begin_bit, end_bit, 0, mr); + return detail::mask_to_bools(bitmask, begin_bit, end_bit, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu index 977073ce48f..93a7521546d 100644 --- a/cpp/src/transform/nans_to_nulls.cu +++ b/cpp/src/transform/nans_to_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,13 +24,17 @@ #include #include +#include + namespace cudf { namespace detail { struct dispatch_nan_to_null { template std::enable_if_t::value, std::pair, cudf::size_type>> - operator()(column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + operator()(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto input_device_view_ptr = column_device_view::create(input, stream); auto input_device_view = *input_device_view_ptr; @@ -68,18 +72,20 @@ struct dispatch_nan_to_null { template std::enable_if_t::value, std::pair, cudf::size_type>> - operator()(column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + operator()(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Input column can't be a non-floating type"); } }; std::pair, cudf::size_type> nans_to_nulls( - column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (input.is_empty()) { return std::make_pair(std::make_unique(), 0); } - return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, mr, stream); + return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, stream, mr); } } // namespace detail @@ -88,7 +94,7 @@ std::pair, cudf::size_type> nans_to_nulls( column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::nans_to_nulls(input, mr); + return detail::nans_to_nulls(input, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 587b29201e9..2372382d178 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -26,6 +26,7 @@ #include #include #include "jit/code/code.h" +#include "rmm/cuda_stream_view.hpp" #include #include @@ -52,7 +53,7 @@ void unary_operation(mutable_column_view output, const std::string& udf, data_type output_type, bool is_ptx, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::string hash = "prog_transform" + std::to_string(std::hash{}(udf)); @@ -86,8 +87,8 @@ std::unique_ptr transform(column_view const& input, std::string const& unary_udf, data_type output_type, bool is_ptx, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_fixed_width(input.type()), "Unexpected non-fixed-width type."); @@ -113,7 +114,7 @@ std::unique_ptr transform(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::transform(input, unary_udf, output_type, is_ptx, mr); + return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr); } } // namespace cudf From d08b2d061b8888907bda8be899204f0678c056dc Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 9 Nov 2020 17:55:01 +1100 Subject: [PATCH 39/51] Convert transpose to cuda_stream_view --- cpp/include/cudf/detail/transpose.hpp | 8 +++++--- cpp/src/transpose/transpose.cu | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp index 468409c1443..be2c567df35 100644 --- a/cpp/include/cudf/detail/transpose.hpp +++ b/cpp/include/cudf/detail/transpose.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -27,8 +29,8 @@ namespace detail { */ std::pair, table_view> transpose( table_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu index 3b4439f63f3..67a06e60dd3 100644 --- a/cpp/src/transpose/transpose.cu +++ b/cpp/src/transpose/transpose.cu @@ -24,11 +24,13 @@ #include #include +#include + namespace cudf { namespace detail { std::pair, table_view> transpose(table_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // If there are no rows in the input, return successfully if (input.num_columns() == 0 || input.num_rows() == 0) { @@ -57,7 +59,7 @@ std::pair, table_view> transpose(table_view const& input rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::transpose(input, mr); + return detail::transpose(input, rmm::cuda_stream_default, mr); } } // namespace cudf From 1715b80859b89873d797632b29d2efeb770a62c0 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 10 Nov 2020 08:49:13 +1100 Subject: [PATCH 40/51] Convert unary ops to cuda_stream_view --- cpp/include/cudf/detail/unary.hpp | 26 +-- cpp/src/dictionary/add_keys.cu | 2 +- cpp/src/dictionary/dictionary_factories.cu | 2 +- cpp/src/dictionary/encode.cu | 2 +- cpp/src/groupby/sort/groupby.cu | 2 +- cpp/src/interop/from_arrow.cpp | 3 +- cpp/src/interop/to_arrow.cpp | 4 +- cpp/src/unary/cast_ops.cu | 32 ++-- cpp/src/unary/math_ops.cu | 197 +++++++++++---------- cpp/src/unary/nan_ops.cu | 41 +++-- cpp/src/unary/null_ops.cu | 3 + cpp/src/unary/unary_ops.cuh | 12 +- 12 files changed, 172 insertions(+), 154 deletions(-) diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp index cd8749cae0b..fb5416fe750 100644 --- a/cpp/include/cudf/detail/unary.hpp +++ b/cpp/include/cudf/detail/unary.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -32,8 +34,8 @@ namespace detail { * @param begin Beginning of the sequence of elements * @param end End of the sequence of elements * @param p Predicate to be applied to each element in `[begin,end)` - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * * @returns A column of type `type_id::BOOL8,` with `true` representing predicate is satisfied. */ @@ -44,15 +46,15 @@ std::unique_ptr true_if( InputIterator end, size_type size, Predicate p, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto output = make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr); auto output_mutable_view = output->mutable_view(); auto output_data = output_mutable_view.data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), begin, end, output_data, p); + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), begin, end, output_data, p); return output; } @@ -65,8 +67,8 @@ std::unique_ptr true_if( std::unique_ptr unary_operation( cudf::column_view const& input, cudf::unary_op op, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::cast @@ -76,8 +78,8 @@ std::unique_ptr unary_operation( std::unique_ptr cast( column_view const& input, data_type type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::is_nan @@ -86,8 +88,8 @@ std::unique_ptr cast( */ std::unique_ptr is_nan( cudf::column_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::is_not_nan @@ -96,8 +98,8 @@ std::unique_ptr is_nan( */ std::unique_ptr is_not_nan( cudf::column_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index b72b4f38a56..6a9b294758d 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -111,7 +111,7 @@ std::unique_ptr add_keys( } // otherwise we need to convert the gather result column_view cast_view(gather_result.type(), indices_size, gather_result.head(), nullptr, 0); - return cudf::detail::cast(cast_view, indices_type, mr, stream); + return cudf::detail::cast(cast_view, indices_type, stream, mr); }(); // create new dictionary column with keys_column and indices_column diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index ec598b71f88..17a09e26f7b 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -139,7 +139,7 @@ std::unique_ptr make_dictionary_column(std::unique_ptr keys, } // If the new type does not match, then convert the data. cudf::column_view cast_view{cudf::data_type{indices_type}, indices_size, contents.data->data()}; - return cudf::detail::cast(cast_view, new_type, mr, stream); + return cudf::detail::cast(cast_view, new_type, stream, mr); }(); return make_dictionary_column(std::move(keys), diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index 9c20d5006bf..129c9345d4b 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -60,7 +60,7 @@ std::unique_ptr encode(column_view const& input_column, // the encode() returns INT32 for indices if (indices_column->type().id() != indices_type.id()) - indices_column = cudf::detail::cast(indices_column->view(), indices_type, mr, stream); + indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr); // create column with keys_column and indices_column return make_dictionary_column( diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 7077e6f089c..cc77e9b8af8 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -307,7 +307,7 @@ void store_result_functor::operator()(aggregation const& agg) operator()(*var_agg); column_view var_result = cache.get_result(col_idx, *var_agg); - auto result = cudf::detail::unary_operation(var_result, unary_op::SQRT, mr, stream); + auto result = cudf::detail::unary_operation(var_result, unary_op::SQRT, stream, mr); cache.add_result(col_idx, agg, std::move(result)); }; diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 690647d9306..4f208d8985c 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -256,8 +256,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( // If index type is not of type uint32_t, then cast it to uint32_t auto const dict_indices_type = data_type{type_id::UINT32}; if (indices_column->type().id() != dict_indices_type.id()) - indices_column = - cudf::detail::cast(indices_column->view(), dict_indices_type, mr, stream.value()); + indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, stream, mr); // Child columns shouldn't have masks and we need the mask in main column auto column_contents = indices_column->release(); diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp index ec58da6bf0b..c36b2be77e8 100644 --- a/cpp/src/interop/to_arrow.cpp +++ b/cpp/src/interop/to_arrow.cpp @@ -276,8 +276,8 @@ std::shared_ptr dispatch_to_arrow::operator()( std::unique_ptr dict_indices = cast(cudf::dictionary_column_view(input).get_indices_annotated(), cudf::data_type{type_id::INT32}, - rmm::mr::get_current_device_resource(), - stream.value()); + stream, + rmm::mr::get_current_device_resource()); auto indices = dispatch_to_arrow{}.operator()( dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream); auto dict_keys = cudf::dictionary_column_view(input).keys(); diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index e8cc606865b..4f006527bbc 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -109,8 +109,8 @@ struct dispatch_unary_cast_to { !(cudf::is_timestamp() && is_numeric()) && !(cudf::is_timestamp() && is_numeric())>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto size = input.size(); auto output = @@ -122,7 +122,7 @@ struct dispatch_unary_cast_to { mutable_column_view output_mutable = *output; - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), output_mutable.begin(), @@ -137,8 +137,8 @@ struct dispatch_unary_cast_to { (cudf::is_timestamp() && is_numeric()) || (cudf::is_timestamp() && is_numeric())>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (!cudf::is_fixed_width()) CUDF_FAIL("Column type must be numeric or chrono"); @@ -160,24 +160,24 @@ struct dispatch_unary_cast_from { typename T, typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return type_dispatcher(type, dispatch_unary_cast_to{input}, type, mr, stream); + return type_dispatcher(type, dispatch_unary_cast_to{input}, type, stream, mr); } template ()>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Fixed point unary ops not supported yet"); } template ()>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Column type must be numeric or chrono"); } @@ -185,12 +185,12 @@ struct dispatch_unary_cast_from { std::unique_ptr cast(column_view const& input, data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_fixed_width(type), "Unary cast type must be fixed-width."); - return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, mr, stream); + return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, stream, mr); } } // namespace detail @@ -200,7 +200,7 @@ std::unique_ptr cast(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::cast(input, type, mr); + return detail::cast(input, type, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index 08b653c7353..348f829d192 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -235,8 +235,8 @@ std::unique_ptr transform_fn(InputIterator begin, InputIterator end, rmm::device_buffer&& null_mask, size_type null_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const size = cudf::distance(begin, end); @@ -250,60 +250,65 @@ std::unique_ptr transform_fn(InputIterator begin, if (size == 0) return output; auto output_view = output->mutable_view(); - thrust::transform( - rmm::exec_policy(stream)->on(stream), begin, end, output_view.begin(), UFN{}); + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), + begin, + end, + output_view.begin(), + UFN{}); return output; } template std::unique_ptr transform_fn(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto dictionary_view = cudf::column_device_view::create(input.parent(), stream); auto dictionary_itr = dictionary::detail::make_dictionary_iterator(*dictionary_view); auto default_mr = rmm::mr::get_current_device_resource(); // call unary-op using temporary output buffer - auto output = transform_fn( - dictionary_itr, - dictionary_itr + input.size(), - detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, default_mr), - input.null_count(), - default_mr, - stream); + auto output = transform_fn(dictionary_itr, + dictionary_itr + input.size(), + detail::copy_bitmask(input.parent(), stream, default_mr), + input.null_count(), + stream, + default_mr); return cudf::dictionary::detail::encode( - output->view(), dictionary::detail::get_indices_type_for_size(output->size()), mr, stream); + output->view(), + dictionary::detail::get_indices_type_for_size(output->size()), + mr, + stream.value()); } template struct MathOpDispatcher { template ::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return transform_fn( input.begin(), input.end(), cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), input.null_count(), - mr, - stream); + stream, + mr); } struct dictionary_dispatch { template ::value>* = nullptr> std::unique_ptr operator()(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return transform_fn(input, mr, stream); + return transform_fn(input, stream, mr); } template ::value>* = nullptr> std::unique_ptr operator()(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("dictionary keys must be numeric for this operation"); } @@ -313,21 +318,21 @@ struct MathOpDispatcher { typename std::enable_if_t::value and std::is_same::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return empty_like(input); auto dictionary_col = dictionary_column_view(input); return type_dispatcher( - dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, mr, stream); + dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, stream, mr); } template ::value and !std::is_same::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unsupported data type for operation"); } @@ -337,31 +342,30 @@ template struct BitwiseOpDispatcher { template ::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return transform_fn( - input.begin(), - input.end(), - cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), - input.null_count(), - mr, - stream); + return transform_fn(input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); } struct dictionary_dispatch { template ::value>* = nullptr> std::unique_ptr operator()(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return transform_fn(input, mr, stream); + return transform_fn(input, stream, mr); } template ::value>* = nullptr> std::unique_ptr operator()(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("dictionary keys type not supported for this operation"); } @@ -371,21 +375,21 @@ struct BitwiseOpDispatcher { typename std::enable_if_t::value and std::is_same::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return empty_like(input); auto dictionary_col = dictionary_column_view(input); return type_dispatcher( - dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, mr, stream); + dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, stream, mr); } template ::value and !std::is_same::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unsupported datatype for operation"); } @@ -403,39 +407,38 @@ struct LogicalOpDispatcher { public: template ()>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return transform_fn( - input.begin(), - input.end(), - cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), - input.null_count(), - mr, - stream); + return transform_fn(input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, stream, mr), + input.null_count(), + + stream, + mr); } struct dictionary_dispatch { template ()>* = nullptr> std::unique_ptr operator()(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto dictionary_view = cudf::column_device_view::create(input.parent(), stream); auto dictionary_itr = dictionary::detail::make_dictionary_iterator(*dictionary_view); - return transform_fn( - dictionary_itr, - dictionary_itr + input.size(), - cudf::detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, mr), - input.null_count(), - mr, - stream); + return transform_fn(dictionary_itr, + dictionary_itr + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); } template ()>* = nullptr> std::unique_ptr operator()(cudf::dictionary_column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("dictionary keys type not supported for this operation"); } @@ -445,13 +448,13 @@ struct LogicalOpDispatcher { typename std::enable_if_t() and std::is_same::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return make_empty_column(cudf::data_type{cudf::type_id::BOOL8}); auto dictionary_col = dictionary_column_view(input); return type_dispatcher( - dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, mr, stream); + dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, stream, mr); } // template ()>* = nullptr> @@ -459,8 +462,8 @@ struct LogicalOpDispatcher { typename std::enable_if_t() and !std::is_same::value>* = nullptr> std::unique_ptr operator()(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unsupported datatype for operation"); } @@ -470,79 +473,79 @@ struct LogicalOpDispatcher { std::unique_ptr unary_operation(cudf::column_view const& input, cudf::unary_op op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { switch (op) { case cudf::unary_op::SIN: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::COS: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::TAN: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ARCSIN: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ARCCOS: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ARCTAN: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::SINH: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::COSH: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::TANH: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ARCSINH: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ARCCOSH: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ARCTANH: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::EXP: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::LOG: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::SQRT: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::CBRT: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::CEIL: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::FLOOR: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::ABS: return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::RINT: CUDF_EXPECTS( (input.type().id() == type_id::FLOAT32) or (input.type().id() == type_id::FLOAT64), "rint expects floating point values"); return cudf::type_dispatcher( - input.type(), detail::MathOpDispatcher{}, input, mr, stream); + input.type(), detail::MathOpDispatcher{}, input, stream, mr); case cudf::unary_op::BIT_INVERT: return cudf::type_dispatcher( - input.type(), detail::BitwiseOpDispatcher{}, input, mr, stream); + input.type(), detail::BitwiseOpDispatcher{}, input, stream, mr); case cudf::unary_op::NOT: return cudf::type_dispatcher( - input.type(), detail::LogicalOpDispatcher{}, input, mr, stream); + input.type(), detail::LogicalOpDispatcher{}, input, stream, mr); default: CUDF_FAIL("Undefined unary operation"); } } @@ -554,7 +557,7 @@ std::unique_ptr unary_operation(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::unary_operation(input, op, mr); + return detail::unary_operation(input, op, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu index 33600e83530..9f8f0e53cb2 100644 --- a/cpp/src/unary/nan_ops.cu +++ b/cpp/src/unary/nan_ops.cu @@ -21,6 +21,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { @@ -29,19 +30,27 @@ struct nan_dispatcher { std::enable_if_t::value, std::unique_ptr> operator()( cudf::column_view const& input, Predicate predicate, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto input_device_view = column_device_view::create(input); if (input.has_nulls()) { auto input_pair_iterator = make_pair_iterator(*input_device_view); - return true_if( - input_pair_iterator, input_pair_iterator + input.size(), input.size(), predicate, mr); + return true_if(input_pair_iterator, + input_pair_iterator + input.size(), + input.size(), + predicate, + stream, + mr); } else { auto input_pair_iterator = make_pair_iterator(*input_device_view); - return true_if( - input_pair_iterator, input_pair_iterator + input.size(), input.size(), predicate, mr); + return true_if(input_pair_iterator, + input_pair_iterator + input.size(), + input.size(), + predicate, + stream, + mr); } } @@ -49,33 +58,33 @@ struct nan_dispatcher { std::enable_if_t::value, std::unique_ptr> operator()( cudf::column_view const& input, Predicate predicate, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("NAN is not supported in a Non-floating point type column"); } }; std::unique_ptr is_nan(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto predicate = [] __device__(auto element_validity_pair) { return element_validity_pair.second and std::isnan(element_validity_pair.first); }; - return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, mr, stream); + return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, stream, mr); } std::unique_ptr is_not_nan(cudf::column_view const& input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto predicate = [] __device__(auto element_validity_pair) { return !element_validity_pair.second or !std::isnan(element_validity_pair.first); }; - return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, mr, stream); + return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, stream, mr); } } // namespace detail @@ -83,14 +92,14 @@ std::unique_ptr is_not_nan(cudf::column_view const& input, std::unique_ptr is_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_nan(input, mr); + return detail::is_nan(input, rmm::cuda_stream_default, mr); } std::unique_ptr is_not_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_not_nan(input, mr); + return detail::is_not_nan(input, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu index 3355cfc348d..699439da1c9 100644 --- a/cpp/src/unary/null_ops.cu +++ b/cpp/src/unary/null_ops.cu @@ -20,6 +20,7 @@ #include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { std::unique_ptr is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) @@ -32,6 +33,7 @@ std::unique_ptr is_null(cudf::column_view const& input, rmm::mr::device_ thrust::make_counting_iterator(input.size()), input.size(), predicate, + rmm::cuda_stream_default, mr); } @@ -46,6 +48,7 @@ std::unique_ptr is_valid(cudf::column_view const& input, thrust::make_counting_iterator(input.size()), input.size(), predicate, + rmm::cuda_stream_default, mr); } diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh index 51b63806cfa..a74a05437be 100644 --- a/cpp/src/unary/unary_ops.cuh +++ b/cpp/src/unary/unary_ops.cuh @@ -20,8 +20,10 @@ #include #include #include +#include #include #include +#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace unary { @@ -29,8 +31,8 @@ template struct launcher { static std::unique_ptr launch(cudf::column_view const& input, cudf::unary_op op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::unique_ptr output = [&] { if (op == cudf::unary_op::NOT) { @@ -40,12 +42,12 @@ struct launcher { return std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), 0, mr}, - copy_bitmask(input, 0, mr), + cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); } else { return cudf::detail::allocate_like( - input, input.size(), mask_allocation_policy::NEVER, mr, stream); + input, input.size(), mask_allocation_policy::NEVER, stream, mr); } }(); @@ -62,7 +64,7 @@ struct launcher { rmm::device_buffer{input.null_mask(), bitmask_allocation_size_bytes(input.size())}, input.null_count()); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), output_view.begin(), From 6250687520808c5f86e67fc895d8f0cedbbe3e8a Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 10 Nov 2020 15:53:53 +1100 Subject: [PATCH 41/51] Fix JNI build after cuda_stream_view changes --- java/pom.xml | 1 + java/src/main/native/src/TableJni.cpp | 319 +++--- java/src/main/native/src/map_lookup.cu | 77 +- java/src/main/native/src/map_lookup.hpp | 61 +- java/src/main/native/src/row_conversion.cu | 1012 +++++++++---------- java/src/main/native/src/row_conversion.hpp | 25 +- 6 files changed, 690 insertions(+), 805 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index d14d4202e21..8894d9eae46 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -351,6 +351,7 @@ + #include #include -#include #include +#include #include #include #include @@ -34,10 +34,11 @@ #include #include #include +#include #include "cudf_jni_apis.hpp" -#include "row_conversion.hpp" #include "dtype_utils.hpp" +#include "row_conversion.hpp" namespace cudf { namespace jni { @@ -108,7 +109,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink { bool supports_device_write() const override { return true; } - void device_write(void const *gpu_data, size_t size, cudaStream_t stream) { + void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) { JNIEnv *env = cudf::jni::get_jni_env(jvm); size_t left_to_copy = size; const char *copy_from = static_cast(gpu_data); @@ -116,7 +117,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink { long buffer_amount_available = current_buffer_len - current_buffer_written; if (buffer_amount_available <= 0) { // should never be < 0, but just to be safe - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); rotate_buffer(env); buffer_amount_available = current_buffer_len - current_buffer_written; } @@ -124,14 +125,15 @@ class jni_writer_data_sink final : public cudf::io::data_sink { left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available; char *copy_to = current_buffer_data + current_buffer_written; - CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, + stream.value())); copy_from = copy_from + amount_to_copy; current_buffer_written += amount_to_copy; total_written += amount_to_copy; left_to_copy -= amount_to_copy; } - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } void flush() override { @@ -195,26 +197,18 @@ template class jni_table_writer_handle final { std::unique_ptr sink; }; -typedef jni_table_writer_handle - native_parquet_writer_handle; +typedef jni_table_writer_handle native_parquet_writer_handle; typedef jni_table_writer_handle native_orc_writer_handle; class native_arrow_ipc_writer_handle final { public: - explicit native_arrow_ipc_writer_handle( - const std::vector& col_names, - const std::string& file_name): - initialized(false), - column_names(col_names), - file_name(file_name) {} - - explicit native_arrow_ipc_writer_handle( - const std::vector& col_names, - const std::shared_ptr& sink): - initialized(false), - column_names(col_names), - sink(sink), - file_name("") {} + explicit native_arrow_ipc_writer_handle(const std::vector &col_names, + const std::string &file_name) + : initialized(false), column_names(col_names), file_name(file_name) {} + + explicit native_arrow_ipc_writer_handle(const std::vector &col_names, + const std::shared_ptr &sink) + : initialized(false), column_names(col_names), sink(sink), file_name("") {} bool initialized; std::vector column_names; @@ -222,7 +216,7 @@ class native_arrow_ipc_writer_handle final { std::shared_ptr sink; std::shared_ptr writer; - void write(std::shared_ptr& arrow_tab, int64_t max_chunk) { + void write(std::shared_ptr &arrow_tab, int64_t max_chunk) { if (!initialized) { if (!sink) { auto tmp_sink = arrow::io::FileOutputStream::Open(file_name); @@ -252,7 +246,6 @@ class native_arrow_ipc_writer_handle final { } }; - class jni_arrow_output_stream final : public arrow::io::OutputStream { public: explicit jni_arrow_output_stream(JNIEnv *env, jobject callback) { @@ -292,11 +285,11 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream { current_buffer = nullptr; } - arrow::Status Write(const std::shared_ptr & data) override { + arrow::Status Write(const std::shared_ptr &data) override { return Write(data->data(), data->size()); } - arrow::Status Write(const void* data, int64_t nbytes) override { + arrow::Status Write(const void *data, int64_t nbytes) override { JNIEnv *env = cudf::jni::get_jni_env(jvm); int64_t left_to_copy = nbytes; const char *copy_from = static_cast(data); @@ -346,13 +339,9 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream { return arrow::Status::OK(); } - arrow::Result Tell() const override { - return total_written; - } + arrow::Result Tell() const override { return total_written; } - bool closed() const override { - return is_closed; - } + bool closed() const override { return is_closed; } private: void rotate_buffer(JNIEnv *env) { @@ -389,8 +378,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream { class jni_arrow_input_stream final : public arrow::io::InputStream { public: - explicit jni_arrow_input_stream(JNIEnv *env, jobject callback) : - mm(arrow::default_cpu_memory_manager()) { + explicit jni_arrow_input_stream(JNIEnv *env, jobject callback) + : mm(arrow::default_cpu_memory_manager()) { if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); } @@ -400,8 +389,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream { throw cudf::jni::jni_exception("class not found"); } - read_into_method = - env->GetMethodID(cls, "readInto", "(JJ)J"); + read_into_method = env->GetMethodID(cls, "readInto", "(JJ)J"); if (read_into_method == nullptr) { throw cudf::jni::jni_exception("readInto method"); } @@ -423,7 +411,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream { callback = nullptr; } - arrow::Result Read(int64_t nbytes, void* out) override { + arrow::Result Read(int64_t nbytes, void *out) override { JNIEnv *env = cudf::jni::get_jni_env(jvm); jlong ret = read_into(env, reinterpret_cast(out), nbytes); total_read += ret; @@ -432,7 +420,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream { arrow::Result> Read(int64_t nbytes) override { JNIEnv *env = cudf::jni::get_jni_env(jvm); - arrow::Result> tmp_buffer = + arrow::Result> tmp_buffer = arrow::AllocateResizableBuffer(nbytes); if (!tmp_buffer.ok()) { return tmp_buffer; @@ -444,7 +432,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream { } return tmp_buffer; } - + arrow::Status Close() override { is_closed = true; return arrow::Status::OK(); @@ -455,13 +443,9 @@ class jni_arrow_input_stream final : public arrow::io::InputStream { return arrow::Status::OK(); } - arrow::Result Tell() const override { - return total_read; - } + arrow::Result Tell() const override { return total_read; } - bool closed() const override { - return is_closed; - } + bool closed() const override { return is_closed; } private: jlong read_into(JNIEnv *env, jlong addr, jlong len) { @@ -483,8 +467,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream { class native_arrow_ipc_reader_handle final { public: - explicit native_arrow_ipc_reader_handle( - const std::string& file_name) { + explicit native_arrow_ipc_reader_handle(const std::string &file_name) { auto tmp_source = arrow::io::ReadableFile::Open(file_name); if (!tmp_source.ok()) { throw std::runtime_error(tmp_source.status().message()); @@ -497,9 +480,8 @@ class native_arrow_ipc_reader_handle final { reader = *tmp_reader; } - explicit native_arrow_ipc_reader_handle( - std::shared_ptr source): - source(source) { + explicit native_arrow_ipc_reader_handle(std::shared_ptr source) + : source(source) { auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source); if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); @@ -528,7 +510,7 @@ class native_arrow_ipc_reader_handle final { // EOF return std::unique_ptr(); } - arrow::Result> tmp = + arrow::Result> tmp = arrow::Table::FromRecordBatches(reader->schema(), batches); if (!tmp.ok()) { throw std::runtime_error(tmp.status().message()); @@ -539,9 +521,7 @@ class native_arrow_ipc_reader_handle final { std::shared_ptr source; std::shared_ptr reader; - void close() { - source->Close(); - } + void close() { source->Close(); } }; /** @@ -584,8 +564,8 @@ bool valid_window_parameters(native_jintArray const &values, // Check that time-range window parameters are valid. bool valid_window_parameters(native_jintArray const &values, native_jintArray const ×tamps, native_jpointerArray const &ops, - native_jintArray const &min_periods, - native_jintArray const &preceding, native_jintArray const &following) { + native_jintArray const &min_periods, native_jintArray const &preceding, + native_jintArray const &following) { return values.size() == timestamps.size() && valid_window_parameters(values, ops, min_periods, preceding, following); } @@ -697,8 +677,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass try { cudf::jni::auto_set_device(env); - cudf::jni::native_jpointerArray n_table_handles(env, - j_table_handles); + cudf::jni::native_jpointerArray n_table_handles(env, j_table_handles); const cudf::jni::native_jintArray n_sort_key_indexes(env, j_sort_key_indexes); jsize num_columns = n_sort_key_indexes.size(); @@ -738,10 +717,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass tables.push_back(*n_table_handles[i]); } - std::unique_ptr result = cudf::merge(tables, - indexes, - order, - null_order); + std::unique_ptr result = cudf::merge(tables, indexes, order, null_order); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, NULL); @@ -790,19 +766,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( } cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(*source) - .delimiter(delim) - .header(header_row) - .names(n_col_names.as_cpp_vector()) - .dtypes(n_data_types.as_cpp_vector()) - .use_cols_names(n_filter_col_names.as_cpp_vector()) - .true_values(n_true_values.as_cpp_vector()) - .false_values(n_false_values.as_cpp_vector()) - .na_values(n_null_values.as_cpp_vector()) - .keep_default_na(false) - .na_filter(n_null_values.size() > 0) - .quotechar(quote) - .comment(comment) - .build(); + .delimiter(delim) + .header(header_row) + .names(n_col_names.as_cpp_vector()) + .dtypes(n_data_types.as_cpp_vector()) + .use_cols_names(n_filter_col_names.as_cpp_vector()) + .true_values(n_true_values.as_cpp_vector()) + .false_values(n_false_values.as_cpp_vector()) + .na_values(n_null_values.as_cpp_vector()) + .keep_default_na(false) + .na_filter(n_null_values.size() > 0) + .quotechar(quote) + .comment(comment) + .build(); cudf::io::table_with_metadata result = cudf::io::read_csv(opts); return cudf::jni::convert_table_for_return(env, result.tbl); } @@ -842,11 +818,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( } cudf::io::parquet_reader_options opts = - cudf::io::parquet_reader_options::builder(*source) - .columns(n_filter_col_names.as_cpp_vector()) - .convert_strings_to_categories(false) - .timestamp_type(cudf::data_type(static_cast(unit))) - .build(); + cudf::io::parquet_reader_options::builder(*source) + .columns(n_filter_col_names.as_cpp_vector()) + .convert_strings_to_categories(false) + .timestamp_type(cudf::data_type(static_cast(unit))) + .build(); cudf::io::table_with_metadata result = cudf::io::read_parquet(opts); return cudf::jni::convert_table_for_return(env, result.tbl); } @@ -883,11 +859,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( new cudf::jni::jni_writer_data_sink(env, consumer)); sink_info sink{data_sink.get()}; chunked_parquet_writer_options opts = - chunked_parquet_writer_options::builder(sink) - .nullable_metadata(&metadata) - .compression(static_cast(j_compression)) - .stats_level(static_cast(j_stats_freq)) - .build(); + chunked_parquet_writer_options::builder(sink) + .nullable_metadata(&metadata) + .compression(static_cast(j_compression)) + .stats_level(static_cast(j_stats_freq)) + .build(); std::shared_ptr state = write_parquet_chunked_begin(opts); cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(state, data_sink); @@ -925,11 +901,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( sink_info sink{output_path.get()}; chunked_parquet_writer_options opts = - chunked_parquet_writer_options::builder(sink) - .nullable_metadata(&metadata) - .compression(static_cast(j_compression)) - .stats_level(static_cast(j_stats_freq)) - .build(); + chunked_parquet_writer_options::builder(sink) + .nullable_metadata(&metadata) + .compression(static_cast(j_compression)) + .stats_level(static_cast(j_stats_freq)) + .build(); std::shared_ptr state = write_parquet_chunked_begin(opts); cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(state); @@ -1007,12 +983,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC( source.reset(new cudf::io::source_info(filename.get())); } - cudf::io::orc_reader_options opts = cudf::io::orc_reader_options::builder(*source) - .columns(n_filter_col_names.as_cpp_vector()) - .use_index(false) - .use_np_dtypes(static_cast(usingNumPyTypes)) - .timestamp_type(cudf::data_type(static_cast(unit))) - .build(); + cudf::io::orc_reader_options opts = + cudf::io::orc_reader_options::builder(*source) + .columns(n_filter_col_names.as_cpp_vector()) + .use_index(false) + .use_np_dtypes(static_cast(usingNumPyTypes)) + .timestamp_type(cudf::data_type(static_cast(unit))) + .build(); cudf::io::table_with_metadata result = cudf::io::read_orc(opts); return cudf::jni::convert_table_for_return(env, result.tbl); } @@ -1048,12 +1025,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( std::unique_ptr data_sink( new cudf::jni::jni_writer_data_sink(env, consumer)); sink_info sink{data_sink.get()}; - chunked_orc_writer_options opts = - chunked_orc_writer_options::builder(sink) - .metadata(&metadata) - .compression(static_cast(j_compression)) - .enable_statistics(true) - .build(); + chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) + .metadata(&metadata) + .compression(static_cast(j_compression)) + .enable_statistics(true) + .build(); std::shared_ptr state = write_orc_chunked_begin(opts); cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(state, data_sink); @@ -1090,12 +1066,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( } sink_info sink{output_path.get()}; - chunked_orc_writer_options opts = - chunked_orc_writer_options::builder(sink) - .metadata(&metadata) - .compression(static_cast(j_compression)) - .enable_statistics(true) - .build(); + chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) + .metadata(&metadata) + .compression(static_cast(j_compression)) + .enable_statistics(true) + .build(); std::shared_ptr state = write_orc_chunked_begin(opts); cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(state); return reinterpret_cast(ret); @@ -1138,10 +1113,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass CATCH_STD(env, ) } - -JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin( - JNIEnv *env, jclass, jobjectArray j_col_names, - jobject consumer) { +JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass, + jobjectArray j_col_names, + jobject consumer) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, consumer, "null consumer", 0); try { @@ -1152,17 +1126,15 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin( new cudf::jni::jni_arrow_output_stream(env, consumer)); cudf::jni::native_arrow_ipc_writer_handle *ret = - new cudf::jni::native_arrow_ipc_writer_handle( - col_names.as_cpp_vector(), - data_sink); + new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink); return reinterpret_cast(ret); } CATCH_STD(env, 0) } -JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin( - JNIEnv *env, jclass, jobjectArray j_col_names, - jstring j_output_path) { +JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass, + jobjectArray j_col_names, + jstring j_output_path) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_output_path, "null output path", 0); try { @@ -1171,9 +1143,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin( cudf::jni::native_jstring output_path(env, j_output_path); cudf::jni::native_arrow_ipc_writer_handle *ret = - new cudf::jni::native_arrow_ipc_writer_handle( - col_names.as_cpp_vector(), - output_path.get()); + new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get()); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -1191,15 +1161,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv try { cudf::jni::auto_set_device(env); - std::unique_ptr> result(new std::shared_ptr(nullptr)); + std::unique_ptr> result( + new std::shared_ptr(nullptr)); auto column_metadata = std::vector{}; column_metadata.reserve(state->column_names.size()); - std::transform( - std::begin(state->column_names), - std::end(state->column_names), - std::back_inserter(column_metadata), - [](auto const& column_name) { return cudf::column_metadata{column_name}; } - ); + std::transform(std::begin(state->column_names), std::end(state->column_names), + std::back_inserter(column_metadata), + [](auto const &column_name) { return cudf::column_metadata{column_name}; }); *result = cudf::to_arrow(*tview, column_metadata); if (!result->get()) { return 0; @@ -1243,7 +1211,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, j } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *env, jclass, - jstring j_input_path) { + jstring j_input_path) { JNI_NULL_CHECK(env, j_input_path, "null input path", 0); try { cudf::jni::auto_set_device(env); @@ -1257,7 +1225,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *e } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv *env, jclass, - jobject provider) { + jobject provider) { JNI_NULL_CHECK(env, provider, "null provider", 0); try { cudf::jni::auto_set_device(env); @@ -1272,10 +1240,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv CATCH_STD(env, 0) } -JNIEXPORT jlong JNICALL -Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass, - jlong j_state, - jint row_target) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass, + jlong j_state, + jint row_target) { JNI_NULL_CHECK(env, j_state, "null state", 0); cudf::jni::native_arrow_ipc_reader_handle *state = @@ -1285,7 +1252,8 @@ Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); // This is a little odd because we have to return a pointer // and arrow wants to deal with shared pointers for everything. - std::unique_ptr> result(new std::shared_ptr(nullptr)); + std::unique_ptr> result( + new std::shared_ptr(nullptr)); *result = state->next(row_target); if (!result->get()) { return 0; @@ -1307,8 +1275,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jc CATCH_STD(env, ) } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, - jlong arrow_table_handle) { +JNIEXPORT jlongArray JNICALL +Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arrow_table_handle) { JNI_NULL_CHECK(env, arrow_table_handle, "null arrow handle", 0); std::shared_ptr *handle = @@ -1323,7 +1291,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertArrowTableToCudf(J } JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jclass, - jlong j_state) { + jlong j_state) { JNI_NULL_CHECK(env, j_state, "null state", ); cudf::jni::native_arrow_ipc_reader_handle *state = @@ -1336,12 +1304,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc CATCH_STD(env, ) } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(JNIEnv *env, jclass clazz, - jlong left_table, - jintArray left_col_join_indices, - jlong right_table, - jintArray right_col_join_indices, - jboolean compare_nulls_equal) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin( + JNIEnv *env, jclass clazz, jlong left_table, jintArray left_col_join_indices, jlong right_table, + jintArray right_col_join_indices, jboolean compare_nulls_equal) { JNI_NULL_CHECK(env, left_table, "left_table is null", NULL); JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL); JNI_NULL_CHECK(env, right_table, "right_table is null", NULL); @@ -1367,19 +1332,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(JNIEnv *env, jcl std::unique_ptr result = cudf::left_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe, - static_cast(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); + static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : + cudf::null_equality::UNEQUAL); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, NULL); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(JNIEnv *env, jclass clazz, - jlong left_table, - jintArray left_col_join_indices, - jlong right_table, - jintArray right_col_join_indices, - jboolean compare_nulls_equal) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin( + JNIEnv *env, jclass clazz, jlong left_table, jintArray left_col_join_indices, jlong right_table, + jintArray right_col_join_indices, jboolean compare_nulls_equal) { JNI_NULL_CHECK(env, left_table, "left_table is null", NULL); JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL); JNI_NULL_CHECK(env, right_table, "right_table is null", NULL); @@ -1405,19 +1368,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(JNIEnv *env, jc std::unique_ptr result = cudf::inner_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe, - static_cast(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); + static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : + cudf::null_equality::UNEQUAL); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, NULL); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(JNIEnv *env, jclass clazz, - jlong left_table, - jintArray left_col_join_indices, - jlong right_table, - jintArray right_col_join_indices, - jboolean compare_nulls_equal) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin( + JNIEnv *env, jclass clazz, jlong left_table, jintArray left_col_join_indices, jlong right_table, + jintArray right_col_join_indices, jboolean compare_nulls_equal) { JNI_NULL_CHECK(env, left_table, "left_table is null", NULL); JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL); JNI_NULL_CHECK(env, right_table, "right_table is null", NULL); @@ -1443,7 +1404,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(JNIEnv *env, jcl std::unique_ptr result = cudf::full_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe, - static_cast(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); + static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : + cudf::null_equality::UNEQUAL); return cudf::jni::convert_table_for_return(env, result); } @@ -1475,7 +1437,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin( std::unique_ptr result = cudf::left_semi_join( *n_left_table, *n_right_table, left_join_cols, right_join_cols, return_cols, - static_cast(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); + static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : + cudf::null_equality::UNEQUAL); return cudf::jni::convert_table_for_return(env, result); } @@ -1507,7 +1470,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin( std::unique_ptr result = cudf::left_anti_join( *n_left_table, *n_right_table, left_join_cols, right_join_cols, return_cols, - static_cast(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL); + static_cast(compare_nulls_equal) ? cudf::null_equality::EQUAL : + cudf::null_equality::UNEQUAL); return cudf::jni::convert_table_for_return(env, result); } @@ -1525,8 +1489,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc cudf::table_view *n_left_table = reinterpret_cast(left_table); cudf::table_view *n_right_table = reinterpret_cast(right_table); - std::unique_ptr result = - cudf::cross_join(*n_left_table, *n_right_table); + std::unique_ptr result = cudf::cross_join(*n_left_table, *n_right_table); return cudf::jni::convert_table_for_return(env, result); } @@ -1702,10 +1665,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, - jlong j_input, - jlong j_map, - jboolean check_bounds) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input, + jlong j_map, jboolean check_bounds) { JNI_NULL_CHECK(env, j_input, "input table is null", 0); JNI_NULL_CHECK(env, j_map, "map column is null", 0); try { @@ -1718,8 +1679,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows( - JNIEnv *env, jclass clazz, jlong input_table) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass clazz, + jlong input_table) { JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { @@ -1736,8 +1697,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows( CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows( - JNIEnv *env, jclass clazz, jlong input_column, jintArray types, jintArray scale) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass clazz, + jlong input_column, + jintArray types, + jintArray scale) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); JNI_NULL_CHECK(env, types, "types is null", 0); @@ -1851,10 +1814,8 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate( - JNIEnv *env, jclass clazz, jlong j_input_table, jintArray j_keys, - jlongArray j_default_output, - jintArray j_aggregate_column_indices, jlongArray j_agg_instances, - jintArray j_min_periods, + JNIEnv *env, jclass clazz, jlong j_input_table, jintArray j_keys, jlongArray j_default_output, + jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods, jintArray j_preceding, jintArray j_following, jboolean ignore_null_keys) { JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL); @@ -1893,13 +1854,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate( int agg_column_index = values[i]; if (default_output[i] != nullptr) { result_columns.emplace_back(std::move(cudf::grouped_rolling_window( - groupby_keys, input_table->column(agg_column_index), *default_output[i], - preceding[i], following[i], - min_periods[i], agg_instances[i]->clone()))); + groupby_keys, input_table->column(agg_column_index), *default_output[i], preceding[i], + following[i], min_periods[i], agg_instances[i]->clone()))); } else { result_columns.emplace_back(std::move(cudf::grouped_rolling_window( - groupby_keys, input_table->column(agg_column_index), - preceding[i], following[i], + groupby_keys, input_table->column(agg_column_index), preceding[i], following[i], min_periods[i], agg_instances[i]->clone()))); } } diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu index 03ffddecf1f..a3e25ce8905 100644 --- a/java/src/main/native/src/map_lookup.cu +++ b/java/src/main/native/src/map_lookup.cu @@ -27,25 +27,25 @@ #include #include +#include "rmm/cuda_stream_view.hpp" + namespace cudf { namespace { /** * @brief Device function that searches for the specified lookup_key - * in the list at index `row_index`, and writes out the index of the + * in the list at index `row_index`, and writes out the index of the * first match to the output. - * + * * This function is called once per row of the `input` column * If the lookup_key is not found, (-1) is returned for that list row. */ template -void __device__ search_each_list(size_type row_index, - column_device_view input, +void __device__ search_each_list(size_type row_index, column_device_view input, mutable_column_device_view output, - string_scalar_device_view lookup_key) -{ - if (has_nulls && input.is_null(row_index)) { // List row is null. - output.element(row_index) = -1; // Not found. + string_scalar_device_view lookup_key) { + if (has_nulls && input.is_null(row_index)) { // List row is null. + output.element(row_index) = -1; // Not found. return; } @@ -68,7 +68,7 @@ void __device__ search_each_list(size_type row_index, } } - output.element(row_index) = -1; // Not found. + output.element(row_index) = -1; // Not found. } /** @@ -76,17 +76,16 @@ void __device__ search_each_list(size_type row_index, * string in each list row of the `input` column. * * The kernel writes the index (into the `input` list-column's child) where the `lookup_key` - * is found, to the `output` column. If the `lookup_key` is not found, (-1) is written instead. + * is found, to the `output` column. If the `lookup_key` is not found, (-1) is written instead. * * The produces one output row per input, with no nulls. The output may then be used * with `cudf::gather()`, to find the values corresponding to the `lookup_key`. */ template -__launch_bounds__(block_size) __global__ void gpu_find_first(column_device_view input, - mutable_column_device_view output, - string_scalar_device_view lookup_key) -{ - size_type tid = blockIdx.x * block_size + threadIdx.x; +__launch_bounds__(block_size) __global__ + void gpu_find_first(column_device_view input, mutable_column_device_view output, + string_scalar_device_view lookup_key) { + size_type tid = blockIdx.x * block_size + threadIdx.x; size_type stride = block_size * gridDim.x; // Each CUDA thread processes one row of `input`. Each row is a list. @@ -106,37 +105,32 @@ __launch_bounds__(block_size) __global__ void gpu_find_first(column_device_view * for each row. */ template -std::unique_ptr get_gather_map_for_map_values(column_view const& input, - string_scalar& lookup_key, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) -{ +std::unique_ptr +get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_key, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { constexpr size_type block_size{256}; cudf::detail::grid_1d grid{input.size(), block_size}; auto input_device_view = cudf::column_device_view::create(input, stream); auto lookup_key_device_view{get_scalar_device_view(lookup_key)}; - auto gather_map = make_numeric_column( - data_type{cudf::type_to_id()}, input.size(), mask_state::ALL_VALID, stream, mr); + auto gather_map = make_numeric_column(data_type{cudf::type_to_id()}, input.size(), + mask_state::ALL_VALID, stream, mr); auto output_view = mutable_column_device_view::create(gather_map->mutable_view(), stream); - gpu_find_first<<>>( - *input_device_view, *output_view, lookup_key_device_view); + gpu_find_first<<>>( + *input_device_view, *output_view, lookup_key_device_view); CHECK_CUDA(stream); return gather_map; } -} // namespace +} // namespace namespace jni { -std::unique_ptr map_lookup(column_view const& map_column, - string_scalar lookup_key, - bool has_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) -{ +std::unique_ptr map_lookup(column_view const &map_column, string_scalar lookup_key, + bool has_nulls, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { // Defensive checks. CUDF_EXPECTS(map_column.type().id() == type_id::LIST, "Expected LIST>."); @@ -155,23 +149,20 @@ std::unique_ptr map_lookup(column_view const& map_column, // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A. // (Can do in one pass perhaps, but that's Plan B.) - auto gather_map = has_nulls? - get_gather_map_for_map_values(map_column, lookup_key, mr, stream) - : get_gather_map_for_map_values(map_column, lookup_key, mr, stream); + auto gather_map = has_nulls ? + get_gather_map_for_map_values(map_column, lookup_key, stream, mr) : + get_gather_map_for_map_values(map_column, lookup_key, stream, mr); // Gather map is now available. - auto values_column = structs_column.child(1); + auto values_column = structs_column.child(1); auto table_for_gather = table_view{std::vector{values_column}}; - auto gathered_table = cudf::detail::gather(table_for_gather, - gather_map->view(), - detail::out_of_bounds_policy::IGNORE, - detail::negative_index_policy::NOT_ALLOWED, - mr, - stream); + auto gathered_table = cudf::detail::gather( + table_for_gather, gather_map->view(), detail::out_of_bounds_policy::IGNORE, + detail::negative_index_policy::NOT_ALLOWED, stream, mr); return std::make_unique(std::move(gathered_table->get_column(0))); } -} // namespace jni; -} // namespace cudf; \ No newline at end of file +} // namespace jni +} // namespace cudf diff --git a/java/src/main/native/src/map_lookup.hpp b/java/src/main/native/src/map_lookup.hpp index c0380fe3306..6d54bfa371d 100644 --- a/java/src/main/native/src/map_lookup.hpp +++ b/java/src/main/native/src/map_lookup.hpp @@ -17,41 +17,40 @@ #pragma once #include +#include namespace cudf { namespace jni { - /** - * @brief Looks up a "map" column by specified key, and returns a column of string values. - * - * The map-column is represented as follows: - * - * list_view >. - * <---KEY---> <--VALUE--> - * - * The string_view struct members are the key and value, respectively. - * For each row in the input list column, the value corresponding to the first match - * of the specified lookup_key is returned. If the key is not found, a null is returned. - * - * @param map_column The input "map" column to be searched. Must be of - * type list_view>. - * @param lookup_key The search key, whose value is to be returned for each list row - * @param has_nulls Whether the input column might contain null list-rows, or null keys. - * @param mr The device memory resource to be used for allocations - * @param stream The CUDA stream - * @return A string_view column with the value from the first match in each list. - * A null row is returned for any row where the lookup_key is not found. - * @throw cudf::logic_error If the input column is not of type - * list_view> - */ - std::unique_ptr map_lookup( - column_view const& map_column, - string_scalar lookup_key, - bool has_nulls = true, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); +/** + * @brief Looks up a "map" column by specified key, and returns a column of string values. + * + * The map-column is represented as follows: + * + * list_view >. + * <---KEY---> <--VALUE--> + * + * The string_view struct members are the key and value, respectively. + * For each row in the input list column, the value corresponding to the first match + * of the specified lookup_key is returned. If the key is not found, a null is returned. + * + * @param map_column The input "map" column to be searched. Must be of + * type list_view>. + * @param lookup_key The search key, whose value is to be returned for each list row + * @param has_nulls Whether the input column might contain null list-rows, or null keys. + * @param stream The CUDA stream + * @param mr The device memory resource to be used for allocations + * @return A string_view column with the value from the first match in each list. + * A null row is returned for any row where the lookup_key is not found. + * @throw cudf::logic_error If the input column is not of type + * list_view> + */ +std::unique_ptr +map_lookup(column_view const &map_column, string_scalar lookup_key, bool has_nulls = true, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -} // namespace jni; +} // namespace jni -} // namespace cudf; \ No newline at end of file +} // namespace cudf diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 4448bc14044..a10ba9a2700 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -17,14 +17,14 @@ #include #include -#include #include #include #include +#include #include #include #include - +#include #include #include "row_conversion.hpp" @@ -37,299 +37,271 @@ namespace java { * the data on the same stream as is used to copy it. */ template -std::unique_ptr> copy_to_dev_async( - const std::vector & input, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr) { - std::unique_ptr> ret(new rmm::device_uvector( - input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync(ret->data(), - input.data(), - sizeof(T) * input.size(), - cudaMemcpyHostToDevice, - stream)); - return ret; +std::unique_ptr> copy_to_dev_async(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); + CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(), + cudaMemcpyHostToDevice, stream.value())); + return ret; } -__global__ -void copy_to_fixed_width_columns( - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type* input_offset_in_row, - const cudf::size_type* num_bytes, - int8_t ** output_data, - cudf::bitmask_type ** output_nm, - const int8_t * input_data) { - - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // For simplicity we will refer to this as a row_group - - // In practice we have found writing more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1)/rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying from shared data in the same place - int8_t * row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t * row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; - row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Step 1: Copy the data into shared memory - // We know row_size is always aligned with and a multiple of int64_t; - int64_t * long_shared = reinterpret_cast(shared_data); - const int64_t * long_input = reinterpret_cast(input_data); - - cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_output_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; +__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type row_size, + const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, int8_t **output_data, + cudf::bitmask_type **output_nm, + const int8_t *input_data) { + + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // For simplicity we will refer to this as a row_group + + // In practice we have found writing more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying from shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Step 1: Copy the data into shared memory + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + const int64_t *long_input = reinterpret_cast(input_data); + + cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_output_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { + row_index_end = num_rows; + } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; - cudf::size_type shared_output_end = shared_length/sizeof(int64_t); + cudf::size_type shared_output_end = shared_length / sizeof(int64_t); - cudf::size_type start_input_index = (row_size * row_group_index * rows_per_group)/sizeof(int64_t); + cudf::size_type start_input_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - for (cudf::size_type shared_index = shared_output_index; - shared_index < shared_output_end; - shared_index += shared_output_stride) { - long_shared[shared_index] = long_input[start_input_index + shared_index]; - } - // Wait for all of the data to be in shared memory - __syncthreads(); - - // Step 2 copy the data back out - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); - if (row_index < num_rows) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; - col_index < num_columns; - col_index += col_index_stride) { - - cudf::size_type col_size = num_bytes[col_index]; - const int8_t * col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t * col_output = output_data[col_index]; - switch(col_size) { - case 1: - { - col_output[row_index] = *col_tmp; - break; - } - case 2: - { - int16_t * short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 4: - { - int32_t * int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 8: - { - int64_t * long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - default: - { - cudf::size_type output_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_output[b + output_offset] = col_tmp[b]; - } - break; - } - } - - cudf::bitmask_type * nm = output_nm[col_index]; - int8_t * valid_byte = &row_vld_tmp[col_index/8]; - cudf::size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { - nm[word_index(row_index)] = bitmask; - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied before starting on the next row group - __syncthreads(); + for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; + shared_index += shared_output_stride) { + long_shared[shared_index] = long_input[start_input_index + shared_index]; } -} + // Wait for all of the data to be in shared memory + __syncthreads(); + + // Step 2 copy the data back out + + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data in for the next row group. + uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); + if (row_index < num_rows) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + + cudf::size_type col_size = num_bytes[col_index]; + const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t *col_output = output_data[col_index]; + switch (col_size) { + case 1: { + col_output[row_index] = *col_tmp; + break; + } + case 2: { + int16_t *short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 4: { + int32_t *int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 8: { + int64_t *long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + default: { + cudf::size_type output_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_output[b + output_offset] = col_tmp[b]; + } + break; + } + } + cudf::bitmask_type *nm = output_nm[col_index]; + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { + nm[word_index(row_index)] = bitmask; + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied before starting on the next row group + __syncthreads(); + } +} -__global__ -void copy_from_fixed_width_columns( - const cudf::size_type start_row, - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type* output_offset_in_row, - const cudf::size_type* num_bytes, - const int8_t ** input_data, - const cudf::bitmask_type ** input_nm, - int8_t * output_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // We do not support copying a subset of the columns in a row yet, so we don't - // currently support a row that is wider than shared memory. - // For simplicity we will refer to this as a row_group - - // In practice we have found reading more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1)/rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying to shared data in the same place - int8_t * row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t * row_vld_tmp = &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; - row_group_index < row_group_end; - row_group_index += row_group_stride) { - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data back out. - if (row_index < (start_row + num_rows)) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; - col_index < num_columns; - col_index += col_index_stride) { - - cudf::size_type col_size = num_bytes[col_index]; - int8_t * col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t * col_input = input_data[col_index]; - switch(col_size) { - case 1: - { - *col_tmp = col_input[row_index]; - break; - } - case 2: - { - const int16_t * short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; - break; - } - case 4: - { - const int32_t * int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; - break; - } - case 8: - { - const int64_t * long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; - break; - } - default: - { - cudf::size_type input_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_tmp[b] = col_input[b + input_offset]; - } - break; - } - } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t * valid_byte = &row_vld_tmp[col_index/8]; - cudf::size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t * valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - // Now copy validity for the column - if (input_nm[col_index]) { - if (bit_is_set(input_nm[col_index], row_index)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied into shared memory - __syncthreads(); - - // Step 2: Copy the data back out - // We know row_size is always aligned with and a multiple of int64_t; - int64_t * long_shared = reinterpret_cast(shared_data); - int64_t * long_output = reinterpret_cast(output_data); - - cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_input_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; +__global__ void +copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, + const cudf::size_type *num_bytes, const int8_t **input_data, + const cudf::bitmask_type **input_nm, int8_t *output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // We do not support copying a subset of the columns in a row yet, so we don't + // currently support a row that is wider than shared memory. + // For simplicity we will refer to this as a row_group + + // In practice we have found reading more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying to shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data back out. + if (row_index < (start_row + num_rows)) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + + cudf::size_type col_size = num_bytes[col_index]; + int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t *col_input = input_data[col_index]; + switch (col_size) { + case 1: { + *col_tmp = col_input[row_index]; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; + break; + } + default: { + cudf::size_type input_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_tmp[b] = col_input[b + input_offset]; + } + break; + } + } + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col_index]) { + if (bit_is_set(input_nm[col_index], row_index)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; + } // end column loop + } // end row copy + // wait for the row_group to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data back out + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + int64_t *long_output = reinterpret_cast(output_data); + + cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_input_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { + row_index_end = num_rows; + } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; - cudf::size_type shared_input_end = shared_length/sizeof(int64_t); + cudf::size_type shared_input_end = shared_length / sizeof(int64_t); - cudf::size_type start_output_index = (row_size * row_group_index * rows_per_group)/sizeof(int64_t); + cudf::size_type start_output_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - for (cudf::size_type shared_index = shared_input_index; - shared_index < shared_input_end; - shared_index += shared_input_stride) { - long_output[start_output_index + shared_index] = long_shared[shared_index]; - } - __syncthreads(); - // Go for the next round + for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; + shared_index += shared_input_stride) { + long_output[start_output_index + shared_index] = long_shared[shared_index]; } + __syncthreads(); + // Go for the next round + } } /** @@ -341,60 +313,58 @@ void copy_from_fixed_width_columns( * @param [out] threads the size of the threads for the kernel * @return the size in bytes of shared memory needed for each block. */ -static int calc_fixed_width_kernel_dims( - const cudf::size_type num_columns, - const cudf::size_type num_rows, - const cudf::size_type size_per_row, - dim3 & blocks, - dim3 & threads) { - - // We have found speed degrades when a thread handles more than 4 columns. - // Each block is 2 dimensional. The y dimension indicates the columns. - // We limit this to 32 threads in the y dimension so we can still - // have at least 32 threads in the x dimension (1 warp) which should - // result in better coalescing of memory operations. We also - // want to guarantee that we are processing a multiple of 32 threads - // in the x dimension because we use atomic operations at the block - // level when writing validity data out to main memory, and that would - // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; - if (y_block_size > 32) { - y_block_size = 32; - } - int x_possible_block_size = 1024/y_block_size; - // 48KB is the default setting for shared memory per block according to the cuda tutorials - // If someone configures the GPU to only have 16 KB this might not work. - int max_shared_size = 48 * 1024; - int max_block_size = max_shared_size/size_per_row; - // If we don't have enough shared memory there is no point in having more threads - // per block that will just sit idle - max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; - // Make sure that the x dimension is a multiple of 32 this not only helps - // coalesce memory access it also lets us do a ballot sync for validity to write - // the data back out the warp level. If x is a multiple of 32 then each thread in the y - // dimension is associated with one or more warps, that should correspond to the validity - // words directly. - int block_size = (max_block_size / 32) * 32; - CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - - int num_blocks = (num_rows + block_size - 1) / block_size; - if (num_blocks < 1) { - num_blocks = 1; - } else if (num_blocks > 10240) { - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - num_blocks = 10240; - } - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; - threads.x = block_size; - threads.y = y_block_size; - threads.z = 1; - return size_per_row * block_size; +static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, + const cudf::size_type num_rows, + const cudf::size_type size_per_row, dim3 &blocks, + dim3 &threads) { + + // We have found speed degrades when a thread handles more than 4 columns. + // Each block is 2 dimensional. The y dimension indicates the columns. + // We limit this to 32 threads in the y dimension so we can still + // have at least 32 threads in the x dimension (1 warp) which should + // result in better coalescing of memory operations. We also + // want to guarantee that we are processing a multiple of 32 threads + // in the x dimension because we use atomic operations at the block + // level when writing validity data out to main memory, and that would + // need to change if we split a word of validity data between blocks. + int y_block_size = (num_columns + 3) / 4; + if (y_block_size > 32) { + y_block_size = 32; + } + int x_possible_block_size = 1024 / y_block_size; + // 48KB is the default setting for shared memory per block according to the cuda tutorials + // If someone configures the GPU to only have 16 KB this might not work. + int max_shared_size = 48 * 1024; + int max_block_size = max_shared_size / size_per_row; + // If we don't have enough shared memory there is no point in having more threads + // per block that will just sit idle + max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; + // Make sure that the x dimension is a multiple of 32 this not only helps + // coalesce memory access it also lets us do a ballot sync for validity to write + // the data back out the warp level. If x is a multiple of 32 then each thread in the y + // dimension is associated with one or more warps, that should correspond to the validity + // words directly. + int block_size = (max_block_size / 32) * 32; + CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); + + int num_blocks = (num_rows + block_size - 1) / block_size; + if (num_blocks < 1) { + num_blocks = 1; + } else if (num_blocks > 10240) { + // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 + // but in practice haveing too many can cause some overhead that I don't totally + // understand. Playing around with this haveing as little as 600 blocks appears + // to be able to saturate memory on V100, so this is an order of magnitude higher + // to try and future proof this a bit. + num_blocks = 10240; + } + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; + threads.x = block_size; + threads.y = y_block_size; + threads.z = 1; + return size_per_row * block_size; } /** @@ -404,68 +374,53 @@ static int calc_fixed_width_kernel_dims( * into this function are common between runs and should be calculated once. */ static std::unique_ptr fixed_width_convert_to_rows( - const cudf::size_type start_row, - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type size_per_row, - std::unique_ptr> & column_start, - std::unique_ptr> & column_size, - std::unique_ptr> & input_data, - std::unique_ptr> & input_nm, - const cudf::scalar & zero, - const cudf::scalar & scalar_size_per_row, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr - ) { - int64_t total_allocation = size_per_row * num_rows; - // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); - - // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = cudf::detail::sequence( - num_rows + 1, zero, scalar_size_per_row); - - std::unique_ptr data = - cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT8), - static_cast(total_allocation)); - - dim3 blocks; - dim3 threads; - int shared_size = calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_from_fixed_width_columns<<>>( - start_row, - num_rows, - num_columns, - size_per_row, - column_start->data(), - column_size->data(), - input_data->data(), - input_nm->data(), - data->mutable_view().data()); - - return cudf::make_lists_column(num_rows, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, 0, mr}); + const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type size_per_row, + std::unique_ptr> &column_start, + std::unique_ptr> &column_size, + std::unique_ptr> &input_data, + std::unique_ptr> &input_nm, + const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + int64_t total_allocation = size_per_row * num_rows; + // We made a mistake in the split somehow + CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); + + // Allocate and set the offsets row for the byte array + std::unique_ptr offsets = + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); + + std::unique_ptr data = cudf::make_numeric_column( + cudf::data_type(cudf::type_id::INT8), static_cast(total_allocation), + cudf::mask_state::UNALLOCATED, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + copy_from_fixed_width_columns<<>>( + start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(), + input_data->data(), input_nm->data(), data->mutable_view().data()); + + return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, 0, mr}, stream, mr); } -static cudf::data_type get_data_type(const cudf::column_view & v) { - return v.type(); +static cudf::data_type get_data_type(const cudf::column_view &v) { + return v.type(); } -static bool is_fixed_width(const cudf::data_type & t) { - return cudf::is_fixed_width(t); +static bool is_fixed_width(const cudf::data_type &t) { + return cudf::is_fixed_width(t); } static inline int32_t align_offset(int32_t offset, std::size_t alignment) { - return (offset + alignment - 1) & ~(alignment - 1); + return (offset + alignment - 1) & ~(alignment - 1); } -static inline bool are_all_fixed_width(std::vector const & schema) { - return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width); +static inline bool are_all_fixed_width(std::vector const &schema) { + return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width); } /** @@ -475,168 +430,149 @@ static inline bool are_all_fixed_width(std::vector const & sche * @param [out] column_size the size in bytes of the data for each columns in the row. * @return the size in bytes each row needs. */ -static inline int32_t compute_fixed_width_layout( - std::vector const & schema, - std::vector & column_start, - std::vector & column_size) { - // We guarantee that the start of each column is 64-bit aligned so anything can go - // there, but to make the code simple we will still do an alignment for it. - int32_t at_offset = 0; - for (auto col = schema.begin(); col < schema.end(); col++) { - cudf::size_type s = cudf::size_of(*col); - column_size.emplace_back(s); - std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = align_offset(at_offset, alignment_needed); - column_start.emplace_back(at_offset); - at_offset += allocation_needed; - } - - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add it in - int32_t validity_bytes_needed = (schema.size() + 7)/8; - // validity comes at the end and is byte aligned so we can pack more in. - at_offset += validity_bytes_needed; - // Now we need to pad the end so all rows are 64 bit aligned - return align_offset(at_offset, 8); // 8 bytes (64 bits) +static inline int32_t compute_fixed_width_layout(std::vector const &schema, + std::vector &column_start, + std::vector &column_size) { + // We guarantee that the start of each column is 64-bit aligned so anything can go + // there, but to make the code simple we will still do an alignment for it. + int32_t at_offset = 0; + for (auto col = schema.begin(); col < schema.end(); col++) { + cudf::size_type s = cudf::size_of(*col); + column_size.emplace_back(s); + std::size_t allocation_needed = s; + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + at_offset = align_offset(at_offset, alignment_needed); + column_start.emplace_back(at_offset); + at_offset += allocation_needed; + } + + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add it + // in + int32_t validity_bytes_needed = (schema.size() + 7) / 8; + // validity comes at the end and is byte aligned so we can pack more in. + at_offset += validity_bytes_needed; + // Now we need to pad the end so all rows are 64 bit aligned + return align_offset(at_offset, 8); // 8 bytes (64 bits) } -std::vector> convert_to_rows( - cudf::table_view const& tbl, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr) { +std::vector> convert_to_rows(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { - const cudf::size_type num_columns = tbl.num_columns(); + const cudf::size_type num_columns = tbl.num_columns(); - std::vector schema; - schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type); + std::vector schema; + schema.resize(num_columns); + std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type); - if (are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; + if (are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; - int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = copy_to_dev_async(column_size, stream, mr); + int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = copy_to_dev_async(column_size, stream, mr); - int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; - // Make the number of rows per batch a multiple of 32 so we don't have to worry about - // splitting validity at a specific row offset. This might change in the future. - max_rows_per_batch = (max_rows_per_batch/32) * 32; + int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; + // Make the number of rows per batch a multiple of 32 so we don't have to worry about + // splitting validity at a specific row offset. This might change in the future. + max_rows_per_batch = (max_rows_per_batch / 32) * 32; - cudf::size_type num_rows = tbl.num_rows(); + cudf::size_type num_rows = tbl.num_rows(); - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { - cudf::column_view cv = tbl.column(column_number); - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - auto dev_input_data = copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr); - - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - zero->set_valid(true); - static_cast(zero.get())->set_value(0); - - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - step->set_valid(true); - static_cast(step.get())->set_value(static_cast(size_per_row)); - - std::vector> ret; - for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - cudf::size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(fixed_width_convert_to_rows( - row_start, - row_count, - num_columns, - size_per_row, - dev_column_start, - dev_column_size, - dev_input_data, - dev_input_nm, - *zero, - *step, - stream, - mr)); - } - - return ret; - } else { - CUDF_FAIL("Only fixed width types are currently supported"); + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { + cudf::column_view cv = tbl.column(column_number); + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + auto dev_input_data = copy_to_dev_async(input_data, stream, mr); + auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr); + + using ScalarType = cudf::scalar_type_t; + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + zero->set_valid(true, stream); + static_cast(zero.get())->set_value(0, stream); + + auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + step->set_valid(true, stream); + static_cast(step.get()) + ->set_value(static_cast(size_per_row), stream); + + std::vector> ret; + for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { + cudf::size_type row_count = num_rows - row_start; + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(fixed_width_convert_to_rows( + row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, + dev_input_data, dev_input_nm, *zero, *step, stream, mr)); } -} -std::unique_ptr convert_from_rows( - cudf::lists_column_view const& input, - std::vector const& schema, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr) { - - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - - if (are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(size_per_row * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_column_start = copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = copy_to_dev_async(column_size, stream, mr); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], - num_rows, - cudf::mask_state::UNINITIALIZED, - stream, - mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } + return ret; + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } +} - auto dev_output_data = copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_to_fixed_width_columns<<>>( - num_rows, - num_columns, - size_per_row, - dev_column_start->data(), - dev_column_size->data(), - dev_output_data->data(), - dev_output_nm->data(), - child.data()); - - return std::make_unique(std::move(output_columns)); - } else { - CUDF_FAIL("Only fixed width types are currently supported"); +std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + + if (are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + cudf::size_type num_rows = input.parent().size(); + int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(size_per_row * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_column_start = copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = copy_to_dev_async(column_size, stream, mr); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); } + + auto dev_output_data = copy_to_dev_async(output_data, stream, mr); + auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + copy_to_fixed_width_columns<<>>( + num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(), + dev_output_data->data(), dev_output_nm->data(), child.data()); + + return std::make_unique(std::move(output_columns)); + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } } } // namespace java diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index e26eadb35ea..17abde8df19 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -20,22 +20,21 @@ #include #include +#include namespace cudf { namespace java { +std::vector> +convert_to_rows(cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -std::vector> convert_to_rows( - cudf::table_view const& tbl, - // TODO need something for validity - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr +convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -std::unique_ptr convert_from_rows( - cudf::lists_column_view const& input, - std::vector const& schema, - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // java -} // cudf +} // namespace java +} // namespace cudf From 4c93c6212c8172a65ca58b47a477397d188ebe8d Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 11 Nov 2020 12:23:55 +1100 Subject: [PATCH 42/51] Add missing CONDA_INCLUDE_DIRS from benchmarks cmake --- cpp/benchmarks/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 667498fa965..592f1377c87 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -58,6 +58,10 @@ include_directories("${CMAKE_BINARY_DIR}/include" "${RMM_INCLUDE}" "${CMAKE_CURRENT_SOURCE_DIR}") +if(CONDA_INCLUDE_DIRS) + include_directories("${CONDA_INCLUDE_DIRS}") +endif(CONDA_INCLUDE_DIRS) + ################################################################################################### # - library paths --------------------------------------------------------------------------------- From 87548f5957cb2d42f3ffa8e12ddfb3188ec1ee75 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 11 Nov 2020 12:24:24 +1100 Subject: [PATCH 43/51] Add CUDF_CPP_BUILD_DIR to enable rapids-compose build. --- java/pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/java/pom.xml b/java/pom.xml index d14d4202e21..8894d9eae46 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -351,6 +351,7 @@ + Date: Wed, 11 Nov 2020 12:25:34 +1100 Subject: [PATCH 44/51] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index edd54204842..67a31c55fa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -101,6 +101,7 @@ - PR #6708 Apply `na_rep` to column names in csv writer - PR #6721 Add missing serialization methods for ListColumn - PR #6722 Fix index=False bug in dask_cudf.read_parquet +- PR #6732 Fix cuDF benchmarks build with static Arrow lib and fix rapids-compose cuDF JNI build # cuDF 0.16.0 (21 Oct 2020) From 5b0592b4aaa632d555ee5c8a44bfc4c667b80235 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Mon, 16 Nov 2020 11:22:56 +1100 Subject: [PATCH 45/51] Fix includes, copyright and doc formatting. --- cpp/include/cudf/detail/gather.cuh | 2 +- cpp/include/cudf/detail/groupby.hpp | 5 +++-- cpp/include/cudf/detail/quantiles.hpp | 19 ++++++++----------- cpp/include/cudf/detail/reduction.cuh | 11 +++++++---- cpp/include/cudf/lists/detail/gather.cuh | 5 +++-- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 9a115772a0c..8e586b231bc 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -35,9 +35,9 @@ #include #include +#include #include -#include "rmm/cuda_stream_view.hpp" #include #include diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp index c616a2c8d50..ce5fdb92bd1 100644 --- a/cpp/include/cudf/detail/groupby.hpp +++ b/cpp/include/cudf/detail/groupby.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,9 +17,10 @@ #include #include +#include + #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace groupby { diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp index e93886c4f11..5fb2ce4cbe6 100644 --- a/cpp/include/cudf/detail/quantiles.hpp +++ b/cpp/include/cudf/detail/quantiles.hpp @@ -22,11 +22,10 @@ namespace cudf { namespace detail { -/** @copydoc cudf::quantile(column_view const&, std::vector const&, interpolation, - column_view const&, bool, rmm::mr::device_memory_resource*) - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ +/** @copydoc cudf::quantile() + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr quantile( column_view const& input, std::vector const& q, @@ -36,12 +35,10 @@ std::unique_ptr quantile( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** @copydoc cudf::quantiles(table_view const&, std::vector const&, interpolation, - cudf::sorted, std::vector const&, std::vector const&, - rmm::mr::device_memory_resource*) - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ +/** @copydoc cudf::quantiles() + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ std::unique_ptr
quantiles( table_view const& input, std::vector const& q, diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh index 063114adbc3..cc899f946c5 100644 --- a/cpp/include/cudf/detail/reduction.cuh +++ b/cpp/include/cudf/detail/reduction.cuh @@ -16,16 +16,19 @@ #pragma once +#include "reduction_operators.cuh" + #include + +#include +#include #include #include -#include +#include + #include #include -#include -#include "reduction_operators.cuh" -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace reduction { diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index 0427e04647d..439bd7ab089 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -15,13 +15,14 @@ */ #pragma once -#include #include #include #include +#include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace lists { From 9d88e344cb85eb96a8db94a298f145cbb488f2d5 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Tue, 17 Nov 2020 15:37:02 +1100 Subject: [PATCH 46/51] Update stream,mr order after recent merges --- .../cudf/structs/detail/concatenate.hpp | 2 +- cpp/src/filling/fill.cu | 2 +- cpp/src/round/round.cu | 2 +- cpp/src/structs/copying/concatenate.cu | 14 ++++++------ cpp/src/unary/cast_ops.cu | 22 +++++++++---------- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp index ebaf8ec5b3c..ef3da82cfeb 100644 --- a/cpp/include/cudf/structs/detail/concatenate.hpp +++ b/cpp/include/cudf/structs/detail/concatenate.hpp @@ -49,7 +49,7 @@ namespace detail { */ std::unique_ptr concatenate( std::vector const& columns, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index 390e590736f..8bf510dded6 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -64,7 +64,7 @@ struct in_place_fill_range_dispatch { template std::enable_if_t() && not cudf::is_fixed_point(), void> operator()( - cudf::size_type begin, cudf::size_type end, cudaStream_t stream = 0) + cudf::size_type begin, cudf::size_type end, rmm::cuda_stream_view stream) { in_place_fill(destination, begin, end, value, stream); } diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 0ec57013c30..8e8626db599 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -246,7 +246,7 @@ std::unique_ptr round_with(column_view const& input, auto const diff = input.type().scale() - (-decimal_places); auto const scalar = cudf::make_fixed_point_scalar(std::pow(10, diff), scale_type{-diff}); return cudf::detail::binary_operation( - input, *scalar, cudf::binary_operator::MUL, {}, mr, stream); + input, *scalar, cudf::binary_operator::MUL, {}, stream, mr); } auto const result_type = data_type{input.type().id(), scale_type{-decimal_places}}; diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu index 6b917227302..47b63d9cf6f 100644 --- a/cpp/src/structs/copying/concatenate.cu +++ b/cpp/src/structs/copying/concatenate.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -23,9 +22,11 @@ #include #include #include - #include +#include + +#include #include namespace cudf { @@ -36,10 +37,9 @@ namespace detail { * @copydoc cudf::structs::detail::concatenate * */ -std::unique_ptr concatenate( - std::vector const& columns, - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr concatenate(std::vector const& columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // get ordered children auto ordered_children = extract_ordered_struct_children(columns); @@ -51,7 +51,7 @@ std::unique_ptr concatenate( ordered_children.end(), std::back_inserter(children), [mr, stream](std::vector const& cols) { - return cudf::detail::concatenate(cols, mr, stream); + return cudf::detail::concatenate(cols, stream, mr); }); size_type const total_length = children[0]->size(); diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 108c0794af9..f30ec1e2b83 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -172,18 +172,18 @@ struct device_cast { template ()>* = nullptr> std::unique_ptr rescale(column_view input, numeric::scale_type scale, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace numeric; if (input.type().scale() > scale) { auto const scalar = make_fixed_point_scalar(0, scale_type{scale}); - return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, mr, stream); + return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, stream, mr); } else { auto const diff = input.type().scale() - scale; auto const scalar = make_fixed_point_scalar(std::pow(10, -diff), scale_type{diff}); - return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, mr, stream); + return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, stream, mr); } }; @@ -286,12 +286,12 @@ struct dispatch_unary_cast_to { typename std::enable_if_t() && cudf::is_fixed_point() && std::is_same::value>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.type() == type) return std::make_unique(input); // TODO add test for this - return detail::rescale(input, numeric::scale_type{type.scale()}, mr, stream); + return detail::rescale(input, numeric::scale_type{type.scale()}, stream, mr); } template < @@ -300,8 +300,8 @@ struct dispatch_unary_cast_to { typename std::enable_if_t() && cudf::is_fixed_point() && not std::is_same::value>* = nullptr> std::unique_ptr operator()(data_type type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace numeric; @@ -318,14 +318,14 @@ struct dispatch_unary_cast_to { mutable_column_view output_mutable = *temporary; - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), output_mutable.begin(), device_cast{}); // clearly there is a more efficient way to do this, can optimize in the future - return rescale(*temporary, numeric::scale_type{type.scale()}, mr, stream); + return rescale(*temporary, numeric::scale_type{type.scale()}, stream, mr); } template Date: Wed, 18 Nov 2020 13:19:32 +1100 Subject: [PATCH 47/51] Remove MR parameter when it can be defaulted. --- cpp/include/cudf/detail/null_mask.hpp | 31 +++++++++++++----------- cpp/src/copying/scatter.cu | 20 +++++++-------- cpp/src/dictionary/add_keys.cu | 4 +-- cpp/src/dictionary/detail/concatenate.cu | 3 +-- cpp/src/dictionary/set_keys.cu | 2 +- cpp/src/filling/fill.cu | 6 ++--- cpp/src/groupby/hash/groupby.cu | 3 +-- cpp/src/groupby/sort/sort_helper.cu | 3 +-- cpp/src/interop/to_arrow.cpp | 2 +- cpp/src/replace/clamp.cu | 18 ++++---------- cpp/src/replace/replace.cu | 3 +-- cpp/src/search/search.cu | 3 +-- cpp/src/transform/encode.cu | 3 +-- 13 files changed, 43 insertions(+), 58 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 50a2424e86c..9a5e000f265 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -93,20 +93,22 @@ rmm::device_buffer copy_bitmask( * * @param stream CUDA stream used for device memory operations and kernel launches */ -rmm::device_buffer bitmask_and(std::vector const &masks, - std::vector const &begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); +rmm::device_buffer bitmask_and( + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_and * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -rmm::device_buffer bitmask_and(table_view const &view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); +rmm::device_buffer bitmask_and( + table_view const &view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a bitwise AND of the specified bitmasks, @@ -120,12 +122,13 @@ rmm::device_buffer bitmask_and(table_view const &view, * @param mr Device memory resource used to allocate the returned device_buffer * @return rmm::device_buffer Output bitmask */ -void inplace_bitmask_and(bitmask_type *dest_mask, - std::vector const &masks, - std::vector const &begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); +void inplace_bitmask_and( + bitmask_type *dest_mask, + std::vector const &masks, + std::vector const &begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 036962ab744..d8beb052f8f 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -177,16 +177,14 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - auto dict_target = dictionary::detail::add_keys( - dictionary_column_view(target), - make_column_from_scalar(source.get(), 1, stream, rmm::mr::get_current_device_resource()) - ->view(), - mr, - stream.value()); + auto dict_target = + dictionary::detail::add_keys(dictionary_column_view(target), + make_column_from_scalar(source.get(), 1, stream)->view(), + mr, + stream.value()); auto dict_view = dictionary_column_view(dict_target->view()); - auto scalar_index = dictionary::detail::get_index( - dict_view, source.get(), stream, rmm::mr::get_current_device_resource()); - auto scalar_iter = thrust::make_permutation_iterator( + auto scalar_index = dictionary::detail::get_index(dict_view, source.get(), stream); + auto scalar_iter = thrust::make_permutation_iterator( indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0)); auto new_indices = std::make_unique(dict_view.get_indices_annotated(), stream, mr); auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view()); @@ -336,8 +334,8 @@ std::unique_ptr boolean_mask_scatter(column_view const& input, 0); // The scatter map is actually a table with only one column, which is scatter map. - auto scatter_map = detail::apply_boolean_mask( - table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource()); + auto scatter_map = + detail::apply_boolean_mask(table_view{{indices->view()}}, boolean_mask, stream); auto output_table = detail::scatter(table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 6a9b294758d..79effe3fc97 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -57,8 +57,8 @@ std::unique_ptr add_keys( CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type"); // first, concatenate the keys together // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e] - auto combined_keys = cudf::detail::concatenate( - std::vector{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource()); + auto combined_keys = + cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); // sort and remove any duplicates from the combined keys // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}}, diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 223e2d7c331..3d44085232e 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -204,8 +204,7 @@ std::unique_ptr concatenate(std::vector const& columns, CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match"); return keys; }); - auto all_keys = - cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource()); + auto all_keys = cudf::detail::concatenate(keys_views, stream); // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index c934e495de3..69fdcd85b35 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -156,7 +156,7 @@ std::vector> match_dictionaries(std::vector keys(input.size()); std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); }); - auto new_keys = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource()); + auto new_keys = cudf::detail::concatenate(keys, stream); auto keys_view = new_keys->view(); std::vector> result(input.size()); std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) { diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index 8bf510dded6..77482e13b6c 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -174,16 +174,14 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()view(), mr, stream.value()); cudf::column_view const target_indices = cudf::dictionary_column_view(target_matched->view()).get_indices_annotated(); // get the index of the key just added - auto index_of_value = cudf::dictionary::detail::get_index( - target_matched->view(), value, stream, rmm::mr::get_current_device_resource()); + auto index_of_value = cudf::dictionary::detail::get_index(target_matched->view(), value, stream); // now call fill using just the indices column and the new index auto new_indices = cudf::type_dispatcher(target_indices.type(), diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index e0c9d92fd30..0a56563cf87 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -280,8 +280,7 @@ void compute_single_pass_aggs(table_view const& keys, bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; if (skip_key_rows_with_nulls) { - auto row_bitmask{ - cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource())}; + auto row_bitmask{cudf::detail::bitmask_and(keys, stream)}; thrust::for_each_n( rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 4b4c6a96688..595efc8198d 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -243,8 +243,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea { if (_keys_bitmask_column) return _keys_bitmask_column->view(); - auto row_bitmask = - cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource()); + auto row_bitmask = cudf::detail::bitmask_and(_keys, stream); _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8), _keys.num_rows(), diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp index c36b2be77e8..874d9078444 100644 --- a/cpp/src/interop/to_arrow.cpp +++ b/cpp/src/interop/to_arrow.cpp @@ -137,7 +137,7 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource()); + auto bitmask = bools_to_mask(input, stream); auto result = arrow::AllocateBuffer(static_cast(bitmask.first->size()), ar_mr); CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data"); diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index cdd8d78fdef..c69cf8c16f4 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -322,11 +322,7 @@ std::unique_ptr dispatch_clamp::operator()( auto add_scalar_key = [&](scalar const& key, scalar const& key_replace) { if (key.is_valid()) { result = dictionary::detail::add_keys( - matched_view, - make_column_from_scalar(key_replace, 1, stream, rmm::mr::get_current_device_resource()) - ->view(), - mr, - stream); + matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), mr, stream); matched_view = dictionary_column_view(result->view()); } }; @@ -337,16 +333,12 @@ std::unique_ptr dispatch_clamp::operator()( auto matched_view = dictionary_column_view(matched_column->view()); // get the indexes for lo_replace and for hi_replace - auto lo_replace_index = dictionary::detail::get_index( - matched_view, lo_replace, stream, rmm::mr::get_current_device_resource()); - auto hi_replace_index = dictionary::detail::get_index( - matched_view, hi_replace, stream, rmm::mr::get_current_device_resource()); + auto lo_replace_index = dictionary::detail::get_index(matched_view, lo_replace, stream); + auto hi_replace_index = dictionary::detail::get_index(matched_view, hi_replace, stream); // get the closest indexes for lo and for hi - auto lo_index = dictionary::detail::get_insert_index( - matched_view, lo, stream, rmm::mr::get_current_device_resource()); - auto hi_index = dictionary::detail::get_insert_index( - matched_view, hi, stream, rmm::mr::get_current_device_resource()); + auto lo_index = dictionary::detail::get_insert_index(matched_view, lo, stream); + auto hi_index = dictionary::detail::get_insert_index(matched_view, hi, stream); // call clamp with the scalar indexes and the matched indices auto matched_indices = matched_view.get_indices_annotated(); diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 6ca894ac186..a6b129630a8 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -453,8 +453,7 @@ std::unique_ptr replace_kernel_forwarder::operator()view(), mr, stream.value()); }(); auto matched_view = cudf::dictionary_column_view(matched_input->view()); diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index e8d776d0d2a..0efd68ac974 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -197,8 +197,7 @@ bool contains_scalar_dispatch::operator()(column_view const& { auto dict_col = cudf::dictionary_column_view(col); // first, find the value in the dictionary's key set - auto index = cudf::dictionary::detail::get_index( - dict_col, value, stream, rmm::mr::get_current_device_resource()); + auto index = cudf::dictionary::detail::get_index(dict_col, value, stream); // if found, check the index is actually in the indices column return index->is_valid() ? cudf::type_dispatcher(dict_col.indices().type(), contains_scalar_dispatch{}, diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 1ecf8a7814a..5abe4e1aaf7 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -53,8 +53,7 @@ std::pair, std::unique_ptr> encode( // https://github.com/rapidsai/cudf/issues/6144 is resolved auto num_rows = keys_table->num_rows(); - auto mask = - cudf::detail::bitmask_and(keys_table->view(), stream, rmm::mr::get_current_device_resource()); + auto mask = cudf::detail::bitmask_and(keys_table->view(), stream); auto num_rows_with_nulls = cudf::count_unset_bits(reinterpret_cast(mask.data()), 0, num_rows); From 6a6274487b2e97ee5066443b9df2620a093942f7 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 18 Nov 2020 13:23:54 +1100 Subject: [PATCH 48/51] Header fix --- cpp/src/groupby/groupby.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 4c391852386..6a004393b83 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -29,11 +29,12 @@ #include #include +#include + #include #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace groupby { From acbb2ebca732f74a7683feb8b7c16d48a1f2e495 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 18 Nov 2020 13:29:04 +1100 Subject: [PATCH 49/51] Header fixes --- cpp/include/cudf/strings/detail/copy_range.cuh | 3 ++- cpp/include/cudf/strings/detail/gather.cuh | 3 ++- cpp/include/cudf/table/table_device_view.cuh | 3 ++- cpp/src/copying/concatenate.cu | 3 ++- cpp/src/copying/copy_range.cu | 3 ++- cpp/src/dictionary/replace.cu | 3 ++- cpp/src/interop/dlpack.cpp | 3 ++- cpp/src/join/semi_join.cu | 7 ++++--- cpp/src/replace/nulls.cu | 3 ++- cpp/src/reshape/tile.cu | 7 ++++--- cpp/src/sort/sort.cu | 3 ++- cpp/src/stream_compaction/drop_nulls.cu | 5 +++-- cpp/src/transform/transform.cpp | 3 ++- cpp/src/unary/nan_ops.cu | 3 ++- cpp/src/unary/null_ops.cu | 5 +++-- cpp/src/unary/unary_ops.cuh | 7 ++++--- java/src/main/native/src/map_lookup.cu | 3 +-- 17 files changed, 41 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index fe0d1dcf2a7..563f66ad2c8 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -22,7 +22,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include #include #include diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 8ca70db74a6..e70dbd399c9 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -22,7 +22,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 8a1938423f0..76d2e57597f 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -19,9 +19,10 @@ #include #include +#include + #include #include -#include "rmm/cuda_stream_view.hpp" /** * @file table_device_view.cuh diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 90b76498860..91354250073 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -28,13 +28,14 @@ #include #include +#include + #include #include #include #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace detail { diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 1df9fc78aa2..ff532059108 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -32,10 +32,11 @@ #include #include +#include + #include #include -#include "rmm/cuda_stream_view.hpp" namespace { template diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 27a85c03898..60e7c496e06 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -23,7 +23,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include #include #include diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index 1ae6119aefd..efc19791c07 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -18,10 +18,11 @@ #include #include +#include + #include #include -#include "rmm/cuda_stream_view.hpp" namespace cudf { namespace { diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 9d6dd55ec03..2b58c1a864a 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -14,6 +14,9 @@ * limitations under the License. */ +#include +#include + #include #include #include @@ -22,9 +25,7 @@ #include #include -#include -#include -#include "rmm/cuda_stream_view.hpp" +#include namespace cudf { namespace detail { diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 6f860dfd60d..2c7542a2f5d 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -35,7 +35,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include #include diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu index c912143f6d7..e1c665cf8dd 100644 --- a/cpp/src/reshape/tile.cu +++ b/cpp/src/reshape/tile.cu @@ -15,18 +15,19 @@ */ #include +#include #include #include #include #include #include -#include +#include #include #include -#include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace { diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index 18d6839e2a2..028796d59cb 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "rmm/cuda_stream_view.hpp" #include "sort_impl.cuh" #include @@ -23,6 +22,8 @@ #include #include +#include + namespace cudf { namespace detail { std::unique_ptr sorted_order(table_view input, diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu index 71aa8f6c63c..7eb8e1c9644 100644 --- a/cpp/src/stream_compaction/drop_nulls.cu +++ b/cpp/src/stream_compaction/drop_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace { // Returns true if the mask is true for index i in at least keep_threshold diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 2372382d178..f4224f87957 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -26,12 +26,13 @@ #include #include #include "jit/code/code.h" -#include "rmm/cuda_stream_view.hpp" #include #include #include +#include + namespace cudf { namespace transformation { //! Jit functions diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu index 9f8f0e53cb2..1840aebf8f0 100644 --- a/cpp/src/unary/nan_ops.cu +++ b/cpp/src/unary/nan_ops.cu @@ -21,7 +21,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { namespace detail { diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu index 699439da1c9..6a967b4ecd7 100644 --- a/cpp/src/unary/null_ops.cu +++ b/cpp/src/unary/null_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,8 @@ #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include namespace cudf { std::unique_ptr is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh index a74a05437be..ab246bde540 100644 --- a/cpp/src/unary/unary_ops.cuh +++ b/cpp/src/unary/unary_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,13 +17,14 @@ #ifndef UNARY_OPS_H #define UNARY_OPS_H -#include #include #include #include #include #include -#include "rmm/cuda_stream_view.hpp" + +#include +#include namespace cudf { namespace unary { diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu index a3e25ce8905..95eea10e8e0 100644 --- a/java/src/main/native/src/map_lookup.cu +++ b/java/src/main/native/src/map_lookup.cu @@ -25,10 +25,9 @@ #include #include #include +#include #include -#include "rmm/cuda_stream_view.hpp" - namespace cudf { namespace { From 81cd6012014cf8fc511888b6706a01e163e791e2 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 19 Nov 2020 07:33:22 +1100 Subject: [PATCH 50/51] Add missing include. --- cpp/src/hash/helper_functions.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/src/hash/helper_functions.cuh index 1b6411de612..57747142f58 100644 --- a/cpp/src/hash/helper_functions.cuh +++ b/cpp/src/hash/helper_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #ifndef HELPER_FUNCTIONS_CUH #define HELPER_FUNCTIONS_CUH +#include + #include constexpr int64_t DEFAULT_HASH_TABLE_OCCUPANCY = 50; From 9fc08f349a53f28f64e4ee0f316b56f9cf3bb656 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 19 Nov 2020 17:42:12 +1100 Subject: [PATCH 51/51] cudaStream_t to cuda_stream_view in math_ops --- cpp/src/unary/math_ops.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index d27dac3542c..f756f68e4a9 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -276,7 +276,7 @@ struct fixed_point_abs { template typename FixedPointFunctor> std::unique_ptr unary_op_with(column_view const& input, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { using Type = device_storage_type_t; @@ -294,7 +294,7 @@ std::unique_ptr unary_op_with(column_view const& input, auto out_view = result->mutable_view(); Type const n = std::pow(10, -input.type().scale()); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), out_view.begin(), @@ -554,7 +554,7 @@ struct FixedPointOpDispatcher { std::enable_if_t(), std::unique_ptr> operator()( column_view const& input, cudf::unary_op op, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { switch (op) {