From 8cc23bdd4f894c85d5ee400712db994711244b3d Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Wed, 25 Nov 2020 04:34:25 +1100 Subject: [PATCH] Replace raw streams with rmm::cuda_stream_view (part 3) (#6744) Converting libcudf to use `rmm::cuda_stream_view` will require a LOT of changes, so I'm splitting it into multiple PRs to ease reviewing. This is the third PR in the series. This series of PRs will - Replace usage of `cudaStream_t` with `rmm::cuda_stream_view` - Replace usage of `0` or `nullptr` as a stream identifier with `rmm::cuda_stream_default` - Ensure all APIs always order the stream parameter before the memory resource parameter. #5119 Contributes to #6645 and #5119 Depends on #6646 and #6648 so this PR will look much bigger until they are merged. This third PR converts: - remaining dictionary functionality - cuio - lists - scalar - strings - groupby - join - contiguous_split - get_element - datetime_ops - extract - merge - partitioning - minmax reduction - scan - byte_cast - clamp - interleave_columns - is_sorted - groupby - rank - tests - concurrent map classes --- CHANGELOG.md | 1 + .../synchronization/synchronization.cpp | 13 +- .../synchronization/synchronization.hpp | 15 +- cpp/include/cudf/detail/copy.hpp | 2 +- cpp/include/cudf/detail/copy_range.cuh | 2 +- cpp/include/cudf/detail/scatter.cuh | 5 +- cpp/include/cudf/dictionary/detail/encode.hpp | 14 +- cpp/include/cudf/dictionary/detail/merge.hpp | 8 +- .../cudf/dictionary/detail/update_keys.hpp | 26 +-- .../cudf/dictionary/dictionary_factories.hpp | 14 +- cpp/include/cudf/groupby.hpp | 8 +- cpp/include/cudf/io/avro.hpp | 4 +- cpp/include/cudf/io/csv.hpp | 1 - cpp/include/cudf/io/data_sink.hpp | 10 +- cpp/include/cudf/io/datasource.hpp | 6 +- cpp/include/cudf/io/detail/avro.hpp | 5 +- cpp/include/cudf/io/detail/csv.hpp | 6 +- cpp/include/cudf/io/detail/json.hpp | 5 +- cpp/include/cudf/io/detail/orc.hpp | 7 +- cpp/include/cudf/io/detail/parquet.hpp | 18 +- cpp/include/cudf/io/json.hpp | 4 +- cpp/include/cudf/io/types.hpp | 1 + cpp/include/cudf/join.hpp | 16 +- cpp/include/cudf/lists/detail/copying.hpp | 4 +- cpp/include/cudf/lists/detail/gather.cuh | 2 +- cpp/include/cudf/lists/detail/scatter.cuh | 22 +- cpp/include/cudf/scalar/scalar_factories.hpp | 18 +- cpp/include/cudf/strings/copying.hpp | 4 +- cpp/include/cudf/strings/detail/combine.hpp | 6 +- .../cudf/strings/detail/converters.hpp | 22 +- .../cudf/strings/detail/copy_if_else.cuh | 11 +- .../cudf/strings/detail/copy_range.cuh | 10 +- cpp/include/cudf/strings/detail/gather.cuh | 8 +- cpp/include/cudf/strings/detail/merge.cuh | 16 +- .../cudf/strings/detail/modify_strings.cuh | 18 +- cpp/include/cudf/strings/detail/scatter.cuh | 6 +- .../detail/strings_column_factories.cuh | 58 +++--- cpp/include/cudf/strings/detail/utilities.cuh | 20 +- cpp/include/cudf/strings/detail/utilities.hpp | 35 ++-- .../cudf/strings/strings_column_view.hpp | 5 +- cpp/include/cudf/table/table_device_view.cuh | 2 +- cpp/include/cudf/types.hpp | 5 - cpp/include/cudf/utilities/error.hpp | 6 +- cpp/include/cudf/utilities/traits.hpp | 2 +- cpp/include/nvtext/detail/load_hash_file.hpp | 7 +- cpp/include/nvtext/detail/tokenize.hpp | 18 +- cpp/src/bitmask/null_mask.cu | 6 +- cpp/src/column/column.cu | 13 +- cpp/src/copying/contiguous_split.cu | 1 + cpp/src/copying/copy_range.cu | 5 +- cpp/src/copying/get_element.cu | 21 +- cpp/src/copying/scatter.cu | 4 +- cpp/src/datetime/datetime_ops.cu | 29 ++- cpp/src/dictionary/add_keys.cu | 15 +- cpp/src/dictionary/decode.cu | 11 +- cpp/src/dictionary/detail/merge.cu | 8 +- cpp/src/dictionary/dictionary_factories.cu | 21 +- cpp/src/dictionary/encode.cu | 16 +- cpp/src/dictionary/remove_keys.cu | 30 +-- cpp/src/dictionary/replace.cu | 26 +-- cpp/src/dictionary/set_keys.cu | 32 +-- cpp/src/filling/fill.cu | 2 +- cpp/src/filling/repeat.cu | 6 +- cpp/src/groupby/groupby.cu | 4 +- cpp/src/groupby/hash/groupby.cu | 10 +- cpp/src/groupby/sort/group_argmax.cu | 13 +- cpp/src/groupby/sort/group_argmin.cu | 13 +- cpp/src/groupby/sort/group_collect.cu | 6 +- cpp/src/groupby/sort/group_count.cu | 18 +- cpp/src/groupby/sort/group_max.cu | 12 +- cpp/src/groupby/sort/group_min.cu | 12 +- cpp/src/groupby/sort/group_nth_element.cu | 14 +- cpp/src/groupby/sort/group_nunique.cu | 22 +- cpp/src/groupby/sort/group_quantiles.cu | 20 +- cpp/src/groupby/sort/group_reductions.hpp | 49 ++--- .../sort/group_single_pass_reduction_util.cuh | 14 +- cpp/src/groupby/sort/group_std.cu | 25 +-- cpp/src/groupby/sort/group_sum.cu | 12 +- cpp/src/groupby/sort/groupby.cu | 52 ++--- cpp/src/hash/concurrent_unordered_map.cuh | 37 ++-- .../hash/concurrent_unordered_multimap.cuh | 39 ++-- cpp/src/hash/hash_allocator.cuh | 15 +- cpp/src/hash/hashing.cu | 8 +- cpp/src/hash/unordered_multiset.cuh | 17 +- cpp/src/interop/dlpack.cpp | 2 +- cpp/src/interop/from_arrow.cpp | 6 +- cpp/src/io/avro/avro_gpu.cu | 42 ++-- cpp/src/io/avro/avro_gpu.h | 10 +- cpp/src/io/avro/reader_impl.cu | 39 ++-- cpp/src/io/avro/reader_impl.hpp | 8 +- cpp/src/io/comp/debrotli.cu | 20 +- cpp/src/io/comp/gpuinflate.cu | 15 +- cpp/src/io/comp/gpuinflate.h | 26 +-- cpp/src/io/comp/snap.cu | 13 +- cpp/src/io/comp/uncomp.cpp | 11 +- cpp/src/io/comp/unsnap.cu | 12 +- cpp/src/io/csv/csv_gpu.cu | 22 +- cpp/src/io/csv/csv_gpu.h | 13 +- cpp/src/io/csv/durations.cu | 23 ++- cpp/src/io/csv/reader_impl.cu | 35 ++-- cpp/src/io/csv/reader_impl.hpp | 10 +- cpp/src/io/csv/writer_impl.cu | 60 +++--- cpp/src/io/csv/writer_impl.hpp | 12 +- cpp/src/io/json/json_gpu.cu | 15 +- cpp/src/io/json/json_gpu.h | 8 +- cpp/src/io/json/reader_impl.cu | 61 +++--- cpp/src/io/json/reader_impl.hpp | 27 +-- cpp/src/io/orc/chunked_state.hpp | 4 +- cpp/src/io/orc/dict_enc.cu | 20 +- cpp/src/io/orc/orc_gpu.h | 39 ++-- cpp/src/io/orc/reader_impl.cu | 50 +++-- cpp/src/io/orc/reader_impl.hpp | 10 +- cpp/src/io/orc/stats_enc.cu | 19 +- cpp/src/io/orc/stripe_data.cu | 32 +-- cpp/src/io/orc/stripe_enc.cu | 28 ++- cpp/src/io/orc/stripe_init.cu | 19 +- cpp/src/io/orc/writer_impl.cu | 109 ++++++---- cpp/src/io/orc/writer_impl.hpp | 21 +- cpp/src/io/parquet/chunked_state.hpp | 19 +- cpp/src/io/parquet/page_data.cu | 60 +++--- cpp/src/io/parquet/page_dict.cu | 15 +- cpp/src/io/parquet/page_enc.cu | 59 +++--- cpp/src/io/parquet/page_hdr.cu | 12 +- cpp/src/io/parquet/parquet_gpu.hpp | 41 ++-- cpp/src/io/parquet/reader_impl.cu | 47 +++-- cpp/src/io/parquet/reader_impl.hpp | 19 +- cpp/src/io/parquet/writer_impl.cu | 88 ++++---- cpp/src/io/parquet/writer_impl.hpp | 18 +- cpp/src/io/statistics/column_stats.cu | 23 ++- cpp/src/io/statistics/column_stats.h | 6 +- cpp/src/io/utilities/column_buffer.hpp | 18 +- cpp/src/io/utilities/data_sink.cpp | 6 +- cpp/src/io/utilities/hostdevice_vector.hpp | 36 ++-- cpp/src/join/cross_join.cu | 6 +- cpp/src/join/hash_join.cu | 81 ++++---- cpp/src/join/hash_join.cuh | 53 ++--- cpp/src/join/join.cu | 73 +++---- cpp/src/join/nested_loop_join.cuh | 46 +++-- cpp/src/join/semi_join.cu | 18 +- cpp/src/lists/copying/copying.cu | 7 +- cpp/src/lists/copying/gather.cu | 5 +- cpp/src/lists/extract.cu | 10 +- cpp/src/merge/merge.cu | 116 ++++++----- cpp/src/partitioning/partitioning.cu | 104 +++++----- cpp/src/partitioning/round_robin.cu | 31 +-- cpp/src/reductions/minmax.cu | 25 ++- cpp/src/reductions/scan.cu | 116 +++++------ cpp/src/replace/clamp.cu | 78 ++++---- cpp/src/replace/nulls.cu | 8 +- cpp/src/replace/replace.cu | 17 +- cpp/src/reshape/byte_cast.cu | 31 ++- cpp/src/reshape/interleave_columns.cu | 31 +-- cpp/src/rolling/rolling.cu | 188 +++++++++--------- cpp/src/scalar/scalar_factories.cpp | 18 +- cpp/src/search/search.cu | 11 +- cpp/src/sort/is_sorted.cu | 14 +- cpp/src/sort/rank.cu | 43 ++-- cpp/src/strings/attributes.cu | 40 ++-- cpp/src/strings/capitalize.cu | 12 +- cpp/src/strings/case.cu | 43 ++-- cpp/src/strings/char_types/char_types.cu | 84 ++++---- cpp/src/strings/combine.cu | 58 +++--- cpp/src/strings/contains.cu | 64 +++--- cpp/src/strings/convert/convert_booleans.cu | 36 ++-- cpp/src/strings/convert/convert_datetime.cu | 67 +++---- cpp/src/strings/convert/convert_durations.cu | 54 +++-- cpp/src/strings/convert/convert_floats.cu | 48 +++-- cpp/src/strings/convert/convert_hex.cu | 46 ++--- cpp/src/strings/convert/convert_integers.cu | 54 +++-- cpp/src/strings/convert/convert_ipv4.cu | 59 +++--- cpp/src/strings/convert/convert_urls.cu | 41 ++-- cpp/src/strings/copying/concatenate.cu | 6 +- cpp/src/strings/copying/copying.cu | 8 +- cpp/src/strings/extract.cu | 21 +- cpp/src/strings/filling/fill.cu | 10 +- cpp/src/strings/filter_chars.cu | 15 +- cpp/src/strings/find.cu | 82 ++++---- cpp/src/strings/find_multiple.cu | 10 +- cpp/src/strings/padding.cu | 43 ++-- cpp/src/strings/regex/regex.cuh | 8 +- cpp/src/strings/regex/regexec.cu | 12 +- cpp/src/strings/replace/backref_re.cu | 31 +-- cpp/src/strings/replace/backref_re.cuh | 10 +- cpp/src/strings/replace/backref_re_large.cu | 12 +- cpp/src/strings/replace/backref_re_medium.cu | 12 +- cpp/src/strings/replace/multi_re.cu | 32 +-- cpp/src/strings/replace/replace.cu | 31 ++- cpp/src/strings/replace/replace_re.cu | 33 +-- cpp/src/strings/split/partition.cu | 18 +- cpp/src/strings/split/split.cu | 66 +++--- cpp/src/strings/split/split_record.cu | 39 ++-- cpp/src/strings/strings_column_factories.cu | 69 ++++--- cpp/src/strings/strings_column_view.cu | 21 +- cpp/src/strings/strings_scalar_factories.cpp | 6 +- cpp/src/strings/strip.cu | 19 +- cpp/src/strings/substring.cu | 64 +++--- cpp/src/strings/translate.cu | 23 ++- cpp/src/strings/utilities.cu | 40 ++-- cpp/src/strings/utilities.cuh | 16 +- cpp/src/strings/utilities.hpp | 4 +- cpp/src/strings/wrap.cu | 15 +- cpp/src/text/detokenize.cu | 27 ++- cpp/src/text/edit_distance.cu | 36 ++-- cpp/src/text/generate_ngrams.cu | 39 ++-- cpp/src/text/ngrams_tokenize.cu | 42 ++-- cpp/src/text/normalize.cu | 44 ++-- cpp/src/text/replace.cu | 34 ++-- cpp/src/text/stemmer.cu | 60 +++--- cpp/src/text/subword/data_normalizer.cu | 25 ++- .../text/subword/detail/data_normalizer.hpp | 5 +- .../text/subword/detail/tokenizer_utils.cuh | 6 +- .../subword/detail/wordpiece_tokenizer.hpp | 8 +- cpp/src/text/subword/load_hash_file.cu | 42 ++-- cpp/src/text/subword/subword_tokenize.cu | 13 +- cpp/src/text/subword/wordpiece_tokenizer.cu | 87 ++++---- cpp/src/text/tokenize.cu | 64 +++--- cpp/src/unary/math_ops.cu | 18 +- cpp/src/unary/unary_ops.cuh | 2 +- cpp/tests/column/column_device_view_test.cu | 24 ++- cpp/tests/column/factories_test.cpp | 12 +- cpp/tests/copying/copy_tests.cu | 29 +-- cpp/tests/copying/gather_struct_tests.cu | 33 ++- cpp/tests/copying/shift_tests.cpp | 14 +- cpp/tests/datetime/datetime_ops_test.cpp | 15 +- cpp/tests/error/error_handling_test.cu | 27 ++- cpp/tests/groupby/group_std_test.cpp | 4 + cpp/tests/groupby/group_var_test.cpp | 4 + cpp/tests/hash_map/map_test.cu | 13 +- cpp/tests/hash_map/multimap_test.cu | 13 +- cpp/tests/io/parquet_test.cpp | 27 +-- cpp/tests/scalar/factories_test.cpp | 13 +- cpp/tests/table/table_view_tests.cu | 17 +- cpp/tests/wrappers/timestamps_test.cu | 17 +- java/src/main/native/src/TableJni.cpp | 8 +- java/src/main/native/src/map_lookup.cu | 2 +- 235 files changed, 3179 insertions(+), 2719 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25bbb34fbe1..b052722ca65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ - PR #6612 Update JNI to new RMM cuda_stream_view API - PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1) - PR #6648 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 2) +- PR #6744 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 3) - PR #6579 Update scatter APIs to use reference wrapper / const scalar - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json` - PR #6622 Update `to_pandas` api docs diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index a2de31e53d3..c5a88bd6410 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,13 +15,15 @@ */ #include "synchronization.hpp" + #include +#include #include cuda_event_timer::cuda_event_timer(benchmark::State& state, bool flush_l2_cache, - cudaStream_t stream) + rmm::cuda_stream_view stream) : p_state(&state), stream(stream) { // flush all of L2$ @@ -35,18 +37,19 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state, if (l2_cache_bytes > 0) { const int memset_value = 0; rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); - CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream)); + CUDA_TRY( + cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); } } CUDA_TRY(cudaEventCreate(&start)); CUDA_TRY(cudaEventCreate(&stop)); - CUDA_TRY(cudaEventRecord(start, stream)); + CUDA_TRY(cudaEventRecord(start, stream.value())); } cuda_event_timer::~cuda_event_timer() { - CUDA_TRY(cudaEventRecord(stop, stream)); + CUDA_TRY(cudaEventRecord(stop, stream.value())); CUDA_TRY(cudaEventSynchronize(stop)); float milliseconds = 0.0f; diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp index 9e214907812..5e84e9fb9ae 100644 --- a/cpp/benchmarks/synchronization/synchronization.hpp +++ b/cpp/benchmarks/synchronization/synchronization.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ for (auto _ : state){ - cudaStream_t stream = 0; + rmm::cuda_stream_view stream{}; // default stream, could be another stream // Create (Construct) an object of this class. You HAVE to pass in the // benchmark::State object you are using. It measures the time from its @@ -44,7 +44,7 @@ cuda_event_timer raii(state, true, stream); // flush_l2_cache = true // Now perform the operations that is to be benchmarked - sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel + sample_kernel<<<1, 256, 0, stream.value()>>>(); // Possibly launching a CUDA kernel } } @@ -61,8 +61,11 @@ // Google Benchmark library #include + #include +#include + #include class cuda_event_timer { @@ -77,7 +80,9 @@ class cuda_event_timer { * every iteration. * @param[in] stream_ The CUDA stream we are measuring time on. **/ - cuda_event_timer(benchmark::State& state, bool flush_l2_cache, cudaStream_t stream_ = 0); + cuda_event_timer(benchmark::State& state, + bool flush_l2_cache, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); // The user must provide a benchmark::State object to set // the timer so we disable the default c'tor. @@ -91,7 +96,7 @@ class cuda_event_timer { private: cudaEvent_t start; cudaEvent_t stop; - cudaStream_t stream; + rmm::cuda_stream_view stream; benchmark::State* p_state; }; diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index cfd637570fe..0af8dd6a500 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -184,7 +184,7 @@ std::unique_ptr sample( */ std::unique_ptr get_element(column_view const& input, size_type index, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index afe67540c42..feb7255eec8 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin, nullptr); } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); } /** diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 1df17585c99..a45c4f86ba4 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -171,10 +171,9 @@ struct column_scatterer_impl { "scatter dictionary keys must be the same type"); // first combine keys so both dictionaries have the same set - auto target_matched = dictionary::detail::add_keys(target, source.keys(), mr, stream.value()); + auto target_matched = dictionary::detail::add_keys(target, source.keys(), stream, mr); auto const target_view = dictionary_column_view(target_matched->view()); - auto source_matched = dictionary::detail::set_keys( - source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); + auto source_matched = dictionary::detail::set_keys(source, target_view.keys(), stream); auto const source_view = dictionary_column_view(source_matched->view()); // now build the new indices by doing a scatter on just the matched indices diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp index 8a1cff84119..933512efdde 100644 --- a/cpp/include/cudf/dictionary/detail/encode.hpp +++ b/cpp/include/cudf/dictionary/detail/encode.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -44,15 +46,15 @@ namespace detail { * * @param column The column to dictionary encode. * @param indices_type The integer type to use for the indices. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Returns a dictionary column. */ std::unique_ptr encode( column_view const& column, data_type indices_type = data_type{type_id::UINT32}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create a column by gathering the keys from the provided @@ -65,14 +67,14 @@ std::unique_ptr encode( * ``` * * @param dictionary_column Existing dictionary column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with type matching the dictionary_column's keys. */ std::unique_ptr decode( dictionary_column_view const& dictionary_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Return minimal integer type for the given number of elements. diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp index 4b9cb634b74..521d36e229e 100644 --- a/cpp/include/cudf/dictionary/detail/merge.hpp +++ b/cpp/include/cudf/dictionary/detail/merge.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -33,15 +35,15 @@ namespace detail { * @param lcol First column. * @param rcol Second column. * @param row_order Indexes for each column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr merge(dictionary_column_view const& lcol, dictionary_column_view const& rcol, cudf::detail::index_vector const& row_order, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp index ec6a9af61cf..9d3cc9f90bc 100644 --- a/cpp/include/cudf/dictionary/detail/update_keys.hpp +++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -31,8 +33,8 @@ namespace detail { std::unique_ptr add_keys( dictionary_column_view const& dictionary_column, column_view const& new_keys, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view @@ -43,8 +45,8 @@ std::unique_ptr add_keys( std::unique_ptr remove_keys( dictionary_column_view const& dictionary_column, column_view const& keys_to_remove, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view @@ -54,8 +56,8 @@ std::unique_ptr remove_keys( */ std::unique_ptr remove_unused_keys( dictionary_column_view const& dictionary_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::dictionary::set_keys(dictionary_column_view @@ -66,8 +68,8 @@ std::unique_ptr remove_unused_keys( std::unique_ptr set_keys( dictionary_column_view const& dictionary_column, column_view const& keys, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create new dictionaries that have keys merged from the input dictionaries. @@ -82,8 +84,8 @@ std::unique_ptr set_keys( */ std::vector> match_dictionaries( std::vector input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create new dictionaries that have keys merged from dictionary columns @@ -106,8 +108,8 @@ std::vector> match_dictionaries( */ std::pair>, std::vector> match_dictionaries( std::vector tables, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp index fe0b92e7df4..6e5f5fa6539 100644 --- a/cpp/include/cudf/dictionary/dictionary_factories.hpp +++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { /** * @addtogroup column_factories Factories @@ -54,15 +56,15 @@ namespace cudf { * * @param keys_column Column of unique, ordered values to use as the new dictionary column's keys. * @param indices_column Indices to use for the new dictionary column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr make_dictionary_column( column_view const& keys_column, column_view const& indices_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Construct a dictionary column by taking ownership of the provided keys @@ -106,15 +108,15 @@ std::unique_ptr make_dictionary_column(std::unique_ptr keys_colu * * @param keys Column of unique, ordered values to use as the new dictionary column's keys. * @param indices Indices values and null-mask to use for the new dictionary column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column. */ std::unique_ptr make_dictionary_column( std::unique_ptr keys_column, std::unique_ptr indices_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 8f8aadccde5..fc809b03dfa 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,8 @@ #include #include +#include + #include #include @@ -222,13 +224,13 @@ class groupby { */ std::pair, std::vector> dispatch_aggregation( std::vector const& requests, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); // Sort-based groupby std::pair, std::vector> sort_aggregate( std::vector const& requests, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); }; /** @} */ diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp index 0311b9e92cb..18398ff4ceb 100644 --- a/cpp/include/cudf/io/avro.hpp +++ b/cpp/include/cudf/io/avro.hpp @@ -18,11 +18,11 @@ #include "types.hpp" -#include - #include #include +#include + #include #include #include diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 2511a366ca8..4b7f3e22601 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -17,7 +17,6 @@ #pragma once #include - #include #include #include diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 9f16ffa3105..6c830e31a56 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -16,13 +16,15 @@ #pragma once +#include +#include + +#include + #include #include #include -#include -#include - namespace cudf { //! IO interfaces namespace io { @@ -113,7 +115,7 @@ class data_sink { * * @return void **/ - virtual void device_write(void const* gpu_data, size_t size, cudaStream_t stream) + virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) { CUDF_FAIL("data_sink classes that support device_write must override this function."); } diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index a99bac3f7f1..88f2bd187e2 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include #include #include @@ -23,9 +26,6 @@ #include -#include -#include - namespace cudf { //! IO interfaces namespace io { diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index c965bfbfb21..40090dbc438 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -23,6 +23,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { @@ -71,7 +73,8 @@ class reader { * * @return The set of columns along with table metadata */ - table_with_metadata read(avro_reader_options const &options, cudaStream_t stream = 0); + table_with_metadata read(avro_reader_options const &options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; } // namespace avro } // namespace detail diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 8a8d07a353c..7790c2ceee1 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -18,6 +18,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { @@ -65,7 +67,7 @@ class reader { * * @return The set of columns along with table metadata */ - table_with_metadata read(cudaStream_t stream = 0); + table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; class writer { @@ -104,7 +106,7 @@ class writer { */ void write(table_view const &table, const table_metadata *metadata = nullptr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; } // namespace csv } // namespace detail diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 62a209b57f4..2176381879a 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -23,6 +23,8 @@ #include +#include + // Forward declarations namespace arrow { namespace io { @@ -77,7 +79,8 @@ class reader { * @param[in] options Settings for controlling reading behavior * @return cudf::table object that contains the array of cudf::column. */ - table_with_metadata read(json_reader_options const &options, cudaStream_t stream = 0); + table_with_metadata read(json_reader_options const &options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; } // namespace json diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index 509aea4c6e1..15969ac6137 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -18,6 +18,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { @@ -66,7 +68,8 @@ class reader { * * @return The set of columns along with table metadata */ - table_with_metadata read(orc_reader_options const& options, cudaStream_t stream = 0); + table_with_metadata read(orc_reader_options const& options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; /** @@ -103,7 +106,7 @@ class writer { */ void write(table_view const& table, const table_metadata* metadata = nullptr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Begins the chunked/streamed write process. diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 586ff497972..1769c72e1c8 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -22,6 +22,8 @@ #include +#include + namespace cudf { namespace io { namespace detail { @@ -70,7 +72,8 @@ class reader { * * @return The set of columns along with table metadata */ - table_with_metadata read(parquet_reader_options const& options, cudaStream_t stream = 0); + table_with_metadata read(parquet_reader_options const& options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; /** @@ -108,12 +111,13 @@ class writer { * @param int96_timestamps If true, write timestamps as INT96 values * @param stream CUDA stream used for device memory operations and kernel launches. */ - std::unique_ptr> write(table_view const& table, - const table_metadata* metadata = nullptr, - bool return_filemetadata = false, - const std::string column_chunks_file_path = "", - bool int96_timestamps = false, - cudaStream_t stream = 0); + std::unique_ptr> write( + table_view const& table, + const table_metadata* metadata = nullptr, + bool return_filemetadata = false, + const std::string column_chunks_file_path = "", + bool int96_timestamps = false, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Begins the chunked/streamed write process. diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index d4b627dd145..262d79b64c2 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -18,11 +18,11 @@ #include "types.hpp" -#include - #include #include +#include + #include #include diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index a50ab95195d..fe6eda101d8 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -22,6 +22,7 @@ #pragma once #include + #include #include #include diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 981bc46d046..37847c41339 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -19,6 +19,8 @@ #include #include +#include + #include namespace cudf { @@ -396,7 +398,7 @@ class hash_join { */ hash_join(cudf::table_view const& build, std::vector const& build_on, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Controls where common columns will be output for a inner join. @@ -449,8 +451,8 @@ class hash_join { std::vector> const& columns_in_common, common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE, null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** * @brief Performs a left join by probing in the internal hash table. @@ -479,8 +481,8 @@ class hash_join { std::vector const& probe_on, std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** * @brief Performs a full join by probing in the internal hash table. @@ -509,8 +511,8 @@ class hash_join { std::vector const& probe_on, std::vector> const& columns_in_common, null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; private: struct hash_join_impl; diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp index 6482518303b..cfa1980e665 100644 --- a/cpp/include/cudf/lists/detail/copying.hpp +++ b/cpp/include/cudf/lists/detail/copying.hpp @@ -17,6 +17,8 @@ #include +#include + namespace cudf { namespace lists { namespace detail { @@ -43,7 +45,7 @@ namespace detail { std::unique_ptr copy_slice(lists_column_view const& lists, size_type start, size_type end, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace lists diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index 439bd7ab089..b035ae62408 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -51,7 +51,7 @@ struct gather_data { * @copydoc cudf::make_gather_data(cudf::lists_column_view const& source_column, * MapItType gather_map, * size_type gather_map_size, - * cudaStream_t stream, + * rmm::cuda_stream_view stream, * rmm::mr::device_memory_resource* mr) * * @param prev_base_offsets The buffer backing the base offsets used in the gather map. We can diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 65bdfb349c8..1de4461f703 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -16,19 +16,23 @@ #pragma once -#include -#include -#include #include #include #include #include #include #include +#include +#include #include #include + #include +#include + +#include + namespace cudf { namespace lists { namespace detail { @@ -385,9 +389,9 @@ struct list_child_constructor { (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists : target_lists); auto const list_begin_offset = - bound_column.offsets().element(unbound_list_row.row_index()); + bound_column.offsets().template element(unbound_list_row.row_index()); auto const list_end_offset = - bound_column.offsets().element(unbound_list_row.row_index() + 1); + bound_column.offsets().template element(unbound_list_row.row_index() + 1); #ifndef NDEBUG printf( @@ -493,10 +497,10 @@ struct list_child_constructor { // string_views should now have been populated with source and target references. auto string_offsets = cudf::strings::detail::child_offsets_from_string_iterator( - string_views.begin(), string_views.size(), mr, stream.value()); + string_views.begin(), string_views.size(), stream, mr); auto string_chars = cudf::strings::detail::child_chars_from_string_vector( - string_views, string_offsets->view().template data(), 0, mr, stream.value()); + string_views, string_offsets->view().template data(), 0, stream, mr); auto child_null_mask = source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable() ? construct_child_nullmask( @@ -587,7 +591,7 @@ struct list_child_constructor { child_list_views.begin(), [] __device__(auto const& row) { return row.size(); }); auto child_offsets = cudf::strings::detail::make_offsets_child_column( - begin, begin + child_list_views.size(), mr, stream.value()); + begin, begin + child_list_views.size(), stream, mr); auto child_column = cudf::type_dispatcher(source_lists_column_view.child().child(1).type(), @@ -695,7 +699,7 @@ std::unique_ptr scatter( auto list_size_begin = thrust::make_transform_iterator( target_vector.begin(), [] __device__(unbound_list_view l) { return l.size(); }); auto offsets_column = cudf::strings::detail::make_offsets_child_column( - list_size_begin, list_size_begin + target.size(), mr, stream.value()); + list_size_begin, list_size_begin + target.size(), stream, mr); auto child_column = cudf::type_dispatcher(child_column_type, list_child_constructor{}, diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp index feade65f31a..5271bed14c8 100644 --- a/cpp/include/cudf/scalar/scalar_factories.hpp +++ b/cpp/include/cudf/scalar/scalar_factories.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include +#include + namespace cudf { /** * @addtogroup scalar_factories @@ -38,7 +40,7 @@ namespace cudf { */ std::unique_ptr make_numeric_scalar( data_type type, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -54,7 +56,7 @@ std::unique_ptr make_numeric_scalar( */ std::unique_ptr make_timestamp_scalar( data_type type, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -70,7 +72,7 @@ std::unique_ptr make_timestamp_scalar( */ std::unique_ptr make_duration_scalar( data_type type, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,7 +88,7 @@ std::unique_ptr make_duration_scalar( */ std::unique_ptr make_fixed_width_scalar( data_type type, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -102,7 +104,7 @@ std::unique_ptr make_fixed_width_scalar( */ std::unique_ptr make_string_scalar( std::string const& string, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,7 +127,7 @@ std::unique_ptr make_default_constructed_scalar(data_type type); template std::unique_ptr make_fixed_width_scalar( T value, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { return std::make_unique>(value, true, stream, mr); @@ -143,7 +145,7 @@ template std::unique_ptr make_fixed_point_scalar( typename T::rep value, numeric::scale_type scale, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { return std::make_unique>(value, scale, true, stream, mr); diff --git a/cpp/include/cudf/strings/copying.hpp b/cpp/include/cudf/strings/copying.hpp index 70aa89fdfea..b4455e2c3b4 100644 --- a/cpp/include/cudf/strings/copying.hpp +++ b/cpp/include/cudf/strings/copying.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -50,7 +52,7 @@ std::unique_ptr copy_slice( size_type start, size_type end = -1, size_type step = 1, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index c45bc9558ed..ed783ca996c 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -33,7 +35,7 @@ namespace detail { std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -45,7 +47,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr join_strings(strings_column_view const& strings, string_scalar const& separator, string_scalar const& narep, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp index 59348d85473..098dc1a38dc 100644 --- a/cpp/include/cudf/strings/detail/converters.hpp +++ b/cpp/include/cudf/strings/detail/converters.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -30,7 +32,7 @@ namespace detail { */ std::unique_ptr to_integers(strings_column_view const& strings, data_type output_type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -39,7 +41,7 @@ std::unique_ptr to_integers(strings_column_view const& strings, * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr from_integers(column_view const& integers, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -49,7 +51,7 @@ std::unique_ptr from_integers(column_view const& integers, */ std::unique_ptr to_floats(strings_column_view const& strings, data_type output_type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -58,7 +60,7 @@ std::unique_ptr to_floats(strings_column_view const& strings, * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr from_floats(column_view const& floats, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -69,7 +71,7 @@ std::unique_ptr from_floats(column_view const& floats, */ std::unique_ptr to_booleans(strings_column_view const& strings, string_scalar const& true_string, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -81,7 +83,7 @@ std::unique_ptr to_booleans(strings_column_view const& strings, std::unique_ptr from_booleans(column_view const& booleans, string_scalar const& true_string, string_scalar const& false_string, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -93,7 +95,7 @@ std::unique_ptr from_booleans(column_view const& booleans, std::unique_ptr to_timestamps(strings_column_view const& strings, data_type timestamp_type, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -104,7 +106,7 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, */ std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -116,7 +118,7 @@ std::unique_ptr from_timestamps(column_view const& timestamps, std::unique_ptr to_durations(strings_column_view const& strings, data_type duration_type, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** @@ -127,7 +129,7 @@ std::unique_ptr to_durations(strings_column_view const& strings, */ std::unique_ptr from_durations(column_view const& durations, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 3433ab7d210..96961feee04 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -60,7 +60,7 @@ std::unique_ptr copy_if_else( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = std::distance(lhs_begin, lhs_end); - if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); // create null mask @@ -88,14 +88,13 @@ std::unique_ptr copy_if_else( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().template data(); // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = - create_chars_child_column(strings_count, null_count, bytes, mr, stream.value()); - auto d_chars = chars_column->mutable_view().template data(); + size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; + auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); // fill in chars thrust::for_each_n( execpol->on(stream.value()), diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index 563f66ad2c8..68a0c1d7733 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,7 +156,7 @@ std::unique_ptr copy_range( source_value_begin, source_validity_begin, d_target, target_begin, target_end}); p_offsets_column = detail::make_offsets_child_column( - string_size_begin, string_size_begin + target.size(), mr, stream.value()); + string_size_begin, string_size_begin + target.size(), stream, mr); } else if (null_count > 0) { // check validities for source only auto string_size_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -164,7 +164,7 @@ std::unique_ptr copy_range( source_value_begin, source_validity_begin, d_target, target_begin, target_end}); p_offsets_column = detail::make_offsets_child_column( - string_size_begin, string_size_begin + target.size(), mr, stream.value()); + string_size_begin, string_size_begin + target.size(), stream, mr); } else { // no need to check validities auto string_size_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -172,7 +172,7 @@ std::unique_ptr copy_range( source_value_begin, source_validity_begin, d_target, target_begin, target_end}); p_offsets_column = detail::make_offsets_child_column( - string_size_begin, string_size_begin + target.size(), mr, stream.value()); + string_size_begin, string_size_begin + target.size(), stream, mr); } // create the chars column @@ -182,7 +182,7 @@ std::unique_ptr copy_range( auto chars_bytes = p_offsets[target.size()]; auto p_chars_column = strings::detail::create_chars_child_column( - target.size(), null_count, chars_bytes, mr, stream.value()); + target.size(), null_count, chars_bytes, stream, mr); // copy to the chars column diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index e70dbd399c9..8f457d9e48f 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,7 +68,7 @@ std::unique_ptr gather( { auto output_count = std::distance(begin, end); auto strings_count = strings.size(); - if (output_count == 0) return make_empty_strings_column(mr, stream.value()); + if (output_count == 0) return make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -82,13 +82,13 @@ std::unique_ptr gather( }; auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.template data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[output_count]; - auto chars_column = create_chars_child_column(output_count, 0, bytes, mr, stream.value()); + auto chars_column = create_chars_child_column(output_count, 0, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); // fill in chars diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index 6bdbce3c933..4a3cde89b30 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -46,12 +48,12 @@ std::unique_ptr merge(strings_column_view const& lhs, strings_column_view const& rhs, row_order_iterator begin, row_order_iterator end, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using cudf::detail::side; size_type strings_count = static_cast(std::distance(begin, end)); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); auto lhs_column = column_device_view::create(lhs.parent(), stream); auto d_lhs = *lhs_column; @@ -75,16 +77,16 @@ std::unique_ptr merge(strings_column_view const& lhs, }; auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().template data(); // create the chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); // merge the strings auto d_chars = chars_column->mutable_view().template data(); - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) { diff --git a/cpp/include/cudf/strings/detail/modify_strings.cuh b/cpp/include/cudf/strings/detail/modify_strings.cuh index c90ca4575f8..b2fcb16dbd6 100644 --- a/cpp/include/cudf/strings/detail/modify_strings.cuh +++ b/cpp/include/cudf/strings/detail/modify_strings.cuh @@ -43,22 +43,22 @@ namespace detail { * * @param strings Number Column of strings to apply the modifications on; * it is not modified in place; rather a new column is returned instead - * @param mr Device memory resource used to allocate the returned column's device memory. - * (cannot be a default argument because of the variadic pack); * @param stream CUDA stream used for device memory operations and kernel launches. * (cannot be a default argument because of the variadic pack); + * @param mr Device memory resource used to allocate the returned column's device memory. + * (cannot be a default argument because of the variadic pack); * @param ...args Additional arguments to be forwarded to * the probe / execute constructors (can be empty); * @return modified strings column */ template std::unique_ptr modify_strings(strings_column_view const& strings, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr, - cudaStream_t stream, Types&&... args) { auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); @@ -67,8 +67,7 @@ std::unique_ptr modify_strings(strings_column_view const& strings, size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // get the lookup tables used for case conversion device_probe_functor d_probe_fctr{d_column, std::forward(args)...}; @@ -77,7 +76,7 @@ std::unique_ptr modify_strings(strings_column_view const& strings, auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), d_probe_fctr); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); // not sure why this requires `.template` and the next @@ -86,19 +85,18 @@ std::unique_ptr modify_strings(strings_column_view const& strings, // build the chars column -- convert characters based on case_flag parameter size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); device_execute_functor d_execute_fctr{ d_column, d_new_offsets, d_chars, std::forward(args)...}; - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, d_execute_fctr); - // return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index 9e0497052a6..5d64cb9944b 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -61,7 +61,7 @@ std::unique_ptr scatter( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = target.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return make_empty_strings_column(stream, mr); // create null mask -- caller must update this rmm::device_buffer null_mask{0, stream, mr}; @@ -75,10 +75,10 @@ std::unique_ptr scatter( rmm::exec_policy(stream)->on(stream.value()), begin, end, scatter_map, target_vector.begin()); // build offsets column - auto offsets_column = child_offsets_from_string_vector(target_vector, mr, stream.value()); + auto offsets_column = child_offsets_from_string_vector(target_vector, stream, mr); // build chars column auto chars_column = child_chars_from_string_vector( - target_vector, offsets_column->view().data(), 0, mr, stream.value()); + target_vector, offsets_column->view().data(), 0, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 6ec055d0aee..dcc15b00c28 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -14,45 +14,46 @@ * limitations under the License. */ +#include + #include #include #include #include #include #include -#include #include +#include + #include #include -// clang-format off namespace cudf { namespace strings { namespace detail { // Create a strings-type column from vector of pointer/size pairs -template -std::unique_ptr make_strings_column( - IndexPairIterator begin, IndexPairIterator end, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream ) +template +std::unique_ptr make_strings_column(IndexPairIterator begin, + IndexPairIterator end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - size_type strings_count = thrust::distance(begin,end); - if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream); + size_type strings_count = thrust::distance(begin, end); + if (strings_count == 0) return strings::detail::make_empty_strings_column(stream, mr); using string_index_pair = thrust::pair; - auto execpol = rmm::exec_policy(stream); + auto execpol = rmm::exec_policy(stream); + // check total size is not too large for cudf column + auto size_checker = [] __device__(string_index_pair const& item) { + return (item.first != nullptr) ? item.second : 0; + }; size_t bytes = thrust::transform_reduce( - execpol->on(stream), begin, end, - [] __device__(string_index_pair const& item) { - return (item.first != nullptr) ? item.second : 0; - }, - 0, - thrust::plus()); + execpol->on(stream.value()), begin, end, size_checker, 0, thrust::plus()); CUDF_EXPECTS(bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column"); @@ -64,30 +65,28 @@ std::unique_ptr make_strings_column( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().template data(); // create null mask - auto new_nulls = cudf::detail::valid_if( begin, end, - [] __device__(string_index_pair const item) { return item.first != nullptr; }, - stream, - mr); + auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; + auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); auto null_count = new_nulls.second; rmm::device_buffer null_mask{0, stream, mr}; if (null_count > 0) null_mask = std::move(new_nulls.first); // build chars column auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); - auto d_chars = chars_column->mutable_view().template data(); - thrust::for_each_n(execpol->on(stream), + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); + auto copy_chars = [begin, d_offsets, d_chars] __device__(size_type idx) { + string_index_pair const item = begin[idx]; + if (item.first != nullptr) memcpy(d_chars + d_offsets[idx], item.first, item.second); + }; + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, - [begin, d_offsets, d_chars] __device__(size_type idx) { - string_index_pair const item = begin[idx]; - if (item.first != nullptr) - memcpy(d_chars + d_offsets[idx], item.first, item.second); - }); + copy_chars); return make_strings_column(strings_count, std::move(offsets_column), @@ -101,4 +100,3 @@ std::unique_ptr make_strings_column( } // namespace detail } // namespace strings } // namespace cudf -// clang-format on TODO fix diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index 11b5455df04..a5c466ecad2 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,14 @@ */ #pragma once -#include #include #include #include +#include + #include + #include #include @@ -43,8 +45,8 @@ template std::unique_ptr make_offsets_child_column( InputIterator begin, InputIterator end, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(begin < end, "Invalid iterator range"); auto count = thrust::distance(begin, end); @@ -57,8 +59,8 @@ std::unique_ptr make_offsets_child_column( // Rather than manually computing the final offset using values in device memory, // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first // offset values to zero manually. - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), begin, end, d_offsets + 1); - CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream)); + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), begin, end, d_offsets + 1); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value())); return offsets_column; } @@ -76,12 +78,12 @@ template std::unique_ptr child_offsets_from_string_iterator( Iter strings_begin, cudf::size_type num_strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto transformer = [] __device__(string_view v) { return v.size_bytes(); }; auto begin = thrust::make_transform_iterator(strings_begin, transformer); - return make_offsets_child_column(begin, begin + num_strings, mr, stream); + return make_offsets_child_column(begin, begin + num_strings, stream, mr); } // This template is a thin wrapper around per-context singleton objects. diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 4e313a12121..c3b953b4211 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ */ #pragma once -#include -#include #include #include +#include +#include + namespace cudf { namespace strings { namespace detail { @@ -30,27 +31,27 @@ namespace detail { * @param strings_count Number of strings in the column. * @param null_count Number of null string entries in the column. * @param bytes Number of bytes for the chars column. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return The chars child column for a strings column. */ std::unique_ptr create_chars_child_column( size_type strings_count, size_type null_count, size_type bytes, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create a strings column with no strings. * - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Empty strings column */ std::unique_ptr make_empty_strings_column( - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Creates a string_view vector from a strings column. @@ -59,21 +60,21 @@ std::unique_ptr make_empty_strings_column( * @param stream CUDA stream used for device memory operations and kernel launches. * @return Device vector of string_views */ -rmm::device_vector create_string_vector_from_column(cudf::strings_column_view strings, - cudaStream_t stream = 0); +rmm::device_vector create_string_vector_from_column( + cudf::strings_column_view strings, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Creates an offsets column from a string_view vector. * * @param strings Strings column - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Child offsets column */ std::unique_ptr child_offsets_from_string_vector( const rmm::device_vector& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Creates a chars column from a string_view vector. @@ -81,16 +82,16 @@ std::unique_ptr child_offsets_from_string_vector( * @param strings Strings vector * @param d_offsets Offsets vector for placing strings into column's memory. * @param null_count Number of null strings. - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return Child chars column */ std::unique_ptr child_chars_from_string_vector( const rmm::device_vector& strings, const int32_t* d_offsets, cudf::size_type null_count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index b6ae22f6b6a..f30316eda10 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include +#include /** * @file @@ -114,7 +115,7 @@ void print(strings_column_view const& strings, */ std::pair, rmm::device_vector> create_offsets( strings_column_view const& strings, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace strings diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 76d2e57597f..f34a265a50a 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 3e3fe6ad719..dc7635928a7 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -37,11 +37,6 @@ * **/ -/** - * @brief Forward declaration of cudaStream_t - **/ -using cudaStream_t = struct CUstream_st*; - namespace bit_mask { using bit_mask_t = uint32_t; } diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index c244462a390..0cdf0e7fe7b 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -125,7 +125,11 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l * **/ #ifndef NDEBUG -#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream)); +#define CHECK_CUDA(stream) \ + do { \ + CUDA_TRY(cudaStreamSynchronize(stream)); \ + CUDA_TRY(cudaPeekAtLastError()); \ + } while (0); #else #define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError()); #endif diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index df9c679c15d..9c009ce5b60 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -18,13 +18,13 @@ #include #include +#include #include #include #include #include #include -#include "cudf/structs/struct_view.hpp" namespace cudf { diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp index 8eced8ca056..a75ae3d6181 100644 --- a/cpp/include/nvtext/detail/load_hash_file.hpp +++ b/cpp/include/nvtext/detail/load_hash_file.hpp @@ -15,9 +15,12 @@ */ #pragma once -#include #include +#include + +#include + #include #include @@ -38,7 +41,7 @@ namespace detail { * @return vocabulary hash-table elements */ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp index 9dd06f17ce8..8b74c9cde94 100644 --- a/cpp/include/nvtext/detail/tokenize.hpp +++ b/cpp/include/nvtext/detail/tokenize.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace nvtext { namespace detail { /** @@ -35,8 +37,8 @@ namespace detail { std::unique_ptr tokenize( cudf::strings_column_view const& strings, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view @@ -51,8 +53,8 @@ std::unique_ptr tokenize( std::unique_ptr tokenize( cudf::strings_column_view const& strings, cudf::strings_column_view const& delimiters, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar @@ -68,8 +70,8 @@ std::unique_ptr tokenize( std::unique_ptr count_tokens( cudf::strings_column_view const& strings, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view @@ -84,8 +86,8 @@ std::unique_ptr count_tokens( std::unique_ptr count_tokens( cudf::strings_column_view const& strings, cudf::strings_column_view const& delimiters, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace nvtext diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 06f969a9d43..ae0395913cc 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -142,7 +142,7 @@ void set_null_mask(bitmask_type *bitmask, cudf::detail::grid_1d config(number_of_mask_words, 256); set_null_mask_kernel<<>>( static_cast(bitmask), begin_bit, end_bit, valid, number_of_mask_words); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); } } @@ -604,7 +604,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, last_word_indices, stream.value())); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); // third, adjust counts in segment boundaries (if segments are not // word-aligned) @@ -619,7 +619,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, stream.value()>>>( bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin()); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); std::vector ret(num_ranges); CUDA_TRY(cudaMemcpyAsync(ret.data(), diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu index b64f88291b7..f72a65f2348 100644 --- a/cpp/src/column/column.cu +++ b/cpp/src/column/column.cu @@ -268,13 +268,12 @@ struct create_column_from_view { auto num_rows = children.empty() ? 0 : children.front()->size(); - return make_structs_column( - num_rows, - std::move(children), - view.null_count(), - cudf::detail::copy_bitmask(view.null_mask(), begin, end, rmm::cuda_stream_view{stream}, mr), - stream.value(), - mr); + return make_structs_column(num_rows, + std::move(children), + view.null_count(), + cudf::detail::copy_bitmask(view.null_mask(), begin, end, stream, mr), + stream.value(), + mr); } }; } // anonymous namespace diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 0719af9756b..38466de12c5 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -608,6 +608,7 @@ BufInfo build_output_columns(InputIter begin, src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr); return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)}; }); + return current_info; } diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index ff532059108..d41c0a2fa74 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -187,10 +187,9 @@ std::unique_ptr out_of_place_copy_range_dispatch::operator()view()); - auto source_matched = cudf::dictionary::detail::set_keys( - dict_source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); + auto source_matched = cudf::dictionary::detail::set_keys(dict_source, target_view.keys(), stream); auto const source_view = cudf::dictionary_column_view(source_matched->view()); // build the new indices by calling in_place_copy_range on just the indices diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index a6da491f672..94e6be49e9d 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -16,11 +16,12 @@ #include #include +#include #include #include #include -#include +#include namespace cudf { namespace detail { @@ -32,7 +33,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { auto s = make_fixed_width_scalar(data_type(type_to_id()), stream, mr); @@ -56,7 +57,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { auto device_col = column_device_view::create(input, stream); @@ -81,7 +82,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { auto dict_view = dictionary_column_view(input); @@ -109,7 +110,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { CUDF_FAIL("get_element_functor not supported for list_view"); @@ -119,7 +120,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { CUDF_FAIL("get_element_functor not supported for decimal32"); @@ -129,7 +130,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { CUDF_FAIL("get_element_functor not supported for decimal64"); @@ -139,7 +140,7 @@ struct get_element_functor { std::unique_ptr operator()( column_view const &input, size_type index, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { CUDF_FAIL("get_element_functor not supported for struct_view"); @@ -150,7 +151,7 @@ struct get_element_functor { std::unique_ptr get_element(column_view const &input, size_type index, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds"); @@ -163,7 +164,7 @@ std::unique_ptr get_element(column_view const &input, size_type index, rmm::mr::device_memory_resource *mr) { - return detail::get_element(input, index, 0, mr); + return detail::get_element(input, index, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index d8beb052f8f..90ff5ff3025 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -180,8 +180,8 @@ struct column_scalar_scatterer_impl { auto dict_target = dictionary::detail::add_keys(dictionary_column_view(target), make_column_from_scalar(source.get(), 1, stream)->view(), - mr, - stream.value()); + stream, + mr); auto dict_view = dictionary_column_view(dict_target->view()); auto scalar_index = dictionary::detail::get_index(dict_view, source.get(), stream); auto scalar_iter = thrust::make_permutation_iterator( diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ce2df92efc0..e4989c743ef 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -137,16 +137,16 @@ struct launch_functor { template typename std::enable_if_t::value, void> operator()( - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { CUDF_FAIL("Cannot extract datetime component from non-timestamp column."); } template typename std::enable_if_t::value, void> operator()( - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input.begin(), input.end(), output.begin(), @@ -157,7 +157,7 @@ struct launch_functor { // Create an output column by applying the functor to every element from the input column template std::unique_ptr apply_datetime_op(column_view const& column, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp"); @@ -167,13 +167,12 @@ std::unique_ptr apply_datetime_op(column_view const& column, // Return an empty column if source column is empty if (size == 0) return make_empty_column(output_col_type); - auto output = - make_fixed_width_column(output_col_type, - size, - cudf::detail::copy_bitmask(column, rmm::cuda_stream_view{stream}, mr), - column.null_count(), - stream, - mr); + auto output = make_fixed_width_column(output_col_type, + size, + cudf::detail::copy_bitmask(column, stream, mr), + column.null_count(), + stream, + mr); auto launch = launch_functor::type>{ column, static_cast(*output)}; @@ -211,16 +210,16 @@ struct add_calendrical_months_functor { template typename std::enable_if_t::value, void> operator()( - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { CUDF_FAIL("Cannot extract datetime component from non-timestamp column."); } template typename std::enable_if_t::value, void> operator()( - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), timestamp_column.begin(), timestamp_column.end(), months_column.begin(), @@ -253,7 +252,7 @@ struct add_calendrical_months_functor { std::unique_ptr add_calendrical_months(column_view const& timestamp_column, column_view const& months_column, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp"); diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 79effe3fc97..daf1bb76916 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -49,8 +49,8 @@ namespace detail { std::unique_ptr add_keys( dictionary_column_view const& dictionary_column, column_view const& new_keys, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls"); auto old_keys = dictionary_column.keys(); // [a,b,c,d,f] @@ -116,11 +116,10 @@ std::unique_ptr add_keys( // create new dictionary column with keys_column and indices_column // null mask has not changed - return make_dictionary_column( - std::move(keys_column), - std::move(indices_column), - cudf::detail::copy_bitmask(dictionary_column.parent(), rmm::cuda_stream_view{stream}, mr), - dictionary_column.null_count()); + return make_dictionary_column(std::move(keys_column), + std::move(indices_column), + cudf::detail::copy_bitmask(dictionary_column.parent(), stream, mr), + dictionary_column.null_count()); } } // namespace detail @@ -130,7 +129,7 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::add_keys(dictionary_column, keys, mr); + return detail::add_keys(dictionary_column, keys, rmm::cuda_stream_default, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu index 913da30df16..3822edfc9ef 100644 --- a/cpp/src/dictionary/decode.cu +++ b/cpp/src/dictionary/decode.cu @@ -33,8 +33,8 @@ namespace detail { * @brief Decode a column from a dictionary. */ std::unique_ptr decode(dictionary_column_view const& source, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (source.is_empty()) return make_empty_column(data_type{type_id::EMPTY}); @@ -55,9 +55,8 @@ std::unique_ptr decode(dictionary_column_view const& source, auto output_column = std::unique_ptr(std::move(table_column.front())); // apply any nulls to the output column - output_column->set_null_mask( - cudf::detail::copy_bitmask(source.parent(), rmm::cuda_stream_view{stream}, mr), - source.null_count()); + output_column->set_null_mask(cudf::detail::copy_bitmask(source.parent(), stream, mr), + source.null_count()); return output_column; } @@ -68,7 +67,7 @@ std::unique_ptr decode(dictionary_column_view const& source, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::decode(source, mr); + return detail::decode(source, rmm::cuda_stream_default, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu index 6448d711db1..6a2b7f71ae3 100644 --- a/cpp/src/dictionary/detail/merge.cu +++ b/cpp/src/dictionary/detail/merge.cu @@ -22,6 +22,8 @@ #include #include +#include + namespace cudf { namespace dictionary { namespace detail { @@ -29,8 +31,8 @@ namespace detail { std::unique_ptr merge(dictionary_column_view const& lcol, dictionary_column_view const& rcol, cudf::detail::index_vector const& row_order, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const lcol_iter = cudf::detail::indexalator_factory::make_input_iterator(lcol.indices()); auto const rcol_iter = cudf::detail::indexalator_factory::make_input_iterator(rcol.indices()); @@ -44,7 +46,7 @@ std::unique_ptr merge(dictionary_column_view const& lcol, cudf::detail::indexalator_factory::make_output_iterator(indices_column->mutable_view()); // merge the input indices columns into the output column - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), row_order.begin(), row_order.end(), output_iter, diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index 17a09e26f7b..73d1becf639 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -29,8 +29,8 @@ namespace { struct dispatch_create_indices { template ()>* = nullptr> std::unique_ptr operator()(column_view const& indices, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(std::is_unsigned(), "indices must be an unsigned type"); column_view indices_view{ @@ -39,8 +39,8 @@ struct dispatch_create_indices { } template ()>* = nullptr> std::unique_ptr operator()(column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) { CUDF_FAIL("indices must be an integer type."); } @@ -49,19 +49,18 @@ struct dispatch_create_indices { std::unique_ptr make_dictionary_column(column_view const& keys_column, column_view const& indices_column, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!keys_column.has_nulls(), "keys column must not have nulls"); if (keys_column.is_empty()) return make_empty_column(data_type{type_id::DICTIONARY32}); auto keys_copy = std::make_unique(keys_column, stream, mr); auto indices_copy = - type_dispatcher(indices_column.type(), dispatch_create_indices{}, indices_column, mr, stream); + type_dispatcher(indices_column.type(), dispatch_create_indices{}, indices_column, stream, mr); rmm::device_buffer null_mask{0, stream, mr}; auto null_count = indices_column.null_count(); - if (null_count) - null_mask = detail::copy_bitmask(indices_column, rmm::cuda_stream_view{stream}, mr); + if (null_count) null_mask = detail::copy_bitmask(indices_column, stream, mr); std::vector> children; children.emplace_back(std::move(indices_copy)); @@ -117,8 +116,8 @@ struct make_unsigned_fn { std::unique_ptr make_dictionary_column(std::unique_ptr keys, std::unique_ptr indices, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!keys->has_nulls(), "keys column must not have nulls"); diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index 129c9345d4b..501e034c5fe 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -26,6 +26,7 @@ #include #include #include + #include namespace cudf { @@ -38,8 +39,8 @@ namespace detail { */ std::unique_ptr encode(column_view const& input_column, data_type indices_type, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer"); CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32, @@ -63,11 +64,10 @@ std::unique_ptr encode(column_view const& input_column, indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr); // create column with keys_column and indices_column - return make_dictionary_column( - std::move(keys_column), - std::move(indices_column), - cudf::detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr), - input_column.null_count()); + return make_dictionary_column(std::move(keys_column), + std::move(indices_column), + cudf::detail::copy_bitmask(input_column, stream, mr), + input_column.null_count()); } /** @@ -89,7 +89,7 @@ std::unique_ptr encode(column_view const& input_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::encode(input_column, indices_type, mr); + return detail::encode(input_column, indices_type, rmm::cuda_stream_default, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index f0f86a3dd1a..b36b110b13f 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -26,6 +26,8 @@ #include #include +#include + #include #include #include @@ -53,8 +55,8 @@ template std::unique_ptr remove_keys_fn( dictionary_column_view const& dictionary_column, KeysKeeper keys_to_keep_fn, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const keys_view = dictionary_column.keys(); auto execpol = rmm::exec_policy(stream); @@ -67,7 +69,7 @@ std::unique_ptr remove_keys_fn( auto map_itr = cudf::detail::indexalator_factory::make_output_iterator(map_indices->mutable_view()); // init to max to identify new nulls - thrust::fill(execpol->on(stream), + thrust::fill(execpol->on(stream.value()), map_itr, map_itr + keys_view.size(), max_size); // all valid indices are less than this value @@ -79,7 +81,7 @@ std::unique_ptr remove_keys_fn( auto positions = make_fixed_width_column( indices_type, keys_view.size(), cudf::mask_state::UNALLOCATED, stream); auto itr = cudf::detail::indexalator_factory::make_output_iterator(positions->mutable_view()); - thrust::sequence(execpol->on(stream), itr, itr + keys_view.size()); + thrust::sequence(execpol->on(stream.value()), itr, itr + keys_view.size()); return positions; }(); // copy the non-removed keys ( keys_to_keep_fn(idx)==true ) @@ -93,7 +95,7 @@ std::unique_ptr remove_keys_fn( cudf::detail::indexalator_factory::make_input_iterator(keys_positions->view()); // build indices mapper // Example scatter([0,1,2][0,2,4][max,max,max,max,max]) => [0,max,1,max,2] - thrust::scatter(execpol->on(stream), + thrust::scatter(execpol->on(stream.value()), positions_itr, positions_itr + filtered_view.size(), filtered_itr, @@ -145,8 +147,8 @@ std::unique_ptr remove_keys_fn( std::unique_ptr remove_keys( dictionary_column_view const& dictionary_column, column_view const& keys_to_remove, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls"); auto const keys_view = dictionary_column.keys(); @@ -157,13 +159,13 @@ std::unique_ptr remove_keys( auto d_matches = matches->view().data(); // call common utility method to keep the keys not matched to keys_to_remove auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; }; - return remove_keys_fn(dictionary_column, key_matcher, mr, stream); + return remove_keys_fn(dictionary_column, key_matcher, stream, mr); } std::unique_ptr remove_unused_keys( dictionary_column_view const& dictionary_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // locate the keys to remove auto const keys_size = dictionary_column.keys_size(); @@ -174,7 +176,7 @@ std::unique_ptr remove_unused_keys( // build keys index to verify against indices values rmm::device_uvector keys_positions(keys_size, stream); thrust::sequence( - rmm::exec_policy(stream)->on(stream), keys_positions.begin(), keys_positions.end()); + rmm::exec_policy(stream)->on(stream.value()), keys_positions.begin(), keys_positions.end()); // wrap the indices for comparison in contains() column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data()); return cudf::detail::contains(keys_positions_view, indices_view, stream, mr); @@ -183,7 +185,7 @@ std::unique_ptr remove_unused_keys( // call common utility method to keep the keys that match auto key_matcher = [d_matches] __device__(size_type idx) { return d_matches[idx]; }; - return remove_keys_fn(dictionary_column, key_matcher, mr, stream); + return remove_keys_fn(dictionary_column, key_matcher, stream, mr); } } // namespace detail @@ -195,14 +197,14 @@ std::unique_ptr remove_keys(dictionary_column_view const& dictionary_col rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::remove_keys(dictionary_column, keys_to_remove, mr); + return detail::remove_keys(dictionary_column, keys_to_remove, rmm::cuda_stream_default, mr); } std::unique_ptr remove_unused_keys(dictionary_column_view const& dictionary_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::remove_unused_keys(dictionary_column, mr); + return detail::remove_unused_keys(dictionary_column, rmm::cuda_stream_default, mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 60e7c496e06..6db30c9765d 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -127,7 +127,7 @@ std::unique_ptr replace_indices(column_view const& input, /** * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::column_view - * const&,rmm::mr::device_memory_resource*,cudaStream_t) + * const& rmm::cuda_stream_view, rmm::mr::device_memory_resource*) */ std::unique_ptr replace_nulls(dictionary_column_view const& input, dictionary_column_view const& replacement, @@ -140,7 +140,7 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match"); // first combine the keys so both input dictionaries have the same set - auto matched = match_dictionaries({input, replacement}, mr, stream.value()); + auto matched = match_dictionaries({input, replacement}, stream, mr); // now build the new indices by doing replace-null using the updated input indices auto const input_indices = @@ -152,16 +152,13 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, : replace_indices( input_indices, make_nullable_index_iterator(repl_indices), stream, mr); - // auto keys_column = ; - return make_dictionary_column(std::move(matched.front()->release().children.back()), - std::move(new_indices), - mr, - stream.value()); + return make_dictionary_column( + std::move(matched.front()->release().children.back()), std::move(new_indices), stream, mr); } /** * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::scalar - * const&,rmm::mr::device_memory_resource*,cudaStream_t) + * const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) */ std::unique_ptr replace_nulls(dictionary_column_view const& input, scalar const& replacement, @@ -175,11 +172,10 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type"); // first add the replacment to the keys so only the indices need to be processed - auto const default_mr = rmm::mr::get_current_device_resource(); - auto input_matched = dictionary::detail::add_keys( - input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream.value()); + auto input_matched = dictionary::detail::add_keys( + input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr); auto const input_view = dictionary_column_view(input_matched->view()); - auto const scalar_index = get_index(input_view, replacement, stream, default_mr); + auto const scalar_index = get_index(input_view, replacement, stream); // now build the new indices by doing replace-null on the updated indices auto const input_indices = input_view.get_indices_annotated(); @@ -187,10 +183,8 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, replace_indices(input_indices, make_scalar_iterator(*scalar_index), stream, mr); new_indices->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - return make_dictionary_column(std::move(input_matched->release().children.back()), - std::move(new_indices), - mr, - stream.value()); + return make_dictionary_column( + std::move(input_matched->release().children.back()), std::move(new_indices), stream, mr); } } // namespace detail diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 69fdcd85b35..6889a265c5a 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -28,6 +28,8 @@ #include #include +#include + #include #include #include @@ -49,8 +51,8 @@ struct dispatch_compute_indices { std::unique_ptr> operator()(dictionary_column_view const& input, column_view const& new_keys, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto dictionary_view = column_device_view::create(input.parent(), stream); auto d_dictionary = *dictionary_view; @@ -72,7 +74,7 @@ struct dispatch_compute_indices { mr); auto result_itr = cudf::detail::indexalator_factory::make_output_iterator(result->mutable_view()); - thrust::lower_bound(rmm::exec_policy(stream)->on(stream), + thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()), new_keys_view->begin(), new_keys_view->end(), dictionary_itr, @@ -88,8 +90,8 @@ struct dispatch_compute_indices { std::unique_ptr> operator()(dictionary_column_view const& input, column_view const& new_keys, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("list_view dictionary set_keys not supported yet"); } @@ -101,8 +103,8 @@ struct dispatch_compute_indices { std::unique_ptr set_keys( dictionary_column_view const& dictionary_column, column_view const& new_keys, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls"); auto keys = dictionary_column.keys(); @@ -140,8 +142,8 @@ std::unique_ptr set_keys( dispatch_compute_indices{}, dictionary_column, keys_column->view(), - mr, - stream); + stream, + mr); // create column with keys_column and indices_column return make_dictionary_column(std::move(keys_column), @@ -151,8 +153,8 @@ std::unique_ptr set_keys( } std::vector> match_dictionaries(std::vector input, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::vector keys(input.size()); std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); }); @@ -160,13 +162,13 @@ std::vector> match_dictionaries(std::vectorview(); std::vector> result(input.size()); std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) { - return set_keys(col, keys_view, mr, stream); + return set_keys(col, keys_view, stream, mr); }); return result; } std::pair>, std::vector> match_dictionaries( - std::vector tables, rmm::mr::device_memory_resource* mr, cudaStream_t stream) + std::vector tables, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // Make a copy of all the column views from each table_view std::vector> updated_columns; @@ -188,7 +190,7 @@ std::pair>, std::vector> match_d return dictionary_column_view(t.column(col_idx)); }); // now match the keys in these dictionary columns - auto dict_cols = dictionary::detail::match_dictionaries(dict_views, mr, stream); + auto dict_cols = dictionary::detail::match_dictionaries(dict_views, stream, mr); // replace the updated_columns vector entries for the set of columns at col_idx auto dict_col_idx = 0; for (auto& v : updated_columns) v[col_idx] = dict_cols[dict_col_idx++]->view(); @@ -218,7 +220,7 @@ std::unique_ptr set_keys(dictionary_column_view const& dictionary_column rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::set_keys(dictionary_column, keys, mr); + return detail::set_keys(dictionary_column, keys, rmm::cuda_stream_default, mr); } } // namespace dictionary diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index 77482e13b6c..a9d8a61d88f 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -176,7 +176,7 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()view(), mr, stream.value()); + cudf::dictionary::detail::add_keys(target, scalar_column->view(), stream, mr); cudf::column_view const target_indices = cudf::dictionary_column_view(target_matched->view()).get_indices_annotated(); diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 224f6dfe3a0..46b16ac0949 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -46,7 +46,8 @@ struct count_accessor { cudf::scalar const* p_scalar = nullptr; template - std::enable_if_t::value, cudf::size_type> operator()(cudaStream_t stream = 0) + std::enable_if_t::value, cudf::size_type> operator()( + rmm::cuda_stream_view stream) { using ScalarType = cudf::scalar_type_t; #if 1 @@ -63,7 +64,8 @@ struct count_accessor { } template - std::enable_if_t::value, cudf::size_type> operator()(cudaStream_t stream) + std::enable_if_t::value, cudf::size_type> operator()( + rmm::cuda_stream_view) { CUDF_FAIL("count value should be a integral type."); } diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 6a004393b83..3df6e0ece85 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ groupby::groupby(table_view const& keys, // Select hash vs. sort groupby implementation std::pair, std::vector> groupby::dispatch_aggregation( std::vector const& requests, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // If sort groupby has been called once on this groupby object, then diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 0a56563cf87..3ef97d431cd 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -223,12 +223,12 @@ auto create_hash_map(table_device_view const& d_keys, row_equality_comparator rows_equal{d_keys, d_keys, null_keys_are_equal}; return map_type::create(compute_hash_table_size(d_keys.num_rows()), + stream, unused_key, unused_value, hasher, rows_equal, - allocator_type(), - stream.value()); + allocator_type()); } /** @@ -273,8 +273,8 @@ void compute_single_pass_aggs(table_view const& keys, cudf::detail::initialize_with_identity(table_view, aggs, stream); // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table); - auto d_values = table_device_view::create(flattened_values); + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); rmm::device_vector d_aggs(aggs); bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; @@ -372,7 +372,7 @@ std::unique_ptr
groupby_null_templated(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto d_keys = table_device_view::create(keys); + auto d_keys = table_device_view::create(keys, stream); auto map = create_hash_map(*d_keys, include_null_keys, stream); // Cache of sparse results where the location of aggregate value in each diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index b49fbeb7387..a22e7619694 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -14,9 +14,12 @@ * limitations under the License. */ -#include #include +#include + +#include + #include namespace cudf { @@ -26,16 +29,16 @@ std::unique_ptr group_argmax(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, column_view const& key_sort_order, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto indices = type_dispatcher(values.type(), reduce_functor{}, values, num_groups, group_labels, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); // The functor returns the index of maximum in the sorted values. // We need the index of maximum in the original unsorted values. diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index 5ae11ba0506..6cdcd7cd94a 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -14,9 +14,12 @@ * limitations under the License. */ -#include #include +#include + +#include + #include namespace cudf { @@ -26,16 +29,16 @@ std::unique_ptr group_argmin(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, column_view const& key_sort_order, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto indices = type_dispatcher(values.type(), reduce_functor{}, values, num_groups, group_labels, - rmm::mr::get_current_device_resource(), - stream); + stream, + rmm::mr::get_current_device_resource()); // The functor returns the index of minimum in the sorted values. // We need the index of minimum in the original unsorted values. diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu index aeb9d472b7e..9c8ab92cc50 100644 --- a/cpp/src/groupby/sort/group_collect.cu +++ b/cpp/src/groupby/sort/group_collect.cu @@ -20,14 +20,16 @@ #include #include +#include + namespace cudf { namespace groupby { namespace detail { std::unique_ptr group_collect(column_view const &values, rmm::device_vector const &group_offsets, size_type num_groups, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { rmm::device_buffer offsets_data( group_offsets.data().get(), group_offsets.size() * sizeof(cudf::size_type), stream, mr); diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu index 504ffe09bc2..d63f691d2e1 100644 --- a/cpp/src/groupby/sort/group_count.cu +++ b/cpp/src/groupby/sort/group_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ #include #include +#include + #include #include @@ -28,8 +30,8 @@ namespace detail { std::unique_ptr group_count_valid(column_view const& values, rmm::device_vector const& group_labels, size_type num_groups, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative"); CUDF_EXPECTS(static_cast(values.size()) == group_labels.size(), @@ -49,14 +51,14 @@ std::unique_ptr group_count_valid(column_view const& values, thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view), [] __device__(auto b) { return static_cast(b); }); - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), bitmask_iterator, thrust::make_discard_iterator(), result->mutable_view().begin()); } else { - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), thrust::make_constant_iterator(1), @@ -69,8 +71,8 @@ std::unique_ptr group_count_valid(column_view const& values, std::unique_ptr group_count_all(rmm::device_vector const& group_offsets, size_type num_groups, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative"); @@ -79,7 +81,7 @@ std::unique_ptr group_count_all(rmm::device_vector const& gro if (num_groups == 0) { return result; } - thrust::adjacent_difference(rmm::exec_policy(stream)->on(stream), + thrust::adjacent_difference(rmm::exec_policy(stream)->on(stream.value()), group_offsets.begin() + 1, group_offsets.end(), result->mutable_view().begin()); diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu index aec10eec520..06aa172d125 100644 --- a/cpp/src/groupby/sort/group_max.cu +++ b/cpp/src/groupby/sort/group_max.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +16,24 @@ #include +#include + namespace cudf { namespace groupby { namespace detail { std::unique_ptr group_max(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return type_dispatcher(values.type(), reduce_functor{}, values, num_groups, group_labels, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu index 89405ccc2fe..72bc3e6ba3d 100644 --- a/cpp/src/groupby/sort/group_min.cu +++ b/cpp/src/groupby/sort/group_min.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +16,24 @@ #include +#include + namespace cudf { namespace groupby { namespace detail { std::unique_ptr group_min(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return type_dispatcher(values.type(), reduce_functor{}, values, num_groups, group_labels, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu index bc9d0016207..e7e947b65fc 100644 --- a/cpp/src/groupby/sort/group_nth_element.cu +++ b/cpp/src/groupby/sort/group_nth_element.cu @@ -23,6 +23,8 @@ #include #include +#include + namespace cudf { namespace groupby { namespace detail { @@ -33,8 +35,8 @@ std::unique_ptr group_nth_element(column_view const &values, size_type num_groups, size_type n, null_policy null_handling, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { CUDF_EXPECTS(static_cast(values.size()) == group_labels.size(), "Size of values column should be same as that of group labels"); @@ -47,7 +49,7 @@ std::unique_ptr group_nth_element(column_view const &values, if (null_handling == null_policy::INCLUDE || !values.has_nulls()) { // Returns index of nth value. thrust::transform_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), group_sizes.begin(), group_sizes.end(), group_offsets.begin(), @@ -67,7 +69,7 @@ std::unique_ptr group_nth_element(column_view const &values, [] __device__(auto b) { return static_cast(b); }); rmm::device_vector intra_group_index(values.size()); // intra group index for valids only. - thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), bitmask_iterator, @@ -76,7 +78,7 @@ std::unique_ptr group_nth_element(column_view const &values, rmm::device_vector group_count = [&] { if (n < 0) { rmm::device_vector group_count(num_groups); - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), bitmask_iterator, @@ -88,7 +90,7 @@ std::unique_ptr group_nth_element(column_view const &values, } }(); // gather the valid index == n - thrust::scatter_if(rmm::exec_policy(stream)->on(stream), + thrust::scatter_if(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(values.size()), group_labels.begin(), // map diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu index 37455a31a91..a1daaedaf27 100644 --- a/cpp/src/groupby/sort/group_nunique.cu +++ b/cpp/src/groupby/sort/group_nunique.cu @@ -20,6 +20,8 @@ #include #include +#include + #include #include @@ -35,8 +37,8 @@ struct nunique_functor { size_type const num_groups, rmm::device_vector const& group_offsets, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto result = make_numeric_column( data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); @@ -61,7 +63,7 @@ struct nunique_functor { return static_cast(is_unique); }); - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), is_unique_iterator, @@ -79,7 +81,7 @@ struct nunique_functor { (not equal.operator()(i, i - 1)); // new unique value in sorted return static_cast(is_unique); }); - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), is_unique_iterator, @@ -96,8 +98,8 @@ struct nunique_functor { size_type const num_groups, rmm::device_vector const& group_offsets, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("list_view group_nunique not supported yet"); } @@ -108,8 +110,8 @@ std::unique_ptr group_nunique(column_view const& values, size_type const num_groups, rmm::device_vector const& group_offsets, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative"); CUDF_EXPECTS(static_cast(values.size()) == group_labels.size(), @@ -122,8 +124,8 @@ std::unique_ptr group_nunique(column_view const& values, num_groups, group_offsets, null_handling, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index 7afeb7c39e4..a9a46b25c04 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,16 @@ #include "group_reductions.hpp" +#include + #include #include #include #include #include -#include #include +#include #include @@ -40,8 +42,8 @@ struct quantiles_functor { size_type const num_groups, rmm::device_vector const& quantile, interpolation interpolation, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using ResultType = cudf::detail::target_type_t; @@ -60,7 +62,7 @@ struct quantiles_functor { auto result_view = mutable_column_device_view::create(result->mutable_view()); // For each group, calculate quantile - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), num_groups, [d_values = *values_view, @@ -125,8 +127,8 @@ std::unique_ptr group_quantiles(column_view const& values, size_type const num_groups, std::vector const& quantiles, interpolation interp, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_vector dv_quantiles(quantiles); @@ -138,8 +140,8 @@ std::unique_ptr group_quantiles(column_view const& values, num_groups, dv_quantiles, interp, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index f1952bc41f7..718ff6e0db9 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -20,6 +20,7 @@ #include #include +#include #include @@ -38,8 +39,8 @@ namespace detail { std::unique_ptr group_sum(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate groupwise minimum value @@ -53,8 +54,8 @@ std::unique_ptr group_sum(column_view const& values, std::unique_ptr group_min(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate groupwise maximum value @@ -68,8 +69,8 @@ std::unique_ptr group_min(column_view const& values, std::unique_ptr group_max(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate group-wise indices of maximum values. @@ -85,8 +86,8 @@ std::unique_ptr group_argmax(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, column_view const& key_sort_order, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate group-wise indices of minimum values. @@ -102,8 +103,8 @@ std::unique_ptr group_argmin(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, column_view const& key_sort_order, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate number of non-null values in each group of @@ -118,8 +119,8 @@ std::unique_ptr group_argmin(column_view const& values, std::unique_ptr group_count_valid(column_view const& values, rmm::device_vector const& group_labels, size_type num_groups, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate number of values in each group of @p values @@ -131,8 +132,8 @@ std::unique_ptr group_count_valid(column_view const& values, */ std::unique_ptr group_count_all(rmm::device_vector const& group_offsets, size_type num_groups, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate groupwise variance @@ -151,8 +152,8 @@ std::unique_ptr group_var(column_view const& values, column_view const& group_sizes, rmm::device_vector const& group_labels, size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate groupwise quantiles @@ -171,8 +172,8 @@ std::unique_ptr group_quantiles(column_view const& values, size_type const num_groups, std::vector const& quantiles, interpolation interp, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate number of unique values in each group of @@ -193,8 +194,8 @@ std::unique_ptr group_nunique(column_view const& values, size_type const num_groups, rmm::device_vector const& group_offsets, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate nth values in each group of @p values @@ -217,8 +218,8 @@ std::unique_ptr group_nth_element(column_view const& values, size_type num_groups, size_type n, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to collect grouped values into a lists column * @@ -231,8 +232,8 @@ std::unique_ptr group_nth_element(column_view const& values, std::unique_ptr group_collect(column_view const& values, rmm::device_vector const& group_offsets, size_type num_groups, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace groupby diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index cc21405925b..696acc886a2 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,8 @@ #include #include +#include + #include namespace cudf { @@ -52,8 +54,8 @@ struct reduce_functor { column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using OpType = cudf::detail::corresponding_operator_t; using ResultType = cudf::detail::target_type_t; @@ -70,10 +72,10 @@ struct reduce_functor { auto result_table = mutable_table_view({*result}); cudf::detail::initialize_with_identity(result_table, {K}, stream); - auto resultview = mutable_column_device_view::create(result->mutable_view()); - auto valuesview = column_device_view::create(values); + auto resultview = mutable_column_device_view::create(result->mutable_view(), stream); + auto valuesview = column_device_view::create(values, stream); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), values.size(), [d_values = *valuesview, diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index eb504951ebb..143a66ab2bd 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -63,17 +64,17 @@ struct var_functor { column_view const& group_sizes, rmm::device_vector const& group_labels, size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Running this in debug build causes a runtime error: // `reduce_by_key failed on 2nd step: invalid device function` #if !defined(__CUDACC_DEBUG__) using ResultType = cudf::detail::target_type_t; size_type const* d_group_labels = group_labels.data().get(); - auto values_view = column_device_view::create(values); - auto means_view = column_device_view::create(group_means); - auto group_size_view = column_device_view::create(group_sizes); + auto values_view = column_device_view::create(values, stream); + auto means_view = column_device_view::create(group_means, stream); + auto group_size_view = column_device_view::create(group_sizes, stream); std::unique_ptr result = make_numeric_column(data_type(type_to_id()), group_sizes.size(), @@ -89,7 +90,7 @@ struct var_functor { thrust::make_counting_iterator(0), var_transform{d_values, d_means, d_group_sizes, d_group_labels, ddof}); - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), group_labels.begin(), group_labels.end(), values_it, @@ -97,10 +98,10 @@ struct var_functor { result->mutable_view().data()); // set nulls - auto result_view = mutable_column_device_view::create(*result); + auto result_view = mutable_column_device_view::create(*result, stream); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), group_sizes.size(), [d_result = *result_view, d_group_sizes = *group_size_view, ddof] __device__(size_type i) { @@ -132,11 +133,11 @@ std::unique_ptr group_var(column_view const& values, column_view const& group_sizes, rmm::device_vector const& group_labels, size_type ddof, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return type_dispatcher( - values.type(), var_functor{}, values, group_means, group_sizes, group_labels, ddof, mr, stream); + values.type(), var_functor{}, values, group_means, group_sizes, group_labels, ddof, stream, mr); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu index 25b68ae86f4..bf3aff91c99 100644 --- a/cpp/src/groupby/sort/group_sum.cu +++ b/cpp/src/groupby/sort/group_sum.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +16,24 @@ #include +#include + namespace cudf { namespace groupby { namespace detail { std::unique_ptr group_sum(column_view const& values, size_type num_groups, rmm::device_vector const& group_labels, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return type_dispatcher(values.type(), reduce_functor{}, values, num_groups, group_labels, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 1b9fff02fba..84cc1af0d76 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -33,6 +33,8 @@ #include #include +#include + #include #include #include @@ -54,9 +56,9 @@ struct store_result_functor { column_view const& values, sort::sort_groupby_helper& helper, cudf::detail::result_cache& cache, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : col_idx(col_idx), values(values), helper(helper), cache(cache), stream(stream), mr(mr) + : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr) { } @@ -105,7 +107,7 @@ struct store_result_functor { cudf::detail::result_cache& cache; ///< cache of results to store into column_view const& values; ///< Column of values to group and aggregate - cudaStream_t stream; ///< CUDA stream on which to execute kernels + rmm::cuda_stream_view stream; ///< CUDA stream on which to execute kernels rmm::mr::device_memory_resource* mr; ///< Memory resource to allocate space for results std::unique_ptr sorted_values; ///< Memoised grouped and sorted values @@ -122,8 +124,8 @@ void store_result_functor::operator()(aggregation cons agg, get_grouped_values().nullable() ? detail::group_count_valid( - get_grouped_values(), helper.group_labels(), helper.num_groups(), mr, stream) - : detail::group_count_all(helper.group_offsets(), helper.num_groups(), mr, stream)); + get_grouped_values(), helper.group_labels(), helper.num_groups(), stream, mr) + : detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr)); } template <> @@ -132,7 +134,7 @@ void store_result_functor::operator()(aggregation const& if (cache.has_result(col_idx, agg)) return; cache.add_result( - col_idx, agg, detail::group_count_all(helper.group_offsets(), helper.num_groups(), mr, stream)); + col_idx, agg, detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr)); } template <> @@ -143,7 +145,7 @@ void store_result_functor::operator()(aggregation const& agg) cache.add_result(col_idx, agg, detail::group_sum( - get_grouped_values(), helper.num_groups(), helper.group_labels(), mr, stream)); + get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr)); }; template <> @@ -157,8 +159,8 @@ void store_result_functor::operator()(aggregation const& ag helper.num_groups(), helper.group_labels(), helper.key_sort_order(), - mr, - stream)); + stream, + mr)); }; template <> @@ -172,8 +174,8 @@ void store_result_functor::operator()(aggregation const& ag helper.num_groups(), helper.group_labels(), helper.key_sort_order(), - mr, - stream)); + stream, + mr)); }; template <> @@ -184,7 +186,7 @@ void store_result_functor::operator()(aggregation const& agg) auto result = [&]() { if (cudf::is_fixed_width(values.type())) { return detail::group_min( - get_grouped_values(), helper.num_groups(), helper.group_labels(), mr, stream); + get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr); } else { auto argmin_agg = make_argmin_aggregation(); operator()(*argmin_agg); @@ -221,7 +223,7 @@ void store_result_functor::operator()(aggregation const& agg) auto result = [&]() { if (cudf::is_fixed_width(values.type())) { return detail::group_max( - get_grouped_values(), helper.num_groups(), helper.group_labels(), mr, stream); + get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr); } else { auto argmax_agg = make_argmax_aggregation(); operator()(*argmax_agg); @@ -292,8 +294,8 @@ void store_result_functor::operator()(aggregation const& group_sizes, helper.group_labels(), var_agg._ddof, - mr, - stream); + stream, + mr); cache.add_result(col_idx, agg, std::move(result)); }; @@ -327,8 +329,8 @@ void store_result_functor::operator()(aggregation const& helper.num_groups(), quantile_agg._quantiles, quantile_agg._interpolation, - mr, - stream); + stream, + mr); cache.add_result(col_idx, agg, std::move(result)); }; @@ -347,8 +349,8 @@ void store_result_functor::operator()(aggregation const& ag helper.num_groups(), {0.5}, interpolation::LINEAR, - mr, - stream); + stream, + mr); cache.add_result(col_idx, agg, std::move(result)); }; @@ -364,8 +366,8 @@ void store_result_functor::operator()(aggregation const& a helper.num_groups(), helper.group_offsets(), nunique_agg._null_handling, - mr, - stream); + stream, + mr); cache.add_result(col_idx, agg, std::move(result)); }; @@ -394,8 +396,8 @@ void store_result_functor::operator()(aggregation cons helper.num_groups(), nth_element_agg._n, nth_element_agg._null_handling, - mr, - stream)); + stream, + mr)); } template <> @@ -404,7 +406,7 @@ void store_result_functor::operator()(aggregation const& a if (cache.has_result(col_idx, agg)) return; auto result = detail::group_collect( - get_grouped_values(), helper.group_offsets(), helper.num_groups(), mr, stream); + get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr); cache.add_result(col_idx, agg, std::move(result)); }; @@ -414,7 +416,7 @@ void store_result_functor::operator()(aggregation const& a // Sort-based groupby std::pair, std::vector> groupby::sort_aggregate( std::vector const& requests, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // We're going to start by creating a cache of results so that aggs that diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index 950d8c2931b..f9e4fdc411b 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,8 @@ #include #include +#include + #include #include @@ -144,6 +146,7 @@ class concurrent_unordered_map { * responsibility to synchronize or use the same stream to access the map. * * @param capacity The maximum number of pairs the map may hold + * @param stream CUDA stream used for device memory operations and kernel launches. * @param unused_element The sentinel value to use for an empty value * @param unused_key The sentinel value to use for an empty key * @param hash_function The hash function to use for hashing keys @@ -151,15 +154,14 @@ class concurrent_unordered_map { * equal * @param allocator The allocator to use for allocation the hash table's * storage - * @param stream CUDA stream used for device memory operations and kernel launches. **/ static auto create(size_type capacity, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, const mapped_type unused_element = std::numeric_limits::max(), const key_type unused_key = std::numeric_limits::max(), const Hasher& hash_function = hasher(), const Equality& equal = key_equal(), - const allocator_type& allocator = allocator_type(), - cudaStream_t stream = 0) + const allocator_type& allocator = allocator_type()) { CUDF_FUNC_RANGE(); using Self = concurrent_unordered_map; @@ -416,7 +418,8 @@ class concurrent_unordered_map { } } - void assign_async(const concurrent_unordered_map& other, cudaStream_t stream = 0) + void assign_async(const concurrent_unordered_map& other, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { if (other.m_capacity <= m_capacity) { m_capacity = other.m_capacity; @@ -431,13 +434,13 @@ class concurrent_unordered_map { other.m_hashtbl_values, m_capacity * sizeof(value_type), cudaMemcpyDefault, - stream)); + stream.value())); } - void clear_async(cudaStream_t stream = 0) + void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default) { constexpr int block_size = 128; - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream>>>( + init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); } @@ -449,16 +452,16 @@ class concurrent_unordered_map { } } - void prefetch(const int dev_id, cudaStream_t stream = 0) + void prefetch(const int dev_id, rmm::cuda_stream_view stream = rmm::cuda_stream_default) { cudaPointerAttributes hashtbl_values_ptr_attributes; cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { - CUDA_TRY( - cudaMemPrefetchAsync(m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream)); + CUDA_TRY(cudaMemPrefetchAsync( + m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); } - CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream)); + CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value())); } /** @@ -469,7 +472,7 @@ class concurrent_unordered_map { * * @param stream CUDA stream used for device memory operations and kernel launches. **/ - void destroy(cudaStream_t stream = 0) + void destroy(rmm::cuda_stream_view stream = rmm::cuda_stream_default) { m_allocator.deallocate(m_hashtbl_values, m_capacity, stream); delete this; @@ -510,7 +513,7 @@ class concurrent_unordered_map { const Hasher& hash_function, const Equality& equal, const allocator_type& allocator, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : m_hf(hash_function), m_equal(equal), m_allocator(allocator), @@ -528,12 +531,12 @@ class concurrent_unordered_map { if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { int dev_id = 0; CUDA_TRY(cudaGetDevice(&dev_id)); - CUDA_TRY( - cudaMemPrefetchAsync(m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream)); + CUDA_TRY(cudaMemPrefetchAsync( + m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); } } - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream>>>( + init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); CUDA_TRY(cudaGetLastError()); } diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh index 1807065bc86..8ba36e8696d 100644 --- a/cpp/src/hash/concurrent_unordered_multimap.cuh +++ b/cpp/src/hash/concurrent_unordered_multimap.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,8 @@ #include #include +#include + #include #include @@ -91,20 +93,20 @@ class concurrent_unordered_multimap { * responsibility to synchronize or use the same stream to access the map. * * @param capacity The maximum number of pairs the map may hold. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param init Indicates if the map should be initialized with the unused * key/values * @param hash_function The hash function to use for hashing keys * @param equal The equality comparison function for comparing if two keys are * equal * @param allocator The allocator to use for allocation of the map's storage - * @param stream CUDA stream used for device memory operations and kernel launches. **/ static auto create(size_type capacity, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, const bool init = true, const Hasher& hash_function = hasher(), const Equality& equal = key_equal(), - const allocator_type& allocator = allocator_type(), - cudaStream_t stream = 0) + const allocator_type& allocator = allocator_type()) { CUDF_FUNC_RANGE(); using Self = concurrent_unordered_multimap>>( + init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream.value()>>>( m_hashtbl_values, m_hashtbl_size, unused_key, unused_element); if (count_collisions) m_collisions = 0; } @@ -520,14 +523,14 @@ class concurrent_unordered_multimap { } } - void prefetch(const int dev_id, cudaStream_t stream = 0) + void prefetch(const int dev_id, rmm::cuda_stream_view stream = rmm::cuda_stream_default) { cudaPointerAttributes hashtbl_values_ptr_attributes; cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { CUDA_TRY(cudaMemPrefetchAsync( - m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream)); + m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream.value())); } } @@ -561,11 +564,11 @@ class concurrent_unordered_multimap { * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ explicit concurrent_unordered_multimap(size_type n, - const bool init = true, - const Hasher& hash_function = hasher(), - const Equality& equal = key_equal(), - const allocator_type& a = allocator_type(), - cudaStream_t stream = 0) + const bool init = true, + const Hasher& hash_function = hasher(), + const Equality& equal = key_equal(), + const allocator_type& a = allocator_type(), + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : m_hf(hash_function), m_equal(equal), m_allocator(a), @@ -584,12 +587,12 @@ class concurrent_unordered_multimap { int dev_id = 0; CUDA_TRY(cudaGetDevice(&dev_id)); CUDA_TRY(cudaMemPrefetchAsync( - m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream)); + m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream.value())); } } if (init) { - init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream>>>( + init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream.value()>>>( m_hashtbl_values, m_hashtbl_size, unused_key, unused_element); CUDA_TRY(cudaGetLastError()); } diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh index 7a0f3fd4005..0c4acccf33d 100644 --- a/cpp/src/hash/hash_allocator.cuh +++ b/cpp/src/hash/hash_allocator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, NVIDIA CORPORATION. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -35,12 +36,14 @@ struct managed_allocator { { } - T* allocate(std::size_t n, cudaStream_t stream = 0) const + T* allocate(std::size_t n, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const { return static_cast(mr->allocate(n * sizeof(T), stream)); } - void deallocate(T* p, std::size_t n, cudaStream_t stream = 0) const + void deallocate(T* p, + std::size_t n, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) const { mr->deallocate(p, n * sizeof(T), stream); } @@ -69,12 +72,14 @@ struct default_allocator { { } - T* allocate(std::size_t n, cudaStream_t stream = 0) const + T* allocate(std::size_t n, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const { return static_cast(mr->allocate(n * sizeof(T), stream)); } - void deallocate(T* p, std::size_t n, cudaStream_t stream = 0) const + void deallocate(T* p, + std::size_t n, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) const { mr->deallocate(p, n * sizeof(T), stream); } diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 8e91de9707f..e6f6ba2bbad 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -675,14 +675,14 @@ std::unique_ptr md5_hash(table_view const& input, "MD5 unsupported column type"); // Result column allocation and creation - auto begin = thrust::make_constant_iterator(32); - auto offsets_column = cudf::strings::detail::make_offsets_child_column( - begin, begin + input.num_rows(), mr, stream.value()); + auto begin = thrust::make_constant_iterator(32); + auto offsets_column = + cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.data(); auto chars_column = strings::detail::create_chars_child_column( - input.num_rows(), 0, input.num_rows() * 32, mr, stream.value()); + input.num_rows(), 0, input.num_rows() * 32, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh index 0e6173529f6..11d4bc414ca 100644 --- a/cpp/src/hash/unordered_multiset.cuh +++ b/cpp/src/hash/unordered_multiset.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,12 @@ #pragma once +#include + #include #include -#include + +#include namespace cudf { namespace detail { @@ -68,7 +71,7 @@ class unordered_multiset { /** * @brief Factory to construct a new unordered_multiset **/ - static unordered_multiset create(column_view const &col, cudaStream_t stream) + static unordered_multiset create(column_view const &col, rmm::cuda_stream_view stream) { auto d_column = column_device_view::create(col, stream); auto d_col = *d_column; @@ -82,7 +85,7 @@ class unordered_multiset { size_type *d_hash_bins_end = hash_bins_end.data().get(); Element *d_hash_data = hash_data.data().get(); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(col.size()), [d_hash_bins_start, d_col, hasher] __device__(size_t idx) { @@ -93,17 +96,17 @@ class unordered_multiset { } }); - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), hash_bins_start.begin(), hash_bins_start.end(), hash_bins_end.begin()); - thrust::copy(rmm::exec_policy(stream)->on(stream), + thrust::copy(rmm::exec_policy(stream)->on(stream.value()), hash_bins_end.begin(), hash_bins_end.end(), hash_bins_start.begin()); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(col.size()), [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) { diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index efc19791c07..5f4fcb1c108 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp index 4f208d8985c..0bdb1f5eeb6 100644 --- a/cpp/src/interop/from_arrow.cpp +++ b/cpp/src/interop/from_arrow.cpp @@ -209,9 +209,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (array.length() == 0) { - return cudf::strings::detail::make_empty_strings_column(mr, stream.value()); - } + if (array.length() == 0) { return cudf::strings::detail::make_empty_strings_column(stream, mr); } auto str_array = static_cast(&array); auto offset_array = std::make_unique( str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); @@ -294,7 +292,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( out_mask = detail::copy_bitmask(static_cast(out_mask.data()), array.offset(), array.offset() + array.length(), - rmm::cuda_stream_view{stream}, + stream, mr); } diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index f5c9c708821..93ec44e4cb2 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -17,6 +17,8 @@ #include +#include + using cudf::detail::device_span; namespace cudf { @@ -310,32 +312,32 @@ extern "C" __global__ void __launch_bounds__(NWARPS * 32, 2) * @param[in] first_row Crop all rows below first_row * @param[in] min_row_size Minimum size in bytes of a row * @param[in] stream CUDA stream to use, default 0 - */ -void __host__ DecodeAvroColumnData(block_desc_s *blocks, - schemadesc_s *schema, - device_span global_dictionary, - const uint8_t *avro_data, - uint32_t num_blocks, - uint32_t schema_len, - size_t max_rows, - size_t first_row, - uint32_t min_row_size, - cudaStream_t stream) + **/ +void DecodeAvroColumnData(block_desc_s *blocks, + schemadesc_s *schema, + device_span global_dictionary, + const uint8_t *avro_data, + uint32_t num_blocks, + uint32_t schema_len, + size_t max_rows, + size_t first_row, + uint32_t min_row_size, + rmm::cuda_stream_view stream) { // NWARPS warps per threadblock dim3 const dim_block(32, NWARPS); // 1 warp per datablock, NWARPS datablocks per threadblock dim3 const dim_grid((num_blocks + NWARPS - 1) / NWARPS, 1); - gpuDecodeAvroColumnData<<>>(blocks, - schema, - global_dictionary, - avro_data, - num_blocks, - schema_len, - min_row_size, - max_rows, - first_row); + gpuDecodeAvroColumnData<<>>(blocks, + schema, + global_dictionary, + avro_data, + num_blocks, + schema_len, + min_row_size, + max_rows, + first_row); } } // namespace gpu diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h index 7f03482c54a..5aac6f99a80 100644 --- a/cpp/src/io/avro/avro_gpu.h +++ b/cpp/src/io/avro/avro_gpu.h @@ -19,6 +19,8 @@ #include +#include + namespace cudf { namespace io { namespace avro { @@ -61,10 +63,10 @@ void DecodeAvroColumnData(block_desc_s *blocks, const uint8_t *avro_data, uint32_t num_blocks, uint32_t schema_len, - size_t max_rows = ~0, - size_t first_row = 0, - uint32_t min_row_size = 0, - cudaStream_t stream = (cudaStream_t)0); + size_t max_rows = ~0, + size_t first_row = 0, + uint32_t min_row_size = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); } // namespace gpu } // namespace avro diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 95a1710d3ae..68c746f2956 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -139,7 +140,7 @@ class metadata : public file_metadata { }; rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_block_data, - cudaStream_t stream) + rmm::cuda_stream_view stream) { size_t uncompressed_data_size = 0; hostdevice_vector inflate_in(_metadata->block_list.size()); @@ -191,8 +192,9 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_ inflate_in.host_ptr(), inflate_in.memory_size(), cudaMemcpyHostToDevice, - stream)); - CUDA_TRY(cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream)); + stream.value())); + CUDA_TRY( + cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value())); if (_metadata->codec == "deflate") { CUDA_TRY(gpuinflate( inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream)); @@ -206,8 +208,8 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_ inflate_out.device_ptr(), inflate_out.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); // Check if larger output is required, as it's not known ahead of time if (_metadata->codec == "deflate" && !loop_cnt) { @@ -247,7 +249,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, size_t num_rows, std::vector> selection, std::vector &out_buffers, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Build gpu schema hostdevice_vector schema_desc(_metadata->schema.size()); @@ -312,7 +314,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, schema_desc.host_ptr(), schema_desc.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::DecodeAvroColumnData(static_cast(block_list.data()), schema_desc.device_ptr(), @@ -332,15 +334,16 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, valid_alias[i], out_buffers[i].null_mask_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); } } CUDA_TRY(cudaMemcpyAsync(schema_desc.host_ptr(), schema_desc.device_ptr(), schema_desc.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); + for (size_t i = 0; i < out_buffers.size(); i++) { const auto col_idx = selection[i].first; const auto schema_null_idx = _metadata->columns[col_idx].schema_null_idx; @@ -351,13 +354,14 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, reader::impl::impl(std::unique_ptr source, avro_reader_options const &options, rmm::mr::device_memory_resource *mr) - : _source(std::move(source)), _mr(mr), _columns(options.get_columns()) + : _mr(mr), _source(std::move(source)), _columns(options.get_columns()) { // Open the source Avro dataset metadata _metadata = std::make_unique(_source.get()); } -table_with_metadata reader::impl::read(avro_reader_options const &options, cudaStream_t stream) +table_with_metadata reader::impl::read(avro_reader_options const &options, + rmm::cuda_stream_view stream) { auto skip_rows = options.get_skip_rows(); auto num_rows = options.get_num_rows(); @@ -430,17 +434,18 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, cudaS dict_pos += len; } } + CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(), h_global_dict.data(), h_global_dict.size() * sizeof(gpu::nvstrdesc_s), cudaMemcpyDefault, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(), h_global_dict_data.data(), h_global_dict_data.size() * sizeof(char), cudaMemcpyDefault, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); } std::vector out_buffers; @@ -453,7 +458,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, cudaS decode_data(block_data, dict, d_global_dict, num_rows, selected_columns, out_buffers, stream); for (size_t i = 0; i < column_types.size(); ++i) { - out_columns.emplace_back(make_column(out_buffers[i], stream, _mr)); + out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr)); } } else { // Create empty columns @@ -496,7 +501,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(avro_reader_options const &options, cudaStream_t stream) +table_with_metadata reader::read(avro_reader_options const &options, rmm::cuda_stream_view stream) { return _impl->read(options, stream); } diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp index cdebb3cf9dc..880c428b60d 100644 --- a/cpp/src/io/avro/reader_impl.hpp +++ b/cpp/src/io/avro/reader_impl.hpp @@ -31,6 +31,8 @@ #include #include +#include + #include #include #include @@ -70,7 +72,7 @@ class reader::impl { * * @return The set of columns along with metadata */ - table_with_metadata read(avro_reader_options const &options, cudaStream_t stream); + table_with_metadata read(avro_reader_options const &options, rmm::cuda_stream_view stream); private: /** @@ -82,7 +84,7 @@ class reader::impl { * @return Device buffer to decompressed block data */ rmm::device_buffer decompress_data(const rmm::device_buffer &comp_block_data, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Convert the avro row-based block data and outputs to columns @@ -99,7 +101,7 @@ class reader::impl { size_t num_rows, std::vector> columns, std::vector &out_buffers, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: rmm::mr::device_memory_resource *_mr = nullptr; diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 5fcf73c03d1..57bad5f3283 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -54,11 +54,15 @@ THE SOFTWARE. */ -#include -#include #include "brotli_dict.h" #include "gpuinflate.h" +#include + +#include + +#include + namespace cudf { namespace io { #define HUFFTAB_LUT1_BITS 8 @@ -2025,7 +2029,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs, void *scratch, size_t scratch_size, int count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { uint32_t count32 = (count > 0) ? count : 0; uint32_t fb_heap_size; @@ -2037,15 +2041,15 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs, scratch_size = min(scratch_size, (size_t)0xffffffffu); fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf); - CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream)); + CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value())); // NOTE: The 128KB dictionary copy can have a relatively large overhead since source isn't // page-locked CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size, get_brotli_dictionary(), sizeof(brotli_dictionary_s), cudaMemcpyHostToDevice, - stream)); - gpu_debrotli_kernel<<>>( + stream.value())); + gpu_debrotli_kernel<<>>( inputs, outputs, scratch_u8, fb_heap_size, count32); #if DUMP_FB_HEAP uint32_t dump[2]; @@ -2053,8 +2057,8 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs, printf("heap dump (%d bytes)\n", fb_heap_size); while (cur < fb_heap_size && !(cur & 3)) { CUDA_TRY(cudaMemcpyAsync( - &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); printf("@%d: next = %d, size = %d\n", cur, dump[0], dump[1]); cur = (dump[0] > cur) ? dump[0] : 0xffffffffu; } diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 723b0850a6c..840b868ffb5 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -43,9 +43,12 @@ misrepresented as being the original software. Mark Adler madler@alumni.caltech.edu */ -#include #include "gpuinflate.h" +#include + +#include + namespace cudf { namespace io { #define NUMTHREADS 128 // Threads per block @@ -1199,17 +1202,19 @@ cudaError_t __host__ gpuinflate(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count, int parse_hdr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - if (count > 0) { inflate_kernel<<>>(inputs, outputs, parse_hdr); } + if (count > 0) { + inflate_kernel<<>>(inputs, outputs, parse_hdr); + } return cudaSuccess; } cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs, int count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - if (count > 0) { copy_uncompressed_kernel<<>>(inputs); } + if (count > 0) { copy_uncompressed_kernel<<>>(inputs); } return cudaSuccess; } diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h index 461256d3762..692752c4e33 100644 --- a/cpp/src/io/comp/gpuinflate.h +++ b/cpp/src/io/comp/gpuinflate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + namespace cudf { namespace io { /** @@ -53,9 +55,9 @@ struct gpu_inflate_status_s { **/ cudaError_t gpuinflate(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, - int count = 1, - int parse_hdr = 0, - cudaStream_t stream = (cudaStream_t)0); + int count = 1, + int parse_hdr = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Interface for copying uncompressed byte blocks @@ -65,8 +67,8 @@ cudaError_t gpuinflate(gpu_inflate_input_s *inputs, * @param[in] stream CUDA stream to use, default 0 **/ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs, - int count = 1, - cudaStream_t stream = (cudaStream_t)0); + int count = 1, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Interface for decompressing Snappy-compressed data @@ -81,8 +83,8 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs, **/ cudaError_t gpu_unsnap(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, - int count = 1, - cudaStream_t stream = (cudaStream_t)0); + int count = 1, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Computes the size of temporary memory for Brotli decompression @@ -110,8 +112,8 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, void *scratch, size_t scratch_size, - int count = 1, - cudaStream_t stream = (cudaStream_t)0); + int count = 1, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Interface for compressing data with Snappy @@ -126,8 +128,8 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs, **/ cudaError_t gpu_snap(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, - int count = 1, - cudaStream_t stream = (cudaStream_t)0); + int count = 1, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 01214b00933..a3ab6a49a88 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,12 @@ * limitations under the License. */ -#include #include "gpuinflate.h" +#include + +#include + namespace cudf { namespace io { #define HASH_BITS 12 @@ -342,11 +345,13 @@ extern "C" __global__ void __launch_bounds__(128) cudaError_t __host__ gpu_snap(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block dim3 dim_grid(count, 1); - if (count > 0) { snap_kernel<<>>(inputs, outputs, count); } + if (count > 0) { + snap_kernel<<>>(inputs, outputs, count); + } return cudaSuccess; } diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 0d9407d7f65..e0824a8a0fb 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,18 @@ * limitations under the License. */ -#include -#include // memset -#include // uncompress #include "io_uncomp.h" #include "unbz2.h" // bz2 uncompress #include #include +#include + +#include // memset + +#include // uncompress + using cudf::detail::host_span; namespace cudf { diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 44229e1202e..0eeb4602463 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -14,10 +14,14 @@ * limitations under the License. */ -#include -#include #include "gpuinflate.h" +#include + +#include + +#include + namespace cudf { namespace io { // Not supporting streams longer than this (not what snappy is intended for) @@ -695,13 +699,13 @@ __global__ void __launch_bounds__(block_size) cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { uint32_t count32 = (count > 0) ? count : 0; dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block dim3 dim_grid(count32, 1); // TODO: Check max grid dimensions vs max expected count - unsnap_kernel<128><<>>(inputs, outputs); + unsnap_kernel<128><<>>(inputs, outputs); return cudaSuccess; } diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index dec35cb7feb..8b913e5918c 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -32,6 +32,8 @@ #include #include +#include + #include #include @@ -1009,13 +1011,13 @@ __global__ void __launch_bounds__(rowofs_block_dim) size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts, device_span const data, device_span const row_offsets, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const auto newline = opts.skipblanklines ? opts.terminator : opts.comment; const auto comment = opts.comment != '\0' ? opts.comment : newline; const auto carriage = (opts.skipblanklines && opts.terminator == '\n') ? '\r' : comment; return thrust::count_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), row_offsets.begin(), row_offsets.end(), [data = data, newline, comment, carriage] __device__(const uint64_t pos) { @@ -1027,14 +1029,14 @@ size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts, void __host__ remove_blank_rows(cudf::io::parse_options_view const &options, device_span const data, rmm::device_vector &row_offsets, - cudaStream_t stream) + rmm::cuda_stream_view stream) { size_t d_size = data.size(); const auto newline = options.skipblanklines ? options.terminator : options.comment; const auto comment = options.comment != '\0' ? options.comment : newline; const auto carriage = (options.skipblanklines && options.terminator == '\n') ? '\r' : comment; auto new_end = thrust::remove_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), row_offsets.begin(), row_offsets.end(), [data = data, d_size, newline, comment, carriage] __device__(const uint64_t pos) { @@ -1050,7 +1052,7 @@ thrust::host_vector detect_column_types( device_span const column_flags, device_span const row_starts, size_t const num_active_columns, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Calculate actual block count to use based on records count const int block_size = csvparse_block_dim; @@ -1058,7 +1060,7 @@ thrust::host_vector detect_column_types( auto d_stats = rmm::device_vector(num_active_columns); - data_type_detection<<>>( + data_type_detection<<>>( options, data, column_flags, row_starts, d_stats); return thrust::host_vector(d_stats); @@ -1071,14 +1073,14 @@ void __host__ decode_row_column_data(cudf::io::parse_options_view const &options device_span const dtypes, device_span const columns, device_span const valids, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Calculate actual block count to use based on records count auto const block_size = csvparse_block_dim; auto const num_rows = row_offsets.size() - 1; auto const grid_size = (num_rows + block_size - 1) / block_size; - convert_csv_to_cudf<<>>( + convert_csv_to_cudf<<>>( options, data, column_flags, row_offsets, dtypes, columns, valids); } @@ -1093,11 +1095,11 @@ uint32_t __host__ gather_row_offsets(const parse_options_view &options, size_t byte_range_start, size_t byte_range_end, size_t skip_rows, - cudaStream_t stream) + rmm::cuda_stream_view stream) { uint32_t dim_grid = 1 + (chunk_size / rowofs_block_bytes); - gather_row_offsets_gpu<<>>( + gather_row_offsets_gpu<<>>( row_ctx, offsets_out, data, diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h index 921b17a8520..91982d60896 100644 --- a/cpp/src/io/csv/csv_gpu.h +++ b/cpp/src/io/csv/csv_gpu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include +#include using cudf::detail::device_span; @@ -162,7 +163,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options, size_t byte_range_start, size_t byte_range_end, size_t skip_rows, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * Count the number of blank rows in the given row offset array @@ -176,7 +177,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options, size_t count_blank_rows(cudf::io::parse_options_view const &options, device_span data, device_span row_offsets, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * Remove blank rows in the given row offset array @@ -190,7 +191,7 @@ size_t count_blank_rows(cudf::io::parse_options_view const &options, void remove_blank_rows(const cudf::io::parse_options_view &options, device_span data, rmm::device_vector &row_offsets, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * @brief Launches kernel for detecting possible dtype of each column of data @@ -209,7 +210,7 @@ thrust::host_vector detect_column_types( device_span column_flags, device_span row_offsets, size_t const num_active_columns, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * @brief Launches kernel for decoding row-column data @@ -230,7 +231,7 @@ void decode_row_column_data(cudf::io::parse_options_view const &options, device_span dtypes, device_span columns, device_span valids, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); } // namespace gpu } // namespace csv diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu index 15dfb5f5534..3cfb4d88ec6 100644 --- a/cpp/src/io/csv/durations.cu +++ b/cpp/src/io/csv/durations.cu @@ -21,6 +21,8 @@ #include #include +#include + #include #include @@ -168,21 +170,20 @@ struct duration_to_string_fn : public duration_to_string_size_fn { struct dispatch_from_durations_fn { template ()>* = nullptr> std::unique_ptr operator()(column_view const& durations, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { size_type strings_count = durations.size(); auto column = column_device_view::create(durations, stream); auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), duration_to_string_size_fn{d_column}); auto offsets_column = strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); @@ -190,11 +191,11 @@ struct dispatch_from_durations_fn { auto const chars_bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = strings::detail::create_chars_child_column( - strings_count, durations.null_count(), chars_bytes, mr, stream); + strings_count, durations.null_count(), chars_bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, duration_to_string_fn{d_column, d_new_offsets, d_chars}); @@ -212,8 +213,8 @@ struct dispatch_from_durations_fn { // non-duration types throw an exception template ()>* = nullptr> std::unique_ptr operator()(column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const { CUDF_FAIL("Values for from_durations function must be a duration type."); } @@ -222,13 +223,13 @@ struct dispatch_from_durations_fn { } // namespace std::unique_ptr pandas_format_durations(column_view const& durations, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = durations.size(); if (strings_count == 0) return make_empty_column(data_type{type_id::STRING}); - return type_dispatcher(durations.type(), dispatch_from_durations_fn{}, durations, mr, stream); + return type_dispatcher(durations.type(), dispatch_from_durations_fn{}, durations, stream, mr); } } // namespace csv diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 602c3e7f82d..9093a4030e8 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -31,6 +31,8 @@ #include #include +#include + #include #include #include @@ -178,7 +180,7 @@ std::vector setColumnNames(std::vector const &header, return col_names; } -table_with_metadata reader::impl::read(cudaStream_t stream) +table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) { auto range_offset = opts_.get_byte_range_offset(); auto range_size = opts_.get_byte_range_size(); @@ -353,7 +355,7 @@ table_with_metadata reader::impl::read(cudaStream_t stream) out_columns.emplace_back( cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_)); } else { - out_columns.emplace_back(make_column(out_buffers[i], stream, mr_)); + out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_)); } } } else { @@ -386,7 +388,7 @@ void reader::impl::gather_row_offsets(host_span const data, size_t skip_rows, int64_t num_rows, bool load_whole_file, - cudaStream_t stream) + rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB size_t buffer_size = std::min(max_chunk_bytes, data.size()); @@ -428,8 +430,9 @@ void reader::impl::gather_row_offsets(host_span const data, row_ctx.device_ptr(), num_blocks * sizeof(uint64_t), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); + // Sum up the rows in each character block, selecting the row count that // corresponds to the current input context. Also stores the now known input // context per character block that will be needed by the second pass. @@ -447,7 +450,7 @@ void reader::impl::gather_row_offsets(host_span const data, row_ctx.host_ptr(), num_blocks * sizeof(uint64_t), cudaMemcpyHostToDevice, - stream)); + stream.value())); // Pass 2: Output row offsets cudf::io::csv::gpu::gather_row_offsets(opts.view(), @@ -468,8 +471,9 @@ void reader::impl::gather_row_offsets(host_span const data, row_ctx.device_ptr(), num_blocks * sizeof(uint64_t), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); + size_t rows_out_of_range = 0; for (uint32_t i = 0; i < num_blocks; i++) { rows_out_of_range += row_ctx[i]; } if (rows_out_of_range != 0) { @@ -514,8 +518,9 @@ void reader::impl::gather_row_offsets(host_span const data, row_offsets_.data().get() + header_row_index, 2 * sizeof(uint64_t), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); + const auto header_start = buffer_pos + row_ctx[0]; const auto header_end = buffer_pos + row_ctx[1]; CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), @@ -529,7 +534,7 @@ void reader::impl::gather_row_offsets(host_span const data, if (num_rows >= 0) { row_offsets_.resize(std::min(row_offsets_.size(), num_rows + 1)); } } -std::vector reader::impl::gather_column_types(cudaStream_t stream) +std::vector reader::impl::gather_column_types(rmm::cuda_stream_view stream) { std::vector dtypes; @@ -542,7 +547,7 @@ std::vector reader::impl::gather_column_types(cudaStream_t stream) auto column_stats = cudf::io::csv::gpu::detect_column_types( opts.view(), data_, d_column_flags_, row_offsets_, num_active_cols_, stream); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); for (int col = 0; col < num_active_cols_; col++) { unsigned long long int_count_total = column_stats[col].big_int_count + @@ -648,7 +653,7 @@ std::vector reader::impl::gather_column_types(cudaStream_t stream) } std::vector reader::impl::decode_data(std::vector const &column_types, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; @@ -687,7 +692,7 @@ std::vector reader::impl::decode_data(std::vector cons cudf::io::csv::gpu::decode_row_column_data( opts.view(), data_, d_column_flags_, row_offsets_, d_dtypes, d_data, d_valid, stream); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); for (int i = 0; i < num_active_cols_; ++i) { out_buffers[i].null_count() = UNKNOWN_NULL_COUNT; } @@ -790,7 +795,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(cudaStream_t stream) { return _impl->read(stream); } +table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); } } // namespace csv } // namespace detail diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 29cc8bab6fe..67246165be0 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -28,6 +28,8 @@ #include #include +#include + #include #include #include @@ -86,7 +88,7 @@ class reader::impl { * * @return The set of columns along with metadata */ - table_with_metadata read(cudaStream_t stream); + table_with_metadata read(rmm::cuda_stream_view stream); private: /** @@ -110,7 +112,7 @@ class reader::impl { size_t skip_rows, int64_t num_rows, bool load_whole_file, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Find the start position of the first data row @@ -128,7 +130,7 @@ class reader::impl { * * @return `std::vector` List of column types */ - std::vector gather_column_types(cudaStream_t stream); + std::vector gather_column_types(rmm::cuda_stream_view stream); /** * @brief Converts the row-column data and outputs to column bufferrs. @@ -139,7 +141,7 @@ class reader::impl { * @return list of column buffers of decoded data, or ptr/size in the case of strings. */ std::vector decode_data(std::vector const &column_types, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: rmm::mr::device_memory_resource *mr_ = nullptr; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index d833acb46ef..e3ad19e0445 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -21,27 +21,23 @@ #include "writer_impl.hpp" +#include + #include #include - -#include - +#include +#include #include #include #include #include - -#include +#include #include +#include -#include - -#include -#include -#include -#include -#include -#include +#include +#include +#include #include #include @@ -50,11 +46,12 @@ #include #include -#include -#include - -#include -#include +#include +#include +#include +#include +#include +#include namespace cudf { namespace io { @@ -217,7 +214,7 @@ struct column_to_strings_fn { explicit column_to_strings_fn(csv_writer_options const& options, rmm::mr::device_memory_resource* mr = nullptr, - cudaStream_t stream = nullptr) + rmm::cuda_stream_view stream = nullptr) : options_(options), mr_(mr), stream_(stream) { } @@ -268,7 +265,7 @@ struct column_to_strings_fn { string_scalar delimiter{std::string{options_.get_inter_column_delimiter()}, true, stream_}; predicate_special_chars pred{delimiter.value(stream_)}; - return modify_strings(column_v, mr_, stream_, pred); + return modify_strings(column_v, stream_, mr_, pred); } // ints: @@ -354,7 +351,7 @@ struct column_to_strings_fn { private: csv_writer_options const& options_; rmm::mr::device_memory_resource* mr_; - cudaStream_t stream_; + rmm::cuda_stream_view stream_; }; } // unnamed namespace @@ -380,7 +377,7 @@ writer::impl::impl(std::unique_ptr sink, // void writer::impl::write_chunked_begin(table_view const& table, const table_metadata* metadata, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if ((metadata != nullptr) && (options_.is_enabled_include_header())) { CUDF_EXPECTS(metadata->column_names.size() == static_cast(table.num_columns()), @@ -402,7 +399,7 @@ void writer::impl::write_chunked_begin(table_view const& table, void writer::impl::write_chunked(strings_column_view const& str_column_view, const table_metadata* metadata, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // algorithm outline: // @@ -442,9 +439,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, ptr_all_bytes, total_num_bytes * sizeof(char), cudaMemcpyDeviceToHost, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); // host algorithm call, where the underlying call // is also host_write taking a host buffer; @@ -459,7 +456,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, void writer::impl::write(table_view const& table, const table_metadata* metadata, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDF_EXPECTS(table.num_columns() > 0, "Empty table."); @@ -495,15 +492,16 @@ void writer::impl::write(table_view const& table, splits.resize(n_chunks); rmm::device_vector d_splits(n_chunks, n_rows_per_chunk); - thrust::inclusive_scan(exec->on(stream), d_splits.begin(), d_splits.end(), d_splits.begin()); + thrust::inclusive_scan( + exec->on(stream.value()), d_splits.begin(), d_splits.end(), d_splits.begin()); CUDA_TRY(cudaMemcpyAsync(splits.data(), d_splits.data().get(), n_chunks * sizeof(size_type), cudaMemcpyDeviceToHost, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); // split table_view into chunks: // @@ -548,7 +546,9 @@ void writer::impl::write(table_view const& table, write_chunked_end(table, metadata, stream); } -void writer::write(table_view const& table, const table_metadata* metadata, cudaStream_t stream) +void writer::write(table_view const& table, + const table_metadata* metadata, + rmm::cuda_stream_view stream) { _impl->write(table, metadata, stream); } diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp index 24ede6a8fe8..f3d2f999070 100644 --- a/cpp/src/io/csv/writer_impl.hpp +++ b/cpp/src/io/csv/writer_impl.hpp @@ -28,6 +28,8 @@ #include #include +#include + #include #include #include @@ -65,7 +67,7 @@ class writer::impl { **/ void write(table_view const& table, const table_metadata* metadata = nullptr, - cudaStream_t stream = nullptr); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Write the header of a CSV format. @@ -76,7 +78,7 @@ class writer::impl { **/ void write_chunked_begin(table_view const& table, const table_metadata* metadata = nullptr, - cudaStream_t stream = nullptr); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Write dataset to CSV format without header. @@ -87,7 +89,7 @@ class writer::impl { **/ void write_chunked(strings_column_view const& strings_column, const table_metadata* metadata = nullptr, - cudaStream_t stream = nullptr); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Write footer of CSV format (typically, empty). @@ -98,7 +100,7 @@ class writer::impl { **/ void write_chunked_end(table_view const& table, const table_metadata* metadata = nullptr, - cudaStream_t stream = nullptr) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { // purposely no-op (for now); } @@ -111,7 +113,7 @@ class writer::impl { std::unique_ptr pandas_format_durations( column_view const& durations, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace csv diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 5f91b2cb2ce..a8e65216e5d 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -795,7 +796,7 @@ void convert_json_to_columns(parse_options_view const &opts, device_span const output_columns, device_span const valid_fields, device_span num_valid_fields, - cudaStream_t stream) + rmm::cuda_stream_view stream) { int block_size; int min_grid_size; @@ -804,7 +805,7 @@ void convert_json_to_columns(parse_options_view const &opts, const int grid_size = (row_offsets.size() + block_size - 1) / block_size; - convert_data_to_columns_kernel<<>>( + convert_data_to_columns_kernel<<>>( opts, data, row_offsets, column_types, col_map, output_columns, valid_fields, num_valid_fields); CUDA_TRY(cudaGetLastError()); @@ -821,7 +822,7 @@ std::vector detect_data_types( bool do_set_null_count, int num_columns, col_map_type *col_map, - cudaStream_t stream) + rmm::cuda_stream_view stream) { int block_size; int min_grid_size; @@ -834,7 +835,7 @@ std::vector detect_data_types( if (do_set_null_count) { // Set the null count to the row count (all fields assumes to be null). thrust::for_each( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), d_column_infos.begin(), d_column_infos.end(), [num_records = row_offsets.size()] __device__(auto &info) { info.null_count = num_records; }); @@ -843,7 +844,7 @@ std::vector detect_data_types( // Calculate actual block count to use based on records count const int grid_size = (row_offsets.size() + block_size - 1) / block_size; - detect_data_types_kernel<<>>( + detect_data_types_kernel<<>>( options, data, row_offsets, col_map, num_columns, d_column_infos); CUDA_TRY(cudaGetLastError()); @@ -863,7 +864,7 @@ void collect_keys_info(parse_options_view const &options, device_span const row_offsets, unsigned long long int *keys_cnt, thrust::optional keys_info, - cudaStream_t stream) + rmm::cuda_stream_view stream) { int block_size; int min_grid_size; @@ -873,7 +874,7 @@ void collect_keys_info(parse_options_view const &options, // Calculate actual block count to use based on records count const int grid_size = (row_offsets.size() + block_size - 1) / block_size; - collect_keys_info_kernel<<>>( + collect_keys_info_kernel<<>>( options, data, row_offsets, keys_cnt, keys_info); CUDA_TRY(cudaGetLastError()); diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h index de7dd21b7f3..cbab408d2f1 100644 --- a/cpp/src/io/json/json_gpu.h +++ b/cpp/src/io/json/json_gpu.h @@ -25,6 +25,8 @@ #include #include +#include + #include using cudf::detail::device_span; @@ -57,7 +59,7 @@ void convert_json_to_columns(parse_options_view const &options, device_span output_columns, device_span valid_fields, device_span num_valid_fields, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * @brief Process a buffer of data and determine information about the column types within. @@ -79,7 +81,7 @@ std::vector detect_data_types( bool do_set_null_count, int num_columns, col_map_type *col_map, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * @brief Collects information about JSON object keys in the file. @@ -96,7 +98,7 @@ void collect_keys_info(parse_options_view const &options, device_span row_offsets, unsigned long long int *keys_cnt, thrust::optional keys_info, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); } // namespace gpu } // namespace json diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 3246f7e9ed0..121f0825228 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -35,6 +35,7 @@ #include #include +#include #include #include @@ -108,11 +109,12 @@ std::unique_ptr
aggregate_keys_info(std::unique_ptr
info) /** * @brief Initializes the (key hash -> column index) hash map. */ -col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, cudaStream_t stream) +col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, + rmm::cuda_stream_view stream) { - auto key_col_map{col_map_type::create(column_name_hashes.size())}; + auto key_col_map{col_map_type::create(column_name_hashes.size(), stream)}; auto const column_data = column_name_hashes.data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), column_name_hashes.size(), [map = *key_col_map, column_data] __device__(size_type idx) mutable { @@ -136,7 +138,7 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, cudaS std::unique_ptr
create_json_keys_info_table(const parse_options_view &options, device_span const data, device_span const row_offsets, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Count keys rmm::device_scalar key_counter(0, stream); @@ -166,7 +168,7 @@ std::unique_ptr
create_json_keys_info_table(const parse_options_view &opt */ std::vector create_key_strings(char const *h_data, table_view sorted_info, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto const num_cols = sorted_info.num_rows(); std::vector h_offsets(num_cols); @@ -174,14 +176,14 @@ std::vector create_key_strings(char const *h_data, sorted_info.column(0).data(), sizeof(uint64_t) * num_cols, cudaMemcpyDefault, - stream); + stream.value()); std::vector h_lens(num_cols); cudaMemcpyAsync(h_lens.data(), sorted_info.column(1).data(), sizeof(uint16_t) * num_cols, cudaMemcpyDefault, - stream); + stream.value()); std::vector names(num_cols); std::transform(h_offsets.cbegin(), @@ -206,7 +208,7 @@ auto sort_keys_info_by_offset(std::unique_ptr
info) * @return Names of JSON object keys in the file */ std::pair, col_map_ptr_type> reader::impl::get_json_object_keys_hashes( - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto info = create_json_keys_info_table( opts_.view(), @@ -259,7 +261,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) * Sets the uncomp_data_ and uncomp_size_ data members * Loads the data into device memory if byte range parameters are not used */ -void reader::impl::decompress_input(cudaStream_t stream) +void reader::impl::decompress_input(rmm::cuda_stream_view stream) { const auto compression_type = infer_compression_type(options_.get_compression(), @@ -289,7 +291,7 @@ void reader::impl::decompress_input(cudaStream_t stream) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -void reader::impl::set_record_starts(cudaStream_t stream) +void reader::impl::set_record_starts(rmm::cuda_stream_view stream) { std::vector chars_to_count{'\n'}; // Currently, ignoring lineterminations within quotes is handled by recording the records of both, @@ -310,7 +312,7 @@ void reader::impl::set_record_starts(cudaStream_t stream) // Manually adding an extra row to account for the first row in the file if (byte_range_offset_ == 0) { find_result_ptr++; - CUDA_TRY(cudaMemsetAsync(rec_starts_.data().get(), 0ull, sizeof(uint64_t), stream)); + CUDA_TRY(cudaMemsetAsync(rec_starts_.data().get(), 0ull, sizeof(uint64_t), stream.value())); } std::vector chars_to_find{'\n'}; @@ -325,7 +327,7 @@ void reader::impl::set_record_starts(cudaStream_t stream) // Previous call stores the record pinput_file.typeositions as encountered by all threads // Sort the record positions as subsequent processing may require filtering // certain rows or other processing on specific records - thrust::sort(rmm::exec_policy()->on(stream), rec_starts_.begin(), rec_starts_.end()); + thrust::sort(rmm::exec_policy()->on(stream.value()), rec_starts_.begin(), rec_starts_.end()); auto filtered_count = prefilter_count; if (allow_newlines_in_strings_) { @@ -343,7 +345,7 @@ void reader::impl::set_record_starts(cudaStream_t stream) } rec_starts_ = h_rec_starts; - thrust::sort(rmm::exec_policy()->on(stream), rec_starts_.begin(), rec_starts_.end()); + thrust::sort(rmm::exec_policy()->on(stream.value()), rec_starts_.begin(), rec_starts_.end()); } // Exclude the ending newline as it does not precede a record start @@ -360,7 +362,7 @@ void reader::impl::set_record_starts(cudaStream_t stream) * Also updates the array of record starts to match the device data offset. * */ -void reader::impl::upload_data_to_device(cudaStream_t stream) +void reader::impl::upload_data_to_device(rmm::cuda_stream_view stream) { size_t start_offset = 0; size_t end_offset = uncomp_size_; @@ -382,7 +384,7 @@ void reader::impl::upload_data_to_device(cudaStream_t stream) // Adjust row start positions to account for the data subcopy start_offset = h_rec_starts.front(); rec_starts_.resize(h_rec_starts.size()); - thrust::transform(rmm::exec_policy()->on(stream), + thrust::transform(rmm::exec_policy()->on(stream.value()), rec_starts_.begin(), rec_starts_.end(), thrust::make_constant_iterator(start_offset), @@ -405,7 +407,7 @@ void reader::impl::upload_data_to_device(cudaStream_t stream) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -void reader::impl::set_column_names(cudaStream_t stream) +void reader::impl::set_column_names(rmm::cuda_stream_view stream) { // If file only contains one row, use the file size for the row size uint64_t first_row_len = data_.size() / sizeof(char); @@ -415,12 +417,15 @@ void reader::impl::set_column_names(cudaStream_t stream) rec_starts_.data().get() + 1, sizeof(uint64_t), cudaMemcpyDeviceToHost, - stream)); + stream.value())); } std::vector first_row(first_row_len); - CUDA_TRY(cudaMemcpyAsync( - first_row.data(), data_.data(), first_row_len * sizeof(char), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(first_row.data(), + data_.data(), + first_row_len * sizeof(char), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); // Determine the row format between: // JSON array - [val1, val2, ...] and @@ -459,7 +464,7 @@ void reader::impl::set_column_names(cudaStream_t stream) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -void reader::impl::set_data_types(cudaStream_t stream) +void reader::impl::set_data_types(rmm::cuda_stream_view stream) { auto const dtype = options_.get_dtypes(); if (!dtype.empty()) { @@ -555,7 +560,7 @@ void reader::impl::set_data_types(cudaStream_t stream) * * @return table_with_metadata struct */ -table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream) +table_with_metadata reader::impl::convert_data_to_table(rmm::cuda_stream_view stream) { const auto num_columns = dtypes_.size(); const auto num_records = rec_starts_.size(); @@ -592,8 +597,7 @@ table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream) d_valid_counts, stream); - CUDA_TRY(cudaStreamSynchronize(stream)); - CUDA_TRY(cudaGetLastError()); + stream.synchronize(); // postprocess columns auto target = make_strings_column( @@ -605,7 +609,7 @@ table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream) for (size_t i = 0; i < num_columns; ++i) { out_buffers[i].null_count() = num_records - h_valid_counts[i]; - auto out_column = make_column(out_buffers[i], stream, mr_); + auto out_column = make_column(out_buffers[i], nullptr, stream, mr_); if (out_column->type().id() == type_id::STRING) { // Need to remove escape character in case of '\"' and '\\' out_columns.emplace_back(cudf::strings::detail::replace( @@ -624,7 +628,7 @@ reader::impl::impl(std::unique_ptr source, std::string filepath, json_reader_options const &options, rmm::mr::device_memory_resource *mr) - : source_(std::move(source)), filepath_(filepath), options_(options), mr_(mr) + : options_(options), mr_(mr), source_(std::move(source)), filepath_(filepath) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -644,7 +648,8 @@ reader::impl::impl(std::unique_ptr source, * * @return Table and its metadata */ -table_with_metadata reader::impl::read(json_reader_options const &options, cudaStream_t stream) +table_with_metadata reader::impl::read(json_reader_options const &options, + rmm::cuda_stream_view stream) { auto range_offset = options.get_byte_range_offset(); auto range_size = options.get_byte_range_size(); @@ -695,7 +700,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(json_reader_options const &options, cudaStream_t stream) +table_with_metadata reader::read(json_reader_options const &options, rmm::cuda_stream_view stream) { return table_with_metadata{_impl->read(options, stream)}; } diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 7bea0b2cf85..ffd3dc58fe7 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -24,16 +24,19 @@ #include "json.h" #include "json_gpu.h" -#include -#include -#include - #include +#include + #include #include #include +#include +#include + +#include + namespace cudf { namespace io { namespace detail { @@ -117,14 +120,14 @@ class reader::impl { * @return Array of keys and a map that maps their hash values to column indices */ std::pair, col_map_ptr_type> get_json_object_keys_hashes( - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Decompress the input data, if needed * * Sets the uncomp_data_ and uncomp_size_ data members */ - void decompress_input(cudaStream_t stream); + void decompress_input(rmm::cuda_stream_view stream); /** * @brief Finds all record starts in the file and stores them in rec_starts_ @@ -133,7 +136,7 @@ class reader::impl { * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_record_starts(cudaStream_t stream); + void set_record_starts(rmm::cuda_stream_view stream); /** * @brief Uploads the relevant segment of the input json data onto the GPU. @@ -142,7 +145,7 @@ class reader::impl { * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ - void upload_data_to_device(cudaStream_t stream); + void upload_data_to_device(rmm::cuda_stream_view stream); /** * @brief Parse the first row to set the column name @@ -151,7 +154,7 @@ class reader::impl { * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_column_names(cudaStream_t stream); + void set_column_names(rmm::cuda_stream_view stream); /** * @brief Set the data type array data member @@ -160,7 +163,7 @@ class reader::impl { * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ - void set_data_types(cudaStream_t stream); + void set_data_types(rmm::cuda_stream_view stream); /** * @brief Parse the input data and store results a table @@ -169,7 +172,7 @@ class reader::impl { * * @return Table and its metadata */ - table_with_metadata convert_data_to_table(cudaStream_t stream); + table_with_metadata convert_data_to_table(rmm::cuda_stream_view stream); public: /** @@ -188,7 +191,7 @@ class reader::impl { * * @return Table and its metadata */ - table_with_metadata read(json_reader_options const &options, cudaStream_t stream); + table_with_metadata read(json_reader_options const &options, rmm::cuda_stream_view stream); }; } // namespace json diff --git a/cpp/src/io/orc/chunked_state.hpp b/cpp/src/io/orc/chunked_state.hpp index c72a8485384..71bdb473f41 100644 --- a/cpp/src/io/orc/chunked_state.hpp +++ b/cpp/src/io/orc/chunked_state.hpp @@ -29,6 +29,8 @@ #include #include +#include + #include #include #include @@ -43,7 +45,7 @@ struct orc_chunked_state { /// The writer to be used std::unique_ptr wp; /// Cuda stream to be used - cudaStream_t stream; + rmm::cuda_stream_view stream; /// Overall file metadata. Filled in during the process and written during write_chunked_end() cudf::io::orc::FileFooter ff; cudf::io::orc::Metadata md; diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 7f10097db67..4df1e43dcce 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -13,11 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include + #include "orc_common.h" #include "orc_gpu.h" +#include + #include +#include #include #include @@ -436,11 +439,11 @@ __global__ void __launch_bounds__(block_size) void InitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns, uint32_t num_rowgroups, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(512, 1); // 512 threads per chunk dim3 dim_grid(num_columns, num_rowgroups); - gpuInitDictionaryIndices<512><<>>(chunks, num_columns); + gpuInitDictionaryIndices<512><<>>(chunks, num_columns); } /** @@ -453,8 +456,6 @@ void InitDictionaryIndices(DictionaryChunk *chunks, * @param[in] num_rowgroups Number of row groups * @param[in] num_columns Number of columns * @param[in] stream CUDA stream to use, default 0 - * - * @return cudaSuccess if successful, a CUDA error code otherwise */ void BuildStripeDictionaries(StripeDictionary *stripes, StripeDictionary *stripes_host, @@ -462,11 +463,11 @@ void BuildStripeDictionaries(StripeDictionary *stripes, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t num_columns, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(1024, 1); // 1024 threads per chunk dim3 dim_grid_build(num_columns, num_stripes); - gpuCompactChunkDictionaries<<>>( + gpuCompactChunkDictionaries<<>>( stripes, chunks, num_columns); for (uint32_t i = 0; i < num_stripes * num_columns; i++) { if (stripes_host[i].dict_data != nullptr) { @@ -474,7 +475,7 @@ void BuildStripeDictionaries(StripeDictionary *stripes, const nvstrdesc_s *str_data = static_cast(stripes_host[i].column_data_base); // NOTE: Requires the --expt-extended-lambda nvcc flag - thrust::sort(rmm::exec_policy(stream)->on(stream), + thrust::sort(rmm::exec_policy(stream)->on(stream.value()), p, p + stripes_host[i].num_strings, [str_data] __device__(const uint32_t &lhs, const uint32_t &rhs) { @@ -485,7 +486,8 @@ void BuildStripeDictionaries(StripeDictionary *stripes, }); } } - gpuBuildStripeDictionaries<1024><<>>(stripes, num_columns); + gpuBuildStripeDictionaries<1024> + <<>>(stripes, num_columns); } } // namespace gpu diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h index 65de710f068..de35d4a66b9 100644 --- a/cpp/src/io/orc/orc_gpu.h +++ b/cpp/src/io/orc/orc_gpu.h @@ -16,10 +16,13 @@ #pragma once +#include "timezone.cuh" + #include +#include #include -#include "timezone.cuh" +#include namespace cudf { namespace io { @@ -192,8 +195,8 @@ struct StripeDictionary { void ParseCompressedStripeData(CompressedStreamInfo *strm_info, int32_t num_streams, uint32_t compression_block_size, - uint32_t log2maxcr = 24, - cudaStream_t stream = (cudaStream_t)0); + uint32_t log2maxcr = 24, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for re-assembling decompressed blocks into a single contiguous block @@ -204,7 +207,7 @@ void ParseCompressedStripeData(CompressedStreamInfo *strm_info, */ void PostDecompressionReassemble(CompressedStreamInfo *strm_info, int32_t num_streams, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for constructing rowgroup from index streams @@ -224,7 +227,7 @@ void ParseRowGroupIndex(RowGroup *row_groups, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t rowidx_stride, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for decoding NULLs and building string dictionary index tables @@ -241,9 +244,9 @@ void DecodeNullsAndStringDictionaries(ColumnDesc *chunks, DictionaryEntry *global_dictionary, uint32_t num_columns, uint32_t num_stripes, - size_t max_rows = ~0, - size_t first_row = 0, - cudaStream_t stream = (cudaStream_t)0); + size_t max_rows = ~0, + size_t first_row = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for decoding column data @@ -271,7 +274,7 @@ void DecodeOrcColumnData(ColumnDesc *chunks, const RowGroup *row_groups = 0, uint32_t num_rowgroups = 0, uint32_t rowidx_stride = 0, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for encoding column data @@ -284,7 +287,7 @@ void DecodeOrcColumnData(ColumnDesc *chunks, void EncodeOrcColumnData(EncChunk *chunks, uint32_t num_columns, uint32_t num_rowgroups, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for encoding column dictionaries @@ -301,7 +304,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes, uint32_t num_string_columns, uint32_t num_columns, uint32_t num_stripes, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for compacting chunked column data prior to compression @@ -316,7 +319,7 @@ void CompactOrcDataStreams(StripeStream *strm_desc, EncChunk *chunks, uint32_t num_stripe_streams, uint32_t num_columns, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel(s) for compressing data streams @@ -340,7 +343,7 @@ void CompressOrcDataStreams(uint8_t *compressed_data, uint32_t num_compressed_blocks, CompressionKind compression, uint32_t comp_blk_size, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for initializing dictionary chunks @@ -353,7 +356,7 @@ void CompressOrcDataStreams(uint8_t *compressed_data, void InitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns, uint32_t num_rowgroups, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for building stripe dictionaries @@ -372,7 +375,7 @@ void BuildStripeDictionaries(StripeDictionary *stripes_dev, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t num_columns, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernels to initialize statistics collection @@ -389,7 +392,7 @@ void orc_init_statistics_groups(statistics_group *groups, uint32_t num_columns, uint32_t num_rowgroups, uint32_t row_index_stride, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernels to return statistics buffer offsets and sizes @@ -402,7 +405,7 @@ void orc_init_statistics_groups(statistics_group *groups, void orc_init_statistics_buffersize(statistics_merge_group *groups, const statistics_chunk *chunks, uint32_t statistics_count, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel to encode statistics in ORC protobuf format @@ -416,7 +419,7 @@ void orc_encode_statistics(uint8_t *blob_bfr, statistics_merge_group *groups, const statistics_chunk *chunks, uint32_t statistics_count, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); } // namespace gpu } // namespace orc diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index c57725b21fb..a9f03aef095 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -391,7 +392,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( size_t num_stripes, rmm::device_vector &row_groups, size_t row_index_stride, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Parse the columns' compressed info hostdevice_vector compinfo(0, stream_info.size(), stream); @@ -404,7 +405,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( compinfo.host_ptr(), compinfo.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::ParseCompressedStripeData(compinfo.device_ptr(), compinfo.size(), decompressor->GetBlockSize(), @@ -414,8 +415,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data( compinfo.device_ptr(), compinfo.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); // Count the exact number of compressed blocks size_t num_compressed_blocks = 0; @@ -453,7 +454,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( compinfo.host_ptr(), compinfo.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::ParseCompressedStripeData(compinfo.device_ptr(), compinfo.size(), decompressor->GetBlockSize(), @@ -489,8 +490,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data( compinfo.device_ptr(), compinfo.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); const size_t num_columns = chunks.size() / num_stripes; @@ -511,7 +512,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::ParseRowGroupIndex(row_groups.data().get(), compinfo.device_ptr(), chunks.device_ptr(), @@ -533,7 +534,7 @@ void reader::impl::decode_stream_data(hostdevice_vector &chunks const rmm::device_vector &row_groups, size_t row_index_stride, std::vector &out_buffers, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const auto num_columns = out_buffers.size(); const auto num_stripes = chunks.size() / out_buffers.size(); @@ -550,8 +551,11 @@ void reader::impl::decode_stream_data(hostdevice_vector &chunks // Allocate global dictionary for deserializing rmm::device_vector global_dict(num_dicts); - CUDA_TRY(cudaMemcpyAsync( - chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(), + chunks.host_ptr(), + chunks.memory_size(), + cudaMemcpyHostToDevice, + stream.value())); gpu::DecodeNullsAndStringDictionaries(chunks.device_ptr(), global_dict.data().get(), num_columns, @@ -570,9 +574,12 @@ void reader::impl::decode_stream_data(hostdevice_vector &chunks row_groups.size() / num_columns, row_index_stride, stream); - CUDA_TRY(cudaMemcpyAsync( - chunks.host_ptr(), chunks.device_ptr(), chunks.memory_size(), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(chunks.host_ptr(), + chunks.device_ptr(), + chunks.memory_size(), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); for (size_t i = 0; i < num_stripes; ++i) { for (size_t j = 0; j < num_columns; ++j) { @@ -584,7 +591,7 @@ void reader::impl::decode_stream_data(hostdevice_vector &chunks reader::impl::impl(std::unique_ptr source, orc_reader_options const &options, rmm::mr::device_memory_resource *mr) - : _source(std::move(source)), _mr(mr) + : _mr(mr), _source(std::move(source)) { // Open and parse the source dataset metadata _metadata = std::make_unique(_source.get()); @@ -611,7 +618,7 @@ reader::impl::impl(std::unique_ptr source, table_with_metadata reader::impl::read(size_type skip_rows, size_type num_rows, const std::vector &stripes, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::vector> out_columns; table_metadata out_metadata; @@ -698,8 +705,9 @@ table_with_metadata reader::impl::read(size_type skip_rows, stream_count++; } const auto buffer = _source->host_read(offset, len); - CUDA_TRY(cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY( + cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); } // Update chunks to reference streams pointers @@ -758,7 +766,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::ParseRowGroupIndex(row_groups.data().get(), nullptr, chunks.device_ptr(), @@ -799,7 +807,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, stream); for (size_t i = 0; i < column_types.size(); ++i) { - out_columns.emplace_back(make_column(out_buffers[i], stream, _mr)); + out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr)); } } } @@ -839,7 +847,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(orc_reader_options const &options, cudaStream_t stream) +table_with_metadata reader::read(orc_reader_options const &options, rmm::cuda_stream_view stream) { return _impl->read( options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream); diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 4fbea95664a..4684dbdcf96 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -26,6 +26,8 @@ #include #include +#include + #include #include #include @@ -41,7 +43,7 @@ using namespace cudf::io; // Forward declarations class metadata; namespace { -class orc_stream_info; +struct orc_stream_info; } /** @@ -73,7 +75,7 @@ class reader::impl { table_with_metadata read(size_type skip_rows, size_type num_rows, const std::vector &stripes, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: /** @@ -97,7 +99,7 @@ class reader::impl { size_t num_stripes, rmm::device_vector &row_groups, size_t row_index_stride, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Converts the stripe column data and outputs to columns @@ -120,7 +122,7 @@ class reader::impl { const rmm::device_vector &row_groups, size_t row_index_stride, std::vector &out_buffers, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: rmm::mr::device_memory_resource *_mr = nullptr; diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index e7aaba2a7cf..a987c171392 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -13,10 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include + #include "orc_common.h" #include "orc_gpu.h" +#include + +#include + namespace cudf { namespace io { namespace orc { @@ -384,11 +388,11 @@ void orc_init_statistics_groups(statistics_group *groups, uint32_t num_columns, uint32_t num_rowgroups, uint32_t row_index_stride, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_grid((num_rowgroups + init_groups_per_block - 1) / init_groups_per_block, num_columns); dim3 dim_block(init_threads_per_group, init_groups_per_block); - gpu_init_statistics_groups<<>>( + gpu_init_statistics_groups<<>>( groups, cols, num_columns, num_rowgroups, row_index_stride); } @@ -403,10 +407,11 @@ void orc_init_statistics_groups(statistics_group *groups, void orc_init_statistics_buffersize(statistics_merge_group *groups, const statistics_chunk *chunks, uint32_t statistics_count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(buffersize_reduction_dim, buffersize_reduction_dim); - gpu_init_statistics_buffersize<<<1, dim_block, 0, stream>>>(groups, chunks, statistics_count); + gpu_init_statistics_buffersize<<<1, dim_block, 0, stream.value()>>>( + groups, chunks, statistics_count); } /** @@ -421,12 +426,12 @@ void orc_encode_statistics(uint8_t *blob_bfr, statistics_merge_group *groups, const statistics_chunk *chunks, uint32_t statistics_count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { unsigned int num_blocks = (statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block; dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block); - gpu_encode_statistics<<>>( + gpu_encode_statistics<<>>( blob_bfr, groups, chunks, statistics_count); } diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 47192172255..cc456978e7a 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -14,11 +14,15 @@ * limitations under the License. */ -#include -#include #include "orc_common.h" #include "orc_gpu.h" +#include + +#include + +#include + #define LOG2_BYTESTREAM_BFRSZ 13 // Must be able to handle 512x 8-byte values #define BYTESTREAM_BFRSZ (1 << LOG2_BYTESTREAM_BFRSZ) @@ -1778,11 +1782,11 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks, uint32_t num_stripes, size_t max_num_rows, size_t first_row, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(NTHREADS, 1); dim3 dim_grid(num_columns, num_stripes * 2); // 1024 threads per chunk - gpuDecodeNullsAndStringDictionaries<<>>( + gpuDecodeNullsAndStringDictionaries<<>>( chunks, global_dictionary, num_columns, num_stripes, max_num_rows, first_row); } @@ -1811,21 +1815,21 @@ void __host__ DecodeOrcColumnData(ColumnDesc *chunks, const RowGroup *row_groups, uint32_t num_rowgroups, uint32_t rowidx_stride, - cudaStream_t stream) + rmm::cuda_stream_view stream) { uint32_t num_chunks = num_columns * num_stripes; dim3 dim_block(NTHREADS, 1); // 1024 threads per chunk dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks, (num_rowgroups > 0) ? num_rowgroups : 1); - gpuDecodeOrcColumnData<<>>(chunks, - global_dictionary, - tz_table, - row_groups, - max_num_rows, - first_row, - num_columns, - num_rowgroups, - rowidx_stride); + gpuDecodeOrcColumnData<<>>(chunks, + global_dictionary, + tz_table, + row_groups, + max_num_rows, + first_row, + num_columns, + num_rowgroups, + rowidx_stride); } } // namespace gpu diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index ea88e3ea645..51f7fdaeed3 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -13,11 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include + #include "orc_common.h" #include "orc_gpu.h" +#include + +#include + +#include + // Apache ORC reader does not handle zero-length patch lists for RLEv2 mode2 // Workaround replaces zero-length patch lists by a dummy zero patch #define ZERO_PLL_WAR 1 @@ -1247,12 +1252,12 @@ __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(StripeStream void EncodeOrcColumnData(EncChunk *chunks, uint32_t num_columns, uint32_t num_rowgroups, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(512, 1); // 512 threads per chunk dim3 dim_grid(num_columns, num_rowgroups); gpuEncodeOrcColumnData<512> - <<>>(chunks, num_columns, num_rowgroups); + <<>>(chunks, num_columns, num_rowgroups); } /** @@ -1270,12 +1275,12 @@ void EncodeStripeDictionaries(StripeDictionary *stripes, uint32_t num_string_columns, uint32_t num_columns, uint32_t num_stripes, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(512, 1); // 512 threads per dictionary dim3 dim_grid(num_string_columns * num_stripes, 2); gpuEncodeStringDictionaries<512> - <<>>(stripes, chunks, num_columns); + <<>>(stripes, chunks, num_columns); } /** @@ -1291,11 +1296,12 @@ void CompactOrcDataStreams(StripeStream *strm_desc, EncChunk *chunks, uint32_t num_stripe_streams, uint32_t num_columns, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(1024, 1); dim3 dim_grid(num_stripe_streams, 1); - gpuCompactOrcDataStreams<<>>(strm_desc, chunks, num_columns); + gpuCompactOrcDataStreams<<>>( + strm_desc, chunks, num_columns); } /** @@ -1321,15 +1327,15 @@ void CompressOrcDataStreams(uint8_t *compressed_data, uint32_t num_compressed_blocks, CompressionKind compression, uint32_t comp_blk_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block_init(256, 1); dim3 dim_grid(num_stripe_streams, 1); - gpuInitCompressionBlocks<<>>( + gpuInitCompressionBlocks<<>>( strm_desc, chunks, comp_in, comp_out, compressed_data, comp_blk_size); if (compression == SNAPPY) { gpu_snap(comp_in, comp_out, num_compressed_blocks, stream); } dim3 dim_block_compact(1024, 1); - gpuCompactCompressedBlocks<<>>( + gpuCompactCompressedBlocks<<>>( strm_desc, comp_in, comp_out, compressed_data, comp_blk_size); } diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 77a1a122e4f..9ccd7a9cfc8 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -13,10 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include + #include "orc_common.h" #include "orc_gpu.h" +#include + +#include + namespace cudf { namespace io { namespace orc { @@ -459,21 +463,22 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo *strm_info, int32_t num_streams, uint32_t compression_block_size, uint32_t log2maxcr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block - gpuParseCompressedStripeData<<>>( + gpuParseCompressedStripeData<<>>( strm_info, num_streams, compression_block_size, log2maxcr); } void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info, int32_t num_streams, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block - gpuPostDecompressionReassemble<<>>(strm_info, num_streams); + gpuPostDecompressionReassemble<<>>(strm_info, + num_streams); } /** @@ -494,11 +499,11 @@ void __host__ ParseRowGroupIndex(RowGroup *row_groups, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t rowidx_stride, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid(num_columns, num_stripes); // 1 column chunk per block - gpuParseRowGroupIndex<<>>( + gpuParseRowGroupIndex<<>>( row_groups, strm_info, chunks, num_columns, num_stripes, num_rowgroups, rowidx_stride); } diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 9d73a05766a..ba3696fbefb 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -24,13 +24,14 @@ #include #include +#include +#include +#include + #include #include #include -#include -#include - namespace cudf { namespace io { namespace detail { @@ -141,7 +142,7 @@ class orc_column_view { size_t str_id, column_view const &col, const table_metadata *metadata, - cudaStream_t stream) + rmm::cuda_stream_view stream) : _id(id), _str_id(str_id), _string_type(col.type().id() == type_id::STRING), @@ -156,14 +157,16 @@ class orc_column_view { if (_string_type && _data_count > 0) { strings_column_view view{col}; _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream); - stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream>>>( + + stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>( static_cast(_indexes.data()), view.offsets().data() + view.offset(), view.chars().data(), _nulls, _data_count); _data = _indexes.data(); - CUDA_TRY(cudaStreamSynchronize(stream)); + + stream.synchronize(); } // Generating default name if name isn't present in metadata if (metadata && _id < metadata->column_names.size()) { @@ -254,7 +257,7 @@ void writer::impl::init_dictionaries(orc_column_view *columns, uint32_t *dict_data, uint32_t *dict_index, hostdevice_vector &dict, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const size_t num_rowgroups = dict.size() / str_col_ids.size(); @@ -280,12 +283,18 @@ void writer::impl::init_dictionaries(orc_column_view *columns, } } - CUDA_TRY(cudaMemcpyAsync( - dict.device_ptr(), dict.host_ptr(), dict.memory_size(), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(dict.device_ptr(), + dict.host_ptr(), + dict.memory_size(), + cudaMemcpyHostToDevice, + stream.value())); gpu::InitDictionaryIndices(dict.device_ptr(), str_col_ids.size(), num_rowgroups, stream); - CUDA_TRY(cudaMemcpyAsync( - dict.host_ptr(), dict.device_ptr(), dict.memory_size(), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(dict.host_ptr(), + dict.device_ptr(), + dict.memory_size(), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); } void writer::impl::build_dictionaries(orc_column_view *columns, @@ -295,7 +304,7 @@ void writer::impl::build_dictionaries(orc_column_view *columns, hostdevice_vector const &dict, uint32_t *dict_index, hostdevice_vector &stripe_dict, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const auto num_rowgroups = dict.size() / str_col_ids.size(); @@ -337,7 +346,7 @@ void writer::impl::build_dictionaries(orc_column_view *columns, stripe_dict.host_ptr(), stripe_dict.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::BuildStripeDictionaries(stripe_dict.device_ptr(), stripe_dict.host_ptr(), dict.device_ptr(), @@ -349,8 +358,8 @@ void writer::impl::build_dictionaries(orc_column_view *columns, stripe_dict.device_ptr(), stripe_dict.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); } std::vector writer::impl::gather_streams(orc_column_view *columns, @@ -522,7 +531,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, std::vector const &streams, std::vector const &strm_ids, hostdevice_vector &chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Allocate combined buffer for RLE data and string data output std::vector strm_offsets(streams.size()); @@ -627,8 +636,11 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, } } - CUDA_TRY(cudaMemcpyAsync( - chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(), + chunks.host_ptr(), + chunks.memory_size(), + cudaMemcpyHostToDevice, + stream.value())); if (!str_col_ids.empty()) { auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict(); gpu::EncodeStripeDictionaries(d_stripe_dict, @@ -639,7 +651,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, stream); } gpu::EncodeOrcColumnData(chunks.device_ptr(), num_columns, num_rowgroups, stream); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); return output; } @@ -652,7 +664,7 @@ std::vector writer::impl::gather_stripes( std::vector const &stripe_list, hostdevice_vector &chunks, hostdevice_vector &strm_desc, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::vector stripes(stripe_list.size()); size_t group = 0; @@ -687,17 +699,20 @@ std::vector writer::impl::gather_stripes( strm_desc.host_ptr(), strm_desc.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::CompactOrcDataStreams( strm_desc.device_ptr(), chunks.device_ptr(), strm_desc.size(), num_columns, stream); CUDA_TRY(cudaMemcpyAsync(strm_desc.host_ptr(), strm_desc.device_ptr(), strm_desc.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaMemcpyAsync( - chunks.host_ptr(), chunks.device_ptr(), chunks.memory_size(), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + CUDA_TRY(cudaMemcpyAsync(chunks.host_ptr(), + chunks.device_ptr(), + chunks.memory_size(), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); return stripes; } @@ -710,7 +725,7 @@ std::vector> writer::impl::gather_statistic_blobs( std::vector const &stripe_list, std::vector const &stripes, hostdevice_vector &chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { size_t num_stat_blobs = (1 + stripe_list.size()) * num_columns; size_t num_chunks = chunks.size(); @@ -767,12 +782,12 @@ std::vector> writer::impl::gather_statistic_blobs( stat_desc.host_ptr(), stat_desc.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync(stat_merge.device_ptr(), stat_merge.host_ptr(), stat_merge.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::orc_init_statistics_groups(stat_groups.data().get(), stat_desc.device_ptr(), num_columns, @@ -798,8 +813,8 @@ std::vector> writer::impl::gather_statistic_blobs( stat_merge.device_ptr(), stat_merge.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); hostdevice_vector blobs(stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks); @@ -812,10 +827,13 @@ std::vector> writer::impl::gather_statistic_blobs( stat_merge.device_ptr(), stat_merge.memory_size(), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaMemcpyAsync( - blobs.host_ptr(), blobs.device_ptr(), blobs.memory_size(), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + CUDA_TRY(cudaMemcpyAsync(blobs.host_ptr(), + blobs.device_ptr(), + blobs.memory_size(), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); for (size_t i = 0; i < num_stat_blobs; i++) { const uint8_t *stat_begin = blobs.host_ptr(stat_merge[i].start_chunk); @@ -919,15 +937,16 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc, uint8_t *stream_out, StripeInformation &stripe, std::vector &streams, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const auto length = strm_desc.stream_size; streams[chunk.strm_id[strm_desc.stream_type]].length = length; if (length != 0) { const auto *stream_in = (compression_kind_ == NONE) ? chunk.streams[strm_desc.stream_type] : (compressed_data + strm_desc.bfr_offset); - CUDA_TRY(cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY( + cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); out_sink_->host_write(stream_out, length); } @@ -966,7 +985,7 @@ writer::impl::impl(std::unique_ptr sink, void writer::impl::write(table_view const &table, const table_metadata *metadata, - cudaStream_t stream) + rmm::cuda_stream_view stream) { orc_chunked_state state; state.user_metadata = metadata; @@ -1156,7 +1175,7 @@ void writer::impl::write_chunk(table_view const &table, orc_chunked_state &state strm_desc.host_ptr(), strm_desc.memory_size(), cudaMemcpyHostToDevice, - state.stream)); + state.stream.value())); gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), strm_desc.device_ptr(), chunks.device_ptr(), @@ -1171,13 +1190,13 @@ void writer::impl::write_chunk(table_view const &table, orc_chunked_state &state strm_desc.device_ptr(), strm_desc.memory_size(), cudaMemcpyDeviceToHost, - state.stream)); + state.stream.value())); CUDA_TRY(cudaMemcpyAsync(comp_out.host_ptr(), comp_out.device_ptr(), comp_out.memory_size(), cudaMemcpyDeviceToHost, - state.stream)); - CUDA_TRY(cudaStreamSynchronize(state.stream)); + state.stream.value())); + state.stream.synchronize(); } ProtobufWriter pbw_(&buffer_); @@ -1362,7 +1381,9 @@ writer::writer(std::unique_ptr sink, writer::~writer() = default; // Forward to implementation -void writer::write(table_view const &table, const table_metadata *metadata, cudaStream_t stream) +void writer::write(table_view const &table, + const table_metadata *metadata, + rmm::cuda_stream_view stream) { _impl->write(table, metadata, stream); } diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 6a96a8d4d7d..a7b1fef87ba 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +#include "chunked_state.hpp" #include "orc.h" #include "orc_gpu.h" @@ -28,12 +29,12 @@ #include #include +#include + #include #include #include -#include "chunked_state.hpp" - namespace cudf { namespace io { namespace detail { @@ -76,7 +77,7 @@ class writer::impl { * @param metadata The metadata associated with the table * @param stream CUDA stream used for device memory operations and kernel launches. **/ - void write(table_view const& table, const table_metadata* metadata, cudaStream_t stream); + void write(table_view const& table, const table_metadata* metadata, rmm::cuda_stream_view stream); /** * @brief Begins the chunked/streamed write process. @@ -121,7 +122,7 @@ class writer::impl { uint32_t* dict_data, uint32_t* dict_index, hostdevice_vector& dict, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Builds up per-stripe dictionaries for string columns @@ -142,7 +143,7 @@ class writer::impl { hostdevice_vector const& dict, uint32_t* dict_index, hostdevice_vector& stripe_dict, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Returns stream information for each column @@ -187,7 +188,7 @@ class writer::impl { std::vector const& streams, std::vector const& strm_ids, hostdevice_vector& chunks, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Returns stripe information after compacting columns' individual data @@ -211,7 +212,7 @@ class writer::impl { std::vector const& stripe_list, hostdevice_vector& chunks, hostdevice_vector& strm_desc, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Returns per-stripe and per-file column statistics encoded @@ -236,7 +237,7 @@ class writer::impl { std::vector const& stripe_list, std::vector const& stripes, hostdevice_vector& chunks, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Write the specified column's row index stream @@ -285,7 +286,7 @@ class writer::impl { uint8_t* stream_out, StripeInformation& stripe, std::vector& streams, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Insert 3-byte uncompressed block headers in a byte vector diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp index a6ea7f23385..5bbc5366f70 100644 --- a/cpp/src/io/parquet/chunked_state.hpp +++ b/cpp/src/io/parquet/chunked_state.hpp @@ -21,9 +21,12 @@ #pragma once -#include #include +#include + +#include + namespace cudf { namespace io { @@ -37,7 +40,7 @@ struct pq_chunked_state { /// The writer to be used std::unique_ptr wp; /// Cuda stream to be used - cudaStream_t stream; + rmm::cuda_stream_view stream; /// Overall file metadata. Filled in during the process and written during write_chunked_end() cudf::io::parquet::FileMetaData md; /// current write position for rowgroups/chunks @@ -56,13 +59,13 @@ struct pq_chunked_state { pq_chunked_state() = default; pq_chunked_state(table_metadata const* metadata, - SingleWriteMode mode = SingleWriteMode::NO, - bool write_int96_timestamps = false, - cudaStream_t str = 0) - : user_metadata{metadata}, + SingleWriteMode mode = SingleWriteMode::NO, + bool write_int96_timestamps = false, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) + : stream{stream}, + user_metadata{metadata}, single_write_mode{mode == SingleWriteMode::YES}, - int96_timestamps(write_int96_timestamps), - stream{str} + int96_timestamps(write_int96_timestamps) { } }; diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 693c82cb4cd..3e4584a9731 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -14,17 +14,20 @@ * limitations under the License. */ +#include +#include +#include + +#include +#include + #include +#include + #include #include #include #include -#include -#include -#include -#include - -#include #define LOG2_NTHREADS (5 + 2) #define NTHREADS (1 << LOG2_NTHREADS) @@ -1711,16 +1714,16 @@ struct chunk_row_output_iter { using reference = size_type &; using iterator_category = thrust::output_device_iterator_tag; - chunk_row_output_iter operator+ __host__ __device__(int i) + __host__ __device__ chunk_row_output_iter operator+(int i) { return chunk_row_output_iter{p + i}; } - void operator++ __host__ __device__() { p++; } + __host__ __device__ void operator++() { p++; } - reference operator[] __device__(int i) { return p[i].chunk_row; } - reference operator*__device__() { return p->chunk_row; } - void operator= __device__(value_type v) { p->chunk_row = v; } + __device__ reference operator[](int i) { return p[i].chunk_row; } + __device__ reference operator*() { return p->chunk_row; } + __device__ void operator=(value_type v) { p->chunk_row = v; } }; struct start_offset_output_iterator { @@ -1736,19 +1739,19 @@ struct start_offset_output_iterator { using reference = size_type &; using iterator_category = thrust::output_device_iterator_tag; - start_offset_output_iterator operator+ __host__ __device__(int i) + __host__ __device__ start_offset_output_iterator operator+(int i) { return start_offset_output_iterator{ pages, page_indices, cur_index + i, src_col_schema, nesting_depth}; } - void operator++ __host__ __device__() { cur_index++; } + __host__ __device__ void operator++() { cur_index++; } - reference operator[] __device__(int i) { return dereference(cur_index + i); } - reference operator*__device__() { return dereference(cur_index); } + __device__ reference operator[](int i) { return dereference(cur_index + i); } + __device__ reference operator*() { return dereference(cur_index); } private: - reference __device__ dereference(int index) + __device__ reference dereference(int index) { PageInfo const &p = pages[page_indices[index]]; if (p.src_col_schema != src_col_schema || p.flags & PAGEINFO_FLAGS_DICTIONARY) { return empty; } @@ -1765,7 +1768,7 @@ void PreprocessColumnData(hostdevice_vector &pages, std::vector &output_columns, size_t num_rows, size_t min_row, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { dim3 dim_block(NTHREADS, 1); @@ -1774,9 +1777,9 @@ void PreprocessColumnData(hostdevice_vector &pages, // computes: // PageNestingInfo::size for each level of nesting, for each page. // The output from this does not take row bounds (num_rows, min_row) into account - gpuComputePageSizes<<>>( + gpuComputePageSizes<<>>( pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size(), false); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); // computes: // PageInfo::chunk_row for all pages @@ -1784,7 +1787,7 @@ void PreprocessColumnData(hostdevice_vector &pages, pages.device_ptr(), [] __device__(PageInfo const &page) { return page.chunk_idx; }); auto page_input = thrust::make_transform_iterator( pages.device_ptr(), [] __device__(PageInfo const &page) { return page.num_rows; }); - thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream.value()), key_input, key_input + pages.size(), page_input, @@ -1793,7 +1796,7 @@ void PreprocessColumnData(hostdevice_vector &pages, // computes: // PageNestingInfo::size for each level of nesting, for each page, taking row bounds into account. // PageInfo::skipped_values, which tells us where to start decoding in the input - gpuComputePageSizes<<>>( + gpuComputePageSizes<<>>( pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size(), true); // retrieve pages back (PageInfo::num_rows has been set. if we don't bring it @@ -1819,14 +1822,15 @@ void PreprocessColumnData(hostdevice_vector &pages, rmm::device_uvector page_keys(pages.size(), stream); rmm::device_uvector page_index(pages.size(), stream); { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), pages.device_ptr(), pages.device_ptr() + pages.size(), page_keys.begin(), [] __device__(PageInfo const &page) { return page.src_col_schema; }); - thrust::sequence(rmm::exec_policy(stream)->on(stream), page_index.begin(), page_index.end()); - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::sequence( + rmm::exec_policy(stream)->on(stream.value()), page_index.begin(), page_index.end()); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream.value()), page_keys.begin(), page_keys.end(), page_index.begin(), @@ -1860,7 +1864,7 @@ void PreprocessColumnData(hostdevice_vector &pages, // columns. so don't compute any given level more than once. if (out_buf.size == 0) { int size = thrust::reduce( - rmm::exec_policy(stream)->on(stream), size_input, size_input + pages.size()); + rmm::exec_policy(stream)->on(stream.value()), size_input, size_input + pages.size()); // if this is a list column add 1 for non-leaf levels for the terminating offset if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; } @@ -1870,7 +1874,7 @@ void PreprocessColumnData(hostdevice_vector &pages, } // compute per-page start offset - thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream.value()), page_keys.begin(), page_keys.end(), size_input, @@ -1890,12 +1894,12 @@ void __host__ DecodePageData(hostdevice_vector &pages, hostdevice_vector const &chunks, size_t num_rows, size_t min_row, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(NTHREADS, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page - gpuDecodePageData<<>>( + gpuDecodePageData<<>>( pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size()); } diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu index b4e87d97857..fba2b3ccfd5 100644 --- a/cpp/src/io/parquet/page_dict.cu +++ b/cpp/src/io/parquet/page_dict.cu @@ -13,11 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include + #include #include +#include + +#include + +#include + namespace cudf { namespace io { namespace parquet { @@ -331,11 +336,11 @@ void BuildChunkDictionaries(EncColumnChunk *chunks, uint32_t *dev_scratch, size_t scratch_size, uint32_t num_chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (num_chunks > 0 && scratch_size > 0) { // zero scratch size implies no dictionaries - CUDA_TRY(cudaMemsetAsync(dev_scratch, 0, scratch_size, stream)); - gpuBuildChunkDictionaries<1024><<>>(chunks, dev_scratch); + CUDA_TRY(cudaMemsetAsync(dev_scratch, 0, scratch_size, stream.value())); + gpuBuildChunkDictionaries<1024><<>>(chunks, dev_scratch); } } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3d87901f269..3542c25bfb2 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -13,13 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include + #include #include #include #include +#include + +#include + #include #include @@ -1667,7 +1671,7 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c * * Similarly we merge up all the way till level 0 offsets */ -dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) +dremel_data get_dremel_data(column_view h_col, rmm::cuda_stream_view stream) { CUDF_EXPECTS(h_col.type().id() == type_id::LIST, "Can only get rep/def levels for LIST type column"); @@ -1679,12 +1683,12 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) auto d_off = lcv.offsets().data(); auto empties_idx_end = - thrust::copy_if(rmm::exec_policy(stream)->on(stream), + thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(start), thrust::make_counting_iterator(end), empties_idx.begin(), [d_off] __device__(auto i) { return d_off[i] == d_off[i + 1]; }); - auto empties_end = thrust::gather(rmm::exec_policy(stream)->on(stream), + auto empties_end = thrust::gather(rmm::exec_policy(stream)->on(stream.value()), empties_idx.begin(), empties_idx_end, lcv.offsets().begin(), @@ -1794,7 +1798,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) auto output_zip_it = thrust::make_zip_iterator(thrust::make_tuple(rep_level.begin(), def_level.begin())); - auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream), + auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream.value()), empties.begin(), empties.begin() + empties_size, thrust::make_counting_iterator(column_offsets[level + 1]), @@ -1812,14 +1816,14 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) [off = lcv.offsets().data()] __device__( auto i) -> int { return off[i] == off[i + 1]; }); rmm::device_uvector scan_out(offset_size_at_level, stream); - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), scan_it, scan_it + offset_size_at_level, scan_out.begin()); // Add scan output to existing offsets to get new offsets into merged rep level values new_offsets = rmm::device_uvector(offset_size_at_level, stream); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), offset_size_at_level, [off = lcv.offsets().data() + column_offsets[level], @@ -1830,7 +1834,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) // Set rep level values at level starts to appropriate rep level auto scatter_it = thrust::make_constant_iterator(level); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), scatter_it, scatter_it + new_offsets.size() - 1, new_offsets.begin(), @@ -1881,7 +1885,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) auto output_zip_it = thrust::make_zip_iterator(thrust::make_tuple(rep_level.begin(), def_level.begin())); - auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream), + auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream.value()), transformed_empties, transformed_empties + empties_size, thrust::make_counting_iterator(0), @@ -1900,14 +1904,14 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) [off = lcv.offsets().data()] __device__( auto i) -> int { return off[i] == off[i + 1]; }); rmm::device_uvector scan_out(offset_size_at_level, stream); - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), scan_it, scan_it + offset_size_at_level, scan_out.begin()); // Add scan output to existing offsets to get new offsets into merged rep level values rmm::device_uvector temp_new_offsets(offset_size_at_level, stream); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), offset_size_at_level, [off = lcv.offsets().data() + column_offsets[level], @@ -1920,7 +1924,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) // Set rep level values at level starts to appropriate rep level auto scatter_it = thrust::make_constant_iterator(level); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), scatter_it, scatter_it + new_offsets.size() - 1, new_offsets.begin(), @@ -1931,7 +1935,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream) rep_level.resize(level_vals_size, stream); def_level.resize(level_vals_size, stream); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); size_type leaf_col_offset = column_offsets[column_offsets.size() - 1]; size_type leaf_data_size = column_ends[column_ends.size() - 1] - leaf_col_offset; @@ -1958,10 +1962,10 @@ void InitPageFragments(PageFragment *frag, int32_t num_columns, uint32_t fragment_size, uint32_t num_rows, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_grid(num_columns, num_fragments); // 1 threadblock per fragment - gpuInitPageFragments<512><<>>( + gpuInitPageFragments<512><<>>( frag, col_desc, num_fragments, num_columns, fragment_size, num_rows); } @@ -1982,10 +1986,10 @@ void InitFragmentStatistics(statistics_group *groups, int32_t num_fragments, int32_t num_columns, uint32_t fragment_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_grid(num_columns, (num_fragments + 3) >> 2); // 1 warp per fragment - gpuInitFragmentStats<<>>( + gpuInitFragmentStats<<>>( groups, fragments, col_desc, num_fragments, num_columns, fragment_size); } @@ -2008,10 +2012,10 @@ void InitEncoderPages(EncColumnChunk *chunks, int32_t num_columns, statistics_merge_group *page_grstats, statistics_merge_group *chunk_grstats, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup - gpuInitPages<<>>( + gpuInitPages<<>>( chunks, pages, col_desc, page_grstats, chunk_grstats, num_rowgroups, num_columns); } @@ -2032,11 +2036,12 @@ void EncodePages(EncPage *pages, uint32_t start_page, gpu_inflate_input_s *comp_in, gpu_inflate_status_s *comp_out, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // A page is part of one column. This is launching 1 block per page. 1 block will exclusively // deal with one datatype. - gpuEncodePages<<>>(pages, chunks, comp_in, comp_out, start_page); + gpuEncodePages<<>>( + pages, chunks, comp_in, comp_out, start_page); } /** @@ -2054,9 +2059,9 @@ void DecideCompression(EncColumnChunk *chunks, uint32_t num_chunks, uint32_t start_page, const gpu_inflate_status_s *comp_out, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - gpuDecideCompression<<>>(chunks, pages, comp_out, start_page); + gpuDecideCompression<<>>(chunks, pages, comp_out, start_page); } /** @@ -2078,9 +2083,9 @@ void EncodePageHeaders(EncPage *pages, const gpu_inflate_status_s *comp_out, const statistics_chunk *page_stats, const statistics_chunk *chunk_stats, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - gpuEncodePageHeaders<<>>( + gpuEncodePageHeaders<<>>( pages, chunks, comp_out, page_stats, chunk_stats, start_page); } @@ -2095,9 +2100,9 @@ void EncodePageHeaders(EncPage *pages, void GatherPages(EncColumnChunk *chunks, const EncPage *pages, uint32_t num_chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - gpuGatherPages<<>>(chunks, pages); + gpuGatherPages<<>>(chunks, pages); } } // namespace gpu diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index d150eb72bba..ef496e71d96 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { namespace io { namespace parquet { @@ -468,20 +470,22 @@ extern "C" __global__ void __launch_bounds__(128) } } -void __host__ DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, cudaStream_t stream) +void __host__ DecodePageHeaders(ColumnChunkDesc *chunks, + int32_t num_chunks, + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid((num_chunks + 3) >> 2, 1); // 1 chunk per warp, 4 warps per block - gpuDecodePageHeaders<<>>(chunks, num_chunks); + gpuDecodePageHeaders<<>>(chunks, num_chunks); } void __host__ BuildStringDictionaryIndex(ColumnChunkDesc *chunks, int32_t num_chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid((num_chunks + 3) >> 2, 1); // 1 chunk per warp, 4 warps per block - gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); + gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); } } // namespace gpu diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 97420d5d7f1..9f657d58804 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -16,23 +16,24 @@ #pragma once -#include #include #include -#include #include #include #include #include #include - #include -#include +#include #include #include +#include + +#include + namespace cudf { namespace io { namespace parquet { @@ -171,8 +172,8 @@ struct ColumnChunkDesc { max_num_pages(0), page_info(nullptr), str_dict_index(nullptr), - valid_map_base({nullptr}), - column_data_base({nullptr}), + valid_map_base{nullptr}, + column_data_base{nullptr}, codec(codec_), converted_type(converted_type_), decimal_scale(decimal_scale_), @@ -327,9 +328,7 @@ struct EncColumnChunk { * @param[in] num_chunks Number of column chunks * @param[in] stream CUDA stream to use, default 0 */ -void DecodePageHeaders(ColumnChunkDesc *chunks, - int32_t num_chunks, - cudaStream_t stream = (cudaStream_t)0); +void DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, rmm::cuda_stream_view stream); /** * @brief Launches kernel for building the dictionary index for the column @@ -341,7 +340,7 @@ void DecodePageHeaders(ColumnChunkDesc *chunks, */ void BuildStringDictionaryIndex(ColumnChunkDesc *chunks, int32_t num_chunks, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); /** * @brief Preprocess column information for nested schemas. @@ -368,7 +367,7 @@ void PreprocessColumnData(hostdevice_vector &pages, std::vector &output_columns, size_t num_rows, size_t min_row, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr); /** @@ -387,7 +386,7 @@ void DecodePageData(hostdevice_vector &pages, hostdevice_vector const &chunks, size_t num_rows, size_t min_row, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); /** * @brief Dremel data that describes one nested type column @@ -420,7 +419,7 @@ struct dremel_data { * * @return A struct containing dremel data */ -dremel_data get_dremel_data(column_view h_col, cudaStream_t stream = (cudaStream_t)0); +dremel_data get_dremel_data(column_view h_col, rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing encoder page fragments @@ -439,7 +438,7 @@ void InitPageFragments(PageFragment *frag, int32_t num_columns, uint32_t fragment_size, uint32_t num_rows, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing fragment statistics groups @@ -458,7 +457,7 @@ void InitFragmentStatistics(statistics_group *groups, int32_t num_fragments, int32_t num_columns, uint32_t fragment_size, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); /** * @brief Launches kernel for initializing encoder data pages @@ -479,7 +478,7 @@ void InitEncoderPages(EncColumnChunk *chunks, int32_t num_columns, statistics_merge_group *page_grstats = nullptr, statistics_merge_group *chunk_grstats = nullptr, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel for packing column data into parquet pages @@ -498,7 +497,7 @@ void EncodePages(EncPage *pages, uint32_t start_page = 0, gpu_inflate_input_s *comp_in = nullptr, gpu_inflate_status_s *comp_out = nullptr, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision @@ -515,7 +514,7 @@ void DecideCompression(EncColumnChunk *chunks, uint32_t num_chunks, uint32_t start_page, const gpu_inflate_status_s *comp_out = nullptr, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel to encode page headers @@ -536,7 +535,7 @@ void EncodePageHeaders(EncPage *pages, const gpu_inflate_status_s *comp_out = nullptr, const statistics_chunk *page_stats = nullptr, const statistics_chunk *chunk_stats = nullptr, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Launches kernel to gather pages to a single contiguous block per chunk @@ -550,7 +549,7 @@ void EncodePageHeaders(EncPage *pages, void GatherPages(EncColumnChunk *chunks, const EncPage *pages, uint32_t num_chunks, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); /** * @brief Launches kernel for building chunk dictionaries @@ -565,7 +564,7 @@ void BuildChunkDictionaries(EncColumnChunk *chunks, uint32_t *dev_scratch, size_t scratch_size, uint32_t num_chunks, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); } // namespace gpu } // namespace parquet diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index e615b4782ea..85c9a3c2919 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -758,7 +759,7 @@ void reader::impl::read_column_chunks( size_t end_chunk, const std::vector &column_chunk_offsets, std::vector const &chunk_source_map, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Transfer chunk data, coalescing adjacent chunks for (size_t chunk = begin_chunk; chunk < end_chunk;) { @@ -797,7 +798,7 @@ void reader::impl::read_column_chunks( * @copydoc cudf::io::detail::parquet::count_page_headers */ size_t reader::impl::count_page_headers(hostdevice_vector &chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { size_t total_pages = 0; @@ -817,7 +818,7 @@ size_t reader::impl::count_page_headers(hostdevice_vector */ void reader::impl::decode_page_headers(hostdevice_vector &chunks, hostdevice_vector &pages, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), // please update preprocess_nested_columns to reflect this. @@ -838,7 +839,7 @@ void reader::impl::decode_page_headers(hostdevice_vector & rmm::device_buffer reader::impl::decompress_page_data( hostdevice_vector &chunks, hostdevice_vector &pages, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto for_each_codec_page = [&](parquet::Compression codec, const std::function &f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { @@ -902,12 +903,12 @@ rmm::device_buffer reader::impl::decompress_page_data( inflate_in.host_ptr(start_pos), sizeof(decltype(inflate_in)::value_type) * (argc - start_pos), cudaMemcpyHostToDevice, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos), inflate_out.host_ptr(start_pos), sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), cudaMemcpyHostToDevice, - stream)); + stream.value())); switch (codec.first) { case parquet::GZIP: CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos), @@ -936,15 +937,18 @@ rmm::device_buffer reader::impl::decompress_page_data( inflate_out.device_ptr(start_pos), sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), cudaMemcpyDeviceToHost, - stream)); + stream.value())); } } - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); // Update the page information in device memory with the updated value of // page_data; it now points to the uncompressed data buffer - CUDA_TRY(cudaMemcpyAsync( - pages.device_ptr(), pages.host_ptr(), pages.memory_size(), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(pages.device_ptr(), + pages.host_ptr(), + pages.memory_size(), + cudaMemcpyHostToDevice, + stream.value())); return decomp_pages; } @@ -955,7 +959,7 @@ rmm::device_buffer reader::impl::decompress_page_data( void reader::impl::allocate_nesting_info(hostdevice_vector const &chunks, hostdevice_vector &pages, hostdevice_vector &page_nesting_info, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // compute total # of page_nesting infos needed and allocate space. doing this in one // buffer to keep it to a single gpu allocation @@ -1075,7 +1079,7 @@ void reader::impl::preprocess_columns(hostdevice_vector &c size_t min_row, size_t total_rows, bool has_lists, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // TODO : we should be selectively preprocessing only columns that have // lists in them instead of doing them all if even one contains lists. @@ -1096,7 +1100,7 @@ void reader::impl::preprocess_columns(hostdevice_vector &c // preprocess per-nesting level sizes by page gpu::PreprocessColumnData( pages, chunks, _input_columns, _output_columns, total_rows, min_row, stream, _mr); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } } @@ -1108,7 +1112,7 @@ void reader::impl::decode_page_data(hostdevice_vector &chu hostdevice_vector &page_nesting, size_t min_row, size_t total_rows, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto is_dict_chunk = [](const gpu::ColumnChunkDesc &chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; @@ -1218,7 +1222,7 @@ void reader::impl::decode_page_data(hostdevice_vector &chu gpu::DecodePageData(pages, chunks, total_rows, min_row, stream); pages.device_to_host(stream); page_nesting.device_to_host(stream); - cudaStreamSynchronize(stream); + stream.synchronize(); // for list columns, add the final offset to every offset buffer. // TODO : make this happen in more efficiently. Maybe use thrust::for_each @@ -1248,7 +1252,7 @@ void reader::impl::decode_page_data(hostdevice_vector &chu &offset, sizeof(offset), cudaMemcpyHostToDevice, - stream); + stream.value()); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } } @@ -1274,13 +1278,13 @@ void reader::impl::decode_page_data(hostdevice_vector &chu } } - cudaStreamSynchronize(stream); + stream.synchronize(); } reader::impl::impl(std::vector> &&sources, parquet_reader_options const &options, rmm::mr::device_memory_resource *mr) - : _sources(std::move(sources)), _mr(mr) + : _mr(mr), _sources(std::move(sources)) { // Open and parse the source dataset metadata _metadata = std::make_unique(_sources); @@ -1304,7 +1308,7 @@ reader::impl::impl(std::vector> &&sources, table_with_metadata reader::impl::read(size_type skip_rows, size_type num_rows, std::vector> const &row_group_list, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Select only row groups required const auto selected_row_groups = @@ -1473,7 +1477,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, for (size_t i = 0; i < _output_columns.size(); ++i) { out_metadata.schema_info.push_back(column_name_info{""}); out_columns.emplace_back( - make_column(_output_columns[i], stream, _mr, &out_metadata.schema_info.back())); + make_column(_output_columns[i], &out_metadata.schema_info.back(), stream, _mr)); } } } @@ -1517,7 +1521,8 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(parquet_reader_options const &options, cudaStream_t stream) +table_with_metadata reader::read(parquet_reader_options const &options, + rmm::cuda_stream_view stream) { return _impl->read( options.get_skip_rows(), options.get_num_rows(), options.get_row_groups(), stream); diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index c192b65f0b0..f6df8f9e460 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -31,6 +31,8 @@ #include #include +#include + #include #include #include @@ -75,7 +77,7 @@ class reader::impl { table_with_metadata read(size_type skip_rows, size_type num_rows, std::vector> const &row_group_indices, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: /** @@ -95,7 +97,7 @@ class reader::impl { size_t end_chunk, const std::vector &column_chunk_offsets, std::vector const &chunk_source_map, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Returns the number of total pages from the given column chunks @@ -105,7 +107,8 @@ class reader::impl { * * @return The total number of pages */ - size_t count_page_headers(hostdevice_vector &chunks, cudaStream_t stream); + size_t count_page_headers(hostdevice_vector &chunks, + rmm::cuda_stream_view stream); /** * @brief Returns the page information from the given column chunks. @@ -116,7 +119,7 @@ class reader::impl { */ void decode_page_headers(hostdevice_vector &chunks, hostdevice_vector &pages, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Decompresses the page data, at page granularity. @@ -129,7 +132,7 @@ class reader::impl { */ rmm::device_buffer decompress_page_data(hostdevice_vector &chunks, hostdevice_vector &pages, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Allocate nesting information storage for all pages and set pointers @@ -149,7 +152,7 @@ class reader::impl { void allocate_nesting_info(hostdevice_vector const &chunks, hostdevice_vector &pages, hostdevice_vector &page_nesting_info, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Preprocess column information for nested schemas. @@ -174,7 +177,7 @@ class reader::impl { size_t min_row, size_t total_rows, bool has_lists, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Converts the page data and outputs to columns. @@ -191,7 +194,7 @@ class reader::impl { hostdevice_vector &page_nesting, size_t min_row, size_t total_rows, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: rmm::mr::device_memory_resource *_mr = nullptr; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f86e6a5ee67..f6f9ecb431a 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -19,24 +19,26 @@ * @brief cuDF-IO parquet writer class implementation */ -#include #include "writer_impl.hpp" +#include + #include #include #include #include -#include -#include -#include -#include - #include +#include #include #include #include +#include +#include +#include +#include + namespace cudf { namespace io { namespace detail { @@ -123,7 +125,7 @@ class parquet_column_view { column_view const &col, const table_metadata *metadata, bool int96_timestamps, - cudaStream_t stream) + rmm::cuda_stream_view stream) : _col(col), _leaf_col(get_leaf_col(col)), _id(id), @@ -286,19 +288,21 @@ class parquet_column_view { } _offsets_array = offsets_array; - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } if (_string_type && _data_count > 0) { strings_column_view view{_leaf_col}; _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream); - stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream>>>( + + stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>( reinterpret_cast(_indexes.data()), view.offsets().data() + leaf_col_offset, view.chars().data(), _nulls, _data_count); _data = _indexes.data(); - CUDA_TRY(cudaStreamSynchronize(stream)); + + stream.synchronize(); } // Generating default name if name isn't present in metadata @@ -427,13 +431,13 @@ void writer::impl::init_page_fragments(hostdevice_vector &fra uint32_t num_fragments, uint32_t num_rows, uint32_t fragment_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { CUDA_TRY(cudaMemcpyAsync(col_desc.device_ptr(), col_desc.host_ptr(), col_desc.memory_size(), cudaMemcpyHostToDevice, - stream)); + stream.value())); gpu::InitPageFragments(frag.device_ptr(), col_desc.device_ptr(), num_fragments, @@ -441,9 +445,12 @@ void writer::impl::init_page_fragments(hostdevice_vector &fra fragment_size, num_rows, stream); - CUDA_TRY(cudaMemcpyAsync( - frag.host_ptr(), frag.device_ptr(), frag.memory_size(), cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(frag.host_ptr(), + frag.device_ptr(), + frag.memory_size(), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); } void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk, @@ -452,7 +459,7 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk uint32_t num_columns, uint32_t num_fragments, uint32_t fragment_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { rmm::device_vector frag_stats_group(num_fragments * num_columns); @@ -465,7 +472,7 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk stream); GatherColumnStatistics( frag_stats_chunk, frag_stats_group.data().get(), num_fragments * num_columns, stream); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } void writer::impl::build_chunk_dictionaries(hostdevice_vector &chunks, @@ -473,12 +480,15 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector dict_scratch(dict_scratch_size / sizeof(uint32_t)); - CUDA_TRY(cudaMemcpyAsync( - chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(), + chunks.host_ptr(), + chunks.memory_size(), + cudaMemcpyHostToDevice, + stream.value())); gpu::BuildChunkDictionaries(chunks.device_ptr(), dict_scratch.data().get(), dict_scratch_size, @@ -492,9 +502,12 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector &chunks, @@ -506,11 +519,14 @@ void writer::impl::init_encoder_pages(hostdevice_vector &ch uint32_t num_columns, uint32_t num_pages, uint32_t num_stats_bfr, - cudaStream_t stream) + rmm::cuda_stream_view stream) { rmm::device_vector page_stats_mrg(num_stats_bfr); - CUDA_TRY(cudaMemcpyAsync( - chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(), + chunks.host_ptr(), + chunks.memory_size(), + cudaMemcpyHostToDevice, + stream.value())); InitEncoderPages(chunks.device_ptr(), pages, col_desc.device_ptr(), @@ -529,7 +545,7 @@ void writer::impl::init_encoder_pages(hostdevice_vector &ch stream); } } - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } void writer::impl::encode_pages(hostdevice_vector &chunks, @@ -543,7 +559,7 @@ void writer::impl::encode_pages(hostdevice_vector &chunks, gpu_inflate_status_s *comp_out, const statistics_chunk *page_stats, const statistics_chunk *chunk_stats, - cudaStream_t stream) + rmm::cuda_stream_view stream) { gpu::EncodePages( pages, chunks.device_ptr(), pages_in_batch, first_page_in_batch, comp_in, comp_out, stream); @@ -577,8 +593,8 @@ void writer::impl::encode_pages(hostdevice_vector &chunks, chunks.device_ptr() + first_rowgroup * num_columns, rowgroups_in_batch * num_columns * sizeof(gpu::EncColumnChunk), cudaMemcpyDeviceToHost, - stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.value())); + stream.synchronize(); } writer::impl::impl(std::unique_ptr sink, @@ -598,7 +614,7 @@ std::unique_ptr> writer::impl::write( bool return_filemetadata, const std::string &column_chunks_file_path, bool int96_timestamps, - cudaStream_t stream) + rmm::cuda_stream_view stream) { pq_chunked_state state{metadata, SingleWriteMode::YES, int96_timestamps, stream}; @@ -1091,8 +1107,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) dev_bfr, ck->ck_stat_size, cudaMemcpyDeviceToHost, - state.stream)); - CUDA_TRY(cudaStreamSynchronize(state.stream)); + state.stream.value())); + state.stream.synchronize(); } } else { // copy the full data @@ -1100,8 +1116,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) dev_bfr, ck->ck_stat_size + ck->compressed_size, cudaMemcpyDeviceToHost, - state.stream)); - CUDA_TRY(cudaStreamSynchronize(state.stream)); + state.stream.value())); + state.stream.synchronize(); out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size); if (ck->ck_stat_size != 0) { state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize( @@ -1174,7 +1190,7 @@ std::unique_ptr> writer::write(table_view const &table, bool return_filemetadata, const std::string column_chunks_file_path, bool int96_timestamps, - cudaStream_t stream) + rmm::cuda_stream_view stream) { return _impl->write( table, metadata, return_filemetadata, column_chunks_file_path, int96_timestamps, stream); diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 51d4213d782..75130c1881d 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -21,6 +21,8 @@ #pragma once +#include "chunked_state.hpp" + #include #include @@ -33,12 +35,12 @@ #include #include +#include + #include #include #include -#include "chunked_state.hpp" - namespace cudf { namespace io { namespace detail { @@ -87,7 +89,7 @@ class writer::impl { bool return_filemetadata, const std::string& column_chunks_file_path, bool int96_timestamps, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Begins the chunked/streamed write process. @@ -136,7 +138,7 @@ class writer::impl { uint32_t num_fragments, uint32_t num_rows, uint32_t fragment_size, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Gather per-fragment statistics * @@ -154,7 +156,7 @@ class writer::impl { uint32_t num_columns, uint32_t num_fragments, uint32_t fragment_size, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Build per-chunk dictionaries and count data pages * @@ -170,7 +172,7 @@ class writer::impl { uint32_t num_rowgroups, uint32_t num_columns, uint32_t num_dictionaries, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Initialize encoder pages * @@ -192,7 +194,7 @@ class writer::impl { uint32_t num_columns, uint32_t num_pages, uint32_t num_stats_bfr, - cudaStream_t stream); + rmm::cuda_stream_view stream); /** * @brief Encode a batch pages * @@ -220,7 +222,7 @@ class writer::impl { gpu_inflate_status_s* comp_out, const statistics_chunk* page_stats, const statistics_chunk* chunk_stats, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: // TODO : figure out if we want to keep this. It is currently unused. diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu index fb74987f061..69fb714d9c8 100644 --- a/cpp/src/io/statistics/column_stats.cu +++ b/cpp/src/io/statistics/column_stats.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-20, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,11 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include -#include + #include "column_stats.h" +#include + +#include + +#include + +#include + namespace cudf { namespace io { /** @@ -754,9 +760,9 @@ __global__ void __launch_bounds__(block_size, 1) void GatherColumnStatistics(statistics_chunk *chunks, const statistics_group *groups, uint32_t num_chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - gpuGatherColumnStatistics<1024><<>>(chunks, groups); + gpuGatherColumnStatistics<1024><<>>(chunks, groups); } /** @@ -772,9 +778,10 @@ void MergeColumnStatistics(statistics_chunk *chunks_out, const statistics_chunk *chunks_in, const statistics_merge_group *groups, uint32_t num_chunks, - cudaStream_t stream) + rmm::cuda_stream_view stream) { - gpuMergeColumnStatistics<1024><<>>(chunks_out, chunks_in, groups); + gpuMergeColumnStatistics<1024> + <<>>(chunks_out, chunks_in, groups); } } // namespace io diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h index 588d764e9af..bbecc85b8d8 100644 --- a/cpp/src/io/statistics/column_stats.h +++ b/cpp/src/io/statistics/column_stats.h @@ -16,6 +16,8 @@ #pragma once #include +#include + namespace cudf { namespace io { @@ -96,7 +98,7 @@ struct statistics_merge_group { void GatherColumnStatistics(statistics_chunk *chunks, const statistics_group *groups, uint32_t num_chunks, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); /** * @brief Launches kernel to merge column statistics @@ -111,7 +113,7 @@ void MergeColumnStatistics(statistics_chunk *chunks_out, const statistics_chunk *chunks_in, const statistics_merge_group *groups, uint32_t num_chunks, - cudaStream_t stream = (cudaStream_t)0); + rmm::cuda_stream_view stream); } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 0290857119b..832817cf7d5 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -51,13 +51,13 @@ namespace detail { inline rmm::device_buffer create_data( data_type type, size_type size, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { std::size_t data_size = size_of(type) * size; rmm::device_buffer data(data_size, stream, mr); - CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream)); + CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value())); return data; } @@ -84,7 +84,7 @@ struct column_buffer { column_buffer(data_type _type, size_type _size, bool _is_nullable = true, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : type(_type), is_nullable(_is_nullable), _null_count(0) { @@ -102,7 +102,7 @@ struct column_buffer { // instantiate a column of known type with a specified size. Allows deferred creation for // preprocessing steps such as in the Parquet reader void create(size_type _size, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size = _size; @@ -164,9 +164,9 @@ namespace { */ std::unique_ptr make_column( column_buffer& buffer, - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - column_name_info* schema_info = nullptr) + column_name_info* schema_info = nullptr, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { using str_pair = thrust::pair; @@ -194,7 +194,7 @@ std::unique_ptr make_column( // make child column CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer"); - auto child = make_column(buffer.children[0], stream, mr, child_info); + auto child = make_column(buffer.children[0], child_info, stream, mr); // make the final list column (note : size is the # of offsets, so our actual # of rows is 1 // less) @@ -219,7 +219,7 @@ std::unique_ptr make_column( schema_info->children.push_back(column_name_info{""}); child_info = &schema_info->children.back(); } - return make_column(col, stream, mr, child_info); + return make_column(col, child_info, stream, mr); }); return make_structs_column(buffer.size, diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 469683e1ad0..b4c2f491927 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -19,6 +19,8 @@ #include #include +#include + namespace cudf { namespace io { /** @@ -86,7 +88,7 @@ class void_sink : public data_sink { bool supports_device_write() const override { return true; } - void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { bytes_written_ += size; } @@ -109,7 +111,7 @@ class user_sink_wrapper : public data_sink { bool supports_device_write() const override { return user_sink->supports_device_write(); } - void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { CUDF_EXPECTS(user_sink->supports_device_write(), "device_write() being called on a data_sink that doesn't support it"); diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 0bdd5ca8345..5a4ac8e1d7e 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -16,10 +16,11 @@ #pragma once -#include - #include +#include +#include + /** * @brief A helper class that wraps fixed-length device memory for the GPU, and * a mirror host pinned memory for the CPU. @@ -43,12 +44,15 @@ class hostdevice_vector { return *this; } - explicit hostdevice_vector(size_t max_size, cudaStream_t stream = 0) + explicit hostdevice_vector(size_t max_size, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : hostdevice_vector(max_size, max_size, stream) { } - explicit hostdevice_vector(size_t initial_size, size_t max_size, cudaStream_t stream = 0) + explicit hostdevice_vector(size_t initial_size, + size_t max_size, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : num_elements(initial_size), max_elements(max_size) { if (max_elements != 0) { @@ -87,16 +91,18 @@ class hostdevice_vector { return reinterpret_cast(d_data.data()) + offset; } - void host_to_device(cudaStream_t stream, bool synchronize = false) + void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false) { - cudaMemcpyAsync(d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream); - if (synchronize) { cudaStreamSynchronize(stream); } + CUDA_TRY(cudaMemcpyAsync( + d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream.value())); + if (synchronize) { stream.synchronize(); } } - void device_to_host(cudaStream_t stream, bool synchronize = false) + void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false) { - cudaMemcpyAsync(h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream); - if (synchronize) { cudaStreamSynchronize(stream); } + CUDA_TRY(cudaMemcpyAsync( + h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream.value())); + if (synchronize) { stream.synchronize(); } } private: @@ -113,9 +119,9 @@ class hostdevice_vector { v.h_data = nullptr; } - cudaStream_t stream = 0; - size_t max_elements = 0; - size_t num_elements = 0; - T *h_data = nullptr; - rmm::device_buffer d_data; + rmm::cuda_stream_view stream{}; + size_t max_elements{}; + size_t num_elements{}; + T *h_data{}; + rmm::device_buffer d_data{}; }; diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index 5a2dc32e27a..c3145f71efd 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -27,6 +27,8 @@ #include #include +#include + namespace cudf { namespace detail { /** @@ -37,7 +39,7 @@ namespace detail { std::unique_ptr cross_join( cudf::table_view const& left, cudf::table_view const& right, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty"); @@ -75,7 +77,7 @@ std::unique_ptr cross_join(cudf::table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::cross_join(left, right, 0, mr); + return detail::cross_join(left, right, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 91188539790..67b9d3436d8 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -13,11 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include + #include #include #include -#include +#include #include @@ -133,7 +135,7 @@ std::pair, rmm::device_vector> get_left_join_indices_complement(rmm::device_vector &right_indices, size_type left_table_row_count, size_type right_table_row_count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // Get array of indices that do not appear in right_indices @@ -146,7 +148,7 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, // right_indices will be JoinNoneValue, i.e. -1. This if path should // produce exactly the same result as the else path but will be faster. if (left_table_row_count == 0) { - thrust::sequence(rmm::exec_policy(stream)->on(stream), + thrust::sequence(rmm::exec_policy(stream)->on(stream.value()), right_indices_complement.begin(), right_indices_complement.end(), 0); @@ -158,7 +160,7 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count // Thus specifying that those locations are valid - thrust::scatter_if(rmm::exec_policy(stream)->on(stream), + thrust::scatter_if(rmm::exec_policy(stream)->on(stream.value()), thrust::make_constant_iterator(0), thrust::make_constant_iterator(0) + right_indices.size(), right_indices.begin(), // Index locations @@ -169,7 +171,7 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, size_type end_counter = static_cast(right_table_row_count); // Create list of indices that have been marked as invalid - size_type indices_count = thrust::copy_if(rmm::exec_policy(stream)->on(stream), + size_type indices_count = thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(begin_counter), thrust::make_counting_iterator(end_counter), invalid_index_map.begin(), @@ -200,7 +202,7 @@ get_left_join_indices_complement(rmm::device_vector &right_indices, * @return Built hash table. */ std::unique_ptr> build_join_hash_table( - cudf::table_device_view build_table, cudaStream_t stream) + cudf::table_device_view build_table, rmm::cuda_stream_view stream) { CUDF_EXPECTS(0 != build_table.num_columns(), "Selected build dataset is empty"); CUDF_EXPECTS(0 != build_table.num_rows(), "Build side table has no rows"); @@ -209,17 +211,17 @@ std::unique_ptr> build_join_ size_t const hash_table_size = compute_hash_table_size(build_table_num_rows); auto hash_table = multimap_type::create(hash_table_size, + stream, true, multimap_type::hasher(), multimap_type::key_equal(), - multimap_type::allocator_type(), - stream); + multimap_type::allocator_type()); row_hash hash_build{build_table}; rmm::device_scalar failure(0, stream); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(build_table_num_rows, block_size); - build_hash_table<<>>( + build_hash_table<<>>( *hash_table, hash_build, build_table_num_rows, failure.data()); // Check error code from the kernel if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); } @@ -247,7 +249,7 @@ std::pair, rmm::device_vector> probe_jo cudf::table_device_view probe_table, multimap_type const &hash_table, null_equality compare_nulls, - cudaStream_t stream) + rmm::cuda_stream_view stream) { size_type estimated_size = estimate_join_output_size( build_table, probe_table, hash_table, compare_nulls, stream); @@ -278,17 +280,18 @@ std::pair, rmm::device_vector> probe_jo row_hash hash_probe{probe_table}; row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; probe_hash_table - <<>>(hash_table, - build_table, - probe_table, - hash_probe, - equality, - left_indices.data().get(), - right_indices.data().get(), - write_index.data(), - estimated_size); - - CHECK_CUDA(stream); + <<>>( + hash_table, + build_table, + probe_table, + hash_probe, + equality, + left_indices.data().get(), + right_indices.data().get(), + write_index.data(), + estimated_size); + + CHECK_CUDA(stream.value()); join_size = write_index.value(stream); current_estimated_size = estimated_size; @@ -388,8 +391,8 @@ std::pair, std::unique_ptr
> construct_join_output_ VectorPair &joined_indices, std::vector> const &columns_in_common, cudf::hash_join::common_columns_output_side common_columns_output_side, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { std::vector probe_common_col; probe_common_col.reserve(columns_in_common.size()); @@ -481,7 +484,7 @@ hash_join::hash_join_impl::~hash_join_impl() = default; hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, std::vector const &build_on, - cudaStream_t stream) + rmm::cuda_stream_view stream) : _build(build), _build_selected(build.select(build_on)), _build_on(build_on), @@ -505,12 +508,12 @@ hash_join::hash_join_impl::inner_join( std::vector> const &columns_in_common, common_columns_output_side common_columns_output_side, null_equality compare_nulls, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, mr, stream); + probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } std::unique_ptr hash_join::hash_join_impl::left_join( @@ -518,8 +521,8 @@ std::unique_ptr hash_join::hash_join_impl::left_join( std::vector const &probe_on, std::vector> const &columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); auto probe_build_pair = @@ -528,8 +531,8 @@ std::unique_ptr hash_join::hash_join_impl::left_join( columns_in_common, common_columns_output_side::PROBE, compare_nulls, - mr, - stream); + stream, + mr); return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } @@ -539,8 +542,8 @@ std::unique_ptr hash_join::hash_join_impl::full_join( std::vector const &probe_on, std::vector> const &columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_FUNC_RANGE(); auto probe_build_pair = @@ -549,8 +552,8 @@ std::unique_ptr hash_join::hash_join_impl::full_join( columns_in_common, common_columns_output_side::PROBE, compare_nulls, - mr, - stream); + stream, + mr); return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } @@ -563,8 +566,8 @@ hash_join::hash_join_impl::compute_hash_join( std::vector> const &columns_in_common, common_columns_output_side common_columns_output_side, null_equality compare_nulls, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, @@ -600,7 +603,7 @@ hash_join::hash_join_impl::compute_hash_join( : JoinKind; auto joined_indices = probe_join_indices(probe_selected, compare_nulls, stream); return cudf::detail::construct_join_output_df( - probe, _build, joined_indices, columns_in_common, common_columns_output_side, mr, stream); + probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr); } template @@ -608,7 +611,7 @@ std::enable_if_t, rmm::device_vector>> hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, null_equality compare_nulls, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { // Trivial left join case - exit early if (!_hash_table && JoinKind == cudf::detail::join_kind::LEFT_JOIN) { diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index f49a563e857..36cb486d4c1 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -15,16 +15,19 @@ */ #pragma once +#include +#include + #include #include #include #include #include -#include -#include +#include #include + #include namespace cudf { @@ -58,7 +61,7 @@ size_type estimate_join_output_size(table_device_view build_table, table_device_view probe_table, multimap_type const& hash_table, null_equality compare_nulls, - cudaStream_t stream) + rmm::cuda_stream_view stream) { using estimate_size_type = int64_t; // use 64-bit size so we can detect overflow @@ -95,7 +98,7 @@ size_type estimate_join_output_size(table_device_view build_table, estimate_size_type h_size_estimate{0}; rmm::device_scalar size_estimate(0, stream); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; int numBlocks{-1}; @@ -122,14 +125,14 @@ size_type estimate_join_output_size(table_device_view build_table, // Probe the hash table without actually building the output to simply // find what the size of the output will be. compute_join_output_size - <<>>(hash_table, - build_table, - probe_table, - hash_probe, - equality, - sample_probe_num_rows, - size_estimate.data()); - CHECK_CUDA(stream); + <<>>(hash_table, + build_table, + probe_table, + hash_probe, + equality, + sample_probe_num_rows, + size_estimate.data()); + CHECK_CUDA(stream.value()); // Only in case subset of probe table is chosen, // increase the estimated output size by a factor of the ratio between the @@ -177,13 +180,13 @@ size_type estimate_join_output_size(table_device_view build_table, * @return Join output indices vector pair */ inline std::pair, rmm::device_vector> -get_trivial_left_join_indices(table_view const& left, cudaStream_t stream) +get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream) { rmm::device_vector left_indices(left.num_rows()); thrust::sequence( - rmm::exec_policy(stream)->on(stream), left_indices.begin(), left_indices.end(), 0); + rmm::exec_policy(stream)->on(stream.value()), left_indices.begin(), left_indices.end(), 0); rmm::device_vector right_indices(left.num_rows()); - thrust::fill(rmm::exec_policy(stream)->on(stream), + thrust::fill(rmm::exec_policy(stream)->on(stream.value()), right_indices.begin(), right_indices.end(), JoinNoneValue); @@ -226,7 +229,7 @@ struct hash_join::hash_join_impl { */ hash_join_impl(cudf::table_view const& build, std::vector const& build_on, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream = rmm::cuda_stream_default); std::pair, std::unique_ptr> inner_join( cudf::table_view const& probe, @@ -234,24 +237,24 @@ struct hash_join::hash_join_impl { std::vector> const& columns_in_common, common_columns_output_side common_columns_output_side, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; std::unique_ptr left_join( cudf::table_view const& probe, std::vector const& probe_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; std::unique_ptr full_join( cudf::table_view const& probe, std::vector const& probe_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; private: /** @@ -302,8 +305,8 @@ struct hash_join::hash_join_impl { std::vector> const& columns_in_common, common_columns_output_side common_columns_output_side, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) const; + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; /** * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, @@ -325,7 +328,7 @@ struct hash_join::hash_join_impl { std::pair, rmm::device_vector>> probe_join_indices(cudf::table_view const& probe, null_equality compare_nulls, - cudaStream_t stream) const; + rmm::cuda_stream_view stream) const; }; } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 7c750395e61..af649fe5fb0 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -13,13 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include +#include + #include #include #include #include -#include -#include +#include namespace cudf { namespace detail { @@ -31,15 +33,16 @@ std::unique_ptr
inner_join( std::vector const& right_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( {left_input.select(left_on), right_input.select(right_on)}, - rmm::mr::get_current_device_resource(), // temporary objects returned - stream); + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + // now rebuild the table views with the updated ones auto const left = scatter_columns(matched.second.front(), left_on, left_input); auto const right = scatter_columns(matched.second.back(), right_on, right_input); @@ -58,19 +61,19 @@ std::unique_ptr
inner_join( actual_columns_in_common, cudf::hash_join::common_columns_output_side::BUILD, compare_nulls, - mr, - stream); + stream, + mr); return cudf::detail::combine_table_pair(std::move(probe_build_pair.second), std::move(probe_build_pair.first)); } else { - cudf::hash_join hj_obj(right, right_on); + cudf::hash_join hj_obj(right, right_on, stream); auto probe_build_pair = hj_obj.inner_join(left, left_on, columns_in_common, cudf::hash_join::common_columns_output_side::PROBE, compare_nulls, - mr, - stream); + stream, + mr); return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), std::move(probe_build_pair.second)); } @@ -83,21 +86,21 @@ std::unique_ptr
left_join( std::vector const& right_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( {left_input.select(left_on), right_input.select(right_on)}, // these should match - rmm::mr::get_current_device_resource(), // temporary objects returned - stream); + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); cudf::hash_join hj_obj(right, right_on, stream); - return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, mr, stream); + return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr); } std::unique_ptr
full_join( @@ -107,21 +110,21 @@ std::unique_ptr
full_join( std::vector const& right_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Make sure any dictionary columns have matched key sets. // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( {left_input.select(left_on), right_input.select(right_on)}, // these should match - rmm::mr::get_current_device_resource(), // temporary objects returned - stream); + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned // now rebuild the table views with the updated ones table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); cudf::hash_join hj_obj(right, right_on, stream); - return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, mr, stream); + return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr); } } // namespace detail @@ -130,7 +133,7 @@ hash_join::~hash_join() = default; hash_join::hash_join(cudf::table_view const& build, std::vector const& build_on, - cudaStream_t stream) + rmm::cuda_stream_view stream) : impl{std::make_unique(build, build_on, stream)} { } @@ -141,11 +144,11 @@ std::pair, std::unique_ptr> hash_join: std::vector> const& columns_in_common, common_columns_output_side common_columns_output_side, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { return impl->inner_join( - probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, mr, stream); + probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr); } std::unique_ptr hash_join::left_join( @@ -153,10 +156,10 @@ std::unique_ptr hash_join::left_join( std::vector const& probe_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, mr, stream); + return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } std::unique_ptr hash_join::full_join( @@ -164,10 +167,10 @@ std::unique_ptr hash_join::full_join( std::vector const& probe_on, std::vector> const& columns_in_common, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, mr, stream); + return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr); } // external APIs @@ -183,7 +186,7 @@ std::unique_ptr
inner_join( { CUDF_FUNC_RANGE(); return detail::inner_join( - left, right, left_on, right_on, columns_in_common, compare_nulls, mr, 0); + left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr
left_join( @@ -196,7 +199,8 @@ std::unique_ptr
left_join( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_join(left, right, left_on, right_on, columns_in_common, compare_nulls, mr, 0); + return detail::left_join( + left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr
full_join( @@ -209,7 +213,8 @@ std::unique_ptr
full_join( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::full_join(left, right, left_on, right_on, columns_in_common, compare_nulls, mr, 0); + return detail::full_join( + left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh index 470549265d1..03d684f91d4 100644 --- a/cpp/src/join/nested_loop_join.cuh +++ b/cpp/src/join/nested_loop_join.cuh @@ -15,18 +15,21 @@ */ #pragma once +#include "hash_join.cuh" +#include "join_common_utils.hpp" +#include "join_kernels.cuh" + #include #include #include #include #include #include -#include +#include -#include "cudf/types.hpp" -#include "hash_join.cuh" -#include "join_common_utils.hpp" -#include "join_kernels.cuh" +#include + +#include namespace cudf { namespace detail { @@ -48,7 +51,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left, table_device_view right, join_kind JoinKind, null_equality compare_nulls, - cudaStream_t stream) + rmm::cuda_stream_view stream) { const size_type left_num_rows{left.num_rows()}; const size_type right_num_rows{right.num_rows()}; @@ -72,7 +75,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left, size_type h_size_estimate{0}; rmm::device_scalar size_estimate(0, stream); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; int numBlocks{-1}; @@ -91,9 +94,10 @@ size_type estimate_nested_loop_join_output_size(table_device_view left, row_equality equality{left, right, compare_nulls == null_equality::EQUAL}; // Determine number of output rows without actually building the output to simply // find what the size of the output will be. - compute_nested_loop_join_output_size<<>>( - left, right, JoinKind, equality, size_estimate.data()); - CHECK_CUDA(stream); + compute_nested_loop_join_output_size + <<>>( + left, right, JoinKind, equality, size_estimate.data()); + CHECK_CUDA(stream.value()); h_size_estimate = size_estimate.value(stream); @@ -120,7 +124,7 @@ get_base_nested_loop_join_indices(table_view const& left, bool flip_join_indices, join_kind JoinKind, null_equality compare_nulls, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // The `right` table is always used for the inner loop. We want to use the smaller table // for the inner loop. Thus, if `left` is smaller than `right`, swap `left/right`. @@ -167,16 +171,16 @@ get_base_nested_loop_join_indices(table_view const& left, const auto& join_output_r = flip_join_indices ? left_indices.data().get() : right_indices.data().get(); nested_loop_join - <<>>(*left_table, - *right_table, - JoinKind, - equality, - join_output_l, - join_output_r, - write_index.data(), - estimated_size); - - CHECK_CUDA(stream); + <<>>(*left_table, + *right_table, + JoinKind, + equality, + join_output_l, + join_output_r, + write_index.data(), + estimated_size); + + CHECK_CUDA(stream.value()); join_size = write_index.value(); current_estimated_size = estimated_size; diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 2b58c1a864a..6df329243ed 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -75,8 +75,8 @@ std::unique_ptr left_semi_anti_join( std::vector const& right_on, std::vector const& return_columns, null_equality compare_nulls, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty"); CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty"); @@ -100,8 +100,9 @@ std::unique_ptr left_semi_anti_join( // This will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( {left.select(left_on), right.select(right_on)}, - rmm::mr::get_current_device_resource(), // temporary objects returned - stream); + stream, + rmm::mr::get_current_device_resource()); // temporary objects returned + auto const left_selected = matched.second.front(); auto const right_selected = matched.second.back(); @@ -120,13 +121,14 @@ std::unique_ptr left_semi_anti_join( row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL}; auto hash_table_ptr = hash_table_type::create(hash_table_size, + stream, std::numeric_limits::max(), std::numeric_limits::max(), hash_build, equality_build); auto hash_table = *hash_table_ptr; - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), right_num_rows, [hash_table] __device__(size_type idx) mutable { @@ -145,7 +147,7 @@ std::unique_ptr left_semi_anti_join( // gather_map_end will be the end of valid data in gather_map auto gather_map_end = thrust::copy_if( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(left_num_rows), gather_map.begin(), @@ -171,7 +173,7 @@ std::unique_ptr left_semi_join(cudf::table_view const& left, { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, return_columns, compare_nulls, mr, 0); + left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr left_anti_join(cudf::table_view const& left, @@ -184,7 +186,7 @@ std::unique_ptr left_anti_join(cudf::table_view const& left, { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - left, right, left_on, right_on, return_columns, compare_nulls, mr, 0); + left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index ccf57a09d52..c65a0518431 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -31,7 +31,7 @@ namespace detail { std::unique_ptr copy_slice(lists_column_view const& lists, size_type start, size_type end, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (lists.is_empty()) { return cudf::empty_like(lists.parent()); } @@ -54,7 +54,7 @@ std::unique_ptr copy_slice(lists_column_view const& lists, // Compute the offsets column of the result: thrust::transform( - execpol->on(stream), + execpol->on(stream.value()), offsets_data + start, offsets_data + end + 1, // size of offsets column is 1 greater than slice length out_offsets.data(), @@ -73,8 +73,7 @@ std::unique_ptr copy_slice(lists_column_view const& lists, cudf::detail::slice(lists.child(), {start_offset, end_offset}, stream).front()); // Compute the null mask of the result: - auto null_mask = - cudf::detail::copy_bitmask(lists.null_mask(), start, end, rmm::cuda_stream_view{stream}, mr); + auto null_mask = cudf::detail::copy_bitmask(lists.null_mask(), start, end, stream, mr); return make_lists_column(lists_count, std::move(offsets), diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu index 96c20fd93ad..a6d9e0baf40 100644 --- a/cpp/src/lists/copying/gather.cu +++ b/cpp/src/lists/copying/gather.cu @@ -13,13 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include + #include #include #include +#include + namespace cudf { namespace lists { namespace detail { diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu index 5adb21a47f1..342bd006ea2 100644 --- a/cpp/src/lists/extract.cu +++ b/cpp/src/lists/extract.cu @@ -18,6 +18,8 @@ #include #include +#include + #include namespace cudf { @@ -57,7 +59,7 @@ struct map_index_fn { */ std::unique_ptr extract_list_element(lists_column_view lists_column, size_type index, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (lists_column.is_empty()) return empty_like(lists_column.parent()); @@ -80,13 +82,13 @@ std::unique_ptr extract_list_element(lists_column_view lists_column, // build the gather map using the offsets and the provided index auto const d_column = column_device_view::create(annotated_offsets, stream); if (index < 0) - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(gather_map->size()), d_gather_map, map_index_fn{*d_column, index, child_column.size()}); else - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(gather_map->size()), d_gather_map, @@ -114,7 +116,7 @@ std::unique_ptr extract_list_element(lists_column_view const& lists_colu size_type index, rmm::mr::device_memory_resource* mr) { - return detail::extract_list_element(lists_column, index, 0, mr); + return detail::extract_list_element(lists_column, index, rmm::cuda_stream_default, mr); } } // namespace lists diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 265a20bcbb7..0afdac17a7d 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include #include @@ -24,6 +25,7 @@ #include #include +#include #include #include @@ -103,7 +105,7 @@ void materialize_bitmask(column_view const& left_col, column_view const& right_col, mutable_column_view& out_col, index_type const* merged_indices, - cudaStream_t stream) + rmm::cuda_stream_view stream) { constexpr size_type BLOCK_SIZE{256}; detail::grid_1d grid_config{out_col.size(), BLOCK_SIZE}; @@ -119,24 +121,24 @@ void materialize_bitmask(column_view const& left_col, if (left_col.has_nulls()) { if (right_col.has_nulls()) { materialize_merged_bitmask_kernel - <<>>( + <<>>( left_valid, right_valid, out_valid, out_col.size(), merged_indices); } else { materialize_merged_bitmask_kernel - <<>>( + <<>>( left_valid, right_valid, out_valid, out_col.size(), merged_indices); } } else { if (right_col.has_nulls()) { materialize_merged_bitmask_kernel - <<>>( + <<>>( left_valid, right_valid, out_valid, out_col.size(), merged_indices); } else { CUDF_FAIL("materialize_merged_bitmask_kernel() should never be called."); } } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); } /** @@ -161,8 +163,8 @@ rmm::device_vector generate_merged_indices( table_view const& right_table, std::vector const& column_order, std::vector const& null_precedence, - bool nullable = true, - cudaStream_t stream = nullptr) + bool nullable = true, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { const size_type left_size = left_table.num_rows(); const size_type right_size = right_table.num_rows(); @@ -200,7 +202,7 @@ rmm::device_vector generate_merged_indices( *rhs_device_view, d_column_order.data().get(), d_null_precedence.data().get()); - thrust::merge(exec_pol->on(stream), + thrust::merge(exec_pol->on(stream.value()), left_begin_zip_iterator, left_end_zip_iterator, right_begin_zip_iterator, @@ -210,7 +212,7 @@ rmm::device_vector generate_merged_indices( } else { auto ineq_op = detail::row_lexicographic_tagged_comparator( *lhs_device_view, *rhs_device_view, d_column_order.data().get()); - thrust::merge(exec_pol->on(stream), + thrust::merge(exec_pol->on(stream.value()), left_begin_zip_iterator, left_end_zip_iterator, right_begin_zip_iterator, @@ -219,7 +221,7 @@ rmm::device_vector generate_merged_indices( ineq_op); } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return merged_indices; } @@ -231,24 +233,24 @@ rmm::device_vector generate_merged_indices( * (ordered according to indices of key_cols) and the 2 columns to merge. */ struct column_merger { - explicit column_merger( - index_vector const& row_order, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = nullptr) - : row_order_(row_order), mr_(mr), stream_(stream) - { - } + explicit column_merger(index_vector const& row_order) : row_order_(row_order) {} // column merger operator; // template // required: column type - std::unique_ptr operator()(column_view const& lcol, column_view const& rcol) const + std::unique_ptr operator()( + column_view const& lcol, + column_view const& rcol, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const { auto lsz = lcol.size(); auto merged_size = lsz + rcol.size(); - auto type = lcol.type(); - auto merged_col = lcol.has_nulls() ? cudf::allocate_like(lcol, merged_size) - : cudf::allocate_like(rcol, merged_size); + auto merged_col = cudf::detail::allocate_like(lcol.has_nulls() ? lcol : rcol, + merged_size, + cudf::mask_allocation_policy::RETAIN, + stream, + mr); //"gather" data from lcol, rcol according to row_order_ "map" //(directly calling gather() won't work because @@ -258,12 +260,13 @@ struct column_merger { // initialize null_mask to all valid: // - // Note: this initialization in conjunction with _conditionally_ - // calling materialize_bitmask() below covers the case - // materialize_merged_bitmask_kernel() - // which won't be called anymore (because of the _condition_ below) + // Note: this initialization in conjunction with + // _conditionally_ calling materialize_bitmask() below covers + // the case materialize_merged_bitmask_kernel() + // which won't be called anymore (because of the _condition_ + // below) // - cudf::detail::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream_); + cudf::detail::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream); // set the null count: // @@ -276,13 +279,13 @@ struct column_merger { auto const d_lcol = lcol.data(); auto const d_rcol = rcol.data(); - auto exe_pol = rmm::exec_policy(stream_); + auto exe_pol = rmm::exec_policy(stream); // capture lcol, rcol // and "gather" into merged_view.data()[indx_merged] // from lcol or rcol, depending on side; // - thrust::transform(exe_pol->on(stream_), + thrust::transform(exe_pol->on(stream.value()), row_order_.begin(), row_order_.end(), merged_view.begin(), @@ -299,7 +302,7 @@ struct column_merger { if (lcol.has_nulls() || rcol.has_nulls()) { // resolve null mask: // - materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream_); + materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream); } return merged_col; @@ -307,42 +310,43 @@ struct column_merger { private: index_vector const& row_order_; - rmm::mr::device_memory_resource* mr_; - cudaStream_t stream_; }; // specialization for strings template <> -std::unique_ptr column_merger::operator()(column_view const& lcol, - column_view const& rcol) const +std::unique_ptr column_merger::operator()( + column_view const& lcol, + column_view const& rcol, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { auto column = strings::detail::merge(strings_column_view(lcol), strings_column_view(rcol), row_order_.begin(), row_order_.end(), - mr_, - stream_); + stream, + mr); if (lcol.has_nulls() || rcol.has_nulls()) { auto merged_view = column->mutable_view(); - materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream_); + materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream); } return column; } // specialization for dictionary template <> -std::unique_ptr column_merger::operator()(column_view const& lcol, - column_view const& rcol) const +std::unique_ptr column_merger::operator()( + column_view const& lcol, + column_view const& rcol, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - auto result = cudf::dictionary::detail::merge(cudf::dictionary_column_view(lcol), - cudf::dictionary_column_view(rcol), - row_order_, - mr_, - stream_); + auto result = cudf::dictionary::detail::merge( + cudf::dictionary_column_view(lcol), cudf::dictionary_column_view(rcol), row_order_, stream, mr); // set the validity mask if (lcol.has_nulls() || rcol.has_nulls()) { auto merged_view = result->mutable_view(); - materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream_); + materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream); } return result; } @@ -355,8 +359,8 @@ table_ptr_type merge(cudf::table_view const& left_table, std::vector const& key_cols, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // collect index columns for lhs, rhs, resp. // @@ -375,13 +379,14 @@ table_ptr_type merge(cudf::table_view const& left_table, std::vector> merged_cols; merged_cols.reserve(n_cols); - column_merger merger{merged_indices, mr, stream}; + column_merger merger{merged_indices}; transform(left_table.begin(), left_table.end(), right_table.begin(), std::back_inserter(merged_cols), [&](auto const& left_col, auto const& right_col) { - return cudf::type_dispatcher(left_col.type(), merger, left_col, right_col); + return cudf::type_dispatcher( + left_col.type(), merger, left_col, right_col, stream, mr); }); return std::make_unique(std::move(merged_cols)); @@ -417,8 +422,8 @@ table_ptr_type merge(std::vector const& tables_to_merge, std::vector const& key_cols, std::vector const& column_order, std::vector const& null_precedence, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (tables_to_merge.empty()) { return std::make_unique(); } @@ -444,7 +449,7 @@ table_ptr_type merge(std::vector const& tables_to_merge, // This utility will ensure all corresponding dictionary columns have matching keys. // It will return any new dictionary columns created as well as updated table_views. auto matched = cudf::dictionary::detail::match_dictionaries( - tables_to_merge, rmm::mr::get_current_device_resource(), stream); + tables_to_merge, stream, rmm::mr::get_current_device_resource()); auto merge_tables = matched.second; // A queue of (table view, table) pairs @@ -468,14 +473,14 @@ table_ptr_type merge(std::vector const& tables_to_merge, auto const right_table = top_and_pop(merge_queue); // Only use mr for the output table - auto const& new_tbl_rm = merge_queue.empty() ? mr : rmm::mr::get_current_device_resource(); + auto const& new_tbl_mr = merge_queue.empty() ? mr : rmm::mr::get_current_device_resource(); auto merged_table = merge(left_table.view, right_table.view, key_cols, column_order, null_precedence, - new_tbl_rm, - stream); + stream, + new_tbl_mr); auto const merged_table_view = merged_table->view(); merge_queue.emplace(merged_table_view, std::move(merged_table)); @@ -493,7 +498,8 @@ std::unique_ptr merge(std::vector const& tables_to_merg rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::merge(tables_to_merge, key_cols, column_order, null_precedence, mr); + return detail::merge( + tables_to_merge, key_cols, column_order, null_precedence, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 3d0f35568f4..6f79cf3aa08 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -26,6 +26,8 @@ #include #include +#include + namespace cudf { namespace { // Launch configuration for optimized hash partition @@ -338,7 +340,7 @@ void copy_block_partitions_impl(InputIter const input, size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // We need 3 chunks of shared memory: // 1. BLOCK_SIZE * ROWS_PER_THREAD elements of size_type for copying to output @@ -347,7 +349,7 @@ void copy_block_partitions_impl(InputIter const input, int const smem = OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD * sizeof(*output) + (num_partitions + 1) * sizeof(size_type) * 2; - copy_block_partitions<<>>( + copy_block_partitions<<>>( input, output, num_rows, @@ -365,7 +367,7 @@ rmm::device_vector compute_gather_map(size_type num_rows, size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto sequence = thrust::make_counting_iterator(0); rmm::device_vector gather_map(num_rows); @@ -393,8 +395,8 @@ struct copy_block_partitions_dispatcher { size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr); @@ -420,8 +422,8 @@ struct copy_block_partitions_dispatcher { size_type const* block_partition_sizes, size_type const* scanned_block_partition_sizes, size_type grid_size, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Use move_to_output_buffer to create an equivalent gather map auto gather_map = compute_gather_map(input.size(), @@ -451,8 +453,8 @@ std::pair, std::vector> hash_partition_table( table_view const& input, table_view const& table_to_hash, size_type num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const num_rows = table_to_hash.num_rows(); @@ -500,14 +502,14 @@ std::pair, std::vector> hash_partition_table( compute_row_partition_numbers<<>>(hasher, - num_rows, - num_partitions, - partitioner_type(num_partitions), - row_partition_numbers.data().get(), - row_partition_offset.data().get(), - block_partition_sizes.data().get(), - global_partition_sizes.data().get()); + stream.value()>>>(hasher, + num_rows, + num_partitions, + partitioner_type(num_partitions), + row_partition_numbers.data().get(), + row_partition_offset.data().get(), + block_partition_sizes.data().get(), + global_partition_sizes.data().get()); } else { // Determines how the mapping between hash value and partition number is // computed @@ -520,19 +522,19 @@ std::pair, std::vector> hash_partition_table( compute_row_partition_numbers<<>>(hasher, - num_rows, - num_partitions, - partitioner_type(num_partitions), - row_partition_numbers.data().get(), - row_partition_offset.data().get(), - block_partition_sizes.data().get(), - global_partition_sizes.data().get()); + stream.value()>>>(hasher, + num_rows, + num_partitions, + partitioner_type(num_partitions), + row_partition_numbers.data().get(), + row_partition_offset.data().get(), + block_partition_sizes.data().get(), + global_partition_sizes.data().get()); } // Compute exclusive scan of all blocks' partition sizes in-place to determine // the starting point for each blocks portion of each partition in the output - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), block_partition_sizes.begin(), block_partition_sizes.end(), scanned_block_partition_sizes.data().get()); @@ -541,7 +543,7 @@ std::pair, std::vector> hash_partition_table( // location of each partition in final output. // TODO This can be done independently on a separate stream size_type* scanned_global_partition_sizes{global_partition_sizes.data().get()}; - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), global_partition_sizes.begin(), global_partition_sizes.end(), scanned_global_partition_sizes); @@ -553,7 +555,7 @@ std::pair, std::vector> hash_partition_table( scanned_global_partition_sizes, num_partitions * sizeof(size_type), cudaMemcpyDeviceToHost, - stream)); + stream.value())); // When the number of partitions is less than a threshold, we can apply an // optimization using shared memory to copy values to the output buffer. @@ -579,8 +581,8 @@ std::pair, std::vector> hash_partition_table( block_partition_sizes_ptr, scanned_block_partition_sizes_ptr, grid_size, - mr, - stream); + stream, + mr); }); if (has_nulls(input)) { @@ -609,7 +611,7 @@ std::pair, std::vector> hash_partition_table( compute_row_output_locations<<>>( + stream.value()>>>( row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr); // Use the resulting scatter map to materialize the output @@ -646,8 +648,8 @@ struct dispatch_map_type { operator()(table_view const& t, column_view const& partition_map, size_type num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { // Build a histogram of the number of rows in each partition rmm::device_vector histogram(num_partitions + 1); @@ -663,7 +665,7 @@ struct dispatch_map_type { lower_level, upper_level, partition_map.size(), - stream); + stream.value()); rmm::device_buffer temp_storage(temp_storage_bytes, stream); @@ -675,12 +677,14 @@ struct dispatch_map_type { lower_level, upper_level, partition_map.size(), - stream); + stream.value()); // `histogram` was created with an extra entry at the end such that an // exclusive scan will put the total number of rows at the end - thrust::exclusive_scan( - rmm::exec_policy()->on(stream), histogram.begin(), histogram.end(), histogram.begin()); + thrust::exclusive_scan(rmm::exec_policy()->on(stream.value()), + histogram.begin(), + histogram.end(), + histogram.begin()); // Copy offsets to host std::vector partition_offsets(histogram.size()); @@ -692,7 +696,7 @@ struct dispatch_map_type { // For each `partition_map[i]`, atomically increment the corresponding // partition offset to determine `i`s location in the output - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), partition_map.begin(), partition_map.end(), scatter_map.begin(), @@ -713,8 +717,8 @@ struct dispatch_map_type { operator()(table_view const& t, column_view const& partition_map, size_type num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("Unexpected, non-integral partition map."); } @@ -723,12 +727,13 @@ struct dispatch_map_type { namespace detail { namespace local { + std::pair, std::vector> hash_partition( table_view const& input, std::vector const& columns_to_hash, int num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto table_to_hash = input.select(columns_to_hash); @@ -738,9 +743,9 @@ std::pair, std::vector> hash_partition( } if (has_nulls(table_to_hash)) { - return hash_partition_table(input, table_to_hash, num_partitions, mr, stream); + return hash_partition_table(input, table_to_hash, num_partitions, stream, mr); } else { - return hash_partition_table(input, table_to_hash, num_partitions, mr, stream); + return hash_partition_table(input, table_to_hash, num_partitions, stream, mr); } } } // namespace local @@ -749,8 +754,8 @@ std::pair, std::vector> partition( table_view const& t, column_view const& partition_map, size_type num_partitions, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(t.num_rows() == partition_map.size(), "Size mismatch between table and partition map."); @@ -761,7 +766,7 @@ std::pair, std::vector> partition( } return cudf::type_dispatcher( - partition_map.type(), dispatch_map_type{}, t, partition_map, num_partitions, mr, stream); + partition_map.type(), dispatch_map_type{}, t, partition_map, num_partitions, stream, mr); } } // namespace detail @@ -773,7 +778,8 @@ std::pair, std::vector> hash_partition( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::local::hash_partition(input, columns_to_hash, num_partitions, mr); + return detail::local::hash_partition( + input, columns_to_hash, num_partitions, rmm::cuda_stream_default, mr); } // Partition based on an explicit partition map @@ -784,7 +790,7 @@ std::pair, std::vector> partition( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::partition(t, partition_map, num_partitions, mr); + return detail::partition(t, partition_map, num_partitions, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index aadcaa6d51f..6367293a9d3 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -27,6 +26,9 @@ #include #include +#include +#include + #include #include #include @@ -77,8 +79,8 @@ std::pair, std::vector> degenerate cudf::table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto nrows = input.num_rows(); @@ -93,7 +95,7 @@ std::pair, std::vector> degenerate if (num_partitions == nrows) { VectorT partition_offsets(num_partitions, cudf::size_type{0}); auto exec = rmm::exec_policy(stream); - thrust::sequence(exec->on(stream), partition_offsets.begin(), partition_offsets.end()); + thrust::sequence(exec->on(stream.value()), partition_offsets.begin(), partition_offsets.end()); auto uniq_tbl = cudf::detail::gather(input, rotated_iter_begin, @@ -109,9 +111,9 @@ std::pair, std::vector> degenerate partition_offsets.data().get(), sizeof(cudf::size_type) * num_partitions, cudaMemcpyDeviceToHost, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); return ret_pair; } else { //( num_partitions > nrows ) @@ -122,7 +124,7 @@ std::pair, std::vector> degenerate //(this relies on a _stable_ copy_if()) // auto exec = rmm::exec_policy(stream); - thrust::copy_if(exec->on(stream), + thrust::copy_if(exec->on(stream.value()), rotated_iter_begin, rotated_iter_begin + num_partitions, d_row_indices.begin(), @@ -151,7 +153,7 @@ std::pair, std::vector> degenerate // offsets (part 2: compute partition offsets): // VectorT partition_offsets(num_partitions, cudf::size_type{0}); - thrust::exclusive_scan(exec->on(stream), + thrust::exclusive_scan(exec->on(stream.value()), nedges_iter_begin, nedges_iter_begin + num_partitions, partition_offsets.begin()); @@ -160,9 +162,9 @@ std::pair, std::vector> degenerate partition_offsets.data().get(), sizeof(cudf::size_type) * num_partitions, cudaMemcpyDeviceToHost, - stream)); + stream.value())); - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); return ret_pair; } @@ -175,8 +177,8 @@ std::pair, std::vector> round_robin_part table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto nrows = input.num_rows(); @@ -191,7 +193,7 @@ std::pair, std::vector> round_robin_part // handle degenerate case: // if (num_partitions >= nrows) { - return degenerate_partitions(input, num_partitions, start_partition, mr, stream); + return degenerate_partitions(input, num_partitions, start_partition, stream, mr); } auto np_max_size = nrows % num_partitions; //# partitions of max size @@ -288,7 +290,8 @@ std::pair, std::vector> round_robi rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_FUNC_RANGE(); - return cudf::detail::round_robin_partition(input, num_partitions, start_partition, mr); + return cudf::detail::round_robin_partition( + input, num_partitions, start_partition, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index bfb592595c3..ab0e45f648a 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -24,8 +24,11 @@ #include #include +#include + #include #include + #include namespace cudf { @@ -183,7 +186,7 @@ struct minmax_functor { std::enable_if_t() and !std::is_same::value and !cudf::is_dictionary()> * = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &col, rmm::mr::device_memory_resource *mr, rmm::cuda_stream_view stream) + cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { // compute minimum and maximum values auto dev_result = reduce(col, stream); @@ -202,7 +205,7 @@ struct minmax_functor { */ template ::value> * = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &col, rmm::mr::device_memory_resource *mr, rmm::cuda_stream_view stream) + cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { // compute minimum and maximum values auto dev_result = reduce(col, stream); @@ -212,8 +215,8 @@ struct minmax_functor { CUDA_TRY(cudaMemcpyAsync( &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value())); // strings are copied to create the scalars here - return {std::make_unique(host_result.min_val, true, stream.value(), mr), - std::make_unique(host_result.max_val, true, stream.value(), mr)}; + return {std::make_unique(host_result.min_val, true, stream, mr), + std::make_unique(host_result.max_val, true, stream, mr)}; } /** @@ -221,7 +224,7 @@ struct minmax_functor { */ template ()> * = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &col, rmm::mr::device_memory_resource *mr, rmm::cuda_stream_view stream) + cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { // compute minimum and maximum values auto dev_result = reduce(col, stream); @@ -232,13 +235,13 @@ struct minmax_functor { &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value())); // get the keys for those indexes auto const keys = dictionary_column_view(col).keys(); - return {get_element(keys, static_cast(host_result.min_val), stream.value(), mr), - get_element(keys, static_cast(host_result.max_val), stream.value(), mr)}; + return {get_element(keys, static_cast(host_result.min_val), stream, mr), + get_element(keys, static_cast(host_result.max_val), stream, mr)}; } template ()> * = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &, rmm::mr::device_memory_resource *, rmm::cuda_stream_view) + cudf::column_view const &, rmm::cuda_stream_view, rmm::mr::device_memory_resource *) { CUDF_FAIL("type not supported for minmax() operation"); } @@ -256,7 +259,7 @@ std::pair, std::unique_ptr> minmax( make_default_constructed_scalar(col.type())}; } - return type_dispatcher(col.type(), minmax_functor{}, col, mr, stream); + return type_dispatcher(col.type(), minmax_functor{}, col, stream, mr); } } // namespace detail @@ -264,9 +267,9 @@ std::pair, std::unique_ptr> minmax( * @copydoc cudf::minmax */ std::pair, std::unique_ptr> minmax( - const cudf::column_view &col, rmm::mr::device_memory_resource *mr) + const column_view &col, rmm::mr::device_memory_resource *mr) { - return cudf::detail::minmax(col, rmm::cuda_stream_default, mr); + return detail::minmax(col, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu index 6d90124db36..2c0b8b8d71d 100644 --- a/cpp/src/reductions/scan.cu +++ b/cpp/src/reductions/scan.cu @@ -57,23 +57,22 @@ struct ScanDispatcher { template ::value, T>* = nullptr> auto exclusive_scan(const column_view& input_view, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const size_type size = input_view.size(); auto output_column = detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr); if (null_handling == null_policy::EXCLUDE) { - output_column->set_null_mask( - detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr), - input_view.null_count()); + output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr), + input_view.null_count()); } mutable_column_view output = output_column->mutable_view(); auto d_input = column_device_view::create(input_view, stream); if (input_view.has_nulls()) { auto input = make_null_replacement_iterator(*d_input, Op::template identity()); - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), input, input + size, output.data(), @@ -81,7 +80,7 @@ struct ScanDispatcher { Op{}); } else { auto input = d_input->begin(); - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), input, input + size, output.data(), @@ -89,7 +88,7 @@ struct ScanDispatcher { Op{}); } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return output_column; } @@ -97,24 +96,25 @@ struct ScanDispatcher { template (), T>* = nullptr> std::unique_ptr exclusive_scan(const column_view& input_view, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("String types supports only inclusive min/max for `cudf::scan`"); } rmm::device_buffer mask_inclusive_scan(const column_view& input_view, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { rmm::device_buffer mask = detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr); - auto d_input = column_device_view::create(input_view, stream); - auto v = detail::make_validity_iterator(*d_input); - auto first_null_position = - thrust::find_if_not( - rmm::exec_policy(stream)->on(stream), v, v + input_view.size(), thrust::identity{}) - - v; + auto d_input = column_device_view::create(input_view, stream); + auto v = detail::make_validity_iterator(*d_input); + auto first_null_position = thrust::find_if_not(rmm::exec_policy(stream)->on(stream.value()), + v, + v + input_view.size(), + thrust::identity{}) - + v; cudf::set_null_mask( static_cast(mask.data()), 0, first_null_position, true); cudf::set_null_mask( @@ -126,19 +126,18 @@ struct ScanDispatcher { template ::value, T>* = nullptr> auto inclusive_scan(const column_view& input_view, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const size_type size = input_view.size(); auto output_column = detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr); if (null_handling == null_policy::EXCLUDE) { - output_column->set_null_mask( - detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr), - input_view.null_count()); + output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr), + input_view.null_count()); } else { if (input_view.nullable()) { - output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream), + output_column->set_null_mask(mask_inclusive_scan(input_view, stream, mr), cudf::UNKNOWN_NULL_COUNT); } } @@ -149,14 +148,14 @@ struct ScanDispatcher { if (input_view.has_nulls()) { auto input = make_null_replacement_iterator(*d_input, Op::template identity()); thrust::inclusive_scan( - rmm::exec_policy(stream)->on(stream), input, input + size, output.data(), Op{}); + rmm::exec_policy(stream)->on(stream.value()), input, input + size, output.data(), Op{}); } else { auto input = d_input->begin(); thrust::inclusive_scan( - rmm::exec_policy(stream)->on(stream), input, input + size, output.data(), Op{}); + rmm::exec_policy(stream)->on(stream.value()), input, input + size, output.data(), Op{}); } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return output_column; } @@ -164,8 +163,8 @@ struct ScanDispatcher { template (), T>* = nullptr> std::unique_ptr inclusive_scan(const column_view& input_view, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const size_type size = input_view.size(); rmm::device_vector result(size); @@ -174,23 +173,28 @@ struct ScanDispatcher { if (input_view.has_nulls()) { auto input = make_null_replacement_iterator(*d_input, Op::template identity()); - thrust::inclusive_scan( - rmm::exec_policy(stream)->on(stream), input, input + size, result.data().get(), Op{}); + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), + input, + input + size, + result.data().get(), + Op{}); } else { auto input = d_input->begin(); - thrust::inclusive_scan( - rmm::exec_policy(stream)->on(stream), input, input + size, result.data().get(), Op{}); + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), + input, + input + size, + result.data().get(), + Op{}); } - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); auto output_column = make_strings_column(result, Op::template identity(), stream, mr); if (null_handling == null_policy::EXCLUDE) { - output_column->set_null_mask( - detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr), - input_view.null_count()); + output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr), + input_view.null_count()); } else { if (input_view.nullable()) { - output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream), + output_column->set_null_mask(mask_inclusive_scan(input_view, stream, mr), cudf::UNKNOWN_NULL_COUNT); } } @@ -203,8 +207,8 @@ struct ScanDispatcher { * * @param input input column view * @param inclusive inclusive or exclusive scan - * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory * @return * * @tparam T type of input column @@ -213,14 +217,14 @@ struct ScanDispatcher { std::unique_ptr operator()(const column_view& input, scan_type inclusive, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::unique_ptr output; if (inclusive == scan_type::INCLUSIVE) - output = inclusive_scan(input, null_handling, mr, stream); + output = inclusive_scan(input, null_handling, stream, mr); else - output = exclusive_scan(input, null_handling, mr, stream); + output = exclusive_scan(input, null_handling, stream, mr); if (null_handling == null_policy::EXCLUDE) { CUDF_EXPECTS(input.null_count() == output->null_count(), "Input / output column null count mismatch"); @@ -232,8 +236,8 @@ struct ScanDispatcher { std::unique_ptr operator()(const column_view& input, scan_type inclusive, null_policy null_handling, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Non-arithmetic types not supported for `cudf::scan`"); } @@ -244,8 +248,8 @@ std::unique_ptr scan( std::unique_ptr const& agg, scan_type inclusive, null_policy null_handling, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_numeric(input.type()) || is_compound(input.type()), "Unexpected non-numeric or non-string type."); @@ -257,32 +261,32 @@ std::unique_ptr scan( input, inclusive, null_handling, - mr, - stream); + stream, + mr); case aggregation::MIN: return cudf::type_dispatcher(input.type(), ScanDispatcher(), input, inclusive, null_handling, - mr, - stream); + stream, + mr); case aggregation::MAX: return cudf::type_dispatcher(input.type(), ScanDispatcher(), input, inclusive, null_handling, - mr, - stream); + stream, + mr); case aggregation::PRODUCT: return cudf::type_dispatcher(input.type(), ScanDispatcher(), input, inclusive, null_handling, - mr, - stream); + stream, + mr); default: CUDF_FAIL("Unsupported aggregation operator for scan"); } } @@ -295,7 +299,7 @@ std::unique_ptr scan(const column_view& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::scan(input, agg, inclusive, null_handling, mr); + return detail::scan(input, agg, inclusive, null_handling, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 4f084aacb54..ab2bb4ea839 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -32,6 +32,8 @@ #include #include +#include + namespace cudf { namespace detail { namespace { @@ -40,8 +42,8 @@ std::pair, std::unique_ptr> form_offsets_and_cha cudf::column_device_view input, size_type null_count, Transformer offsets_transformer, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::unique_ptr offsets_column{}; auto strings_count = input.size(); @@ -52,19 +54,19 @@ std::pair, std::unique_ptr> form_offsets_and_cha auto offsets_transformer_itr = thrust::make_transform_iterator(input_begin, offsets_transformer); offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); } else { auto offsets_transformer_itr = thrust::make_transform_iterator(input.begin(), offsets_transformer); offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); } auto d_offsets = offsets_column->view().template data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); return std::make_pair(std::move(offsets_column), std::move(chars_column)); } @@ -75,8 +77,8 @@ std::unique_ptr clamp_string_column(strings_column_view const& inp ScalarIterator const& lo_replace_itr, ScalarIterator const& hi_itr, ScalarIterator const& hi_replace_itr, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto input_device_column = column_device_view::create(input.parent(), stream); auto d_input = *input_device_column; @@ -106,7 +108,7 @@ std::unique_ptr clamp_string_column(strings_column_view const& inp }; auto offset_and_char = - form_offsets_and_char_column(d_input, null_count, offsets_transformer, mr, stream); + form_offsets_and_char_column(d_input, null_count, offsets_transformer, stream, mr); auto offsets_column(std::move(offset_and_char.first)); auto chars_column(std::move(offset_and_char.second)); @@ -135,8 +137,10 @@ std::unique_ptr clamp_string_column(strings_column_view const& inp }; auto exec = rmm::exec_policy(stream); - thrust::for_each_n( - exec->on(stream), thrust::make_counting_iterator(0), input.size(), copy_transformer); + thrust::for_each_n(exec->on(stream.value()), + thrust::make_counting_iterator(0), + input.size(), + copy_transformer); return make_strings_column(input.size(), std::move(offsets_column), @@ -154,8 +158,8 @@ std::enable_if_t(), std::unique_ptr> clamp ScalarIterator const& lo_replace_itr, ScalarIterator const& hi_itr, ScalarIterator const& hi_replace_itr, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto output = detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr); @@ -185,7 +189,7 @@ std::enable_if_t(), std::unique_ptr> clamp if (input.has_nulls()) { auto input_pair_iterator = make_pair_iterator(*input_device_view); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input_pair_iterator, input_pair_iterator + input.size(), scalar_zip_itr, @@ -193,7 +197,7 @@ std::enable_if_t(), std::unique_ptr> clamp trans); } else { auto input_pair_iterator = make_pair_iterator(*input_device_view); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), input_pair_iterator, input_pair_iterator + input.size(), scalar_zip_itr, @@ -211,10 +215,10 @@ std::enable_if_t::value, std::unique_ptr clamp( ScalarIterator const& lo_replace_itr, ScalarIterator const& hi_itr, ScalarIterator const& hi_replace_itr, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - return clamper(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, mr, stream); + return clamper(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr); } struct dispatch_clamp { @@ -240,8 +244,8 @@ struct dispatch_clamp { scalar const& lo_replace, scalar const& hi, scalar const& hi_replace, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input"); @@ -252,7 +256,7 @@ struct dispatch_clamp { auto lo_replace_itr = make_pair_iterator(lo_replace); auto hi_replace_itr = make_pair_iterator(hi_replace); - return clamp(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, mr, stream); + return clamp(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr); } }; @@ -263,8 +267,8 @@ std::unique_ptr dispatch_clamp::operator()( scalar const& lo_replace, scalar const& hi, scalar const& hi_replace, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("clamp for list_view not supported"); } @@ -275,8 +279,8 @@ std::unique_ptr dispatch_clamp::operator()(column_view cons scalar const& lo_replace, scalar const& hi, scalar const& hi_replace, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("clamp for struct_view not supported"); } @@ -288,8 +292,8 @@ std::unique_ptr dispatch_clamp::operator()( scalar const& lo_replace, scalar const& hi, scalar const& hi_replace, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // add lo_replace and hi_replace to keys auto matched_column = [&] { @@ -298,7 +302,7 @@ std::unique_ptr dispatch_clamp::operator()( auto add_scalar_key = [&](scalar const& key, scalar const& key_replace) { if (key.is_valid()) { result = dictionary::detail::add_keys( - matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), mr, stream); + matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), stream, mr); matched_view = dictionary_column_view(result->view()); } }; @@ -325,8 +329,8 @@ std::unique_ptr dispatch_clamp::operator()( *lo_replace_index, *hi_index, *hi_replace_index, - mr, - stream); + stream, + mr); auto const indices_type = new_indices->type(); auto const output_size = new_indices->size(); @@ -364,8 +368,8 @@ std::unique_ptr clamp( scalar const& lo_replace, scalar const& hi, scalar const& hi_replace, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars"); CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars"); @@ -384,7 +388,7 @@ std::unique_ptr clamp( } return cudf::type_dispatcher( - input.type(), dispatch_clamp{}, input, lo, lo_replace, hi, hi_replace, mr, stream); + input.type(), dispatch_clamp{}, input, lo, lo_replace, hi, hi_replace, stream, mr); } } // namespace detail @@ -398,7 +402,7 @@ std::unique_ptr clamp(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::clamp(input, lo, lo_replace, hi, hi_replace, mr); + return detail::clamp(input, lo, lo_replace, hi, hi_replace, rmm::cuda_stream_default, mr); } // clamp input at lo and hi @@ -408,6 +412,6 @@ std::unique_ptr clamp(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::clamp(input, lo, lo, hi, hi, mr); + return detail::clamp(input, lo, lo, hi, hi, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 2c7542a2f5d..39fa62c99b0 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -236,7 +236,7 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< valid_count); std::unique_ptr offsets = cudf::strings::detail::make_offsets_child_column( - sizes_view.begin(), sizes_view.end(), mr, stream.value()); + sizes_view.begin(), sizes_view.end(), stream, mr); auto offsets_view = offsets->mutable_view(); int32_t size; @@ -244,9 +244,9 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< &size, offsets_view.end() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value())); // Allocate chars array and output null mask - cudf::size_type null_count = input.size() - valid_counter.value(stream); - std::unique_ptr output_chars = cudf::strings::detail::create_chars_child_column( - input.size(), null_count, size, mr, stream.value()); + cudf::size_type null_count = input.size() - valid_counter.value(stream); + std::unique_ptr output_chars = + cudf::strings::detail::create_chars_child_column(input.size(), null_count, size, stream, mr); auto output_chars_view = output_chars->mutable_view(); diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index a6b129630a8..6abacc6095e 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -397,8 +397,8 @@ std::unique_ptr replace_kernel_forwarder::operator() replace_kernel_forwarder::operator() offsets = cudf::strings::detail::make_offsets_child_column( - sizes_view.begin(), sizes_view.end(), mr, stream.value()); + sizes_view.begin(), sizes_view.end(), stream, mr); auto offsets_view = offsets->mutable_view(); auto device_offsets = cudf::mutable_column_device_view::create(offsets_view); int32_t size; @@ -423,7 +423,7 @@ std::unique_ptr replace_kernel_forwarder::operator() output_chars = cudf::strings::detail::create_chars_child_column( - input_col.size(), null_count, size, mr, stream.value()); + input_col.size(), null_count, size, stream, mr); auto output_chars_view = output_chars->mutable_view(); auto device_chars = cudf::mutable_column_device_view::create(output_chars_view); @@ -454,13 +454,12 @@ std::unique_ptr replace_kernel_forwarder::operator()view(), mr, stream.value()); + return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr); }(); auto matched_view = cudf::dictionary_column_view(matched_input->view()); - auto matched_values = cudf::dictionary::detail::set_keys( - values, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); - auto matched_replacements = cudf::dictionary::detail::set_keys( - replacements, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); + auto matched_values = cudf::dictionary::detail::set_keys(values, matched_view.keys(), stream); + auto matched_replacements = + cudf::dictionary::detail::set_keys(replacements, matched_view.keys(), stream); auto indices_type = matched_view.indices().type(); auto new_indices = cudf::type_dispatcher( diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 0f5c7595cd0..8683754422b 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -35,8 +35,8 @@ struct byte_list_conversion { std::enable_if_t::value and !is_floating_point(), std::unique_ptr> operator()(column_view const& input_column, flip_endianness configuration, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("Unsupported non-numeric and non-string column"); } @@ -45,8 +45,8 @@ struct byte_list_conversion { std::enable_if_t() or std::is_integral::value, std::unique_ptr> operator()(column_view const& input_column, flip_endianness configuration, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { size_type num_bytes = input_column.size() * sizeof(T); auto byte_column = make_numeric_column( @@ -57,22 +57,21 @@ struct byte_list_conversion { size_type mask = sizeof(T) - 1; if (configuration == flip_endianness::YES) { - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_bytes), [d_chars, d_data, mask] __device__(auto index) { d_chars[index] = d_data[index + mask - ((index & mask) << 1)]; }); } else { - thrust::copy_n(rmm::exec_policy(stream)->on(stream), d_data, num_bytes, d_chars); + thrust::copy_n(rmm::exec_policy(stream)->on(stream.value()), d_data, num_bytes, d_chars); } auto begin = thrust::make_constant_iterator(cudf::size_of(input_column.type())); auto offsets_column = cudf::strings::detail::make_offsets_child_column( - begin, begin + input_column.size(), mr, stream); + begin, begin + input_column.size(), stream, mr); - rmm::device_buffer null_mask = - detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = detail::copy_bitmask(input_column, stream, mr); return make_lists_column(input_column.size(), std::move(offsets_column), @@ -88,8 +87,8 @@ template <> std::unique_ptr byte_list_conversion::operator()( column_view const& input_column, flip_endianness configuration, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { strings_column_view input_strings(input_column); auto strings_count = input_strings.size(); @@ -101,7 +100,7 @@ std::unique_ptr byte_list_conversion::operator()( std::move(contents.children[cudf::strings_column_view::offsets_column_index]), std::move(contents.children[cudf::strings_column_view::chars_column_index]), input_column.null_count(), - detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr), + detail::copy_bitmask(input_column, stream, mr), stream, mr); } @@ -114,11 +113,11 @@ std::unique_ptr byte_list_conversion::operator()( */ std::unique_ptr byte_cast(column_view const& input_column, flip_endianness endian_configuration, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return type_dispatcher( - input_column.type(), byte_list_conversion{}, input_column, endian_configuration, mr, stream); + input_column.type(), byte_list_conversion{}, input_column, endian_configuration, stream, mr); } } // namespace detail @@ -131,7 +130,7 @@ std::unique_ptr byte_cast(column_view const& input_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::byte_cast(input_column, endian_configuration, mr, cudaStreamDefault); + return detail::byte_cast(input_column, endian_configuration, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index 9e6197afe0f..7173c96daed 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -14,13 +14,15 @@ * limitations under the License. */ +#include + #include #include #include #include #include -#include +#include namespace cudf { namespace detail { @@ -38,8 +40,8 @@ struct interleave_columns_functor { std::enable_if_t::value, std::unique_ptr> operator()(table_view const& strings_columns, bool create_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto num_columns = strings_columns.num_columns(); if (num_columns == 1) // Single strings column returns a copy @@ -47,7 +49,7 @@ struct interleave_columns_functor { auto strings_count = strings_columns.num_rows(); if (strings_count == 0) // All columns have 0 rows - return strings::detail::make_empty_strings_column(mr, stream); + return strings::detail::make_empty_strings_column(stream, mr); // Create device views from the strings columns. auto table = table_device_view::create(strings_columns, stream); @@ -83,17 +85,17 @@ struct interleave_columns_functor { auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + num_strings, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr); auto d_results_offsets = offsets_column->view().template data(); // Create the chars column size_type bytes = thrust::device_pointer_cast(d_results_offsets)[num_strings]; auto chars_column = - strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(num_strings, null_count, bytes, stream, mr); // Fill the chars column auto d_results_chars = chars_column->mutable_view().data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), num_strings, [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) { @@ -122,8 +124,8 @@ struct interleave_columns_functor { std::enable_if_t(), std::unique_ptr> operator()( table_view const& input, bool create_mask, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto arch_column = input.column(0); auto output_size = input.num_columns() * input.num_rows(); @@ -142,7 +144,7 @@ struct interleave_columns_functor { }; if (not create_mask) { - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), index_begin, index_end, device_output->begin(), @@ -156,7 +158,7 @@ struct interleave_columns_functor { return input.column(idx % divisor).is_valid(idx / divisor); }; - thrust::transform_if(rmm::exec_policy(stream)->on(stream), + thrust::transform_if(rmm::exec_policy(stream)->on(stream.value()), index_begin, index_end, device_output->begin(), @@ -193,7 +195,12 @@ std::unique_ptr interleave_columns(table_view const& input, auto const output_needs_mask = std::any_of( std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); - return type_dispatcher(dtype, detail::interleave_columns_functor{}, input, output_needs_mask, mr); + return type_dispatcher(dtype, + detail::interleave_columns_functor{}, + input, + output_needs_mask, + rmm::cuda_stream_default, + mr); } } // namespace cudf diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index a31eabe3964..7d906102cc2 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -14,45 +14,47 @@ * limitations under the License. */ +#include +#include +#include + #include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include -#include -#include +#include +#include #include #include #include -#include - #include #include #include -#include +#include +#include #include -#include +#include #include #include #include #include #include -#include -#include -#include -#include + #include namespace cudf { @@ -499,7 +501,7 @@ struct rolling_window_launcher { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - cudaStream_t stream) + rmm::cuda_stream_view stream) { constexpr cudf::size_type block_size = 256; cudf::detail::grid_1d grid(input.size(), block_size); @@ -512,28 +514,28 @@ struct rolling_window_launcher { if (input.has_nulls()) { gpu_rolling, agg_op, op, block_size, true> - <<>>(*input_device_view, - *default_outputs_device_view, - *output_device_view, - device_valid_count.data(), - preceding_window_begin, - following_window_begin, - min_periods); + <<>>(*input_device_view, + *default_outputs_device_view, + *output_device_view, + device_valid_count.data(), + preceding_window_begin, + following_window_begin, + min_periods); } else { gpu_rolling, agg_op, op, block_size, false> - <<>>(*input_device_view, - *default_outputs_device_view, - *output_device_view, - device_valid_count.data(), - preceding_window_begin, - following_window_begin, - min_periods); + <<>>(*input_device_view, + *default_outputs_device_view, + *output_device_view, + device_valid_count.data(), + preceding_window_begin, + following_window_begin, + min_periods); } size_type valid_count = device_valid_count.value(stream); // check the stream for debugging - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return valid_count; } @@ -551,7 +553,7 @@ struct rolling_window_launcher { size_type min_periods, std::unique_ptr const& agg, agg_op const& device_agg_op, - cudaStream_t stream) + rmm::cuda_stream_view stream) { constexpr cudf::size_type block_size = 256; cudf::detail::grid_1d grid(input.size(), block_size); @@ -564,30 +566,30 @@ struct rolling_window_launcher { if (input.has_nulls()) { gpu_rolling, agg_op, op, block_size, true> - <<>>(*input_device_view, - *default_outputs_device_view, - *output_device_view, - device_valid_count.data(), - preceding_window_begin, - following_window_begin, - min_periods, - device_agg_op); + <<>>(*input_device_view, + *default_outputs_device_view, + *output_device_view, + device_valid_count.data(), + preceding_window_begin, + following_window_begin, + min_periods, + device_agg_op); } else { gpu_rolling, agg_op, op, block_size, false> - <<>>(*input_device_view, - *default_outputs_device_view, - *output_device_view, - device_valid_count.data(), - preceding_window_begin, - following_window_begin, - min_periods, - device_agg_op); + <<>>(*input_device_view, + *default_outputs_device_view, + *output_device_view, + device_valid_count.data(), + preceding_window_begin, + following_window_begin, + min_periods, + device_agg_op); } size_type valid_count = device_valid_count.value(stream); // check the stream for debugging - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return valid_count; } @@ -610,8 +612,8 @@ struct rolling_window_launcher { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return empty_like(input); @@ -650,8 +652,8 @@ struct rolling_window_launcher { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return empty_like(input); @@ -721,8 +723,8 @@ struct rolling_window_launcher { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Aggregation operator and/or input type combination is invalid"); } @@ -742,8 +744,8 @@ struct rolling_window_launcher { size_type min_periods, std::unique_ptr const& agg, agg_op const& device_agg_op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return empty_like(input); @@ -793,8 +795,8 @@ struct rolling_window_launcher { size_type min_periods, std::unique_ptr const& agg, agg_op device_agg_op, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL( "Aggregation operator and/or input type combination is invalid: " @@ -812,8 +814,8 @@ struct rolling_window_launcher { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(default_outputs.is_empty(), "Only LEAD/LAG window functions support default values."); @@ -828,8 +830,8 @@ struct rolling_window_launcher { following_window_begin, min_periods, agg, - mr, - stream); + stream, + mr); } // This variant is just to handle mean @@ -843,8 +845,8 @@ struct rolling_window_launcher { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return launch( input, @@ -853,8 +855,8 @@ struct rolling_window_launcher { following_window_begin, min_periods, agg, - mr, - stream); + stream, + mr); } template const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return launch(agg.get())->row_offset}, - mr, - stream); + stream, + mr); } }; @@ -895,8 +897,8 @@ struct dispatch_rolling { FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return aggregation_dispatcher(agg->kind, rolling_window_launcher{}, @@ -906,8 +908,8 @@ struct dispatch_rolling { following_window_begin, min_periods, agg, - mr, - stream); + stream, + mr); } }; @@ -916,15 +918,14 @@ struct dispatch_rolling { // Applies a user-defined rolling window function to the values in a column. template std::unique_ptr rolling_window_udf(column_view const& input, - PrecedingWindowIterator preceding_window, std::string const& preceding_window_str, FollowingWindowIterator following_window, std::string const& following_window_str, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { static_assert(warp_size == cudf::detail::size_in_bits(), "bitmask_type size does not match CUDA warp size"); @@ -999,7 +1000,7 @@ std::unique_ptr rolling_window_udf(column_view const& input, output->set_null_count(output->size() - device_valid_count.value(stream)); // check the stream for debugging - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return output; } @@ -1021,8 +1022,8 @@ std::unique_ptr rolling_window(column_view const& input, FollowingWindowIterator following_window_begin, size_type min_periods, std::unique_ptr const& agg, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { static_assert(warp_size == cudf::detail::size_in_bits(), "bitmask_type size does not match CUDA warp size"); @@ -1037,8 +1038,8 @@ std::unique_ptr rolling_window(column_view const& input, following_window_begin, min_periods, agg, - mr, - stream); + stream, + mr); } } // namespace detail @@ -1080,8 +1081,8 @@ std::unique_ptr rolling_window(column_view const& input, "cudf::size_type", min_periods, agg, - mr, - 0); + rmm::cuda_stream_default, + mr); } else { auto preceding_window_begin = thrust::make_constant_iterator(preceding_window); auto following_window_begin = thrust::make_constant_iterator(following_window); @@ -1092,8 +1093,8 @@ std::unique_ptr rolling_window(column_view const& input, following_window_begin, min_periods, agg, - mr, - 0); + rmm::cuda_stream_default, + mr); } } @@ -1125,8 +1126,8 @@ std::unique_ptr rolling_window(column_view const& input, "cudf::size_type*", min_periods, agg, - mr, - 0); + rmm::cuda_stream_default, + mr); } else { return cudf::detail::rolling_window(input, empty_like(input)->view(), @@ -1134,8 +1135,8 @@ std::unique_ptr rolling_window(column_view const& input, following_window.begin(), min_periods, agg, - mr, - 0); + rmm::cuda_stream_default, + mr); } } @@ -1241,8 +1242,8 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, "cudf::detail::following_window_wrapper", min_periods, aggr, - mr, - 0); + rmm::cuda_stream_default, + mr); } else { return cudf::detail::rolling_window( input, @@ -1253,8 +1254,8 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, following_calculator), min_periods, aggr, - mr, - 0); + rmm::cuda_stream_default, + mr); } } @@ -1387,6 +1388,7 @@ std::unique_ptr time_range_window_ASC(column_view const& input, following_calculator), min_periods, aggr, + rmm::cuda_stream_default, mr); } @@ -1558,6 +1560,7 @@ std::unique_ptr time_range_window_ASC( following_calculator), min_periods, aggr, + rmm::cuda_stream_default, mr); } @@ -1642,6 +1645,7 @@ std::unique_ptr time_range_window_DESC(column_view const& input, following_calculator), min_periods, aggr, + rmm::cuda_stream_default, mr); } @@ -1747,8 +1751,8 @@ std::unique_ptr time_range_window_DESC( following_calculator), min_periods, aggr, - mr, - 0); + rmm::cuda_stream_default, + mr); } } diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp index 7bad39af717..644b320dcd5 100644 --- a/cpp/src/scalar/scalar_factories.cpp +++ b/cpp/src/scalar/scalar_factories.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,13 +20,16 @@ #include #include +#include + namespace cudf { namespace { struct scalar_construction_helper { template , typename std::enable_if_t() and not is_fixed_point()>* = nullptr> - std::unique_ptr operator()(cudaStream_t stream, rmm::mr::device_memory_resource* mr) const + std::unique_ptr operator()(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { using Type = device_storage_type_t; auto s = new ScalarType(Type{}, false, stream, mr); @@ -36,7 +39,8 @@ struct scalar_construction_helper { template , typename std::enable_if_t()>* = nullptr> - std::unique_ptr operator()(cudaStream_t stream, rmm::mr::device_memory_resource* mr) const + std::unique_ptr operator()(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { using Type = device_storage_type_t; auto s = new ScalarType(Type{}, numeric::scale_type{0}, false, stream, mr); @@ -55,7 +59,7 @@ struct scalar_construction_helper { // Allocate storage for a single numeric element std::unique_ptr make_numeric_scalar(data_type type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); @@ -65,7 +69,7 @@ std::unique_ptr make_numeric_scalar(data_type type, // Allocate storage for a single timestamp element std::unique_ptr make_timestamp_scalar(data_type type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); @@ -75,7 +79,7 @@ std::unique_ptr make_timestamp_scalar(data_type type, // Allocate storage for a single duration element std::unique_ptr make_duration_scalar(data_type type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); @@ -85,7 +89,7 @@ std::unique_ptr make_duration_scalar(data_type type, // Allocate storage for a single fixed width element std::unique_ptr make_fixed_width_scalar(data_type type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index 0efd68ac974..2b3e7e5a60a 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -99,8 +99,7 @@ std::unique_ptr search_ordered(table_view const& t, // This utility will ensure all corresponding dictionary columns have matching keys. // It will return any new dictionary columns created as well as updated table_views. - auto matched = dictionary::detail::match_dictionaries( - {t, values}, rmm::mr::get_current_device_resource(), stream.value()); + auto matched = dictionary::detail::match_dictionaries({t, values}, stream); auto d_t = table_device_view::create(matched.second.front(), stream); auto d_values = table_device_view::create(matched.second.back(), stream); auto count_it = thrust::make_counting_iterator(0); @@ -304,12 +303,10 @@ std::unique_ptr multi_contains_dispatch::operator()( dictionary_column_view const haystack(haystack_in); dictionary_column_view const needles(needles_in); // first combine keys so both dictionaries have the same set - auto haystack_matched = dictionary::detail::add_keys( - haystack, needles.keys(), rmm::mr::get_current_device_resource(), stream.value()); + auto haystack_matched = dictionary::detail::add_keys(haystack, needles.keys(), stream); auto const haystack_view = dictionary_column_view(haystack_matched->view()); - auto needles_matched = dictionary::detail::set_keys( - needles, haystack_view.keys(), rmm::mr::get_current_device_resource(), stream.value()); - auto const needles_view = dictionary_column_view(needles_matched->view()); + auto needles_matched = dictionary::detail::set_keys(needles, haystack_view.keys(), stream); + auto const needles_view = dictionary_column_view(needles_matched->view()); // now just use the indices for the contains column_view const haystack_indices = haystack_view.get_indices_annotated(); diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index b737a889e98..1cbbdd0cff6 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -22,16 +22,18 @@ #include #include +#include namespace cudf { namespace detail { + template auto is_sorted(cudf::table_view const& in, std::vector const& column_order, - std::vector const& null_precedence) + std::vector const& null_precedence, + rmm::cuda_stream_view stream) { - cudaStream_t stream = 0; - auto in_d = table_device_view::create(in); + auto in_d = table_device_view::create(in); rmm::device_vector d_column_order(column_order); rmm::device_vector const d_null_precedence = (has_nulls) ? rmm::device_vector{null_precedence} @@ -39,7 +41,7 @@ auto is_sorted(cudf::table_view const& in, auto ineq_op = row_lexicographic_comparator( *in_d, *in_d, d_column_order.data().get(), d_null_precedence.data().get()); - auto sorted = thrust::is_sorted(rmm::exec_policy(stream)->on(stream), + auto sorted = thrust::is_sorted(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(in.num_rows()), ineq_op); @@ -68,9 +70,9 @@ bool is_sorted(cudf::table_view const& in, } if (has_nulls(in)) { - return detail::is_sorted(in, column_order, null_precedence); + return detail::is_sorted(in, column_order, null_precedence, rmm::cuda_stream_default); } else { - return detail::is_sorted(in, column_order, null_precedence); + return detail::is_sorted(in, column_order, null_precedence, rmm::cuda_stream_default); } } diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index 50f8155313f..cb76701dd34 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -55,7 +55,7 @@ struct unique_comparator { // Assign rank from 1 to n unique values. Equal values get same rank value. rmm::device_vector sorted_dense_rank(column_view input_col, column_view sorted_order_view, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto device_table = table_device_view::create(table_view{{input_col}}, stream); auto const input_size = input_col.size(); @@ -68,7 +68,7 @@ rmm::device_vector sorted_dense_rank(column_view input_col, auto unique_it = thrust::make_transform_iterator(thrust::make_counting_iterator(0), conv); - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), unique_it, unique_it + input_size, dense_rank_sorted.data().get()); @@ -78,7 +78,7 @@ rmm::device_vector sorted_dense_rank(column_view input_col, auto unique_it = thrust::make_transform_iterator(thrust::make_counting_iterator(0), conv); - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), unique_it, unique_it + input_size, dense_rank_sorted.data().get()); @@ -110,13 +110,13 @@ void tie_break_ranks_transform(rmm::device_vector const &dense_rank_s outputIterator rank_iter, TieBreaker tie_breaker, Transformer transformer, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto const input_size = sorted_order_view.size(); rmm::device_vector tie_sorted(input_size, 0); // algorithm: reduce_by_key(dense_rank, 1, n, reduction_tie_breaker) // reduction_tie_breaker = min, max, min_count - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()), dense_rank_sorted.begin(), dense_rank_sorted.end(), tie_iter, @@ -129,7 +129,7 @@ void tie_break_ranks_transform(rmm::device_vector const &dense_rank_s [tied_rank = tie_sorted.begin(), transformer] __device__(auto dense_pos) { return transformer(tied_rank[dense_pos - 1]); }); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), sorted_tied_rank, sorted_tied_rank + input_size, sorted_order_view.begin(), @@ -139,10 +139,10 @@ void tie_break_ranks_transform(rmm::device_vector const &dense_rank_s template void rank_first(column_view sorted_order_view, mutable_column_view rank_mutable_view, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // stable sort order ranking (no ties) - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(1), thrust::make_counting_iterator(rank_mutable_view.size() + 1), sorted_order_view.begin(), @@ -153,10 +153,10 @@ template void rank_dense(rmm::device_vector const &dense_rank_sorted, column_view sorted_order_view, mutable_column_view rank_mutable_view, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // All equal values have same rank and rank always increases by 1 between groups - thrust::scatter(rmm::exec_policy(stream)->on(stream), + thrust::scatter(rmm::exec_policy(stream)->on(stream.value()), dense_rank_sorted.begin(), dense_rank_sorted.end(), sorted_order_view.begin(), @@ -167,7 +167,7 @@ template void rank_min(rmm::device_vector const &group_keys, column_view sorted_order_view, mutable_column_view rank_mutable_view, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // min of first in the group // All equal values have min of ranks among them. @@ -185,7 +185,7 @@ template void rank_max(rmm::device_vector const &group_keys, column_view sorted_order_view, mutable_column_view rank_mutable_view, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // max of first in the group // All equal values have max of ranks among them. @@ -202,7 +202,7 @@ void rank_max(rmm::device_vector const &group_keys, void rank_average(rmm::device_vector const &group_keys, column_view sorted_order_view, mutable_column_view rank_mutable_view, - cudaStream_t stream) + rmm::cuda_stream_view stream) { // k, k+1, .. k+n-1 // average = (n*k+ n*(n-1)/2)/n @@ -236,13 +236,13 @@ std::unique_ptr rank(column_view const &input, null_policy null_handling, null_order null_precedence, bool percentage, - rmm::mr::device_memory_resource *mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { data_type const output_type = (percentage or method == rank_method::AVERAGE) ? data_type(type_id::FLOAT64) : data_type(type_to_id()); - std::unique_ptr rank_column = [&null_handling, &output_type, &input, &mr, &stream] { + std::unique_ptr rank_column = [&null_handling, &output_type, &input, &stream, &mr] { // na_option=keep assign NA to NA values if (null_handling == null_policy::EXCLUDE) return make_numeric_column(output_type, @@ -320,7 +320,7 @@ std::unique_ptr rank(column_view const &input, (null_handling == null_policy::EXCLUDE) ? input.size() - input.null_count() : input.size(); auto drs = dense_rank_sorted.data().get(); bool const is_dense = (method == rank_method::DENSE); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), rank_iter, rank_iter + input.size(), rank_iter, @@ -340,6 +340,13 @@ std::unique_ptr rank(column_view const &input, bool percentage, rmm::mr::device_memory_resource *mr) { - return detail::rank(input, method, column_order, null_handling, null_precedence, percentage, mr); + return detail::rank(input, + method, + column_order, + null_handling, + null_precedence, + percentage, + rmm::cuda_stream_default, + mr); } } // namespace cudf diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index e16291b6aa2..5b7459b396f 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -50,11 +50,10 @@ namespace { template std::unique_ptr counts_fn(strings_column_view const& strings, UnaryFunction& ufn, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); - auto execpol = rmm::exec_policy(stream); auto strings_column = cudf::column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create output column @@ -62,13 +61,12 @@ std::unique_ptr counts_fn(strings_column_view const& strings, cudf::data_type{type_id::INT32}, strings_count, rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr), - cudf::detail::copy_bitmask( - strings.parent(), rmm::cuda_stream_view{stream}, mr), // copy the null mask + cudf::detail::copy_bitmask(strings.parent(), stream, mr), // copy the null mask strings.null_count()); auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); // fill in the lengths - thrust::transform(execpol->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_lengths, @@ -86,20 +84,20 @@ std::unique_ptr counts_fn(strings_column_view const& strings, std::unique_ptr count_characters( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); }; - return counts_fn(strings, ufn, mr, stream); + return counts_fn(strings, ufn, stream, mr); } std::unique_ptr count_bytes( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); }; - return counts_fn(strings, ufn, mr, stream); + return counts_fn(strings, ufn, stream, mr); } } // namespace detail @@ -134,8 +132,8 @@ namespace detail { // std::unique_ptr code_points( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; @@ -144,7 +142,7 @@ std::unique_ptr code_points( rmm::device_vector offsets(strings.size() + 1); size_type* d_offsets = offsets.data().get(); thrust::transform_inclusive_scan( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), d_offsets + 1, @@ -154,7 +152,7 @@ std::unique_ptr code_points( return length; }, thrust::plus()); - CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream)); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream.value())); // the total size is the number of characters in the entire column size_type num_characters = offsets.back(); @@ -165,11 +163,11 @@ std::unique_ptr code_points( // fill column with character code-point values auto d_results = results_view.data(); // now set the ranges from each strings' character values - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings.size(), code_points_fn{d_column, d_offsets, d_results}); - // + results->set_null_count(0); return results; } @@ -182,21 +180,21 @@ std::unique_ptr count_characters(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_characters(strings, mr); + return detail::count_characters(strings, rmm::cuda_stream_default, mr); } std::unique_ptr count_bytes(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_bytes(strings, mr); + return detail::count_bytes(strings, rmm::cuda_stream_default, mr); } std::unique_ptr code_points(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::code_points(strings, mr); + return detail::code_points(strings, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index f2482588cc8..7dd4962e8de 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -15,6 +15,9 @@ */ #include +#include +#include + #include #include #include @@ -25,8 +28,8 @@ #include #include #include -#include -#include + +#include namespace cudf { namespace strings { @@ -257,14 +260,15 @@ std::unique_ptr capitalize(strings_column_view const& strings, { CUDF_FUNC_RANGE(); return detail::modify_strings( - strings, mr, nullptr); + strings, rmm::cuda_stream_default, mr); } std::unique_ptr title(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::modify_strings(strings, mr, nullptr); + return detail::modify_strings( + strings, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 48306ce4e11..453f1e7daf7 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -128,17 +128,17 @@ struct upper_lower_fn { * * @param strings Strings to convert. * @param case_flag The character type to convert (upper, lower, or both) - * @param mr Device memory resource used to allocate the returned column's device memory. * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with characters converted. */ std::unique_ptr convert_case(strings_column_view const& strings, character_flags_table_type case_flag, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -146,8 +146,7 @@ std::unique_ptr convert_case(strings_column_view const& strings, size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // get the lookup tables used for case conversion auto d_flags = get_character_flags_table(); @@ -159,24 +158,24 @@ std::unique_ptr convert_case(strings_column_view const& strings, thrust::make_counting_iterator(0), upper_lower_fn{d_column, case_flag, d_flags, d_case_table, d_special_case_mapping}); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.data(); // build the chars column -- convert characters based on case_flag parameter size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, upper_lower_fn{ d_column, case_flag, d_flags, d_case_table, d_special_case_mapping, d_new_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -190,32 +189,32 @@ std::unique_ptr convert_case(strings_column_view const& strings, std::unique_ptr to_lower( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { character_flags_table_type case_flag = IS_UPPER(0xFF); // convert only upper case characters - return convert_case(strings, case_flag, mr, stream); + return convert_case(strings, case_flag, stream, mr); } // std::unique_ptr to_upper( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { character_flags_table_type case_flag = IS_LOWER(0xFF); // convert only lower case characters - return convert_case(strings, case_flag, mr, stream); + return convert_case(strings, case_flag, stream, mr); } // std::unique_ptr swapcase( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // convert only upper or lower case characters character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF); - return convert_case(strings, case_flag, mr, stream); + return convert_case(strings, case_flag, stream, mr); } } // namespace detail @@ -226,21 +225,21 @@ std::unique_ptr to_lower(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_lower(strings, mr); + return detail::to_lower(strings, rmm::cuda_stream_default, mr); } std::unique_ptr to_upper(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_upper(strings, mr); + return detail::to_upper(strings, rmm::cuda_stream_default, mr); } std::unique_ptr swapcase(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::swapcase(strings, mr); + return detail::swapcase(strings, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 6e63e756c2e..da85c551adf 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -39,27 +39,26 @@ std::unique_ptr all_characters_of_type( strings_column_view const& strings, string_character_types types, string_character_types verify_types, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); // get the static character types table auto d_flags = detail::get_character_flags_table(); // set the output values by checking the character types for each string - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -148,7 +147,7 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str string_character_types types_to_remove, string_scalar const& replacement, string_character_types types_to_keep, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid"); @@ -171,12 +170,11 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str d_replacement}; // copy null mask from input column - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns auto children = cudf::strings::detail::make_strings_children( - filterer, strings_count, strings.null_count(), mr, stream); + filterer, strings_count, strings.null_count(), stream, mr); // return new strings column return make_strings_column(strings_count, @@ -190,21 +188,20 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str std::unique_ptr is_integer( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), d_results, @@ -216,7 +213,7 @@ std::unique_ptr is_integer( return results; } -bool all_integer(strings_column_view const& strings, cudaStream_t stream = 0) +bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; @@ -225,7 +222,7 @@ bool all_integer(strings_column_view const& strings, cudaStream_t stream = 0) if (d_column.is_null(idx)) return false; return string::is_integer(d_column.element(idx)); }); - return thrust::all_of(rmm::exec_policy(stream)->on(stream), + return thrust::all_of(rmm::exec_policy(stream)->on(stream.value()), transformer_itr, transformer_itr + strings.size(), thrust::identity()); @@ -233,22 +230,21 @@ bool all_integer(strings_column_view const& strings, cudaStream_t stream = 0) std::unique_ptr is_float( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // check strings for valid float chars - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), d_results, @@ -260,7 +256,7 @@ std::unique_ptr is_float( return results; } -bool all_float(strings_column_view const& strings, cudaStream_t stream = 0) +bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; @@ -269,7 +265,7 @@ bool all_float(strings_column_view const& strings, cudaStream_t stream = 0) if (d_column.is_null(idx)) return false; return string::is_float(d_column.element(idx)); }); - return thrust::all_of(rmm::exec_policy(stream)->on(stream), + return thrust::all_of(rmm::exec_policy(stream)->on(stream.value()), transformer_itr, transformer_itr + strings.size(), thrust::identity()); @@ -285,7 +281,7 @@ std::unique_ptr all_characters_of_type(strings_column_view const& string rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::all_characters_of_type(strings, types, verify_types, mr); + return detail::all_characters_of_type(strings, types, verify_types, rmm::cuda_stream_default, mr); } std::unique_ptr filter_characters_of_type(strings_column_view const& strings, @@ -296,33 +292,33 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str { CUDF_FUNC_RANGE(); return detail::filter_characters_of_type( - strings, types_to_remove, replacement, types_to_keep, 0, mr); + strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr); } std::unique_ptr is_integer(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_integer(strings, mr); + return detail::is_integer(strings, rmm::cuda_stream_default, mr); } std::unique_ptr is_float(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_float(strings, mr); + return detail::is_float(strings, rmm::cuda_stream_default, mr); } bool all_integer(strings_column_view const& strings) { CUDF_FUNC_RANGE(); - return detail::all_integer(strings); + return detail::all_integer(strings, rmm::cuda_stream_default); } bool all_float(strings_column_view const& strings) { CUDF_FUNC_RANGE(); - return detail::all_float(strings); + return detail::all_float(strings, rmm::cuda_stream_default); } } // namespace strings diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 57bd7abef2f..1ef8e691149 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -28,9 +30,8 @@ #include #include -#include - #include +#include #include #include @@ -41,12 +42,12 @@ namespace cudf { namespace strings { namespace detail { -// + std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); @@ -59,7 +60,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, return std::make_unique(*(strings_columns.begin()), stream, mr); auto strings_count = strings_columns.num_rows(); if (strings_count == 0) // empty begets empty - return detail::make_empty_strings_column(mr, stream); + return detail::make_empty_strings_column(stream, mr); CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar"); string_view d_separator(separator.data(), separator.size()); @@ -111,17 +112,17 @@ std::unique_ptr concatenate(table_view const& strings_columns, auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_results_offsets = offsets_column->view().data(); // create the chars column size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count]; auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); // fill the chars column auto d_results_chars = chars_column->mutable_view().data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__( @@ -154,15 +155,14 @@ std::unique_ptr concatenate(table_view const& strings_columns, mr); } -// std::unique_ptr join_strings(strings_column_view const& strings, string_scalar const& separator, string_scalar const& narep, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar"); @@ -178,7 +178,7 @@ std::unique_ptr join_strings(strings_column_view const& strings, auto d_output_offsets = output_offsets.data().get(); // using inclusive-scan to compute last entry which is the total size thrust::transform_inclusive_scan( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_output_offsets + 1, @@ -193,7 +193,7 @@ std::unique_ptr join_strings(strings_column_view const& strings, return bytes; }, thrust::plus()); - CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream)); + CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream.value())); // total size is the last entry size_type bytes = output_offsets.back(); @@ -207,7 +207,7 @@ std::unique_ptr join_strings(strings_column_view const& strings, new_offsets, sizeof(new_offsets), cudaMemcpyHostToDevice, - stream)); + stream.value())); // build null mask // only one entry so it is either all valid or all null @@ -218,11 +218,11 @@ std::unique_ptr join_strings(strings_column_view const& strings, null_count = 1; } auto chars_column = - detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) { @@ -248,13 +248,12 @@ std::unique_ptr join_strings(strings_column_view const& strings, mr); } -// std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); @@ -268,7 +267,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, CUDF_EXPECTS(strings_count == separators.size(), "Separators column should be the same size as the strings columns"); if (strings_count == 0) // Empty begets empty - return detail::make_empty_strings_column(mr, stream); + return detail::make_empty_strings_column(stream, mr); // Invalid output column strings - null rows string_view const invalid_str{nullptr, 0}; @@ -287,7 +286,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Execute it on every element thrust::transform( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), out_col_strings.data().get(), @@ -373,17 +372,17 @@ std::unique_ptr concatenate(table_view const& strings_columns, auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_results_offsets = offsets_column->view().data(); // Create the chars column size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count]; auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); // Fill the chars column auto d_results_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_table, @@ -453,7 +452,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separator, narep, mr); + return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); } std::unique_ptr join_strings(strings_column_view const& strings, @@ -462,7 +461,7 @@ std::unique_ptr join_strings(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::join_strings(strings, separator, narep, mr); + return detail::join_strings(strings, separator, narep, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, @@ -472,7 +471,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separators, separator_narep, col_narep, mr); + return detail::concatenate( + strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 96c87f554b5..246a5cad1ae 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -69,8 +69,8 @@ std::unique_ptr contains_util( strings_column_view const& strings, std::string const& pattern, bool beginning_only = false, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -81,32 +81,31 @@ std::unique_ptr contains_util( auto d_prog = *prog; // create the output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // fill the output column auto execpol = rmm::exec_policy(stream); int regex_insts = d_prog.insts_counts(); if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, contains_fn{d_prog, d_column, beginning_only}); else if (regex_insts <= RX_MEDIUM_INSTS) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, contains_fn{d_prog, d_column, beginning_only}); else - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -121,19 +120,19 @@ std::unique_ptr contains_util( std::unique_ptr contains_re( strings_column_view const& strings, std::string const& pattern, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - return contains_util(strings, pattern, false, mr, stream); + return contains_util(strings, pattern, false, stream, mr); } std::unique_ptr matches_re( strings_column_view const& strings, std::string const& pattern, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - return contains_util(strings, pattern, true, mr, stream); + return contains_util(strings, pattern, true, stream, mr); } } // namespace detail @@ -145,7 +144,7 @@ std::unique_ptr contains_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_re(strings, pattern, mr); + return detail::contains_re(strings, pattern, rmm::cuda_stream_default, mr); } std::unique_ptr matches_re(strings_column_view const& strings, @@ -153,7 +152,7 @@ std::unique_ptr matches_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::matches_re(strings, pattern, mr); + return detail::matches_re(strings, pattern, rmm::cuda_stream_default, mr); } namespace detail { @@ -191,8 +190,8 @@ struct count_fn { std::unique_ptr count_re( strings_column_view const& strings, std::string const& pattern, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -203,32 +202,31 @@ std::unique_ptr count_re( auto d_prog = *prog; // create the output column - auto results = make_numeric_column( - data_type{type_id::INT32}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::INT32}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // fill the output column auto execpol = rmm::exec_policy(stream); int regex_insts = d_prog.insts_counts(); if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, count_fn{d_prog, d_column}); else if (regex_insts <= RX_MEDIUM_INSTS) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, count_fn{d_prog, d_column}); else - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -247,7 +245,7 @@ std::unique_ptr count_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_re(strings, pattern, mr); + return detail::count_re(strings, pattern, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index 1ba2151c0a7..e46d1dbe4b5 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -39,7 +39,7 @@ namespace detail { // Convert strings column to boolean column std::unique_ptr to_booleans(strings_column_view const& strings, string_scalar const& true_string, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); @@ -52,17 +52,16 @@ std::unique_ptr to_booleans(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create output column copying the strings' null-mask - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); auto d_results = results_view.data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -84,7 +83,7 @@ std::unique_ptr to_booleans(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_booleans(strings, true_string, cudaStream_t{}, mr); + return detail::to_booleans(strings, true_string, rmm::cuda_stream_default, mr); } namespace detail { @@ -92,11 +91,11 @@ namespace detail { std::unique_ptr from_booleans(column_view const& booleans, string_scalar const& true_string, string_scalar const& false_string, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = booleans.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(booleans.type().id() == type_id::BOOL8, "Input column must be boolean type"); CUDF_EXPECTS(true_string.is_valid() && true_string.size() > 0, @@ -110,8 +109,7 @@ std::unique_ptr from_booleans(column_view const& booleans, auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(booleans, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), @@ -125,17 +123,17 @@ std::unique_ptr from_booleans(column_view const& booleans, return bytes; }); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, booleans.null_count(), bytes, mr, stream); + create_chars_child_column(strings_count, booleans.null_count(), bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_column, d_true, d_false, d_offsets, d_chars] __device__(size_type idx) { @@ -143,7 +141,7 @@ std::unique_ptr from_booleans(column_view const& booleans, string_view result = (d_column.element(idx) ? d_true : d_false); memcpy(d_chars + d_offsets[idx], result.data(), result.size_bytes()); }); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -163,7 +161,7 @@ std::unique_ptr from_booleans(column_view const& booleans, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_booleans(booleans, true_string, false_string, cudaStream_t{}, mr); + return detail::from_booleans(booleans, true_string, false_string, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index f716b1500c6..688ebacb95c 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -123,7 +123,7 @@ struct format_compiler { {'p', 2}, {'j', 3}}; - format_compiler(const char* fmt, cudaStream_t stream) : format(fmt), d_items(0, stream) + format_compiler(const char* fmt, rmm::cuda_stream_view stream) : format(fmt), d_items(0, stream) { std::vector items; const char* str = format.c_str(); @@ -165,7 +165,7 @@ struct format_compiler { items.data(), items.size() * sizeof(items[0]), cudaMemcpyHostToDevice, - stream)); + stream.value())); } format_item const* format_items() { return d_items.data(); } @@ -376,14 +376,14 @@ struct dispatch_to_timestamps_fn { std::string const& format, timestamp_units units, mutable_column_view& results_view, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { format_compiler compiler(format.c_str(), stream); auto d_items = compiler.format_items(); auto d_results = results_view.data(); parse_datetime pfn{ d_strings, d_items, compiler.items_count(), units, compiler.subsecond_precision()}; - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(results_view.size()), d_results, @@ -394,7 +394,7 @@ struct dispatch_to_timestamps_fn { std::string const&, timestamp_units, mutable_column_view&, - cudaStream_t) const + rmm::cuda_stream_view) const { CUDF_FAIL("Only timestamps type are expected"); } @@ -406,7 +406,7 @@ struct dispatch_to_timestamps_fn { std::unique_ptr to_timestamps(strings_column_view const& strings, data_type timestamp_type, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); @@ -418,13 +418,12 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; - auto results = make_timestamp_column( - timestamp_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_timestamp_column(timestamp_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); cudf::type_dispatcher( timestamp_type, dispatch_to_timestamps_fn(), d_column, format, units, results_view, stream); @@ -558,7 +557,7 @@ struct check_datetime_format { std::unique_ptr is_timestamp(strings_column_view const& strings, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); @@ -569,18 +568,17 @@ std::unique_ptr is_timestamp(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); format_compiler compiler(format.c_str(), stream); thrust::transform( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -600,7 +598,7 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_timestamps(strings, timestamp_type, format, cudaStream_t{}, mr); + return detail::to_timestamps(strings, timestamp_type, format, rmm::cuda_stream_default, mr); } std::unique_ptr is_timestamp(strings_column_view const& strings, @@ -608,7 +606,7 @@ std::unique_ptr is_timestamp(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_timestamp(strings, format, cudaStream_t{}, mr); + return detail::is_timestamp(strings, format, rmm::cuda_stream_default, mr); } namespace detail { @@ -849,10 +847,10 @@ struct dispatch_from_timestamps_fn { timestamp_units units, const int32_t* d_offsets, char* d_chars, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { datetime_formatter pfn{d_timestamps, d_format_items, items_count, units, d_offsets, d_chars}; - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), d_timestamps.size(), pfn); @@ -864,7 +862,7 @@ struct dispatch_from_timestamps_fn { timestamp_units, const int32_t*, char* d_chars, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { CUDF_FAIL("Only timestamps type are expected"); } @@ -875,11 +873,11 @@ struct dispatch_from_timestamps_fn { // std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = timestamps.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); timestamp_units units = @@ -892,8 +890,7 @@ std::unique_ptr from_timestamps(column_view const& timestamps, auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(timestamps, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(timestamps, stream, mr); // Each string will be the same number of bytes which can be determined // directly from the format string. auto d_str_bytes = compiler.template_bytes(); // size in bytes of each string @@ -904,14 +901,14 @@ std::unique_ptr from_timestamps(column_view const& timestamps, return (d_column.is_null(idx) ? 0 : d_str_bytes); }); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, timestamps.null_count(), bytes, mr, stream); + create_chars_child_column(strings_count, timestamps.null_count(), bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); // fill in chars column with timestamps @@ -925,7 +922,7 @@ std::unique_ptr from_timestamps(column_view const& timestamps, d_new_offsets, d_chars, stream); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -944,7 +941,7 @@ std::unique_ptr from_timestamps(column_view const& timestamps, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_timestamps(timestamps, format, cudaStream_t{}, mr); + return detail::from_timestamps(timestamps, format, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index d2709e2ebe1..cdcef791f7b 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -80,7 +80,8 @@ struct alignas(4) format_item { struct format_compiler { std::string format; rmm::device_uvector d_items; - format_compiler(const char* format_, cudaStream_t stream) : format(format_), d_items(0, stream) + format_compiler(const char* format_, rmm::cuda_stream_view stream) + : format(format_), d_items(0, stream) { static std::map const specifier_lengths = { {'-', -1}, // '-' if negative @@ -150,7 +151,7 @@ struct format_compiler { items.data(), items.size() * sizeof(items[0]), cudaMemcpyHostToDevice, - stream)); + stream.value())); } format_item const* compiled_format_items() { return d_items.data(); } @@ -400,8 +401,8 @@ struct dispatch_from_durations_fn { template ()>* = nullptr> std::unique_ptr operator()(column_view const& durations, std::string const& format, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); @@ -413,14 +414,13 @@ struct dispatch_from_durations_fn { auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), duration_to_string_size_fn{d_column, d_format_items, compiler.items_count()}); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); @@ -428,17 +428,16 @@ struct dispatch_from_durations_fn { auto const chars_bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = detail::create_chars_child_column( - strings_count, durations.null_count(), chars_bytes, mr, stream); + strings_count, durations.null_count(), chars_bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, duration_to_string_fn{ d_column, d_format_items, compiler.items_count(), d_new_offsets, d_chars}); - // return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -452,8 +451,8 @@ struct dispatch_from_durations_fn { template ()>* = nullptr> std::unique_ptr operator()(column_view const&, std::string const& format, - rmm::mr::device_memory_resource*, - cudaStream_t) const + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const { CUDF_FAIL("Values for from_durations function must be a duration type."); } @@ -678,13 +677,13 @@ struct dispatch_to_durations_fn { void operator()(column_device_view const& d_strings, std::string const& format, mutable_column_view& results_view, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { format_compiler compiler(format.c_str(), stream); auto d_items = compiler.compiled_format_items(); auto d_results = results_view.data(); parse_duration pfn{d_strings, d_items, compiler.items_count()}; - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(results_view.size()), d_results, @@ -694,7 +693,7 @@ struct dispatch_to_durations_fn { void operator()(column_device_view const&, std::string const&, mutable_column_view&, - cudaStream_t) const + rmm::cuda_stream_view) const { CUDF_FAIL("Only durations type are expected for to_durations function"); } @@ -704,20 +703,20 @@ struct dispatch_to_durations_fn { std::unique_ptr from_durations(column_view const& durations, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = durations.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); return type_dispatcher( - durations.type(), dispatch_from_durations_fn{}, durations, format, mr, stream); + durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr); } std::unique_ptr to_durations(strings_column_view const& strings, data_type duration_type, std::string const& format, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); @@ -728,13 +727,12 @@ std::unique_ptr to_durations(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; - auto results = make_duration_column( - duration_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_duration_column(duration_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); cudf::type_dispatcher( duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream); @@ -749,7 +747,7 @@ std::unique_ptr from_durations(column_view const& durations, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_durations(durations, format, cudaStream_t{}, mr); + return detail::from_durations(durations, format, rmm::cuda_stream_default, mr); } std::unique_ptr to_durations(strings_column_view const& strings, @@ -758,7 +756,7 @@ std::unique_ptr to_durations(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_durations(strings, duration_type, format, cudaStream_t{}, mr); + return detail::to_durations(strings, duration_type, format, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 8abf49c5dca..4f5edb660e5 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -148,10 +148,10 @@ struct dispatch_to_floats_fn { std::enable_if_t::value>* = nullptr> void operator()(column_device_view const& strings_column, mutable_column_view& output_column, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { auto d_results = output_column.data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_column.size()), d_results, @@ -159,7 +159,7 @@ struct dispatch_to_floats_fn { } // non-integral types throw an exception template ::value>* = nullptr> - void operator()(column_device_view const&, mutable_column_view&, cudaStream_t) const + void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const { CUDF_FAIL("Output for to_floats must be a float type."); } @@ -170,7 +170,7 @@ struct dispatch_to_floats_fn { // This will convert a strings column into any float column type. std::unique_ptr to_floats(strings_column_view const& strings, data_type output_type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); @@ -178,13 +178,12 @@ std::unique_ptr to_floats(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create float output column copying the strings null-mask - auto results = make_numeric_column( - output_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(output_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); // fill output column with floats type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream); @@ -201,7 +200,7 @@ std::unique_ptr to_floats(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_floats(strings, output_type, cudaStream_t{}, mr); + return detail::to_floats(strings, output_type, rmm::cuda_stream_default, mr); } namespace detail { @@ -463,31 +462,30 @@ struct dispatch_from_floats_fn { template ::value>* = nullptr> std::unique_ptr operator()(column_view const& floats, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { size_type strings_count = floats.size(); auto column = column_device_view::create(floats, stream); auto d_column = *column; // copy the null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(floats, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), float_to_string_size_fn{d_column}); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.template data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - detail::create_chars_child_column(strings_count, floats.null_count(), bytes, mr, stream); + detail::create_chars_child_column(strings_count, floats.null_count(), bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, float_to_string_fn{d_column, d_offsets, d_chars}); @@ -504,8 +502,8 @@ struct dispatch_from_floats_fn { // non-float types throw an exception template ::value>* = nullptr> std::unique_ptr operator()(column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const { CUDF_FAIL("Values for from_floats function must be a float type."); } @@ -515,13 +513,13 @@ struct dispatch_from_floats_fn { // This will convert all float column types into a strings column. std::unique_ptr from_floats(column_view const& floats, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = floats.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); - return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, mr, stream); + return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr); } } // namespace detail @@ -531,7 +529,7 @@ std::unique_ptr from_floats(column_view const& floats, std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_floats(floats, cudaStream_t{}, mr); + return detail::from_floats(floats, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index a8ea7cf3ab9..3bb422d17f3 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -93,10 +93,10 @@ struct dispatch_hex_to_integers_fn { template ::value>* = nullptr> void operator()(column_device_view const& strings_column, mutable_column_view& output_column, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { auto d_results = output_column.data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_column.size()), d_results, @@ -104,7 +104,7 @@ struct dispatch_hex_to_integers_fn { } // non-integral types throw an exception template ::value>* = nullptr> - void operator()(column_device_view const&, mutable_column_view&, cudaStream_t) const + void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const { CUDF_FAIL("Output for hex_to_integers must be an integral type."); } @@ -113,7 +113,7 @@ struct dispatch_hex_to_integers_fn { template <> void dispatch_hex_to_integers_fn::operator()(column_device_view const&, mutable_column_view&, - cudaStream_t) const + rmm::cuda_stream_view) const { CUDF_FAIL("Output for hex_to_integers must not be a boolean type."); } @@ -124,21 +124,20 @@ void dispatch_hex_to_integers_fn::operator()(column_device_view const&, std::unique_ptr hex_to_integers( strings_column_view const& strings, data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(output_type); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create integer output column copying the strings null-mask - auto results = make_numeric_column( - output_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(output_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); // fill output column with integers type_dispatcher(output_type, dispatch_hex_to_integers_fn{}, d_strings, results_view, stream); @@ -147,21 +146,20 @@ std::unique_ptr hex_to_integers( } std::unique_ptr is_hex(strings_column_view const& strings, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), d_results, @@ -193,14 +191,14 @@ std::unique_ptr hex_to_integers(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hex_to_integers(strings, output_type, mr); + return detail::hex_to_integers(strings, output_type, rmm::cuda_stream_default, mr); } std::unique_ptr is_hex(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_hex(strings, 0, mr); + return detail::is_hex(strings, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 42bd70899a9..cfa64613c90 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -66,10 +66,10 @@ struct dispatch_to_integers_fn { template ::value>* = nullptr> void operator()(column_device_view const& strings_column, mutable_column_view& output_column, - cudaStream_t stream) const + rmm::cuda_stream_view stream) const { auto d_results = output_column.data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_column.size()), d_results, @@ -77,7 +77,7 @@ struct dispatch_to_integers_fn { } // non-integral types throw an exception template ::value>* = nullptr> - void operator()(column_device_view const&, mutable_column_view&, cudaStream_t) const + void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const { CUDF_FAIL("Output for to_integers must be an integral type."); } @@ -86,7 +86,7 @@ struct dispatch_to_integers_fn { template <> void dispatch_to_integers_fn::operator()(column_device_view const&, mutable_column_view&, - cudaStream_t) const + rmm::cuda_stream_view) const { CUDF_FAIL("Output for to_integers must not be a boolean type."); } @@ -96,7 +96,7 @@ void dispatch_to_integers_fn::operator()(column_device_view const&, // This will convert a strings column into any integer column type. std::unique_ptr to_integers(strings_column_view const& strings, data_type output_type, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); @@ -104,13 +104,12 @@ std::unique_ptr to_integers(strings_column_view const& strings, auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create integer output column copying the strings null-mask - auto results = make_numeric_column( - output_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(output_type, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto results_view = results->mutable_view(); // fill output column with integers type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream); @@ -126,7 +125,7 @@ std::unique_ptr to_integers(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_integers(strings, output_type, cudaStream_t{}, mr); + return detail::to_integers(strings, output_type, rmm::cuda_stream_default, mr); } namespace detail { @@ -176,35 +175,34 @@ struct integer_to_string_fn { struct dispatch_from_integers_fn { template ::value>* = nullptr> std::unique_ptr operator()(column_view const& integers, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) const + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { size_type strings_count = integers.size(); auto column = column_device_view::create(integers, stream); auto d_column = *column; // copy the null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), integer_to_string_size_fn{d_column}); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_new_offsets = offsets_view.template data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; auto chars_column = - detail::create_chars_child_column(strings_count, integers.null_count(), bytes, mr, stream); + detail::create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, integer_to_string_fn{d_column, d_new_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -217,8 +215,8 @@ struct dispatch_from_integers_fn { // non-integral types throw an exception template ::value>* = nullptr> std::unique_ptr operator()(column_view const&, - rmm::mr::device_memory_resource*, - cudaStream_t) const + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const { CUDF_FAIL("Values for from_integers function must be an integral type."); } @@ -226,7 +224,7 @@ struct dispatch_from_integers_fn { template <> std::unique_ptr dispatch_from_integers_fn::operator()( - column_view const&, rmm::mr::device_memory_resource*, cudaStream_t) const + column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { CUDF_FAIL("Input for from_integers must not be a boolean type."); } @@ -235,13 +233,13 @@ std::unique_ptr dispatch_from_integers_fn::operator()( // This will convert all integer column types into a strings column. std::unique_ptr from_integers(column_view const& integers, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = integers.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); - return type_dispatcher(integers.type(), dispatch_from_integers_fn{}, integers, mr, stream); + return type_dispatcher(integers.type(), dispatch_from_integers_fn{}, integers, stream, mr); } } // namespace detail @@ -252,7 +250,7 @@ std::unique_ptr from_integers(column_view const& integers, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_integers(integers, cudaStream_t{}, mr); + return detail::from_integers(integers, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index dcccad30f30..e0303270987 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -73,24 +73,23 @@ struct ipv4_to_integers_fn { // Convert strings column of IPv4 addresses to integers column std::unique_ptr ipv4_to_integers( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0); auto strings_column = column_device_view::create(strings.parent(), stream); // create output column copying the strings' null-mask - auto results = make_numeric_column( - data_type{type_id::INT64}, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::INT64}, + strings_count, + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); // fill output column with ipv4 integers - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -107,7 +106,7 @@ std::unique_ptr ipv4_to_integers(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ipv4_to_integers(strings, mr); + return detail::ipv4_to_integers(strings, rmm::cuda_stream_default, mr); } namespace detail { @@ -160,11 +159,11 @@ struct integers_to_ipv4_fn { // Convert integers into IPv4 addresses std::unique_ptr integers_to_ipv4( column_view const& integers, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = integers.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type"); @@ -172,8 +171,7 @@ std::unique_ptr integers_to_ipv4( auto d_column = *column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { @@ -188,19 +186,19 @@ std::unique_ptr integers_to_ipv4( return bytes; }); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, integers.null_count(), bytes, mr, stream); + create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, integers_to_ipv4_fn{d_column, d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -211,21 +209,20 @@ std::unique_ptr integers_to_ipv4( } std::unique_ptr is_ipv4(strings_column_view const& strings, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // create output column - auto results = make_numeric_column( - data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), d_results, @@ -264,14 +261,14 @@ std::unique_ptr integers_to_ipv4(column_view const& integers, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::integers_to_ipv4(integers, mr); + return detail::integers_to_ipv4(integers, rmm::cuda_stream_default, mr); } std::unique_ptr is_ipv4(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_ipv4(strings, 0, mr); + return detail::is_ipv4(strings, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 9b5c142511f..7d57e748cf3 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -113,36 +113,36 @@ struct url_encoder_fn { // std::unique_ptr url_encode( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), url_encoder_fn{d_strings}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column auto chars_column = create_chars_child_column(strings_count, strings.null_count(), thrust::device_pointer_cast(d_offsets)[strings_count], - mr, - stream); + stream, + mr); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, url_encoder_fn{d_strings, d_offsets, d_chars}); + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -159,7 +159,7 @@ std::unique_ptr url_encode(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::url_encode(strings, mr); + return detail::url_encode(strings, rmm::cuda_stream_default, mr); } namespace detail { @@ -216,23 +216,22 @@ struct url_decoder_fn { // std::unique_ptr url_decode( strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), url_decoder_fn{d_strings}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column @@ -240,14 +239,14 @@ std::unique_ptr url_decode( create_chars_child_column(strings_count, strings.null_count(), thrust::device_pointer_cast(d_offsets)[strings_count], - mr, - stream); + stream, + mr); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, url_decoder_fn{d_strings, d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -265,7 +264,7 @@ std::unique_ptr url_decode(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::url_decode(strings, mr); + return detail::url_decode(strings, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index be56c256bfa..ffa6eb9a076 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -67,8 +67,8 @@ auto create_strings_device_views(std::vector const& views, rmm::cuda_stream_view stream) { // Create device views for each input view - using CDViewPtr = - decltype(column_device_view::create(std::declval(), std::declval())); + using CDViewPtr = decltype( + column_device_view::create(std::declval(), std::declval())); auto device_view_owners = std::vector(views.size()); std::transform( views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) { @@ -228,7 +228,7 @@ std::unique_ptr concatenate(std::vector const& columns, auto const total_bytes = std::get<5>(device_views); auto const offsets_count = strings_count + 1; - if (strings_count == 0) { return make_empty_strings_column(mr, stream.value()); } + if (strings_count == 0) { return make_empty_strings_column(stream, mr); } CUDF_EXPECTS(offsets_count <= std::numeric_limits::max(), "total number of strings is too large for cudf column"); diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu index 4c99b45f5ce..384d6780116 100644 --- a/cpp/src/strings/copying/copying.cu +++ b/cpp/src/strings/copying/copying.cu @@ -22,6 +22,8 @@ #include #include +#include + #include namespace cudf { @@ -32,11 +34,11 @@ std::unique_ptr copy_slice(strings_column_view const& strings, size_type start, size_type end, size_type step, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); if (step == 0) step = 1; CUDF_EXPECTS(step > 0, "Parameter step must be positive integer."); if (end < 0 || end > strings_count) end = strings_count; @@ -46,7 +48,7 @@ std::unique_ptr copy_slice(strings_column_view const& strings, auto execpol = rmm::exec_policy(stream); // build indices rmm::device_vector indices(strings_count); - thrust::sequence(execpol->on(stream), indices.begin(), indices.end(), start, step); + thrust::sequence(execpol->on(stream.value()), indices.begin(), indices.end(), start, step); // create a column_view as a wrapper of these indices column_view indices_view( data_type{type_id::INT32}, strings_count, indices.data().get(), nullptr, 0); diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract.cu index 7a8fe7bee29..2973a52d27e 100644 --- a/cpp/src/strings/extract.cu +++ b/cpp/src/strings/extract.cu @@ -14,6 +14,9 @@ * limitations under the License. */ +#include +#include + #include #include #include @@ -23,8 +26,8 @@ #include #include #include -#include -#include + +#include namespace cudf { namespace strings { @@ -70,8 +73,8 @@ struct extract_fn { std::unique_ptr
extract( strings_column_view const& strings, std::string const& pattern, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -93,24 +96,24 @@ std::unique_ptr
extract( string_index_pair* d_indices = indices.data().get(); if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_indices, extract_fn{d_prog, d_strings, column_index}); else if (regex_insts <= RX_MEDIUM_INSTS) - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_indices, extract_fn{d_prog, d_strings, column_index}); else - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_indices, extract_fn{d_prog, d_strings, column_index}); - // + results.emplace_back(make_strings_column(indices, stream, mr)); } return std::make_unique
(std::move(results)); @@ -125,7 +128,7 @@ std::unique_ptr
extract(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract(strings, pattern, mr); + return detail::extract(strings, pattern, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index 5ed3de2c888..d2e89d5b668 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -40,7 +40,7 @@ std::unique_ptr fill( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); CUDF_EXPECTS((begin >= 0) && (end <= strings_count), "Parameters [begin,end) are outside the range of the provided strings column"); CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values"); @@ -74,13 +74,13 @@ std::unique_ptr fill( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // create the chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = strings::detail::create_chars_child_column( - strings_count, null_count, bytes, mr, stream.value()); + size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; + auto chars_column = + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); // fill the chars column auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 975d84c7875..e75950f2984 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -105,11 +105,11 @@ std::unique_ptr filter_characters( std::vector> characters_to_filter, filter_type keep_characters, string_scalar const& replacement, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid"); cudf::string_view d_replacement(replacement.data(), replacement.size()); @@ -127,23 +127,22 @@ std::unique_ptr filter_characters( auto d_strings = *strings_column; // create null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // create offsets column filter_fn ffn{d_strings, keep_characters, table.begin(), table.end(), d_replacement}; auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), ffn); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); ffn.d_offsets = offsets_column->view().data(); // build chars column size_type bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, stream, mr); ffn.d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, ffn); @@ -171,7 +170,7 @@ std::unique_ptr filter_characters( { CUDF_FUNC_RANGE(); return detail::filter_characters( - strings, characters_to_filter, keep_characters, replacement, 0, mr); + strings, characters_to_filter, keep_characters, replacement, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu index d5a6356e3f1..67c2eff33b3 100644 --- a/cpp/src/strings/find.cu +++ b/cpp/src/strings/find.cu @@ -56,8 +56,8 @@ std::unique_ptr find_fn(strings_column_view const& strings, size_type start, size_type stop, FindFunction& pfn, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid."); CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero."); @@ -77,7 +77,7 @@ std::unique_ptr find_fn(strings_column_view const& strings, auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the position values by evaluating the passed function - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -99,8 +99,8 @@ std::unique_ptr find( string_scalar const& target, size_type start = 0, size_type stop = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__( string_view d_string, string_view d_target, size_type start, size_type stop) { @@ -111,7 +111,7 @@ std::unique_ptr find( return d_string.find(d_target, begin, end - begin); }; - return find_fn(strings, target, start, stop, pfn, mr, stream); + return find_fn(strings, target, start, stop, pfn, stream, mr); } std::unique_ptr rfind( @@ -119,8 +119,8 @@ std::unique_ptr rfind( string_scalar const& target, size_type start = 0, size_type stop = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__( string_view d_string, string_view d_target, size_type start, size_type stop) { @@ -131,7 +131,7 @@ std::unique_ptr rfind( return d_string.rfind(d_target, begin, end - begin); }; - return find_fn(strings, target, start, stop, pfn, mr, stream); + return find_fn(strings, target, start, stop, pfn, stream, mr); } } // namespace detail @@ -145,7 +145,7 @@ std::unique_ptr find(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find(strings, target, start, stop, mr); + return detail::find(strings, target, start, stop, rmm::cuda_stream_default, mr); } std::unique_ptr rfind(strings_column_view const& strings, @@ -155,7 +155,7 @@ std::unique_ptr rfind(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rfind(strings, target, start, stop, mr); + return detail::rfind(strings, target, start, stop, rmm::cuda_stream_default, mr); } namespace detail { @@ -179,8 +179,8 @@ template std::unique_ptr contains_fn(strings_column_view const& strings, string_scalar const& target, BoolFunction pfn, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); if (strings_count == 0) return make_empty_column(data_type{type_id::BOOL8}); @@ -208,7 +208,7 @@ std::unique_ptr contains_fn(strings_column_view const& strings, auto results_view = results->mutable_view(); auto d_results = results_view.data(); // set the bool values by evaluating the passed function - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -240,8 +240,8 @@ template std::unique_ptr contains_fn(strings_column_view const& strings, strings_column_view const& targets, BoolFunction pfn, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (strings.is_empty()) return make_empty_column(data_type{type_id::BOOL8}); @@ -263,7 +263,7 @@ std::unique_ptr contains_fn(strings_column_view const& strings, auto d_results = results_view.data(); // set the bool values by evaluating the passed function thrust::transform( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), d_results, @@ -286,56 +286,56 @@ std::unique_ptr contains_fn(strings_column_view const& strings, std::unique_ptr contains( strings_column_view const& strings, string_scalar const& target, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return d_string.find(d_target) >= 0; }; - return contains_fn(strings, target, pfn, mr, stream); + return contains_fn(strings, target, pfn, stream, mr); } std::unique_ptr contains( strings_column_view const& strings, strings_column_view const& targets, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return d_string.find(d_target) >= 0; }; - return contains_fn(strings, targets, pfn, mr, stream); + return contains_fn(strings, targets, pfn, stream, mr); } std::unique_ptr starts_with( strings_column_view const& strings, string_scalar const& target, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return d_string.find(d_target) == 0; }; - return contains_fn(strings, target, pfn, mr, stream); + return contains_fn(strings, target, pfn, stream, mr); } std::unique_ptr starts_with( strings_column_view const& strings, strings_column_view const& targets, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return d_string.find(d_target) == 0; }; - return contains_fn(strings, targets, pfn, mr, stream); + return contains_fn(strings, targets, pfn, stream, mr); } std::unique_ptr ends_with( strings_column_view const& strings, string_scalar const& target, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__(string_view d_string, string_view d_target) { auto str_length = d_string.length(); @@ -344,14 +344,14 @@ std::unique_ptr ends_with( return d_string.find(d_target, str_length - tgt_length) >= 0; }; - return contains_fn(strings, target, pfn, mr, stream); + return contains_fn(strings, target, pfn, stream, mr); } std::unique_ptr ends_with( strings_column_view const& strings, strings_column_view const& targets, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto pfn = [] __device__(string_view d_string, string_view d_target) { auto str_length = d_string.length(); @@ -360,7 +360,7 @@ std::unique_ptr ends_with( return d_string.find(d_target, str_length - tgt_length) >= 0; }; - return contains_fn(strings, targets, pfn, mr, stream); + return contains_fn(strings, targets, pfn, stream, mr); } } // namespace detail @@ -372,7 +372,7 @@ std::unique_ptr contains(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, target, mr); + return detail::contains(strings, target, rmm::cuda_stream_default, mr); } std::unique_ptr contains(strings_column_view const& strings, @@ -380,7 +380,7 @@ std::unique_ptr contains(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(strings, targets, mr); + return detail::contains(strings, targets, rmm::cuda_stream_default, mr); } std::unique_ptr starts_with(strings_column_view const& strings, @@ -388,7 +388,7 @@ std::unique_ptr starts_with(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, target, mr); + return detail::starts_with(strings, target, rmm::cuda_stream_default, mr); } std::unique_ptr starts_with(strings_column_view const& strings, @@ -396,7 +396,7 @@ std::unique_ptr starts_with(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::starts_with(strings, targets, mr); + return detail::starts_with(strings, targets, rmm::cuda_stream_default, mr); } std::unique_ptr ends_with(strings_column_view const& strings, @@ -404,7 +404,7 @@ std::unique_ptr ends_with(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, target, mr); + return detail::ends_with(strings, target, rmm::cuda_stream_default, mr); } std::unique_ptr ends_with(strings_column_view const& strings, @@ -412,7 +412,7 @@ std::unique_ptr ends_with(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ends_with(strings, targets, mr); + return detail::ends_with(strings, targets, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/find_multiple.cu index 45225b13196..6bcaf7ccea7 100644 --- a/cpp/src/strings/find_multiple.cu +++ b/cpp/src/strings/find_multiple.cu @@ -22,6 +22,8 @@ #include #include +#include + #include namespace cudf { @@ -30,8 +32,8 @@ namespace detail { std::unique_ptr find_multiple( strings_column_view const& strings, strings_column_view const& targets, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); if (strings_count == 0) return make_empty_column(data_type{type_id::INT32}); @@ -55,7 +57,7 @@ std::unique_ptr find_multiple( auto results_view = results->mutable_view(); auto d_results = results_view.data(); // fill output column with position values - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(total_count), d_results, @@ -78,7 +80,7 @@ std::unique_ptr find_multiple(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::find_multiple(strings, targets, mr); + return detail::find_multiple(strings, targets, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 05b5293e432..578787605ee 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -50,17 +50,16 @@ struct compute_pad_output_length_fn { } // namespace -// std::unique_ptr pad( strings_column_view const& strings, size_type width, pad_side side = pad_side::RIGHT, std::string const& fill_char = " ", - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty"); char_utf8 d_fill_char = 0; size_type fill_char_size = to_char_utf8(fill_char.c_str(), d_fill_char); @@ -70,26 +69,25 @@ std::unique_ptr pad( auto d_strings = *strings_column; // create null_mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), compute_pad_output_length_fn{d_strings, width, fill_char_size}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); if (side == pad_side::LEFT) { thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { @@ -102,7 +100,7 @@ std::unique_ptr pad( }); } else if (side == pad_side::RIGHT) { thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { @@ -115,7 +113,7 @@ std::unique_ptr pad( }); } else if (side == pad_side::BOTH) { thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { @@ -131,7 +129,7 @@ std::unique_ptr pad( while (right_pad-- > 0) ptr += from_char_utf8(d_fill_char, ptr); }); } - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -149,34 +147,33 @@ std::unique_ptr pad( std::unique_ptr zfill( strings_column_view const& strings, size_type width, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // copy bitmask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), compute_pad_output_length_fn{d_strings, width, 1}); // fillchar is 1 byte auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_offsets, d_chars] __device__(size_type idx) { @@ -199,7 +196,7 @@ std::unique_ptr zfill( } // namespace detail -// external APIs +// Public APIs std::unique_ptr pad(strings_column_view const& strings, size_type width, @@ -208,7 +205,7 @@ std::unique_ptr pad(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::pad(strings, width, side, fill_char, mr); + return detail::pad(strings, width, side, fill_char, rmm::cuda_stream_default, mr); } std::unique_ptr zfill(strings_column_view const& strings, @@ -216,7 +213,7 @@ std::unique_ptr zfill(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::zfill(strings, width, mr); + return detail::zfill(strings, width, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 6e9e92844c5..6e03c183a8d 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -15,16 +15,20 @@ */ #pragma once -#include #include + +#include + #include #include namespace cudf { + class string_view; namespace strings { namespace detail { + struct reljunk; struct reinst; class reprog; @@ -76,7 +80,7 @@ class reprog_device { std::string const& pattern, const uint8_t* cp_flags, int32_t strings_count, - cudaStream_t stream = 0); + rmm::cuda_stream_view stream); /** * @brief Called automatically by the unique_ptr returned from create(). */ diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 46f7904410b..8089244803e 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,12 @@ */ #include +#include + #include + +#include #include -#include namespace cudf { namespace strings { @@ -72,7 +75,7 @@ std::unique_ptr> reprog_devic std::string const& pattern, const uint8_t* codepoint_flags, size_type strings_count, - cudaStream_t stream) + rmm::cuda_stream_view stream) { std::vector pattern32 = string_to_char32_vector(pattern); // compile pattern into host object @@ -148,7 +151,8 @@ std::unique_ptr> reprog_devic } // copy flat prog to device memory - CUDA_TRY(cudaMemcpy(d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync( + d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value())); // auto deleter = [d_buffer, d_relists](reprog_device* t) { t->destroy(); diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 3b3043d37d0..95f9ecbe2ef 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -14,21 +14,24 @@ * limitations under the License. */ +#include "backref_re.cuh" + +#include +#include + #include #include #include +#include #include -#include #include #include #include #include -#include -#include -#include +#include -#include "backref_re.cuh" +#include namespace cudf { namespace strings { @@ -81,11 +84,11 @@ std::unique_ptr replace_with_backrefs( strings_column_view const& strings, std::string const& pattern, std::string const& repl, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty"); @@ -105,7 +108,7 @@ std::unique_ptr replace_with_backrefs( string_view d_repl_template{repl_scalar.data(), repl_scalar.size()}; // copy null mask - auto null_mask = copy_bitmask(strings.parent()); + auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); auto null_count = strings.null_count(); // create child columns @@ -118,14 +121,14 @@ std::unique_ptr replace_with_backrefs( d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, strings_count, null_count, - mr, - stream); + stream, + mr); } else if (regex_insts <= RX_MEDIUM_INSTS) children = replace_with_backrefs_medium( - d_strings, d_prog, d_repl_template, backrefs, null_count, mr, stream); + d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr); else children = replace_with_backrefs_large( - d_strings, d_prog, d_repl_template, backrefs, null_count, mr, stream); + d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr); return make_strings_column(strings_count, std::move(children.first), @@ -146,7 +149,7 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& strings rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_with_backrefs(strings, pattern, repl, mr); + return detail::replace_with_backrefs(strings, pattern, repl, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh index bf0644c65ee..d5bec759528 100644 --- a/cpp/src/strings/replace/backref_re.cuh +++ b/cpp/src/strings/replace/backref_re.cuh @@ -20,6 +20,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -118,16 +120,16 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings, string_view const& d_repl_template, rmm::device_vector& backrefs, size_type null_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); children_pair replace_with_backrefs_large(column_device_view const& d_strings, reprog_device& d_prog, string_view const& d_repl_template, rmm::device_vector& backrefs, size_type null_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu index 5e7b2b3c2fc..0b078132623 100644 --- a/cpp/src/strings/replace/backref_re_large.cu +++ b/cpp/src/strings/replace/backref_re_large.cu @@ -14,9 +14,11 @@ * limitations under the License. */ +#include "backref_re.cuh" + #include -#include "backref_re.cuh" +#include namespace cudf { namespace strings { @@ -28,16 +30,16 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings, string_view const& d_repl_template, rmm::device_vector& backrefs, size_type null_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return make_strings_children( backrefs_fn{ d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, d_strings.size(), null_count, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu index e75494e8c55..899e0cb2a3e 100644 --- a/cpp/src/strings/replace/backref_re_medium.cu +++ b/cpp/src/strings/replace/backref_re_medium.cu @@ -14,9 +14,11 @@ * limitations under the License. */ +#include "backref_re.cuh" + #include -#include "backref_re.cuh" +#include namespace cudf { namespace strings { @@ -28,16 +30,16 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings, string_view const& d_repl_template, rmm::device_vector& backrefs, size_type null_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return make_strings_children( backrefs_fn{ d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()}, d_strings.size(), null_count, - mr, - stream); + stream, + mr); } } // namespace detail diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index d43dff4548c..81f1c694716 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -14,6 +14,10 @@ * limitations under the License. */ +#include +#include +#include + #include #include #include @@ -23,9 +27,8 @@ #include #include #include -#include -#include -#include + +#include namespace cudf { namespace strings { @@ -127,16 +130,15 @@ struct replace_multi_regex_fn { } // namespace -// std::unique_ptr replace_re( strings_column_view const& strings, std::vector const& patterns, strings_column_view const& repls, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); if (patterns.empty()) // no patterns; just return a copy return std::make_unique(strings.parent()); @@ -177,25 +179,25 @@ std::unique_ptr replace_re( d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, strings_count, null_count, - mr, - stream); + stream, + mr); else if (regex_insts <= RX_MEDIUM_INSTS) children = make_strings_children( replace_multi_regex_fn{ d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, strings_count, null_count, - mr, - stream); + stream, + mr); else children = make_strings_children( replace_multi_regex_fn{ d_strings, d_progs, static_cast(progs.size()), d_found_ranges, d_repls}, strings_count, null_count, - mr, - stream); - // + stream, + mr); + return make_strings_column(strings_count, std::move(children.first), std::move(children.second), @@ -215,7 +217,7 @@ std::unique_ptr replace_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_re(strings, patterns, repls, mr); + return detail::replace_re(strings, patterns, repls, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index a1aca664e25..7a22d70d4d3 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -99,7 +99,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid."); CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string."); @@ -117,20 +117,20 @@ std::unique_ptr replace(strings_column_view const& strings, thrust::make_counting_iterator(0), replace_fn{d_strings, d_target, d_repl, maxrepl}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value()); + create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, replace_fn{d_strings, d_target, d_repl, maxrepl, d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -187,7 +187,7 @@ std::unique_ptr replace_slice(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); @@ -197,21 +197,20 @@ std::unique_ptr replace_slice(strings_column_view const& strings, auto d_strings = *strings_column; // copy the null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), replace_slice_fn{d_strings, d_repl, start, stop}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value()); + create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); thrust::for_each_n( @@ -291,7 +290,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)), "Parameters targets must not be empty and must not have nulls"); CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)), @@ -313,20 +312,20 @@ std::unique_ptr replace(strings_column_view const& strings, thrust::make_counting_iterator(0), replace_multi_fn{d_strings, d_targets, d_repls}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value()); + create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, replace_multi_fn{d_strings, d_targets, d_repls, d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -342,7 +341,7 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); string_view d_repl(repl.data(), repl.size()); @@ -357,13 +356,13 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, : d_strings.element(idx).size_bytes(); }); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream.value()); + strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 4f6ec56d213..4eff05ba7b7 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -14,18 +14,21 @@ * limitations under the License. */ +#include +#include +#include + #include #include #include +#include #include -#include #include #include #include #include -#include -#include -#include + +#include namespace cudf { namespace strings { @@ -105,11 +108,11 @@ std::unique_ptr replace_re( std::string const& pattern, string_scalar const& repl = string_scalar(""), size_type maxrepl = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid"); string_view d_repl(repl.data(), repl.size()); @@ -122,7 +125,7 @@ std::unique_ptr replace_re( auto regex_insts = d_prog.insts_counts(); // copy null mask - auto null_mask = copy_bitmask(strings.parent()); + auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); auto null_count = strings.null_count(); // create child columns @@ -134,22 +137,22 @@ std::unique_ptr replace_re( make_strings_children(replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, strings_count, null_count, - mr, - stream); + stream, + mr); else if (regex_insts <= RX_MEDIUM_INSTS) children = make_strings_children(replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, strings_count, null_count, - mr, - stream); + stream, + mr); else children = make_strings_children(replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, strings_count, null_count, - mr, - stream); + stream, + mr); return make_strings_column(strings_count, std::move(children.first), @@ -171,7 +174,7 @@ std::unique_ptr replace_re(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_re(strings, pattern, repl, maxrepl, mr); + return detail::replace_re(strings, pattern, repl, maxrepl, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu index 6102de7335e..14c0e754abd 100644 --- a/cpp/src/strings/split/partition.cu +++ b/cpp/src/strings/split/partition.cu @@ -24,6 +24,8 @@ #include #include +#include + #include namespace cudf { @@ -176,8 +178,8 @@ struct rpartition_fn : public partition_fn { std::unique_ptr
partition( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); auto strings_count = strings.size(); @@ -189,7 +191,7 @@ std::unique_ptr
partition( partition_fn partitioner( *strings_column, d_delimiter, left_indices, delim_indices, right_indices); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, partitioner); @@ -203,8 +205,8 @@ std::unique_ptr
partition( std::unique_ptr
rpartition( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); auto strings_count = strings.size(); @@ -215,7 +217,7 @@ std::unique_ptr
rpartition( right_indices(strings_count); rpartition_fn partitioner( *strings_column, d_delimiter, left_indices, delim_indices, right_indices); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, partitioner); @@ -236,7 +238,7 @@ std::unique_ptr
partition(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::partition(strings, delimiter, mr); + return detail::partition(strings, delimiter, rmm::cuda_stream_default, mr); } std::unique_ptr
rpartition(strings_column_view const& strings, @@ -244,7 +246,7 @@ std::unique_ptr
rpartition(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rpartition(strings, delimiter, mr); + return detail::rpartition(strings, delimiter, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index fb0efa1131c..61d7adf8674 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -26,7 +28,7 @@ #include #include -#include +#include #include // upper_bound() #include // copy_if() @@ -422,13 +424,13 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { template std::unique_ptr
split_fn(strings_column_view const& strings_column, Tokenizer tokenizer, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::vector> results; auto strings_count = strings_column.size(); if (strings_count == 0) { - results.push_back(make_empty_strings_column(mr, stream)); + results.push_back(make_empty_strings_column(stream, mr)); return std::make_unique
(std::move(results)); } @@ -440,7 +442,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, // count the number of delimiters in the entire column size_type delimiter_count = - thrust::count_if(execpol->on(stream), + thrust::count_if(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(chars_bytes), [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { @@ -450,7 +452,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, // create vector of every delimiter position in the chars column rmm::device_vector delimiter_positions(delimiter_count); auto d_positions = delimiter_positions.data().get(); - auto copy_end = thrust::copy_if(execpol->on(stream), + auto copy_end = thrust::copy_if(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(chars_bytes), delimiter_positions.begin(), @@ -461,7 +463,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, // create vector of string indices for each delimiter rmm::device_vector string_indices(delimiter_count); // these will be strings that auto d_string_indices = string_indices.data().get(); // only contain delimiters - thrust::upper_bound(execpol->on(stream), + thrust::upper_bound(execpol->on(stream.value()), d_offsets, d_offsets + strings_count, delimiter_positions.begin(), @@ -472,7 +474,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, rmm::device_vector token_counts(strings_count); auto d_token_counts = token_counts.data().get(); // first, initialize token counts for strings without delimiters in them - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_token_counts, @@ -482,7 +484,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, }); // now compute the number of tokens in each string thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), delimiter_count, [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__( @@ -492,7 +494,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, // the columns_count is the maximum number of tokens for any string size_type columns_count = - *thrust::max_element(execpol->on(stream), token_counts.begin(), token_counts.end()); + *thrust::max_element(execpol->on(stream.value()), token_counts.begin(), token_counts.end()); // boundary case: if no columns, return one null column (custrings issue #119) if (columns_count == 0) { results.push_back(std::make_unique( @@ -508,7 +510,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, string_index_pair* d_tokens = tokens.data().get(); // initialize the token positions // -- accounts for nulls, empty, and strings with no delimiter in them - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [tokenizer, columns_count, d_tokens] __device__(size_type idx) { @@ -516,7 +518,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, }); // get the positions for every token using the delimiter positions - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), delimiter_count, [tokenizer, @@ -541,7 +543,7 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, for (size_type col = 0; col < columns_count; ++col) { auto column_tokens = d_tokens + (col * strings_count); results.emplace_back( - make_strings_column(column_tokens, column_tokens + strings_count, mr, stream)); + make_strings_column(column_tokens, column_tokens + strings_count, stream, mr)); } return std::make_unique
(std::move(results)); } @@ -742,8 +744,8 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer { template std::unique_ptr
whitespace_split_fn(size_type strings_count, Tokenizer tokenizer, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto execpol = rmm::exec_policy(stream); @@ -753,14 +755,14 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, auto d_token_counts = token_counts.data().get(); if (strings_count > 0) { thrust::transform( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_token_counts, [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); }); // column count is the maximum number of tokens for any string columns_count = - *thrust::max_element(execpol->on(stream), token_counts.begin(), token_counts.end()); + *thrust::max_element(execpol->on(stream.value()), token_counts.begin(), token_counts.end()); } std::vector> results; @@ -777,12 +779,12 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, // get the positions for every token rmm::device_vector tokens(columns_count * strings_count); string_index_pair* d_tokens = tokens.data().get(); - thrust::fill(execpol->on(stream), + thrust::fill(execpol->on(stream.value()), d_tokens, d_tokens + (columns_count * strings_count), string_index_pair{nullptr, 0}); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [tokenizer, columns_count, d_token_counts, d_tokens] __device__(size_type idx) { @@ -795,7 +797,7 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, for (size_type col = 0; col < columns_count; ++col) { auto column_tokens = d_tokens + (col * strings_count); results.emplace_back( - make_strings_column(column_tokens, column_tokens + strings_count, mr, stream)); + make_strings_column(column_tokens, column_tokens + strings_count, stream, mr)); } return std::make_unique
(std::move(results)); } @@ -806,8 +808,8 @@ std::unique_ptr
split( strings_column_view const& strings_column, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); @@ -818,21 +820,21 @@ std::unique_ptr
split( if (delimiter.size() == 0) { return whitespace_split_fn(strings_column.size(), whitespace_split_tokenizer_fn{*strings_device_view, max_tokens}, - mr, - stream); + stream, + mr); } string_view d_delimiter(delimiter.data(), delimiter.size()); return split_fn( - strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, mr, stream); + strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr); } std::unique_ptr
rsplit( strings_column_view const& strings_column, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); @@ -843,13 +845,13 @@ std::unique_ptr
rsplit( if (delimiter.size() == 0) { return whitespace_split_fn(strings_column.size(), whitespace_rsplit_tokenizer_fn{*strings_device_view, max_tokens}, - mr, - stream); + stream, + mr); } string_view d_delimiter(delimiter.data(), delimiter.size()); return split_fn( - strings_column, rsplit_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, mr, stream); + strings_column, rsplit_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr); } } // namespace detail @@ -862,7 +864,7 @@ std::unique_ptr
split(strings_column_view const& strings_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split(strings_column, delimiter, maxsplit, mr); + return detail::split(strings_column, delimiter, maxsplit, rmm::cuda_stream_default, mr); } std::unique_ptr
rsplit(strings_column_view const& strings_column, @@ -871,7 +873,7 @@ std::unique_ptr
rsplit(strings_column_view const& strings_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit(strings_column, delimiter, maxsplit, mr); + return detail::rsplit(strings_column, delimiter, maxsplit, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 6a88809ea92..8cd5ed1fd1f 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -23,7 +25,8 @@ #include #include #include -#include + +#include #include #include @@ -221,21 +224,23 @@ template std::unique_ptr split_record_fn(strings_column_view const& strings, TokenCounter counter, TokenReader reader, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // create offsets column by counting the number of tokens per string auto strings_count = strings.size(); auto offsets = make_numeric_column( data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_offsets, counter); - thrust::exclusive_scan( - rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), + d_offsets, + d_offsets + strings_count + 1, + d_offsets); // last entry is the total number of tokens to be generated auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); @@ -243,12 +248,12 @@ std::unique_ptr split_record_fn(strings_column_view const& strings, rmm::device_vector tokens(total_tokens); reader.d_token_offsets = d_offsets; reader.d_tokens = tokens.data().get(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, reader); // convert the index-pairs into one big strings column - auto strings_output = make_strings_column(tokens.begin(), tokens.end(), mr, stream); + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); // create a lists column using the offsets and the strings columns return make_lists_column(strings_count, std::move(offsets), @@ -262,8 +267,8 @@ std::unique_ptr split_record( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); @@ -275,15 +280,15 @@ std::unique_ptr split_record( return split_record_fn(strings, whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens}, whitespace_token_reader_fn{*d_strings_column_ptr, max_tokens}, - mr, - stream); + stream, + mr); } else { string_view d_delimiter(delimiter.data(), delimiter.size()); return split_record_fn(strings, token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, token_reader_fn{*d_strings_column_ptr, d_delimiter}, - mr, - stream); + stream, + mr); } } @@ -297,7 +302,8 @@ std::unique_ptr split_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record(strings, delimiter, maxsplit, mr, 0); + return detail::split_record( + strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } std::unique_ptr rsplit_record(strings_column_view const& strings, @@ -306,7 +312,8 @@ std::unique_ptr rsplit_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record(strings, delimiter, maxsplit, mr, 0); + return detail::split_record( + strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 60da9b682ec..2e387d91d2b 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -28,32 +28,32 @@ #include #include -// clang-format off namespace cudf { // Create a strings-type column from vector of pointer/size pairs std::unique_ptr make_strings_column( const rmm::device_vector>& strings, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { + rmm::mr::device_memory_resource* mr) +{ CUDF_FUNC_RANGE(); size_type strings_count = strings.size(); - if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream.value()); + if (strings_count == 0) return strings::detail::make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); auto d_strings = strings.data().get(); // check total size is not too large for cudf column - size_t bytes = thrust::transform_reduce( - execpol->on(stream.value()), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - [d_strings] __device__(size_t idx) { - auto item = d_strings[idx]; - return (item.first != nullptr) ? item.second : 0; - }, - 0, - thrust::plus()); + auto size_checker = [d_strings] __device__(size_t idx) { + auto item = d_strings[idx]; + return (item.first != nullptr) ? item.second : 0; + }; + size_t bytes = thrust::transform_reduce(execpol->on(stream.value()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + size_checker, + 0, + thrust::plus()); CUDF_EXPECTS(bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column"); @@ -65,7 +65,7 @@ std::unique_ptr make_strings_column( auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer); auto offsets_column = strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value()); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); @@ -82,7 +82,7 @@ std::unique_ptr make_strings_column( // build chars column auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream.value()); + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); thrust::for_each_n(execpol->on(stream.value()), @@ -107,7 +107,8 @@ std::unique_ptr make_strings_column( struct string_view_to_pair { string_view null_placeholder; string_view_to_pair(string_view n) : null_placeholder(n) {} - __device__ thrust::pair operator()(const string_view& i) { + __device__ thrust::pair operator()(const string_view& i) + { return (i.data() == null_placeholder.data()) ? thrust::pair{nullptr, 0} : thrust::pair{i.data(), i.size_bytes()}; @@ -118,7 +119,8 @@ struct string_view_to_pair { std::unique_ptr make_strings_column(const rmm::device_vector& string_views, const string_view null_placeholder, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { + rmm::mr::device_memory_resource* mr) +{ auto it_pair = thrust::make_transform_iterator(string_views.begin(), string_view_to_pair{null_placeholder}); const rmm::device_vector> dev_strings( @@ -132,10 +134,11 @@ std::unique_ptr make_strings_column(const rmm::device_vector& stri const rmm::device_vector& valid_mask, size_type null_count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { + rmm::mr::device_memory_resource* mr) +{ CUDF_FUNC_RANGE(); size_type num_strings = offsets.size() - 1; - if (num_strings == 0) return strings::detail::make_empty_strings_column(mr, stream.value()); + if (num_strings == 0) return strings::detail::make_empty_strings_column(stream, mr); CUDF_EXPECTS(null_count < num_strings, "null strings column not yet supported"); if (null_count > 0) { @@ -147,8 +150,8 @@ std::unique_ptr make_strings_column(const rmm::device_vector& stri CUDF_EXPECTS(bytes >= 0, "invalid offsets vector"); // build offsets column -- this is the number of strings + 1 - auto offsets_column = - make_numeric_column(data_type{type_id::INT32}, num_strings + 1, mask_state::UNALLOCATED, stream, mr); + auto offsets_column = make_numeric_column( + data_type{type_id::INT32}, num_strings + 1, mask_state::UNALLOCATED, stream, mr); auto offsets_view = offsets_column->mutable_view(); CUDA_TRY(cudaMemcpyAsync(offsets_view.data(), offsets.data().get(), @@ -159,17 +162,20 @@ std::unique_ptr make_strings_column(const rmm::device_vector& stri rmm::device_buffer null_mask{ valid_mask.data().get(), valid_mask.size() * - sizeof( - bitmask_type)}; // Or this works too: sizeof(typename std::remove_reference_t::value_type) - // Following give the incorrect value of 8 instead of 4 because of smart references: - // sizeof(valid_mask[0]), sizeof(decltype(valid_mask.front())) + sizeof(bitmask_type)}; // Or this works too: sizeof(typename + // std::remove_reference_t::value_type) + // Following give the incorrect value of 8 instead of 4 because of smart references: + // sizeof(valid_mask[0]), sizeof(decltype(valid_mask.front())) // build chars column auto chars_column = - strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream.value()); + strings::detail::create_chars_child_column(num_strings, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); - CUDA_TRY(cudaMemcpyAsync( - chars_view.data(), strings.data().get(), bytes, cudaMemcpyDeviceToDevice, stream.value())); + CUDA_TRY(cudaMemcpyAsync(chars_view.data(), + strings.data().get(), + bytes, + cudaMemcpyDeviceToDevice, + stream.value())); return make_strings_column(num_strings, std::move(offsets_column), @@ -186,7 +192,8 @@ std::unique_ptr make_strings_column(const std::vector& strings, const std::vector& null_mask, size_type null_count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { + rmm::mr::device_memory_resource* mr) +{ rmm::device_vector d_strings{strings}; rmm::device_vector d_offsets{offsets}; rmm::device_vector d_null_mask{null_mask}; @@ -201,7 +208,8 @@ std::unique_ptr make_strings_column(size_type num_strings, size_type null_count, rmm::device_buffer&& null_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { + rmm::mr::device_memory_resource* mr) +{ if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); CUDF_EXPECTS(num_strings == offsets_column->size() - 1, "Invalid offsets column size for strings column."); @@ -220,4 +228,3 @@ std::unique_ptr make_strings_column(size_type num_strings, } } // namespace cudf -// clang-format on TODO fix diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index 679ac6a2bb5..106f133229b 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -20,9 +20,12 @@ #include #include +#include + #include #include #include + #include namespace cudf { @@ -130,30 +133,36 @@ void print(strings_column_view const& strings, // std::pair, rmm::device_vector> create_offsets( - strings_column_view const& strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr) + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); size_type count = strings.size(); const int32_t* d_offsets = strings.offsets().data(); d_offsets += strings.offset(); // nvbug-2808421 : do not combine with the previous line int32_t first = 0; - CUDA_TRY(cudaMemcpyAsync(&first, d_offsets, sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_TRY( + cudaMemcpyAsync(&first, d_offsets, sizeof(int32_t), cudaMemcpyDeviceToHost, stream.value())); rmm::device_vector offsets(count + 1); // normalize the offset values for the column offset thrust::transform( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), d_offsets, d_offsets + count + 1, offsets.begin(), [first] __device__(int32_t offset) { return static_cast(offset - first); }); // copy the chars column data int32_t bytes = 0; // last offset entry is the size in bytes - CUDA_TRY( - cudaMemcpyAsync(&bytes, d_offsets + count, sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync( + &bytes, d_offsets + count, sizeof(int32_t), cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); + bytes -= first; const char* d_chars = strings.chars().data() + first; rmm::device_vector chars(bytes); - CUDA_TRY(cudaMemcpyAsync(chars.data().get(), d_chars, bytes, cudaMemcpyDeviceToHost, stream)); + CUDA_TRY( + cudaMemcpyAsync(chars.data().get(), d_chars, bytes, cudaMemcpyDeviceToHost, stream.value())); // return offsets and chars return std::make_pair(std::move(chars), std::move(offsets)); } diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp index d3256e4ccb8..9c7f905cb0b 100644 --- a/cpp/src/strings/strings_scalar_factories.cpp +++ b/cpp/src/strings/strings_scalar_factories.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,12 @@ #include +#include + namespace cudf { // Create a strings-type column from array of pointer/size pairs std::unique_ptr make_string_scalar(std::string const& string, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto s = new string_scalar(string, true, stream, mr); diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index ea5a2d8ef69..907999bf50d 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -106,11 +106,11 @@ std::unique_ptr strip( strings_column_view const& strings, strip_type stype = strip_type::BOTH, string_scalar const& to_strip = string_scalar(""), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); CUDF_EXPECTS(to_strip.is_valid(), "Parameter to_strip must be valid"); string_view d_to_strip(to_strip.data(), to_strip.size()); @@ -121,27 +121,26 @@ std::unique_ptr strip( size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column -- calculate the size of each output string auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), strip_fn{d_column, stype, d_to_strip}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); // build the chars column -- convert characters based on case_flag parameter size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = create_chars_child_column(strings_count, null_count, bytes, mr, stream); + auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, strip_fn{d_column, stype, d_to_strip, d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -161,7 +160,7 @@ std::unique_ptr strip(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::strip(strings, stype, to_strip, mr); + return detail::strip(strings, stype, to_strip, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index 7cc039b5141..2bb5723dc9b 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -28,7 +30,7 @@ #include #include -#include +#include namespace cudf { namespace strings { @@ -98,11 +100,11 @@ std::unique_ptr slice_strings( numeric_scalar const& start = numeric_scalar(0, false), numeric_scalar const& stop = numeric_scalar(0, false), numeric_scalar const& step = numeric_scalar(1), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0"); @@ -113,22 +115,21 @@ std::unique_ptr slice_strings( auto d_step = get_scalar_device_view(const_cast&>(step)); // copy the null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), substring_fn{d_column, d_start, d_stop, d_step}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_new_offsets = offsets_column->view().data(); // build chars column auto bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars}); @@ -153,7 +154,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, start, stop, step, mr); + return detail::slice_strings(strings, start, stop, step, rmm::cuda_stream_default, mr); } namespace detail { @@ -205,8 +206,8 @@ std::unique_ptr compute_substrings_from_fn(column_device_view const& d_c size_type null_count, cudf::detail::input_indexalator starts, cudf::detail::input_indexalator stops, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = d_column.size(); @@ -220,16 +221,16 @@ std::unique_ptr compute_substrings_from_fn(column_device_view const& d_c auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), substring_from_fn{d_column, starts, stops}); auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_new_offsets = offsets_column->view().data(); // Build chars column auto bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = - cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream); + cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.template data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars}); @@ -255,13 +256,13 @@ void compute_substring_indices(column_device_view const& d_column, size_type delimiter_count, size_type* start_char_pos, size_type* end_char_pos, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = d_column.size(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__(size_type idx) { @@ -313,11 +314,11 @@ std::unique_ptr slice_strings( strings_column_view const& strings, column_view const& starts_column, column_view const& stops_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(starts_column.size() == strings_count, "Parameter starts must have the same number of rows as strings."); CUDF_EXPECTS(stops_column.size() == strings_count, @@ -334,15 +335,15 @@ std::unique_ptr slice_strings( auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column); auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stops_column); return compute_substrings_from_fn( - *strings_column, strings.null_count(), starts_iter, stops_iter, mr, stream); + *strings_column, strings.null_count(), starts_iter, stops_iter, stream, mr); } template std::unique_ptr slice_strings(strings_column_view const& strings, DelimiterItrT const delimiter_itr, size_type count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); // If there aren't any rows, return an empty strings column @@ -368,7 +369,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, if (count != 0) { // Compute the substring indices first compute_substring_indices( - d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream); + d_column, delimiter_itr, count, start_char_pos, end_char_pos, stream, mr); } // Extract the substrings using the indices next @@ -377,7 +378,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, auto stops_iter = cudf::detail::indexalator_factory::make_input_iterator(stop_chars_pos_vec->view()); return compute_substrings_from_fn( - d_column, strings.null_count(), starts_iter, stops_iter, mr, stream); + d_column, strings.null_count(), starts_iter, stops_iter, stream, mr); } } // namespace detail @@ -390,7 +391,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, starts_column, stops_column, mr); + return detail::slice_strings(strings, starts_column, stops_column, rmm::cuda_stream_default, mr); } std::unique_ptr slice_strings(strings_column_view const& strings, @@ -399,8 +400,11 @@ std::unique_ptr slice_strings(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings( - strings, cudf::detail::make_pair_iterator(delimiter), count, mr, nullptr); + return detail::slice_strings(strings, + cudf::detail::make_pair_iterator(delimiter), + count, + rmm::cuda_stream_default, + mr); } std::unique_ptr slice_strings(strings_column_view const& strings, @@ -419,11 +423,13 @@ std::unique_ptr slice_strings(strings_column_view const& strings, strings, cudf::detail::make_pair_iterator(delimiters_dev_view), count, + rmm::cuda_stream_default, mr) : detail::slice_strings( strings, cudf::detail::make_pair_iterator(delimiters_dev_view), count, + rmm::cuda_stream_default, mr); } diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 4cc5d2bcba8..f643a60722a 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -24,7 +26,7 @@ #include #include -#include +#include #include @@ -76,11 +78,11 @@ struct translate_fn { std::unique_ptr translate( strings_column_view const& strings, std::vector> const& chars_table, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(mr, stream); + if (strings_count == 0) return make_empty_strings_column(stream, mr); size_type table_size = static_cast(chars_table.size()); // convert input table @@ -95,26 +97,25 @@ std::unique_ptr translate( auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // create null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // create offsets column auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), translate_fn{d_strings, table.begin(), table.end()}); auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars}); - // + return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -133,7 +134,7 @@ std::unique_ptr translate(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::translate(strings, chars_table); + return detail::translate(strings, chars_table, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index ce15fb80960..0737cd1e003 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,16 +16,20 @@ #include #include +#include +#include + #include #include #include #include -#include -#include #include +#include + #include #include + #include namespace cudf { @@ -33,14 +37,14 @@ namespace strings { namespace detail { // Used to build a temporary string_view object from a single host string. std::unique_ptr> string_from_host( - const char* str, cudaStream_t stream) + const char* str, rmm::cuda_stream_view stream) { if (!str) return nullptr; auto length = std::strlen(str); auto* d_str = new rmm::device_buffer(length, stream); - CUDA_TRY(cudaMemcpyAsync(d_str->data(), str, length, cudaMemcpyHostToDevice, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(d_str->data(), str, length, cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); auto deleter = [d_str](string_view* sv) { delete d_str; }; return std::unique_ptr{ @@ -49,7 +53,7 @@ std::unique_ptr> string_from_host // build a vector of string_view objects from a strings column rmm::device_vector create_string_vector_from_column(cudf::strings_column_view strings, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(), stream); @@ -58,7 +62,7 @@ rmm::device_vector create_string_vector_from_column(cudf::strings_c auto count = strings.size(); rmm::device_vector strings_vector(count); string_view* d_strings = strings_vector.data().get(); - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), count, [d_column, d_strings] __device__(size_type idx) { @@ -73,10 +77,10 @@ rmm::device_vector create_string_vector_from_column(cudf::strings_c // build a strings offsets column from a vector of string_views std::unique_ptr child_offsets_from_string_vector( const rmm::device_vector& strings, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return child_offsets_from_string_iterator(strings.begin(), strings.size(), mr, stream); + return child_offsets_from_string_iterator(strings.begin(), strings.size(), stream, mr); } // build a strings chars column from an vector of string_views @@ -84,8 +88,8 @@ std::unique_ptr child_chars_from_string_vector( const rmm::device_vector& strings, const int32_t* d_offsets, cudf::size_type null_count, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type count = strings.size(); auto d_strings = strings.data().get(); @@ -97,7 +101,7 @@ std::unique_ptr child_chars_from_string_vector( make_numeric_column(data_type{type_id::INT8}, bytes, mask_state::UNALLOCATED, stream, mr); // get it's view auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), count, [d_strings, d_offsets, d_chars] __device__(size_type idx) { @@ -112,8 +116,8 @@ std::unique_ptr child_chars_from_string_vector( std::unique_ptr create_chars_child_column(cudf::size_type strings_count, cudf::size_type null_count, cudf::size_type total_bytes, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(null_count <= strings_count, "Invalid null count"); return make_numeric_column( @@ -121,8 +125,8 @@ std::unique_ptr create_chars_child_column(cudf::size_type strings_count, } // -std::unique_ptr make_empty_strings_column(rmm::mr::device_memory_resource* mr, - cudaStream_t stream) +std::unique_ptr make_empty_strings_column(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return std::make_unique(data_type{type_id::STRING}, 0, diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh index 06a8aab4dc4..c90cc3aeee1 100644 --- a/cpp/src/strings/utilities.cuh +++ b/cpp/src/strings/utilities.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include namespace cudf { @@ -74,8 +76,8 @@ auto make_strings_children( SizeAndExecuteFunction size_and_exec_fn, size_type strings_count, size_type null_count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto offsets_column = make_numeric_column( data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); @@ -86,7 +88,7 @@ auto make_strings_children( // This is called twice -- once for offsets and once for chars. // Reducing the number of places size_and_exec_fn is inlined speeds up compile time. auto for_each_fn = [strings_count, stream](SizeAndExecuteFunction& size_and_exec_fn) { - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, size_and_exec_fn); @@ -94,12 +96,14 @@ auto make_strings_children( // Compute the offsets values for_each_fn(size_and_exec_fn); - thrust::exclusive_scan( - rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()), + d_offsets, + d_offsets + strings_count + 1, + d_offsets); // Now build the chars column std::unique_ptr chars_column = create_chars_child_column( - strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], mr, stream); + strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], stream, mr); size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); for_each_fn(size_and_exec_fn); diff --git a/cpp/src/strings/utilities.hpp b/cpp/src/strings/utilities.hpp index b61f9581078..3377d8bab35 100644 --- a/cpp/src/strings/utilities.hpp +++ b/cpp/src/strings/utilities.hpp @@ -15,11 +15,13 @@ */ #pragma once +#include + namespace cudf { namespace strings { namespace detail { // Type for the character flags table. -using character_flags_table_type = uint8_t; +using character_flags_table_type = std::uint8_t; /** * @brief Returns pointer to device memory that contains the static diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index c61fd0797a4..e42a8b51f9f 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -28,6 +28,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -90,13 +92,13 @@ template std::unique_ptr wrap( strings_column_view const& strings, size_type width, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(width > 0, "Positive wrap width required"); auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(mr, stream); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); auto execpol = rmm::exec_policy(stream); @@ -105,8 +107,7 @@ std::unique_ptr wrap( size_type null_count = strings.null_count(); // copy null mask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // build offsets column auto offsets_column = std::make_unique(strings.offsets(), stream, mr); // makes a copy @@ -117,7 +118,7 @@ std::unique_ptr wrap( device_execute_functor d_execute_fctr{d_column, d_new_offsets, d_chars, width}; - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, d_execute_fctr); @@ -138,7 +139,7 @@ std::unique_ptr wrap(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::wrap(strings, width, mr); + return detail::wrap(strings, width, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 5ac1b5162be..464306fee94 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -14,6 +14,10 @@ * limitations under the License. */ +#include + +#include + #include #include #include @@ -27,12 +31,12 @@ #include #include #include -#include -#include + +#include +#include #include #include -#include namespace nvtext { namespace detail { @@ -98,17 +102,18 @@ struct token_row_offsets_fn { cudf::size_type const tokens_counts; template ()>* = nullptr> - std::unique_ptr> operator()(cudaStream_t stream) const + std::unique_ptr> operator()( + rmm::cuda_stream_view stream) const { index_changed_fn pfn{row_indices.data(), sorted_indices.template data()}; auto const output_count = - thrust::count_if(rmm::exec_policy(stream)->on(stream), + thrust::count_if(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(tokens_counts), pfn); auto tokens_offsets = std::make_unique>(output_count + 1, stream); - thrust::copy_if(rmm::exec_policy(stream)->on(stream), + thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(tokens_counts), tokens_offsets->begin(), @@ -134,7 +139,7 @@ struct token_row_offsets_fn { std::unique_ptr detokenize(cudf::strings_column_view const& strings, cudf::column_view const& row_indices, cudf::string_scalar const& separator, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid"); @@ -164,17 +169,17 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string thrust::make_counting_iterator(0), detokenizer_fn{*strings_column, d_row_map, tokens_offsets->data(), d_separator}); auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build the chars column - append each source token to the appropriate output row cudf::size_type const total_bytes = cudf::detail::get_value(offsets_column->view(), output_count, stream); auto chars_column = - cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, mr, stream); + cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), output_count, detokenizer_fn{ @@ -199,7 +204,7 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::detokenize(strings, row_indices, separator, 0, mr); + return detail::detokenize(strings, row_indices, separator, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index 64508467087..6977def28ef 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -22,11 +24,11 @@ #include #include -#include +#include +#include #include #include -#include namespace nvtext { namespace detail { @@ -141,7 +143,7 @@ struct edit_distance_matrix_levenshtein_algorithm { */ std::unique_ptr edit_distance(cudf::strings_column_view const& strings, cudf::strings_column_view const& targets, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { cudf::size_type strings_count = strings.size(); @@ -165,7 +167,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str mr); auto d_results = results->mutable_view().data(); auto execpol = rmm::exec_policy(stream); - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_results, @@ -181,9 +183,10 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str // get the total size of the temporary compute buffer size_t compute_size = - thrust::reduce(execpol->on(stream), d_results, d_results + strings_count, size_t{0}); + thrust::reduce(execpol->on(stream.value()), d_results, d_results + strings_count, size_t{0}); // convert sizes to offsets in-place - thrust::exclusive_scan(execpol->on(stream), d_results, d_results + strings_count, d_results); + thrust::exclusive_scan( + execpol->on(stream.value()), d_results, d_results + strings_count, d_results); // create the temporary compute buffer rmm::device_uvector compute_buffer(compute_size, stream); auto d_buffer = compute_buffer.data(); @@ -192,7 +195,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str // - on input, d_results is the offset to the working section of d_buffer for each row // - on output, d_results is the calculated edit distance for that row thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, d_results}); @@ -203,7 +206,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str * @copydoc nvtext::edit_distance_matrix */ std::unique_ptr edit_distance_matrix(cudf::strings_column_view const& strings, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { cudf::size_type strings_count = strings.size(); @@ -224,9 +227,9 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2; rmm::device_uvector offsets(n_upper, stream); auto d_offsets = offsets.data(); - CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream)); + CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value())); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count * strings_count, [d_strings, d_offsets, strings_count] __device__(cudf::size_type idx) { @@ -244,9 +247,10 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con // get the total size for the compute buffer size_t compute_size = - thrust::reduce(execpol->on(stream), offsets.begin(), offsets.end(), size_t{0}); + thrust::reduce(execpol->on(stream.value()), offsets.begin(), offsets.end(), size_t{0}); // convert sizes to offsets in-place - thrust::exclusive_scan(execpol->on(stream), offsets.begin(), offsets.end(), offsets.begin()); + thrust::exclusive_scan( + execpol->on(stream.value()), offsets.begin(), offsets.end(), offsets.begin()); // create the compute buffer rmm::device_uvector compute_buffer(compute_size, stream); auto d_buffer = compute_buffer.data(); @@ -260,7 +264,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con mr); auto d_results = results->mutable_view().data(); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count * strings_count, edit_distance_matrix_levenshtein_algorithm{d_strings, d_buffer, d_offsets, d_results}); @@ -273,7 +277,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con stream, mr); thrust::transform_exclusive_scan( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count + 1), offsets_column->mutable_view().data(), @@ -301,7 +305,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::edit_distance(strings, targets, 0, mr); + return detail::edit_distance(strings, targets, rmm::cuda_stream_default, mr); } /** @@ -311,7 +315,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::edit_distance_matrix(strings, 0, mr); + return detail::edit_distance_matrix(strings, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 792b94aaee6..815b2720f3a 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -14,6 +14,10 @@ * limitations under the License. */ +#include + +#include + #include #include #include @@ -25,8 +29,8 @@ #include #include #include -#include -#include + +#include #include @@ -78,8 +82,8 @@ std::unique_ptr generate_ngrams( cudf::strings_column_view const& strings, cudf::size_type ngrams = 2, cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid"); cudf::string_view const d_separator(separator.data(), separator.size()); @@ -89,7 +93,6 @@ std::unique_ptr generate_ngrams( if (strings_count == 0) // if no strings, return an empty column return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto execpol = rmm::exec_policy(stream); auto strings_column = cudf::column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; @@ -131,16 +134,17 @@ std::unique_ptr generate_ngrams( thrust::make_transform_iterator(thrust::make_counting_iterator(0), ngram_generator_fn{d_strings, ngrams, d_separator}); auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + ngrams_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + ngrams_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build the chars column // generate the ngrams from the input strings and copy them into the chars data buffer cudf::size_type const total_bytes = thrust::device_pointer_cast(d_offsets)[ngrams_count]; auto chars_column = - cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, mr, stream); + cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, stream, mr); char* const d_chars = chars_column->mutable_view().data(); - thrust::for_each_n(execpol->on(stream), + auto execpol = rmm::exec_policy(stream); + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), ngrams_count, ngram_generator_fn{d_strings, ngrams, d_separator, d_offsets, d_chars}); @@ -164,7 +168,7 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_ngrams(strings, ngrams, separator, mr); + return detail::generate_ngrams(strings, ngrams, separator, rmm::cuda_stream_default, mr); } namespace detail { @@ -203,7 +207,7 @@ struct character_ngram_generator_fn { std::unique_ptr generate_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater"); @@ -219,7 +223,7 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie // create a vector of ngram offsets for each string rmm::device_vector ngram_offsets(strings_count + 1); thrust::transform_exclusive_scan( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count + 1), ngram_offsets.begin(), @@ -238,7 +242,7 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie d_ngram_offsets + strings_count, sizeof(cudf::size_type), cudaMemcpyDeviceToHost, - stream)); + stream.value())); CUDF_EXPECTS(total_ngrams > 0, "Insufficient number of characters in each string to generate ngrams"); @@ -251,21 +255,22 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie auto d_offsets = offsets_column->mutable_view().data(); // compute the size of each ngram -- output goes in d_offsets character_ngram_generator_fn generator{d_strings, ngrams, d_ngram_offsets, d_offsets}; - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, generator); // convert sizes into offsets in-place - thrust::exclusive_scan(execpol->on(stream), d_offsets, d_offsets + total_ngrams + 1, d_offsets); + thrust::exclusive_scan( + execpol->on(stream.value()), d_offsets, d_offsets + total_ngrams + 1, d_offsets); // build the chars column auto const chars_bytes = cudf::detail::get_value(offsets_column->view(), total_ngrams, stream); auto chars_column = - cudf::strings::detail::create_chars_child_column(total_ngrams, 0, chars_bytes, mr, stream); + cudf::strings::detail::create_chars_child_column(total_ngrams, 0, chars_bytes, stream, mr); generator.d_chars = chars_column->mutable_view().data(); // output chars - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, generator); @@ -286,7 +291,7 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::generate_character_ngrams(strings, ngrams, 0, mr); + return detail::generate_character_ngrams(strings, ngrams, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index 1ec356bbf33..fab49af99a9 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -14,6 +14,13 @@ * limitations under the License. */ +#include +#include + +#include + +#include + #include #include #include @@ -22,10 +29,8 @@ #include #include #include -#include -#include -#include -#include + +#include #include #include @@ -130,8 +135,8 @@ std::unique_ptr ngrams_tokenize( cudf::size_type ngrams = 2, cudf::string_scalar const& delimiter = cudf::string_scalar(""), cudf::string_scalar const& separator = cudf::string_scalar{"_"}, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); @@ -140,7 +145,7 @@ std::unique_ptr ngrams_tokenize( CUDF_EXPECTS(ngrams >= 1, "Parameter ngrams should be an integer value of 1 or greater"); if (ngrams == 1) // this is just a straight tokenize - return tokenize(strings, delimiter, mr, stream); + return tokenize(strings, delimiter, stream, mr); auto strings_count = strings.size(); if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); @@ -155,13 +160,13 @@ std::unique_ptr ngrams_tokenize( // Ex. token-counts = [3,2]; token-offsets = [0,3,5] rmm::device_vector token_offsets(strings_count + 1); auto d_token_offsets = token_offsets.data().get(); - thrust::transform_inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::transform_inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_token_offsets + 1, strings_tokenizer{d_strings, d_delimiter}, thrust::plus()); - CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream)); + CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream.value())); auto total_tokens = token_offsets[strings_count]; // Ex. 5 tokens // get the token positions (in bytes) per string @@ -169,7 +174,7 @@ std::unique_ptr ngrams_tokenize( rmm::device_vector token_positions(total_tokens); auto d_token_positions = token_positions.data().get(); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, string_tokens_positions_fn{d_strings, d_delimiter, d_token_offsets, d_token_positions}); @@ -179,7 +184,7 @@ std::unique_ptr ngrams_tokenize( rmm::device_vector ngram_offsets(strings_count + 1); auto d_ngram_offsets = ngram_offsets.data().get(); thrust::transform_inclusive_scan( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_ngram_offsets + 1, @@ -188,7 +193,7 @@ std::unique_ptr ngrams_tokenize( return (token_count >= ngrams) ? token_count - ngrams + 1 : 0; }, thrust::plus()); - CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream)); + CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream.value())); auto total_ngrams = ngram_offsets[strings_count]; // Compute the total size of the ngrams for each string (not for each ngram) @@ -202,13 +207,13 @@ std::unique_ptr ngrams_tokenize( rmm::device_vector chars_offsets(strings_count + 1); // output memory offsets auto d_chars_offsets = chars_offsets.data().get(); // per input string thrust::transform_inclusive_scan( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_chars_offsets + 1, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions}, thrust::plus()); - CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream)); + CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream.value())); auto output_chars_size = chars_offsets[strings_count]; // Ex. 14 output bytes total rmm::device_vector ngram_sizes(total_ngrams); // size in bytes of each @@ -216,12 +221,12 @@ std::unique_ptr ngrams_tokenize( // build chars column auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, 0, output_chars_size, mr, stream); + strings_count, 0, output_chars_size, stream, mr); auto d_chars = chars_column->mutable_view().data(); // Generate the ngrams into the chars column data buffer. // The ngram_builder_fn functor also fills the d_ngram_sizes vector with the // size of each ngram. - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, ngram_builder_fn{d_strings, @@ -235,7 +240,7 @@ std::unique_ptr ngrams_tokenize( d_ngram_sizes}); // build the offsets column -- converting the ngram sizes into offsets auto offsets_column = cudf::strings::detail::make_offsets_child_column( - ngram_sizes.begin(), ngram_sizes.end(), mr, stream); + ngram_sizes.begin(), ngram_sizes.end(), stream, mr); chars_column->set_null_count(0); offsets_column->set_null_count(0); // create the output strings column @@ -259,7 +264,8 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, mr); + return detail::ngrams_tokenize( + strings, ngrams, delimiter, separator, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index ac67b08eba0..0f3e3ec6b6b 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -154,11 +154,11 @@ struct codepoint_to_utf8_fn { } // namespace -// details API +// detail API std::unique_ptr normalize_spaces( cudf::strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(), - cudaStream_t stream = 0) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { cudf::size_type strings_count = strings.size(); if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); @@ -167,30 +167,29 @@ std::unique_ptr normalize_spaces( auto strings_column = cudf::column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // copy bitmask - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // create offsets by calculating size of each string for output auto offsets_transformer_itr = thrust::make_transform_iterator(thrust::make_counting_iterator(0), normalize_spaces_fn{d_strings}); // this does size-only calc auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // build the chars column cudf::size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, strings.null_count(), bytes, mr, stream); + strings_count, strings.null_count(), bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); // copy tokens to the chars buffer - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, normalize_spaces_fn{d_strings, d_offsets, d_chars}); chars_column->set_null_count(0); // reset null count for child column - // + return cudf::make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), @@ -205,7 +204,7 @@ std::unique_ptr normalize_spaces( */ std::unique_ptr normalize_characters(cudf::strings_column_view const& strings, bool do_lower_case, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const strings_count = strings.size(); @@ -236,32 +235,31 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con thrust::make_transform_iterator(thrust::make_counting_iterator(0), codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets}); auto offsets_column = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream); + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); auto d_offsets = offsets_column->view().data(); // create the output chars column cudf::size_type output_bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = cudf::strings::detail::create_chars_child_column( - strings_count, strings.null_count(), output_bytes, mr, stream); + strings_count, strings.null_count(), output_bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); // build the chars output data: convert the 4-byte code-point values into UTF-8 chars thrust::for_each_n( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), strings_count, codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars}); chars_column->set_null_count(0); // reset null count for child column - return cudf::make_strings_column( - strings_count, - std::move(offsets_column), - std::move(chars_column), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - stream, - mr); + return cudf::make_strings_column(strings_count, + std::move(offsets_column), + std::move(chars_column), + strings.null_count(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + stream, + mr); } } // namespace detail @@ -272,7 +270,7 @@ std::unique_ptr normalize_spaces(cudf::strings_column_view const& rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::normalize_spaces(strings, mr); + return detail::normalize_spaces(strings, rmm::cuda_stream_default, mr); } /** @@ -283,7 +281,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::normalize_characters(strings, do_lower_case, 0, mr); + return detail::normalize_characters(strings, do_lower_case, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 8da94e69da9..e1a03c3462b 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -14,6 +14,13 @@ * limitations under the License. */ +#include + +#include + +#include +#include + #include #include #include @@ -24,12 +31,7 @@ #include #include -#include -#include - -#include - -#include +#include namespace nvtext { namespace detail { @@ -194,7 +196,7 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st cudf::strings_column_view const& targets, cudf::strings_column_view const& replacements, cudf::string_scalar const& delimiter, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls"); @@ -218,12 +220,11 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st *replacements_column}; // copy null mask from input column - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls replacer to build the offsets and chars columns auto children = cudf::strings::detail::make_strings_children( - replacer, strings_count, strings.null_count(), mr, stream); + replacer, strings_count, strings.null_count(), stream, mr); // return new strings column return cudf::make_strings_column(strings_count, @@ -239,7 +240,7 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str cudf::size_type min_token_length, cudf::string_scalar const& replacement, cudf::string_scalar const& delimiter, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid"); @@ -254,12 +255,11 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement}; // copy null mask from input column - rmm::device_buffer null_mask = - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr); + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns auto children = cudf::strings::detail::make_strings_children( - filterer, strings_count, strings.null_count(), mr, stream); + filterer, strings_count, strings.null_count(), stream, mr); // return new strings column return cudf::make_strings_column(strings_count, @@ -282,7 +282,8 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_tokens(strings, targets, replacements, delimiter, 0, mr); + return detail::replace_tokens( + strings, targets, replacements, delimiter, rmm::cuda_stream_default, mr); } std::unique_ptr filter_tokens(cudf::strings_column_view const& strings, @@ -292,7 +293,8 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::filter_tokens(strings, min_token_length, replacement, delimiter, 0, mr); + return detail::filter_tokens( + strings, min_token_length, replacement, delimiter, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu index 8810ea759e7..eace646934d 100644 --- a/cpp/src/text/stemmer.cu +++ b/cpp/src/text/stemmer.cu @@ -14,6 +14,10 @@ * limitations under the License. */ +#include + +#include + #include #include #include @@ -24,9 +28,7 @@ #include #include -#include - -#include +#include #include #include @@ -93,22 +95,22 @@ template std::unique_ptr is_letter(cudf::strings_column_view const& strings, letter_type ltype, PositionIterator position_itr, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8}); // create empty output column - auto results = cudf::make_fixed_width_column( - cudf::data_type{cudf::type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); // set values into output column auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), results->mutable_view().data(), @@ -126,7 +128,7 @@ struct dispatch_is_letter_fn { std::unique_ptr operator()(cudf::strings_column_view const& strings, letter_type ltype, cudf::column_view const& indices, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(strings.size() == indices.size(), @@ -135,6 +137,7 @@ struct dispatch_is_letter_fn { // resolve and pass an iterator for the indices column to the detail function return is_letter(strings, ltype, indices.begin(), stream, mr); } + template ()>* = nullptr> std::unique_ptr operator()(Args&&... args) const { @@ -201,22 +204,22 @@ struct porter_stemmer_measure_fn { } // namespace std::unique_ptr porter_stemmer_measure(cudf::strings_column_view const& strings, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); // create empty output column - auto results = cudf::make_fixed_width_column( - cudf::data_type{cudf::type_id::INT32}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr), - strings.null_count(), - stream, - mr); + auto results = + cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); // compute measures into output column auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings.size()), results->mutable_view().data(), @@ -227,7 +230,7 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c std::unique_ptr is_letter(cudf::strings_column_view const& strings, letter_type ltype, cudf::column_view const& indices, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { return cudf::type_dispatcher( @@ -244,8 +247,11 @@ std::unique_ptr is_letter(cudf::strings_column_view const& strings rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_letter( - strings, ltype, thrust::make_constant_iterator(character_index), 0, mr); + return detail::is_letter(strings, + ltype, + thrust::make_constant_iterator(character_index), + rmm::cuda_stream_default, + mr); } std::unique_ptr is_letter(cudf::strings_column_view const& strings, @@ -254,7 +260,7 @@ std::unique_ptr is_letter(cudf::strings_column_view const& strings rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_letter(strings, ltype, indices, 0, mr); + return detail::is_letter(strings, ltype, indices, rmm::cuda_stream_default, mr); } /** @@ -264,7 +270,7 @@ std::unique_ptr porter_stemmer_measure(cudf::strings_column_view c rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::porter_stemmer_measure(strings, 0, mr); + return detail::porter_stemmer_measure(strings, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 12f0b0d9813..0b1baab3758 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -14,11 +14,14 @@ * limitations under the License. */ +#include +#include + #include #include #include -#include -#include + +#include #include #include @@ -258,7 +261,7 @@ __global__ void kernel_data_normalizer(unsigned char const* strings, } // namespace -data_normalizer::data_normalizer(cudaStream_t stream, bool do_lower_case) +data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case) : do_lower_case(do_lower_case) { d_cp_metadata = detail::get_codepoint_metadata(stream); @@ -268,7 +271,7 @@ data_normalizer::data_normalizer(cudaStream_t stream, bool do_lower_case) uvector_pair data_normalizer::normalize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - cudaStream_t stream) + rmm::cuda_stream_view stream) { if (num_strings == 0) return std::make_pair(std::make_unique>(0, stream), @@ -278,7 +281,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings, // copy offsets to working memory size_t const num_offsets = num_strings + 1; auto d_strings_offsets = std::make_unique>(num_offsets, stream); - thrust::transform(execpol->on(stream), + thrust::transform(execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_offsets), d_strings_offsets->begin(), @@ -298,7 +301,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings, auto d_code_points = std::make_unique>(max_new_char_total, stream); rmm::device_uvector d_chars_per_thread(threads_on_device, stream); - kernel_data_normalizer<<>>( + kernel_data_normalizer<<>>( reinterpret_cast(d_strings), bytes_count, d_cp_metadata, @@ -308,19 +311,21 @@ uvector_pair data_normalizer::normalize(char const* d_strings, d_chars_per_thread.data()); // Remove the 'empty' code points from the vector - thrust::remove( - execpol->on(stream), d_code_points->begin(), d_code_points->end(), uint32_t{1 << FILTER_BIT}); + thrust::remove(execpol->on(stream.value()), + d_code_points->begin(), + d_code_points->end(), + uint32_t{1 << FILTER_BIT}); // We also need to prefix sum the number of characters up to an including // the current character in order to get the new strings lengths. - thrust::inclusive_scan(execpol->on(stream), + thrust::inclusive_scan(execpol->on(stream.value()), d_chars_per_thread.begin(), d_chars_per_thread.end(), d_chars_per_thread.begin()); // This will reset the offsets to the new generated code point values thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(1), num_strings, update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()}); diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp index 9148bde5317..1a9eb5ba997 100644 --- a/cpp/src/text/subword/detail/data_normalizer.hpp +++ b/cpp/src/text/subword/detail/data_normalizer.hpp @@ -18,6 +18,7 @@ #include +#include #include using uvector_pair = std::pair>, @@ -54,7 +55,7 @@ class data_normalizer { * input stream to lower case and strip accents from those characters. * If false, accented and uppercase characters are not transformed. */ - data_normalizer(cudaStream_t stream, bool do_lower_case = true); + data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true); /** * @brief Normalize a vector of strings. @@ -83,7 +84,7 @@ class data_normalizer { uvector_pair normalize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: bool const do_lower_case; diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh index 0bb92c01a9d..48ee0fc2b51 100644 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh @@ -18,6 +18,8 @@ #include +#include + #include namespace nvtext { @@ -60,7 +62,7 @@ struct update_strings_lengths_fn { * * @param stream CUDA stream used for device memory operations and kernel launches. */ -codepoint_metadata_type const* get_codepoint_metadata(cudaStream_t stream); +codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream); /** * @brief Retrieve the aux code point metadata table. @@ -70,7 +72,7 @@ codepoint_metadata_type const* get_codepoint_metadata(cudaStream_t stream); * * @param stream CUDA stream used for device memory operations and kernel launches. */ -aux_codepoint_data_type const* get_aux_codepoint_data(cudaStream_t stream); +aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream); } // namespace detail } // namespace nvtext diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp index 2813c018145..e61437b7703 100644 --- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp +++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp @@ -18,6 +18,8 @@ #include +#include + namespace nvtext { struct hashed_vocabulary; @@ -70,7 +72,7 @@ class wordpiece_tokenizer { uint32_t stride, bool do_truncate, bool do_lower_case, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream, uint32_t max_word_length = 200); /** @@ -88,7 +90,7 @@ class wordpiece_tokenizer { uvector_pair tokenize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - cudaStream_t stream); + rmm::cuda_stream_view stream); private: /** @@ -100,7 +102,7 @@ class wordpiece_tokenizer { * per string. * @param stream CUDA stream used for device memory operations and kernel launches. */ - void tokenize(uvector_pair& cps_and_offsets, cudaStream_t stream); + void tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream); hashed_vocabulary const& vocab_table; data_normalizer normalizer; // removes punctuation, accents, etc diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index d0f797fca62..b905fdebb1a 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -14,14 +14,18 @@ * limitations under the License. */ +#include +#include +#include + +#include + #include #include #include #include -#include -#include -#include -#include + +#include #include #include @@ -38,7 +42,7 @@ namespace detail { * Build the code point metadata table in device memory * using the vector pieces from codepoint_metadata.ah */ -const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream) +const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream) { static cudf::strings::detail::thread_safe_per_context_cache g_codepoint_metadata; @@ -46,7 +50,7 @@ const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream) codepoint_metadata_type* table = static_cast(rmm::mr::get_current_device_resource()->allocate( codepoint_metadata_size * sizeof(codepoint_metadata_type), stream)); - thrust::fill(rmm::exec_policy(stream)->on(stream), + thrust::fill(rmm::exec_policy(stream)->on(stream.value()), table + cp_section1_end, table + codepoint_metadata_size, codepoint_metadata_default_value); @@ -54,13 +58,13 @@ const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream) codepoint_metadata, cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section cudaMemcpyHostToDevice, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync( table + cp_section2_begin, cp_metadata_917505_917999, (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section cudaMemcpyHostToDevice, - stream)); + stream.value())); return table; }); } @@ -71,7 +75,7 @@ const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream) * Build the aux code point data table in device memory * using the vector pieces from codepoint_metadata.ah */ -const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream) +const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream) { static cudf::strings::detail::thread_safe_per_context_cache g_aux_codepoint_data; @@ -79,7 +83,7 @@ const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream) aux_codepoint_data_type* table = static_cast(rmm::mr::get_current_device_resource()->allocate( aux_codepoint_data_size * sizeof(aux_codepoint_data_type), stream)); - thrust::fill(rmm::exec_policy(stream)->on(stream), + thrust::fill(rmm::exec_policy(stream)->on(stream.value()), table + aux_section1_end, table + aux_codepoint_data_size, aux_codepoint_default_value); @@ -87,25 +91,25 @@ const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream) aux_codepoint_data, aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section cudaMemcpyHostToDevice, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync( table + aux_section2_begin, aux_cp_data_44032_55203, (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section cudaMemcpyHostToDevice, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync( table + aux_section3_begin, aux_cp_data_70475_71099, (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section cudaMemcpyHostToDevice, - stream)); + stream.value())); CUDA_TRY(cudaMemcpyAsync( table + aux_section4_begin, aux_cp_data_119134_119232, (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section cudaMemcpyHostToDevice, - stream)); + stream.value())); return table; }); } @@ -134,7 +138,7 @@ const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream) * @return object containing hash table elements for the wordpiece tokenizer */ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { hashed_vocabulary result; @@ -194,7 +198,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu table.data(), table.size() * sizeof(uint64_t), cudaMemcpyHostToDevice, - stream)); + stream.value())); result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64}, bin_coefficients.size(), @@ -205,7 +209,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu bin_coefficients.data(), bin_coefficients.size() * sizeof(uint64_t), cudaMemcpyHostToDevice, - stream)); + stream.value())); result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16}, bin_offsets.size(), @@ -216,7 +220,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu bin_offsets.data(), bin_offsets.size() * sizeof(uint16_t), cudaMemcpyHostToDevice, - stream)); + stream.value())); // this just initializes some constant tables into device memory // to help speed up the runtime @@ -232,7 +236,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::load_vocabulary_file(filename_hashed_vocabulary, 0, mr); + return detail::load_vocabulary_file(filename_hashed_vocabulary, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index f25d8144bbf..e305a3e7296 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -23,6 +23,8 @@ #include #include +#include + #include #include @@ -127,7 +129,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, bool do_lower_case, bool do_truncate, uint32_t max_rows_tensor, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(stride <= max_sequence_length, @@ -164,7 +166,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, auto d_offsets_per_tensor = offsets_per_tensor.data(); auto const execpol = rmm::exec_policy(stream); thrust::transform_exclusive_scan( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count + 1), offsets_per_tensor.begin(), @@ -184,7 +186,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, rmm::device_uvector row2row_within_tensor(nrows_tensor_token_ids, stream); auto d_row2row_within_tensor = row2row_within_tensor.data(); thrust::for_each_n( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, [d_offsets_per_tensor, d_row2tensor, d_row2row_within_tensor] __device__(auto idx) { @@ -218,7 +220,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, constexpr int block_size = 256; cudf::detail::grid_1d const grid{ static_cast(nrows_tensor_token_ids * max_sequence_length), block_size}; - kernel_compute_tensor_metadata<<>>( + kernel_compute_tensor_metadata<<>>( device_token_ids, device_offsets, d_row2tensor, diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index b51e5c82688..3e65add8a7d 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -14,13 +14,17 @@ * limitations under the License. */ -#include -#include -#include #include #include #include +#include +#include + +#include + +#include + #include #include #include @@ -270,21 +274,21 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table, uint32_t stride, bool do_truncate, bool do_lower_case, - cudaStream_t stream, + rmm::cuda_stream_view stream, uint32_t max_word_length) : vocab_table(vocab_table), + normalizer(stream, do_lower_case), max_sequence_length{max_sequence_length}, - max_word_length{max_word_length}, stride(stride), do_truncate(do_truncate), - normalizer(stream, do_lower_case) + max_word_length{max_word_length} { } uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - cudaStream_t stream) + rmm::cuda_stream_view stream) { auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream); tokenize(cps_and_offsets, stream); @@ -299,7 +303,7 @@ struct tranform_fn { // just converting uint8 value to uint32 __device__ uint32_t operator()(uint8_t count) { return count; } }; -void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t stream) +void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream) { uint32_t* device_code_points = cps_and_offsets.first->data(); size_t const num_code_points = cps_and_offsets.first->size(); @@ -321,32 +325,32 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s detail::init_data_and_mark_word_start_and_ends<<>>(device_code_points, - device_start_word_indices, - device_end_word_indices, - num_code_points, - device_token_ids.data(), - device_tokens_per_word.data()); - CHECK_CUDA(stream); + stream.value()>>>(device_code_points, + device_start_word_indices, + device_end_word_indices, + num_code_points, + device_token_ids.data(), + device_tokens_per_word.data()); + CHECK_CUDA(stream.value()); cudf::detail::grid_1d const grid_mark{static_cast(num_strings + 1), THREADS_PER_BLOCK}; detail::mark_string_start_and_ends<<>>(device_code_points, - device_strings_offsets, - device_start_word_indices, - device_end_word_indices, - num_strings); - CHECK_CUDA(stream); + stream.value()>>>(device_code_points, + device_strings_offsets, + device_start_word_indices, + device_end_word_indices, + num_strings); + CHECK_CUDA(stream.value()); // Now start_word_indices has the word starts scattered throughout the array. We need to select // all values not equal to the max uint32_t and place them at the start of the array. We leverage // the fact that the start_word_indices and the end_word indices are contiguous to only launch one // device select kernel. auto const execpol = rmm::exec_policy(stream); - auto itr_end = thrust::remove(execpol->on(stream), + auto itr_end = thrust::remove(execpol->on(stream.value()), device_word_indices.begin(), device_word_indices.end(), std::numeric_limits::max()); @@ -359,27 +363,28 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s device_end_word_indices = device_start_word_indices + num_words; cudf::detail::grid_1d const grid{static_cast(num_words), THREADS_PER_BLOCK}; - detail::kernel_wordpiece_tokenizer<<>>( - device_code_points, - vocab_table.table->view().data(), - vocab_table.bin_coefficients->view().data(), - vocab_table.bin_offsets->view().data(), - vocab_table.unknown_token_id, - vocab_table.outer_hash_a, - vocab_table.outer_hash_b, - vocab_table.num_bins, - device_start_word_indices, - device_end_word_indices, - max_word_length, - num_words, - device_token_ids.data(), - device_tokens_per_word.data()); - CHECK_CUDA(stream); + detail:: + kernel_wordpiece_tokenizer<<>>( + device_code_points, + vocab_table.table->view().data(), + vocab_table.bin_coefficients->view().data(), + vocab_table.bin_offsets->view().data(), + vocab_table.unknown_token_id, + vocab_table.outer_hash_a, + vocab_table.outer_hash_b, + vocab_table.num_bins, + device_start_word_indices, + device_end_word_indices, + max_word_length, + num_words, + device_token_ids.data(), + device_tokens_per_word.data()); + CHECK_CUDA(stream.value()); // Repurpose the input array for the token ids. In the worst case, each code point ends up being a // token so this will always have enough memory to store the contiguous tokens. uint32_t* contiguous_token_ids = device_code_points; - thrust::copy_if(execpol->on(stream), + thrust::copy_if(execpol->on(stream.value()), device_token_ids.begin(), device_token_ids.end(), contiguous_token_ids, @@ -387,7 +392,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s // Repurpose start word indices since it is the same size and type as the required output. uint32_t* token_id_counts = device_start_word_indices; - thrust::transform_inclusive_scan(execpol->on(stream), + thrust::transform_inclusive_scan(execpol->on(stream.value()), device_tokens_per_word.data(), device_tokens_per_word.data() + num_code_points, token_id_counts, @@ -395,7 +400,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s thrust::plus()); // Update the device_strings_offsets using the token_id_counts - thrust::for_each_n(rmm::exec_policy(stream)->on(stream), + thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(1), num_strings, update_strings_lengths_fn{token_id_counts, device_strings_offsets}); diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index ea1afa69d2b..e16bf3cf153 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -26,6 +26,8 @@ #include #include +#include + #include #include @@ -36,8 +38,8 @@ namespace { template std::unique_ptr token_count_fn(cudf::size_type strings_count, TokenCounter tokenizer, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // create output column auto token_counts = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, @@ -47,7 +49,7 @@ std::unique_ptr token_count_fn(cudf::size_type strings_count, mr); auto d_token_counts = token_counts->mutable_view().data(); // add the counts to the column - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), d_token_counts, @@ -59,28 +61,28 @@ std::unique_ptr token_count_fn(cudf::size_type strings_count, template std::unique_ptr tokenize_fn(cudf::size_type strings_count, Tokenizer tokenizer, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto execpol = rmm::exec_policy(stream); // get the number of tokens in each string auto const token_counts = - token_count_fn(strings_count, tokenizer, rmm::mr::get_current_device_resource(), stream); + token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource()); auto d_token_counts = token_counts->view(); // create token-index offsets from the counts rmm::device_vector token_offsets(strings_count + 1); - thrust::inclusive_scan(execpol->on(stream), + thrust::inclusive_scan(execpol->on(stream.value()), d_token_counts.template begin(), d_token_counts.template end(), token_offsets.begin() + 1); - CUDA_TRY(cudaMemsetAsync(token_offsets.data().get(), 0, sizeof(int32_t), stream)); + CUDA_TRY(cudaMemsetAsync(token_offsets.data().get(), 0, sizeof(int32_t), stream.value())); auto const total_tokens = token_offsets.back(); // build a list of pointers to each token rmm::device_vector tokens(total_tokens); // now go get the tokens tokenizer.d_offsets = token_offsets.data().get(); tokenizer.d_tokens = tokens.data().get(); - thrust::for_each_n(execpol->on(stream), + thrust::for_each_n(execpol->on(stream.value()), thrust::make_counting_iterator(0), strings_count, tokenizer); @@ -95,33 +97,33 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, // zero or more character tokenizer std::unique_ptr tokenize(cudf::strings_column_view const& strings, cudf::string_scalar const& delimiter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - return tokenize_fn(strings.size(), strings_tokenizer{*strings_column, d_delimiter}, mr, stream); + return tokenize_fn(strings.size(), strings_tokenizer{*strings_column, d_delimiter}, stream, mr); } // zero or more character token counter std::unique_ptr count_tokens(cudf::strings_column_view const& strings, cudf::string_scalar const& delimiter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid"); cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); auto strings_column = cudf::column_device_view::create(strings.parent(), stream); return token_count_fn( - strings.size(), strings_tokenizer{*strings_column, d_delimiter}, mr, stream); + strings.size(), strings_tokenizer{*strings_column, d_delimiter}, stream, mr); } // one or more string delimiter tokenizer std::unique_ptr tokenize(cudf::strings_column_view const& strings, cudf::strings_column_view const& delimiters, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty"); CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls"); @@ -132,15 +134,15 @@ std::unique_ptr tokenize(cudf::strings_column_view const& strings, multi_delimiter_strings_tokenizer{*strings_column, delimiters_column->begin(), delimiters_column->end()}, - mr, - stream); + stream, + mr); } // one or more string delimiter token counter std::unique_ptr count_tokens(cudf::strings_column_view const& strings, cudf::strings_column_view const& delimiters, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty"); CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls"); @@ -151,13 +153,13 @@ std::unique_ptr count_tokens(cudf::strings_column_view const& stri multi_delimiter_strings_tokenizer{*strings_column, delimiters_column->begin(), delimiters_column->end()}, - mr, - stream); + stream, + mr); } // tokenize on every character std::unique_ptr character_tokenize(cudf::strings_column_view const& strings_column, - cudaStream_t stream, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto strings_count = strings_column.size(); @@ -179,7 +181,7 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const auto execpol = rmm::exec_policy(stream); auto strings_view = cudf::column_device_view::create(strings_column.parent(), stream); cudf::size_type num_characters = thrust::count_if( - execpol->on(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) { + execpol->on(stream.value()), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) { return cudf::strings::detail::is_begin_utf8_char(byte); }); @@ -198,7 +200,7 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const mr); auto d_new_offsets = offsets_column->mutable_view().begin(); thrust::copy_if( - execpol->on(stream), + execpol->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(chars_bytes + 1), d_new_offsets, @@ -230,7 +232,7 @@ std::unique_ptr tokenize(cudf::strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::tokenize(strings, delimiter, mr); + return detail::tokenize(strings, delimiter, rmm::cuda_stream_default, mr); } std::unique_ptr tokenize(cudf::strings_column_view const& strings, @@ -238,7 +240,7 @@ std::unique_ptr tokenize(cudf::strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::tokenize(strings, delimiters, mr); + return detail::tokenize(strings, delimiters, rmm::cuda_stream_default, mr); } std::unique_ptr count_tokens(cudf::strings_column_view const& strings, @@ -246,7 +248,7 @@ std::unique_ptr count_tokens(cudf::strings_column_view const& stri rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_tokens(strings, delimiter, mr); + return detail::count_tokens(strings, delimiter, rmm::cuda_stream_default, mr); } std::unique_ptr count_tokens(cudf::strings_column_view const& strings, @@ -254,14 +256,14 @@ std::unique_ptr count_tokens(cudf::strings_column_view const& stri rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_tokens(strings, delimiters, mr); + return detail::count_tokens(strings, delimiters, rmm::cuda_stream_default, mr); } std::unique_ptr character_tokenize(cudf::strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::character_tokenize(strings, 0, mr); + return detail::character_tokenize(strings, rmm::cuda_stream_default, mr); } } // namespace nvtext diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index 0903350f802..987d6272737 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -347,10 +347,7 @@ std::unique_ptr transform_fn(cudf::dictionary_column_view const& i stream, default_mr); return cudf::dictionary::detail::encode( - output->view(), - dictionary::detail::get_indices_type_for_size(output->size()), - mr, - stream.value()); + output->view(), dictionary::detail::get_indices_type_for_size(output->size()), stream, mr); } template @@ -360,13 +357,12 @@ struct MathOpDispatcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return transform_fn( - input.begin(), - input.end(), - cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr), - input.null_count(), - stream, - mr); + return transform_fn(input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); } struct dictionary_dispatch { diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh index f5f445fff6c..7cfc48d4385 100644 --- a/cpp/src/unary/unary_ops.cuh +++ b/cpp/src/unary/unary_ops.cuh @@ -71,7 +71,7 @@ struct launcher { output_view.begin(), F{}); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return output; } diff --git a/cpp/tests/column/column_device_view_test.cu b/cpp/tests/column/column_device_view_test.cu index c26a3046017..69d428be814 100644 --- a/cpp/tests/column/column_device_view_test.cu +++ b/cpp/tests/column/column_device_view_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,31 +14,33 @@ * limitations under the License. */ +#include +#include +#include +#include + #include #include #include #include #include -#include -#include -#include -#include +#include struct ColumnDeviceViewTest : public cudf::test::BaseFixture { }; TEST_F(ColumnDeviceViewTest, Sample) { - using T = int32_t; - cudaStream_t stream = 0; + using T = int32_t; + rmm::cuda_stream_view stream{}; cudf::test::fixed_width_column_wrapper input({1, 2, 3, 4, 5, 6}); auto output = cudf::allocate_like(input); auto input_device_view = cudf::column_device_view::create(input, stream); auto output_device_view = cudf::mutable_column_device_view::create(output->mutable_view(), stream); auto exec = rmm::exec_policy(stream); - EXPECT_NO_THROW(thrust::copy(exec->on(stream), + EXPECT_NO_THROW(thrust::copy(exec->on(stream.value()), input_device_view->begin(), input_device_view->end(), output_device_view->begin())); @@ -48,15 +50,15 @@ TEST_F(ColumnDeviceViewTest, Sample) TEST_F(ColumnDeviceViewTest, MismatchingType) { - using T = int32_t; - cudaStream_t stream = 0; + using T = int32_t; + rmm::cuda_stream_view stream{}; cudf::test::fixed_width_column_wrapper input({1, 2, 3, 4, 5, 6}); auto output = cudf::allocate_like(input); auto input_device_view = cudf::column_device_view::create(input, stream); auto output_device_view = cudf::mutable_column_device_view::create(output->mutable_view(), stream); auto exec = rmm::exec_policy(stream); - EXPECT_THROW(thrust::copy(exec->on(stream), + EXPECT_THROW(thrust::copy(exec->on(stream.value()), input_device_view->begin(), input_device_view->end(), output_device_view->begin()), diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index afd4d6668dd..d30929b90c6 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,21 +14,23 @@ * limitations under the License. */ +#include +#include + #include #include #include #include #include -#include -#include + +#include class ColumnFactoryTest : public cudf::test::BaseFixture { cudf::size_type _size{1000}; - cudaStream_t _stream{0}; public: cudf::size_type size() { return _size; } - cudaStream_t stream() { return _stream; } + rmm::cuda_stream_view stream() { return rmm::cuda_stream_default; } }; template diff --git a/cpp/tests/copying/copy_tests.cu b/cpp/tests/copying/copy_tests.cu index 50d0e82222e..9e0251d944d 100644 --- a/cpp/tests/copying/copy_tests.cu +++ b/cpp/tests/copying/copy_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,20 +14,21 @@ * limitations under the License. */ -#include -#include -#include #include -#include -#include - #include #include +#include +#include #include #include +#include +#include +#include #include +#include + template struct CopyTest : public cudf::test::BaseFixture { }; @@ -69,8 +70,8 @@ struct copy_if_else_tiny_grid_functor { std::unique_ptr operator()(cudf::column_view const& lhs, cudf::column_view const& rhs, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // output std::unique_ptr out = @@ -85,7 +86,7 @@ struct copy_if_else_tiny_grid_functor { // call the kernel with an artificially small grid cudf::detail::copy_if_else_kernel<32, T, decltype(lhs_iter), decltype(rhs_iter), Filter, false> - <<<1, 32, 0, stream>>>(lhs_iter, rhs_iter, filter, *out_dv, nullptr); + <<<1, 32, 0, stream.value()>>>(lhs_iter, rhs_iter, filter, *out_dv, nullptr); return out; } @@ -94,8 +95,8 @@ struct copy_if_else_tiny_grid_functor { std::unique_ptr operator()(cudf::column_view const& lhs, cudf::column_view const& rhs, Filter filter, - rmm::mr::device_memory_resource* mr, - cudaStream_t stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FAIL("Unexpected test execution"); } @@ -115,8 +116,8 @@ std::unique_ptr tiny_grid_launch(cudf::column_view const& lhs, lhs, rhs, filter, - rmm::mr::get_current_device_resource(), - (cudaStream_t)0); + rmm::cuda_stream_default, + rmm::mr::get_current_device_resource()); } TYPED_TEST(CopyTest, CopyIfElseTestTinyGrid) diff --git a/cpp/tests/copying/gather_struct_tests.cu b/cpp/tests/copying/gather_struct_tests.cu index c9923fb6457..3df44409062 100644 --- a/cpp/tests/copying/gather_struct_tests.cu +++ b/cpp/tests/copying/gather_struct_tests.cu @@ -14,33 +14,28 @@ * limitations under the License. */ -#include -#include -#include +#include +#include +#include +#include +#include -#include -#include -#include -#include -#include #include +#include #include +#include #include #include +#include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include + #include -#include + +#include + +#include using vector_of_columns = std::vector>; using cudf::size_type; @@ -384,7 +379,7 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfStructsWithValidity) // Testing gather() on struct> // Factory to construct numeric column with configurable null-mask. - auto const numeric_column_exemplar = [](std::function pred) { + auto const numeric_column_exemplar = [](nvstd::function pred) { return fixed_width_column_wrapper{ {5, 10, 15, 20, 25, 30, 35, 45, 50, 55, 60, 65, 70, 75}, make_counting_transform_iterator(0, [=](auto i) { return pred(i); })}; diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index ea9459675ff..f642ad5bd90 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -14,14 +14,18 @@ * limitations under the License. */ -#include -#include -#include #include #include #include #include #include + +#include +#include +#include + +#include + #include #include #include @@ -32,7 +36,7 @@ using TestTypes = cudf::test::Types; template > std::unique_ptr make_scalar( - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto s = new ScalarType(cudf::test::make_type_param_scalar(0), false, stream, mr); @@ -42,7 +46,7 @@ std::unique_ptr make_scalar( template > std::unique_ptr make_scalar( T value, - cudaStream_t stream = 0, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto s = new ScalarType(value, true, stream, mr); diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 87fe3145226..c63cab91be7 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,18 @@ * limitations under the License. */ -#include -#include -#include -#include -#include - #include #include #include #include #include +#include +#include +#include +#include +#include + template struct NonTimestampTest : public cudf::test::BaseFixture { cudf::data_type type() { return cudf::data_type{cudf::type_to_id()}; } @@ -143,7 +143,6 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) template struct TypedDatetimeOpsTest : public cudf::test::BaseFixture { - cudaStream_t stream() { return cudaStream_t(0); } cudf::size_type size() { return cudf::size_type(10); } cudf::data_type type() { return cudf::data_type{cudf::type_to_id()}; } }; diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 16af9ea93bc..debf540ea8e 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,10 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include +#include + +#include + #include TEST(ExpectsTest, FalseCondition) @@ -58,30 +61,26 @@ void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; } // calls. TEST(StreamCheck, FailedKernel) { - cudaStream_t stream; - CUDA_TRY(cudaStreamCreate(&stream)); + rmm::cuda_stream stream; int a; - test_kernel<<<0, 0, 0, stream>>>(&a); + test_kernel<<<0, 0, 0, stream.value()>>>(&a); #ifdef NDEBUG - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); #endif - EXPECT_THROW(CHECK_CUDA(stream), cudf::cuda_error); - CUDA_TRY(cudaStreamDestroy(stream)); + EXPECT_THROW(CHECK_CUDA(stream.value()), cudf::cuda_error); } TEST(StreamCheck, CatchFailedKernel) { - cudaStream_t stream; - CUDA_TRY(cudaStreamCreate(&stream)); + rmm::cuda_stream stream; int a; - test_kernel<<<0, 0, 0, stream>>>(&a); + test_kernel<<<0, 0, 0, stream.value()>>>(&a); #ifndef NDEBUG - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); #endif - CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream), + CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream.value()), "cudaErrorInvalidConfiguration " "invalid configuration argument"); - CUDA_TRY(cudaStreamDestroy(stream)); } __global__ void assert_false_kernel() { release_assert(false && "this kernel should die"); } diff --git a/cpp/tests/groupby/group_std_test.cpp b/cpp/tests/groupby/group_std_test.cpp index fdc69251428..e60aba08385 100644 --- a/cpp/tests/groupby/group_std_test.cpp +++ b/cpp/tests/groupby/group_std_test.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#ifdef NDEBUG // currently groupby std tests are not supported. See groupstd.cu + #include #include @@ -144,3 +146,5 @@ TYPED_TEST(groupby_std_test, ddof_non_default) } // namespace test } // namespace cudf + +#endif // NDEBUG diff --git a/cpp/tests/groupby/group_var_test.cpp b/cpp/tests/groupby/group_var_test.cpp index 2e49709a11f..6c1ea616212 100644 --- a/cpp/tests/groupby/group_var_test.cpp +++ b/cpp/tests/groupby/group_var_test.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#ifdef NDEBUG // currently groupby variance tests are not supported. See groupstd.cu + #include #include @@ -144,3 +146,5 @@ TYPED_TEST(groupby_var_test, ddof_non_default) } // namespace test } // namespace cudf + +#endif // NDEBUG diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index 5e69b4fd15d..b1fd3fa0bb4 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-19, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,19 @@ * limitations under the License. */ +#include + #include #include -#include -#include #include +#include + #include #include + +#include + #include #include #include @@ -51,7 +56,7 @@ struct InsertTest : public cudf::test::BaseFixture { std::min(static_cast(size), std::numeric_limits::max()); pairs.resize(input_size); map = std::move(map_type::create(compute_hash_table_size(size))); - CUDA_TRY(cudaStreamSynchronize(0)); + rmm::cuda_stream_default.synchronize(); } const cudf::size_type size{10000}; diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu index 3ed3e14ab09..1f1d20eb8ae 100644 --- a/cpp/tests/hash_map/multimap_test.cu +++ b/cpp/tests/hash_map/multimap_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,20 +14,21 @@ * limitations under the License. */ +#include + #include -#include +#include -#include +#include #include +#include #include #include #include -#include - // This is necessary to do a parametrized typed-test over multiple template // arguments template @@ -61,7 +62,7 @@ class MultimapTest : public cudf::test::BaseFixture { MultimapTest(const size_type hash_table_size = 100) : the_map(multimap_type::create(hash_table_size)), size(hash_table_size) { - CUDA_TRY(cudaStreamSynchronize(0)); + rmm::cuda_stream_default.synchronize(); } ~MultimapTest() {} diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 7c9b0929e35..c3ecafe990a 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -14,13 +14,6 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include - #include #include #include @@ -29,6 +22,14 @@ #include #include #include +#include +#include +#include +#include +#include +#include + +#include #include #include @@ -706,12 +707,12 @@ class custom_test_data_sink : public cudf::io::data_sink { bool supports_device_write() const override { return true; } - void device_write(void const* gpu_data, size_t size, cudaStream_t stream) + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { char* ptr = nullptr; CUDA_TRY(cudaMallocHost(&ptr, size)); - CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); outfile_.write(ptr, size); CUDA_TRY(cudaFreeHost(ptr)); } @@ -1135,12 +1136,12 @@ class custom_test_memmap_sink : public cudf::io::data_sink { bool supports_device_write() const override { return supports_device_writes; } - void device_write(void const* gpu_data, size_t size, cudaStream_t stream) + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) { char* ptr = nullptr; CUDA_TRY(cudaMallocHost(&ptr, size)); - CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream)); - CUDA_TRY(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); mm_writer->host_write(ptr, size); CUDA_TRY(cudaFreeHost(ptr)); } diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp index 6d850e15e9f..c91fb6b3b5e 100644 --- a/cpp/tests/scalar/factories_test.cpp +++ b/cpp/tests/scalar/factories_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,17 +14,18 @@ * limitations under the License. */ +#include +#include + #include #include #include -#include -#include -class ScalarFactoryTest : public cudf::test::BaseFixture { - cudaStream_t _stream{0}; +#include +class ScalarFactoryTest : public cudf::test::BaseFixture { public: - cudaStream_t stream() { return _stream; } + rmm::cuda_stream_view stream() { return rmm::cuda_stream_default; } }; template diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu index 506b0de4a36..528517d2be5 100644 --- a/cpp/tests/table/table_view_tests.cu +++ b/cpp/tests/table/table_view_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,18 @@ * limitations under the License. */ -#include -#include -#include -#include #include #include #include #include +#include +#include +#include +#include + +#include + #include // Compares two tables row by row, if table1 row is less than table2, then corresponding row value @@ -34,7 +37,7 @@ void row_comparison(cudf::table_view input1, cudf::mutable_column_view output, std::vector const& column_order) { - cudaStream_t stream = 0; + rmm::cuda_stream_view stream{}; auto device_table_1 = cudf::table_device_view::create(input1, stream); auto device_table_2 = cudf::table_device_view::create(input2, stream); @@ -43,7 +46,7 @@ void row_comparison(cudf::table_view input1, auto comparator = cudf::row_lexicographic_comparator( *device_table_1, *device_table_2, d_column_order.data().get()); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(stream)->on(stream.value()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input1.num_rows()), thrust::make_counting_iterator(0), diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index dc51b039b11..b9cbcd7c8a5 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,12 @@ * limitations under the License. */ +#include +#include +#include +#include +#include + #include #include #include @@ -21,15 +27,12 @@ #include #include #include -#include -#include -#include -#include -#include + +#include template struct ChronoColumnTest : public cudf::test::BaseFixture { - cudaStream_t stream() { return cudaStream_t(0); } + rmm::cuda_stream_view stream() { return rmm::cuda_stream_default; } cudf::size_type size() { return cudf::size_type(100); } cudf::data_type type() { return cudf::data_type{cudf::type_to_id()}; } }; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 8ceee00598a..597672ec50a 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -109,7 +109,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink { bool supports_device_write() const override { return true; } - void device_write(void const *gpu_data, size_t size, cudaStream_t stream) override { + void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override { JNIEnv *env = cudf::jni::get_jni_env(jvm); size_t left_to_copy = size; const char *copy_from = static_cast(gpu_data); @@ -117,7 +117,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink { long buffer_amount_available = current_buffer_len - current_buffer_written; if (buffer_amount_available <= 0) { // should never be < 0, but just to be safe - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); rotate_buffer(env); buffer_amount_available = current_buffer_len - current_buffer_written; } @@ -126,14 +126,14 @@ class jni_writer_data_sink final : public cudf::io::data_sink { char *copy_to = current_buffer_data + current_buffer_written; CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, - stream)); + stream.value())); copy_from = copy_from + amount_to_copy; current_buffer_written += amount_to_copy; total_written += amount_to_copy; left_to_copy -= amount_to_copy; } - CUDA_TRY(cudaStreamSynchronize(stream)); + stream.synchronize(); } void flush() override { diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu index 95eea10e8e0..e1fc56ed834 100644 --- a/java/src/main/native/src/map_lookup.cu +++ b/java/src/main/native/src/map_lookup.cu @@ -119,7 +119,7 @@ get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_ke gpu_find_first<<>>( *input_device_view, *output_view, lookup_key_device_view); - CHECK_CUDA(stream); + CHECK_CUDA(stream.value()); return gather_map; }