From 2ced21489d7b4043b39dad1d60adae9b3f77fd57 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 7 Nov 2022 17:59:16 -0600 Subject: [PATCH] Use nosync policy in gather and scatter implementations. (#12038) This PR uses `rmm::exec_policy_nosync` in libcudf's gather and scatter functions. These changes are motivated by performance improvements seen previously in #11577. # Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/12038 --- cpp/include/cudf/detail/gather.cuh | 4 ++-- cpp/include/cudf/detail/scatter.cuh | 12 ++++++------ cpp/include/cudf/lists/detail/gather.cuh | 4 ++-- cpp/include/cudf/lists/detail/scatter.cuh | 6 +++--- cpp/include/cudf/strings/detail/gather.cuh | 10 ++++++---- cpp/include/cudf/strings/detail/scatter.cuh | 3 ++- cpp/src/copying/scatter.cu | 11 +++++++---- cpp/src/lists/copying/scatter_helper.cu | 6 +++--- cpp/tests/copying/detail_gather_tests.cu | 2 +- 9 files changed, 32 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 2acdc007afa..57d834e6277 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -128,7 +128,7 @@ void gather_helper(InputItr source_itr, { using map_type = typename std::iterator_traits::value_type; if (nullify_out_of_bounds) { - thrust::gather_if(rmm::exec_policy(stream), + thrust::gather_if(rmm::exec_policy_nosync(stream), gather_map_begin, gather_map_end, gather_map_begin, @@ -137,7 +137,7 @@ void gather_helper(InputItr source_itr, bounds_checker{0, source_size}); } else { thrust::gather( - rmm::exec_policy(stream), gather_map_begin, gather_map_end, source_itr, target_itr); + rmm::exec_policy_nosync(stream), gather_map_begin, gather_map_end, source_itr, target_itr); } } diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index ad5a2134afe..c8b17e22df2 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -79,14 +79,14 @@ auto scatter_to_gather(MapIterator scatter_map_begin, // We'll use the `numeric_limits::lowest()` value for this since it should always be outside the // valid range. auto gather_map = rmm::device_uvector(gather_rows, stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), gather_map.begin(), gather_map.end(), std::numeric_limits::lowest()); // Convert scatter map to a gather map thrust::scatter( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(std::distance(scatter_map_begin, scatter_map_end)), scatter_map_begin, @@ -114,13 +114,13 @@ auto scatter_to_gather_complement(MapIterator scatter_map_begin, rmm::cuda_stream_view stream) { auto gather_map = rmm::device_uvector(gather_rows, stream); - thrust::sequence(rmm::exec_policy(stream), gather_map.begin(), gather_map.end(), 0); + thrust::sequence(rmm::exec_policy_nosync(stream), gather_map.begin(), gather_map.end(), 0); auto const out_of_bounds_begin = thrust::make_constant_iterator(std::numeric_limits::lowest()); auto const out_of_bounds_end = out_of_bounds_begin + thrust::distance(scatter_map_begin, scatter_map_end); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), out_of_bounds_begin, out_of_bounds_end, scatter_map_begin, @@ -152,7 +152,7 @@ struct column_scatterer_impl(), source.begin() + cudf::distance(scatter_map_begin, scatter_map_end), scatter_map_begin, @@ -226,7 +226,7 @@ struct column_scatterer_impl { auto source_itr = indexalator_factory::make_input_iterator(source_view.indices()); auto new_indices = std::make_unique(target_view.get_indices_annotated(), stream, mr); auto target_itr = indexalator_factory::make_output_iterator(new_indices->mutable_view()); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), source_itr, source_itr + std::distance(scatter_map_begin, scatter_map_end), scatter_map_begin, diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index 7db908c5b52..48c0ed8f6e9 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -89,7 +89,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the compacted outgoing offsets. auto count_iter = thrust::make_counting_iterator(0); thrust::transform_exclusive_scan( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), count_iter, count_iter + offset_count, dst_offsets_v.begin(), @@ -125,7 +125,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the base offsets rmm::device_uvector base_offsets = rmm::device_uvector(output_count, stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), gather_map, gather_map + output_count, base_offsets.data(), diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 5d89a9be29c..f4106fb5cdf 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -58,7 +58,7 @@ rmm::device_uvector list_vector_from_column( auto vector = rmm::device_uvector(n_rows, stream, mr); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end, vector.begin(), @@ -104,7 +104,7 @@ std::unique_ptr scatter_impl( auto const child_column_type = lists_column_view(target).child().type(); // Scatter. - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), source_vector.begin(), source_vector.end(), scatter_map_begin, @@ -239,7 +239,7 @@ std::unique_ptr scatter( : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr); auto offset_column = make_numeric_column( data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr); - thrust::sequence(rmm::exec_policy(stream), + thrust::sequence(rmm::exec_policy_nosync(stream), offset_column->mutable_view().begin(), offset_column->mutable_view().end(), 0, diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 4820e6e77c7..28b98eac3b5 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -305,7 +305,7 @@ std::unique_ptr gather(strings_column_view const& strings, auto const d_in_offsets = (strings_count > 0) ? strings.offsets_begin() : nullptr; auto const d_strings = column_device_view::create(strings.parent(), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), begin, end, d_out_offsets, @@ -317,7 +317,7 @@ std::unique_ptr gather(strings_column_view const& strings, // check total size is not too large size_t const total_bytes = thrust::transform_reduce( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), d_out_offsets, d_out_offsets + output_count, [] __device__(auto size) { return static_cast(size); }, @@ -327,8 +327,10 @@ std::unique_ptr gather(strings_column_view const& strings, "total size of output strings is too large for a cudf column"); // In-place convert output sizes into offsets - thrust::exclusive_scan( - rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + d_out_offsets, + d_out_offsets + output_count + 1, + d_out_offsets); // build chars column cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index 7d6a07b4b10..55dd5bda260 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -76,7 +76,8 @@ std::unique_ptr scatter(SourceIterator begin, begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; }); // do the scatter - thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin()); + thrust::scatter( + rmm::exec_policy_nosync(stream), itr, itr + size, scatter_map, target_vector.begin()); // build the output column auto sv_span = cudf::device_span(target_vector); diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 4ebe465b945..6083a698560 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -119,7 +119,7 @@ struct column_scalar_scatterer_impl { auto scalar_iter = thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0)); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), scalar_iter, scalar_iter + scatter_rows, scatter_iter, @@ -191,8 +191,11 @@ struct column_scalar_scatterer_impl { auto new_indices = std::make_unique(dict_view.get_indices_annotated(), stream, mr); auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view()); - thrust::scatter( - rmm::exec_policy(stream), scalar_iter, scalar_iter + scatter_rows, scatter_iter, target_iter); + thrust::scatter(rmm::exec_policy_nosync(stream), + scalar_iter, + scalar_iter + scatter_rows, + scatter_iter, + target_iter); // build the dictionary indices column from the result auto const indices_type = new_indices->type(); @@ -383,7 +386,7 @@ std::unique_ptr boolean_mask_scatter(column_view const& input, data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream); auto mutable_indices = indices->mutable_view(); - thrust::sequence(rmm::exec_policy(stream), + thrust::sequence(rmm::exec_policy_nosync(stream), mutable_indices.begin(), mutable_indices.end(), 0); diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu index cbb3aec76c5..ca7ca2f6590 100644 --- a/cpp/src/lists/copying/scatter_helper.cu +++ b/cpp/src/lists/copying/scatter_helper.cu @@ -185,7 +185,7 @@ struct list_child_constructor { mr); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(child_column->size()), child_column->mutable_view().begin(), @@ -237,7 +237,7 @@ struct list_child_constructor { auto const null_string_view = string_view{nullptr, 0}; // placeholder for factory function thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(string_views.size()), string_views.begin(), @@ -304,7 +304,7 @@ struct list_child_constructor { // For instance, if a parent list_device_view has 3 elements, it should have 3 corresponding // child list_device_view instances. thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(child_list_views.size()), child_list_views.begin(), diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu index a8abaa33ac3..bf2937ae8ab 100644 --- a/cpp/tests/copying/detail_gather_tests.cu +++ b/cpp/tests/copying/detail_gather_tests.cu @@ -48,7 +48,7 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest) constexpr cudf::size_type source_size{1000}; rmm::device_uvector gather_map(source_size, cudf::get_default_stream()); thrust::sequence( - rmm::exec_policy(cudf::get_default_stream()), gather_map.begin(), gather_map.end()); + rmm::exec_policy_nosync(cudf::get_default_stream()), gather_map.begin(), gather_map.end()); auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); cudf::test::fixed_width_column_wrapper source_column(data, data + source_size);