From bd63a2b54347bb282dfa91bfdcc6f4983bd5ce02 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 7 Mar 2023 14:46:50 -0500 Subject: [PATCH 01/15] Disable inline of row operators for nested column types --- .../cudf/table/experimental/row_operators.cuh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 2a207d2a5c4..94de4530757 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -381,7 +381,7 @@ class device_row_comparator { template )> - __device__ cuda::std::pair operator()( + __attribute__((noinline)) __device__ cuda::std::pair operator()( size_type const lhs_element_index, size_type const rhs_element_index) const noexcept { column_device_view lcol = _lhs; @@ -415,8 +415,8 @@ class device_row_comparator { template )> - __device__ cuda::std::pair operator()(size_type lhs_element_index, - size_type rhs_element_index) + __attribute__((noinline)) __device__ cuda::std::pair operator()( + size_type lhs_element_index, size_type rhs_element_index) { // only order top-NULLs according to null_order auto const is_l_row_null = _lhs.is_null(lhs_element_index); @@ -1296,8 +1296,8 @@ class device_row_comparator { } template ())> - __device__ bool operator()(size_type const lhs_element_index, - size_type const rhs_element_index) const noexcept + __attribute__((noinline)) __device__ bool operator()( + size_type const lhs_element_index, size_type const rhs_element_index) const noexcept { column_device_view lcol = lhs.slice(lhs_element_index, 1); column_device_view rcol = rhs.slice(rhs_element_index, 1); @@ -1749,8 +1749,8 @@ class device_row_hasher { } template ())> - __device__ hash_value_type operator()(column_device_view const& col, - size_type row_index) const noexcept + __attribute__((noinline)) __device__ hash_value_type + operator()(column_device_view const& col, size_type row_index) const noexcept { auto hash = hash_value_type{0}; column_device_view curr_col = col.slice(row_index, 1); From 1446e08eab34e0cfb8d1a9de8d8618bc00b260b9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 7 Mar 2023 18:54:31 -0500 Subject: [PATCH 02/15] change attribute(noinline) to noinline --- .../cudf/table/experimental/row_operators.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 94de4530757..40802f71e33 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -381,7 +381,7 @@ class device_row_comparator { template )> - __attribute__((noinline)) __device__ cuda::std::pair operator()( + __noinline__ __device__ cuda::std::pair operator()( size_type const lhs_element_index, size_type const rhs_element_index) const noexcept { column_device_view lcol = _lhs; @@ -415,7 +415,7 @@ class device_row_comparator { template )> - __attribute__((noinline)) __device__ cuda::std::pair operator()( + __noinline__ __device__ cuda::std::pair operator()( size_type lhs_element_index, size_type rhs_element_index) { // only order top-NULLs according to null_order @@ -1296,8 +1296,8 @@ class device_row_comparator { } template ())> - __attribute__((noinline)) __device__ bool operator()( - size_type const lhs_element_index, size_type const rhs_element_index) const noexcept + __noinline__ __device__ bool operator()(size_type const lhs_element_index, + size_type const rhs_element_index) const noexcept { column_device_view lcol = lhs.slice(lhs_element_index, 1); column_device_view rcol = rhs.slice(rhs_element_index, 1); @@ -1749,8 +1749,8 @@ class device_row_hasher { } template ())> - __attribute__((noinline)) __device__ hash_value_type - operator()(column_device_view const& col, size_type row_index) const noexcept + __noinline__ __device__ hash_value_type operator()(column_device_view const& col, + size_type row_index) const noexcept { auto hash = hash_value_type{0}; column_device_view curr_col = col.slice(row_index, 1); From 2453e7dc91974953023bcbf3fb04ae1a5cf29c54 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 9 Mar 2023 09:08:06 -0500 Subject: [PATCH 03/15] use transform for row-ops intermediate result --- cpp/src/search/contains_scalar.cu | 26 ++++++++++++++++---------- cpp/src/sort/is_sorted.cu | 19 ++++++++++++++----- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu index 093a1f8f1ed..dd473df0218 100644 --- a/cpp/src/search/contains_scalar.cu +++ b/cpp/src/search/contains_scalar.cu @@ -30,6 +30,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -108,16 +109,21 @@ struct contains_scalar_dispatch { auto const haystack_cdv_ptr = column_device_view::create(haystack, stream); auto const d_comp = comparator.equal_to(nullate::DYNAMIC{has_nulls}); - return thrust::count_if( - rmm::exec_policy(stream), - begin, - end, - [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) { - if (check_nulls && d_haystack.is_null_nocheck(static_cast(idx))) { - return false; - } - return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. - }) > 0; + + auto d_results = rmm::device_uvector(haystack.size(), stream); + thrust::transform( + rmm::exec_policy(stream), + begin, + end, + d_results.begin(), + [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) { + if (check_nulls && d_haystack.is_null_nocheck(static_cast(idx))) { + return false; + } + return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. + }); + + return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0; } }; diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index 356c58b1c22..e0655bb089c 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -28,13 +28,15 @@ #include #include +#include #include #include +#include namespace cudf { namespace detail { -auto is_sorted(cudf::table_view const& in, +bool is_sorted(cudf::table_view const& in, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream) @@ -45,10 +47,17 @@ auto is_sorted(cudf::table_view const& in, if (cudf::detail::has_nested_columns(in)) { auto const device_comparator = comparator.less(has_nested_nulls(in)); - return thrust::is_sorted(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(in.num_rows()), - device_comparator); + auto d_results = rmm::device_uvector(in.num_rows(), stream); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(in.num_rows()), + d_results.begin(), + [device_comparator] __device__(auto idx) -> bool { + if (idx == 0) { return true; } + return device_comparator(idx - 1, idx); + }); + + return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) > 0; } else { auto const device_comparator = comparator.less(has_nested_nulls(in)); From 47f1332825b166c66977a71df3e3e56509d2f97a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 9 Mar 2023 13:16:15 -0500 Subject: [PATCH 04/15] fix bool logic from count() return --- cpp/src/sort/is_sorted.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index e0655bb089c..133b60e7baa 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -57,7 +57,7 @@ bool is_sorted(cudf::table_view const& in, return device_comparator(idx - 1, idx); }); - return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) > 0; + return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0; } else { auto const device_comparator = comparator.less(has_nested_nulls(in)); From 8cdf813f25f36c2cba5fee4e0a3ec10a162aaa2e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 9 Mar 2023 18:18:15 -0500 Subject: [PATCH 05/15] add transform/count to unique_count --- cpp/src/sort/is_sorted.cu | 3 +-- cpp/src/stream_compaction/unique_count.cu | 16 +++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index 11c6ed6074a..c84c2fa3458 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -52,8 +52,7 @@ bool is_sorted(cudf::table_view const& in, thrust::make_counting_iterator(in.num_rows()), d_results.begin(), [device_comparator] __device__(auto idx) -> bool { - if (idx == 0) { return true; } - return device_comparator(idx - 1, idx); + return (idx == 0) || device_comparator(idx - 1, idx); }); return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0; diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu index c7c10438d7a..9420aefe7c8 100644 --- a/cpp/src/stream_compaction/unique_count.cu +++ b/cpp/src/stream_compaction/unique_count.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -75,11 +76,16 @@ cudf::size_type unique_count(table_view const& keys, if (cudf::detail::has_nested_columns(keys)) { auto const comp = row_comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); + + auto d_results = rmm::device_uvector(keys.num_rows(), stream); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(keys.num_rows()), + d_results.begin(), + [comp] __device__(auto i) { return (i == 0 or not comp(i, i - 1)); }); + + return static_cast( + thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true)); } else { auto const comp = row_comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal); From f8fe0350494c5888d1e6a99b7f07c5b21e39e661 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 13 Mar 2023 19:33:02 -0400 Subject: [PATCH 06/15] undo no-inline declaration --- .../cudf/table/experimental/row_operators.cuh | 14 ++-- cpp/src/stream_compaction/unique.cu | 70 +++++++++++-------- 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index e8119e05804..58f20adb923 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -381,7 +381,7 @@ class device_row_comparator { template )> - __noinline__ __device__ cuda::std::pair operator()( + __device__ cuda::std::pair operator()( size_type const lhs_element_index, size_type const rhs_element_index) const noexcept { column_device_view lcol = _lhs; @@ -415,8 +415,8 @@ class device_row_comparator { template )> - __noinline__ __device__ cuda::std::pair operator()( - size_type lhs_element_index, size_type rhs_element_index) + __device__ cuda::std::pair operator()(size_type lhs_element_index, + size_type rhs_element_index) { // only order top-NULLs according to null_order auto const is_l_row_null = _lhs.is_null(lhs_element_index); @@ -1294,8 +1294,8 @@ class device_row_comparator { } template ())> - __noinline__ __device__ bool operator()(size_type const lhs_element_index, - size_type const rhs_element_index) const noexcept + __device__ bool operator()(size_type const lhs_element_index, + size_type const rhs_element_index) const noexcept { column_device_view lcol = lhs.slice(lhs_element_index, 1); column_device_view rcol = rhs.slice(rhs_element_index, 1); @@ -1747,8 +1747,8 @@ class device_row_hasher { } template ())> - __noinline__ __device__ hash_value_type operator()(column_device_view const& col, - size_type row_index) const noexcept + __device__ hash_value_type operator()(column_device_view const& col, + size_type row_index) const noexcept { auto hash = hash_value_type{0}; column_device_view curr_col = col.slice(row_index, 1); diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu index 511a7b7ae1c..e76cea70f19 100644 --- a/cpp/src/stream_compaction/unique.cu +++ b/cpp/src/stream_compaction/unique.cu @@ -67,38 +67,46 @@ std::unique_ptr unique(table_view const& input, auto comp = cudf::experimental::row::equality::self_comparator(keys_view, stream); - auto const comparator_helper = [&](auto const row_equal) { - // get indices of unique rows - auto result_end = unique_copy(thrust::counting_iterator(0), - thrust::counting_iterator(num_rows), - mutable_view->begin(), - row_equal, - keep, - stream); + size_type const unique_size = [&] { + if (cudf::detail::has_nested_columns(keys_view)) { + auto row_equal = + comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal); + auto d_results = rmm::device_uvector(num_rows, stream); + auto itr = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + itr, + itr + num_rows, + d_results.begin(), + unique_copy_fn{itr, keep, row_equal, num_rows - 1}); + auto result_end = thrust::copy_if(rmm::exec_policy(stream), + itr, + itr + num_rows, + d_results.begin(), + mutable_view->begin(), + thrust::identity{}); + return static_cast(thrust::distance(mutable_view->begin(), result_end)); + } else { + auto row_equal = + comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal); + auto result_end = unique_copy(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + mutable_view->begin(), + row_equal, + keep, + stream); + return static_cast(thrust::distance(mutable_view->begin(), result_end)); + } + }(); + auto indices_view = cudf::detail::slice(column_view(*unique_indices), 0, unique_size); - auto indices_view = - cudf::detail::slice(column_view(*unique_indices), - 0, - thrust::distance(mutable_view->begin(), result_end)); - - // gather unique rows and return - return detail::gather(input, - indices_view, - out_of_bounds_policy::DONT_CHECK, - detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - }; - - if (cudf::detail::has_nested_columns(keys_view)) { - auto row_equal = - comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal); - return comparator_helper(row_equal); - } else { - auto row_equal = - comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal); - return comparator_helper(row_equal); - } + // gather unique rows and return + return detail::gather(input, + indices_view, + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); } } // namespace detail From 3ccd0a41eee324aea264616b0e7ede316185ced0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Mar 2023 08:42:23 -0400 Subject: [PATCH 07/15] rework group-nunique to use intermediate buffer --- cpp/src/groupby/sort/group_nunique.cu | 36 ++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu index cf81253483e..1a5f1691d5b 100644 --- a/cpp/src/groupby/sort/group_nunique.cu +++ b/cpp/src/groupby/sort/group_nunique.cu @@ -94,21 +94,20 @@ std::unique_ptr group_nunique(column_view const& values, auto const d_values_view = column_device_view::create(values, stream); + auto d_result = rmm::device_uvector(group_labels.size(), stream); + auto const comparator_helper = [&](auto const d_equal) { - auto const is_unique_iterator = - thrust::make_transform_iterator(thrust::counting_iterator(0), - is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()}, - *d_values_view, - d_equal, - null_handling, - group_offsets.data(), - group_labels.data()}); - thrust::reduce_by_key(rmm::exec_policy(stream), - group_labels.begin(), - group_labels.end(), - is_unique_iterator, - thrust::make_discard_iterator(), - result->mutable_view().begin()); + auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()}, + *d_values_view, + d_equal, + null_handling, + group_offsets.data(), + group_labels.data()}; + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(values.size()), + d_result.begin(), + fn); }; if (cudf::detail::has_nested_columns(values_view)) { @@ -121,6 +120,15 @@ std::unique_ptr group_nunique(column_view const& values, comparator_helper(d_equal); } + // calling this with a vector instead of a transform iterator is 10x faster to compile; + // it also helps that we are only calling it once for both conditions + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + d_result.begin(), + thrust::make_discard_iterator(), + result->mutable_view().begin()); + return result; } From 321c4d43c1d2156b283b29088a608dc4aa2b6aad Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Mar 2023 12:11:20 -0400 Subject: [PATCH 08/15] use temp buffer for reduce-by-key calls --- cpp/src/groupby/sort/group_m2.cu | 15 +++++++++------ cpp/src/groupby/sort/group_std.cu | 15 +++++++++------ cpp/src/groupby/sort/sort_helper.cu | 25 +++++++++++++++++-------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu index edc8b089120..f017f671c3c 100644 --- a/cpp/src/groupby/sort/group_m2.cu +++ b/cpp/src/groupby/sort/group_m2.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,10 +25,12 @@ #include #include +#include #include #include #include +#include namespace cudf { namespace groupby { @@ -62,15 +64,16 @@ void compute_m2_fn(column_device_view const& values, ResultType* d_result, rmm::cuda_stream_view stream) { - auto const var_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, - m2_transform{ - values, values_iter, d_means, group_labels.data()}); + auto m2_fn = m2_transform{ + values, values_iter, d_means, group_labels.data()}; + auto const itr = thrust::make_counting_iterator(0); + auto m2_vals = rmm::device_uvector(values.size(), stream); + thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn); thrust::reduce_by_key(rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), - var_iter, + m2_vals.begin(), // var_iter thrust::make_discard_iterator(), d_result); } diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index a3efc1f172a..6b2f88a2c81 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -33,6 +34,7 @@ #include #include #include +#include namespace cudf { namespace groupby { @@ -48,7 +50,7 @@ struct var_transform { size_type const* d_group_labels; size_type ddof; - __device__ ResultType operator()(size_type i) + __device__ ResultType operator()(size_type i) const { if (d_values.is_null(i)) return 0.0; @@ -75,15 +77,16 @@ void reduce_by_key_fn(column_device_view const& values, ResultType* d_result, rmm::cuda_stream_view stream) { - auto var_iter = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - var_transform{ - values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof}); + auto var_fn = var_transform{ + values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof}; + auto const itr = thrust::make_counting_iterator(0); + auto vars = rmm::device_uvector(values.size(), stream); + thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn); thrust::reduce_by_key(rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), - var_iter, + vars.begin(), thrust::make_discard_iterator(), d_result); } diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index ebafcd75e6d..c756146d3d7 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -15,6 +15,7 @@ */ #include "common_utils.cuh" +#include #include #include @@ -144,7 +145,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets( { if (_group_offsets) return *_group_offsets; - _group_offsets = std::make_unique(num_keys(stream) + 1, stream); + auto const size = num_keys(stream); + _group_offsets = std::make_unique(size + 1, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream}; @@ -154,23 +156,30 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets( if (cudf::detail::has_nested_columns(_keys)) { auto const d_key_equal = comparator.equal_to( cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL); - result_end = thrust::unique_copy(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(num_keys(stream)), - _group_offsets->begin(), - permuted_row_equality_comparator(d_key_equal, sorted_order)); + auto result = rmm::device_uvector(size, stream); + auto const itr = thrust::make_counting_iterator(0); + auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order); + auto const ufn = cudf::detail::unique_copy_fn{ + itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1}; + thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn); + result_end = thrust::copy_if(rmm::exec_policy(stream), + itr, + itr + size, + result.begin(), + _group_offsets->begin(), + thrust::identity{}); } else { auto const d_key_equal = comparator.equal_to( cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL); result_end = thrust::unique_copy(rmm::exec_policy(stream), thrust::counting_iterator(0), - thrust::counting_iterator(num_keys(stream)), + thrust::counting_iterator(size), _group_offsets->begin(), permuted_row_equality_comparator(d_key_equal, sorted_order)); } size_type num_groups = thrust::distance(_group_offsets->begin(), result_end); - _group_offsets->set_element(num_groups, num_keys(stream), stream); + _group_offsets->set_element(num_groups, size, stream); _group_offsets->resize(num_groups + 1, stream); return *_group_offsets; From 0370e246d269ff1ed45034c69d2596cea90551cd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 17 Mar 2023 16:15:40 -0400 Subject: [PATCH 09/15] cleanup comments --- cpp/src/groupby/sort/group_m2.cu | 6 ++++-- cpp/src/groupby/sort/group_std.cu | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu index f017f671c3c..5a3e8b8bb7b 100644 --- a/cpp/src/groupby/sort/group_m2.cu +++ b/cpp/src/groupby/sort/group_m2.cu @@ -67,13 +67,15 @@ void compute_m2_fn(column_device_view const& values, auto m2_fn = m2_transform{ values, values_iter, d_means, group_labels.data()}; auto const itr = thrust::make_counting_iterator(0); - auto m2_vals = rmm::device_uvector(values.size(), stream); + // using a temporary buffer for the transform instead of a transform-iterator + // improves compile-time significantly + auto m2_vals = rmm::device_uvector(values.size(), stream); thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn); thrust::reduce_by_key(rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), - m2_vals.begin(), // var_iter + m2_vals.begin(), thrust::make_discard_iterator(), d_result); } diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 6b2f88a2c81..2e5b0102c7f 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -80,7 +80,9 @@ void reduce_by_key_fn(column_device_view const& values, auto var_fn = var_transform{ values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof}; auto const itr = thrust::make_counting_iterator(0); - auto vars = rmm::device_uvector(values.size(), stream); + // using a temporary buffer for the transform instead of a transform-iterator + // improves compile-time significantly + auto vars = rmm::device_uvector(values.size(), stream); thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn); thrust::reduce_by_key(rmm::exec_policy(stream), From 6784431ad378d59aa5574f8ca63bc48a9be3fbb8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 21 Mar 2023 19:26:46 -0400 Subject: [PATCH 10/15] prefer using counting-iterator over factory call --- cpp/src/groupby/sort/group_m2.cu | 2 +- cpp/src/groupby/sort/sort_helper.cu | 1 + cpp/src/sort/is_sorted.cu | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu index 5a3e8b8bb7b..8e410c9b9c3 100644 --- a/cpp/src/groupby/sort/group_m2.cu +++ b/cpp/src/groupby/sort/group_m2.cu @@ -66,7 +66,7 @@ void compute_m2_fn(column_device_view const& values, { auto m2_fn = m2_transform{ values, values_iter, d_means, group_labels.data()}; - auto const itr = thrust::make_counting_iterator(0); + auto const itr = thrust::counting_iterator(0); // using a temporary buffer for the transform instead of a transform-iterator // improves compile-time significantly auto m2_vals = rmm::device_uvector(values.size(), stream); diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index c756146d3d7..0a3abad42e3 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -15,6 +15,7 @@ */ #include "common_utils.cuh" + #include #include diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index c84c2fa3458..1acf7886dc8 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -48,8 +48,8 @@ bool is_sorted(cudf::table_view const& in, auto d_results = rmm::device_uvector(in.num_rows(), stream); thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(in.num_rows()), + thrust::counting_iterator(0), + thrust::counting_iterator(in.num_rows()), d_results.begin(), [device_comparator] __device__(auto idx) -> bool { return (idx == 0) || device_comparator(idx - 1, idx); @@ -60,8 +60,8 @@ bool is_sorted(cudf::table_view const& in, auto const device_comparator = comparator.less(has_nested_nulls(in)); return thrust::is_sorted(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(in.num_rows()), + thrust::counting_iterator(0), + thrust::counting_iterator(in.num_rows()), device_comparator); } } From 69da72a84a20606de99bf9f32b122f43154b3f99 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 22 Mar 2023 14:19:54 -0400 Subject: [PATCH 11/15] add benchmarks for unique_count --- cpp/benchmarks/CMakeLists.txt | 2 +- .../stream_compaction/unique_count.cpp | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 cpp/benchmarks/stream_compaction/unique_count.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index cc0b642a337..ada4a5ccba0 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -149,7 +149,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp + STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp stream_compaction/unique_count.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/stream_compaction/unique_count.cpp b/cpp/benchmarks/stream_compaction/unique_count.cpp new file mode 100644 index 00000000000..f8319e0385c --- /dev/null +++ b/cpp/benchmarks/stream_compaction/unique_count.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void nvbench_unique_count(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("NumRows")); + auto const nulls = state.get_float64("NullProbability"); + + data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows / 100); + + auto source_column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + auto sorted_table = cudf::sort(cudf::table_view({source_column->view()})); + + auto input = sorted_table->view(); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::unique_count(input, cudf::null_equality::EQUAL); + }); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type)) + .set_name("unique_count") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}) + .add_float64_axis("NullProbability", {0.0, 0.1}); From 2a4f7debfe149a754b9befa678a723bad0285c07 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 22 Mar 2023 14:20:56 -0400 Subject: [PATCH 12/15] add comments for new code patterns --- cpp/src/groupby/sort/group_m2.cu | 5 +++-- cpp/src/groupby/sort/group_std.cu | 5 +++-- cpp/src/groupby/sort/sort_helper.cu | 3 +++ cpp/src/search/contains_scalar.cu | 3 +++ cpp/src/sort/is_sorted.cu | 3 +++ cpp/src/stream_compaction/unique.cu | 5 +++++ cpp/src/stream_compaction/unique_count.cu | 6 +++++- 7 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu index 8e410c9b9c3..70b05100fb0 100644 --- a/cpp/src/groupby/sort/group_m2.cu +++ b/cpp/src/groupby/sort/group_m2.cu @@ -67,8 +67,9 @@ void compute_m2_fn(column_device_view const& values, auto m2_fn = m2_transform{ values, values_iter, d_means, group_labels.data()}; auto const itr = thrust::counting_iterator(0); - // using a temporary buffer for the transform instead of a transform-iterator - // improves compile-time significantly + // Using a temporary buffer for intermediate transform results instead of + // using the transform-iterator directly in thrust::reduce_by_key + // improves compile-time significantly. auto m2_vals = rmm::device_uvector(values.size(), stream); thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn); diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 2e5b0102c7f..8cd2d8baf4e 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -80,8 +80,9 @@ void reduce_by_key_fn(column_device_view const& values, auto var_fn = var_transform{ values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof}; auto const itr = thrust::make_counting_iterator(0); - // using a temporary buffer for the transform instead of a transform-iterator - // improves compile-time significantly + // Using a temporary buffer for intermediate transform results instead of + // using the transform-iterator directly in thrust::reduce_by_key + // improves compile-time significantly. auto vars = rmm::device_uvector(values.size(), stream); thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn); diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 0a3abad42e3..c4e30f85bc2 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -157,6 +157,9 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets( if (cudf::detail::has_nested_columns(_keys)) { auto const d_key_equal = comparator.equal_to( cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL); + // Using a temporary buffer for intermediate transform results from the iterator containing + // the comparator speeds up compile-time significantly without much degradation in + // runtime performance over using the comparator directly in thrust::unique_copy. auto result = rmm::device_uvector(size, stream); auto const itr = thrust::make_counting_iterator(0); auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order); diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu index dd473df0218..7c16a1b12ef 100644 --- a/cpp/src/search/contains_scalar.cu +++ b/cpp/src/search/contains_scalar.cu @@ -110,6 +110,9 @@ struct contains_scalar_dispatch { auto const d_comp = comparator.equal_to(nullate::DYNAMIC{has_nulls}); + // Using a temporary buffer for intermediate transform results from the lambda containing + // the comparator speeds up compile-time significantly without much degradation in + // runtime performance over using the comparator in a transform iterator with thrust::count_if. auto d_results = rmm::device_uvector(haystack.size(), stream); thrust::transform( rmm::exec_policy(stream), diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu index 1acf7886dc8..25c594e9e74 100644 --- a/cpp/src/sort/is_sorted.cu +++ b/cpp/src/sort/is_sorted.cu @@ -46,6 +46,9 @@ bool is_sorted(cudf::table_view const& in, if (cudf::detail::has_nested_columns(in)) { auto const device_comparator = comparator.less(has_nested_nulls(in)); + // Using a temporary buffer for intermediate transform results from the lambda containing + // the comparator speeds up compile-time significantly over using the comparator directly + // in thrust::is_sorted. auto d_results = rmm::device_uvector(in.num_rows(), stream); thrust::transform(rmm::exec_policy(stream), thrust::counting_iterator(0), diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu index e76cea70f19..2d81c00e9d9 100644 --- a/cpp/src/stream_compaction/unique.cu +++ b/cpp/src/stream_compaction/unique.cu @@ -69,6 +69,9 @@ std::unique_ptr
unique(table_view const& input, size_type const unique_size = [&] { if (cudf::detail::has_nested_columns(keys_view)) { + // Using a temporary buffer for intermediate transform results from the functor containing + // the comparator speeds up compile-time significantly without much degradation in + // runtime performance over using the comparator directly in thrust::unique_copy. auto row_equal = comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal); auto d_results = rmm::device_uvector(num_rows, stream); @@ -87,6 +90,8 @@ std::unique_ptr
unique(table_view const& input, thrust::identity{}); return static_cast(thrust::distance(mutable_view->begin(), result_end)); } else { + // Using thrust::unique_copy with the comparator directly will compile more slowly but + // improves runtime by up to 2x over the transform/copy_if approach above. auto row_equal = comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal); auto result_end = unique_copy(thrust::counting_iterator(0), diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu index 0344923d9c0..ac9924311c2 100644 --- a/cpp/src/stream_compaction/unique_count.cu +++ b/cpp/src/stream_compaction/unique_count.cu @@ -77,7 +77,9 @@ cudf::size_type unique_count(table_view const& keys, if (cudf::detail::has_nested_columns(keys)) { auto const comp = row_comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal); - + // Using a temporary buffer for intermediate transform results from the lambda containing + // the comparator speeds up compile-time significantly without much degradation in + // runtime performance over using the comparator directly in thrust::count_if. auto d_results = rmm::device_uvector(keys.num_rows(), stream); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -90,6 +92,8 @@ cudf::size_type unique_count(table_view const& keys, } else { auto const comp = row_comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal); + // Using thrust::copy_if with the comparator directly will compile more slowly but + // improves runtime by up to 2x over the transform/count approach above. return thrust::count_if( rmm::exec_policy(stream), thrust::counting_iterator(0), From c5870e9c1e19c07e17044b6ba635ee3cb112376b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 22 Mar 2023 14:49:45 -0400 Subject: [PATCH 13/15] fix style violation --- cpp/benchmarks/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ada4a5ccba0..e01d7745e94 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -149,7 +149,8 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp stream_compaction/unique_count.cpp + STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp + stream_compaction/unique_count.cpp ) # ################################################################################################## From c142598f86d2ac81bdc965f3a8f31f6175e418d6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 23 Mar 2023 11:10:32 -0400 Subject: [PATCH 14/15] test cmake change --- cpp/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index bd4077aff4e..48766c175cc 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -584,7 +584,7 @@ ConfigureTest( # variable in that setup. set_tests_properties( STREAM_IDENTIFICATION_TEST - PROPERTIES ENVIRONMENT LD_PRELOAD=$ + PROPERTIES ENVIRONMENT "LD_PRELOAD=$" ) # ################################################################################################## From 00f51307559e6f5e5ab867cef1060353c0cf6a67 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 23 Mar 2023 13:00:08 -0400 Subject: [PATCH 15/15] revert temp cmake change --- cpp/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 48766c175cc..bd4077aff4e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -584,7 +584,7 @@ ConfigureTest( # variable in that setup. set_tests_properties( STREAM_IDENTIFICATION_TEST - PROPERTIES ENVIRONMENT "LD_PRELOAD=$" + PROPERTIES ENVIRONMENT LD_PRELOAD=$ ) # ##################################################################################################