From e84b6f82b8a93406ba8ea0ae0a61c3da680f5da0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Dec 2024 13:08:28 -0800 Subject: [PATCH 1/4] Fix typos and renaming --- cpp/src/stream_compaction/distinct.cu | 4 ++-- cpp/src/stream_compaction/distinct_helpers.cu | 12 ++++++------ cpp/src/stream_compaction/distinct_helpers.hpp | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 7d11b02d3e1..9ab8ed5938a 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -95,8 +95,8 @@ rmm::device_uvector distinct_indices(table_view const& input, auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const helper_func = [&](auto const& d_equal) { - using RowHasher = std::decay_t; - auto set = hash_set_type{ + using RowEqual = std::decay_t; + auto set = distinct_set_t{ num_rows, 0.5, // desired load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index c3a004b7f28..aadb438b019 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -21,8 +21,8 @@ namespace cudf::detail { -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, @@ -100,7 +100,7 @@ rmm::device_uvector reduce_by_row(hash_set_type& set, } template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -110,7 +110,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -120,7 +120,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -130,7 +130,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp index f15807c2434..339052c3b56 100644 --- a/cpp/src/stream_compaction/distinct_helpers.hpp +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) } } -template -using hash_set_type = +template +using distinct_set_t = cuco::static_set, cuda::thread_scope_device, - RowHasher, + RowEqual, cuco::linear_probing<1, cudf::experimental::row::hash::device_row_hasher< cudf::hashing::detail::default_hash, @@ -87,8 +87,8 @@ using hash_set_type = * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the output indices */ -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, From f181258ff65aa4281a76913a20b1d3f471a3d737 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Dec 2024 13:28:32 -0800 Subject: [PATCH 2/4] Add null_probability benchmark axis for distinct bench --- cpp/benchmarks/stream_compaction/distinct.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index d7deebca89a..d6c1ece3fcc 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -34,15 +34,16 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) cudf::size_type const num_rows = state.get_int64("NumRows"); auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); - - if (cardinality > num_rows) { + auto const null_probability = state.get_float64("null_probability"); +  if (cardinality > num_rows) + { state.skip("cardinality > num_rows"); return; } data_profile profile = data_profile_builder() .cardinality(cardinality) - .null_probability(0.01) + .null_probability(null_probability) .distribution(cudf::type_to_id(), distribution_id::UNIFORM, static_cast(0), @@ -65,6 +66,7 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.01}) .add_string_axis("keep", {"any", "first", "last", "none"}) .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); From 877f297cb77ad232303cae2472bb8fa1deca8471 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Dec 2024 13:29:39 -0800 Subject: [PATCH 3/4] Add null_probability benchmark axis for distinct bench --- cpp/benchmarks/stream_compaction/distinct.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index d6c1ece3fcc..75d04bb4e8e 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -35,8 +35,8 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); auto const null_probability = state.get_float64("null_probability"); -  if (cardinality > num_rows) - { + + if (cardinality > num_rows) { state.skip("cardinality > num_rows"); return; } From a9bc9a26a26ddc5867ae76e8d09c4589b54bae54 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 9 Dec 2024 09:30:47 -0800 Subject: [PATCH 4/4] Add missing doc --- cpp/src/stream_compaction/distinct_helpers.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp index 339052c3b56..4ca1cab937a 100644 --- a/cpp/src/stream_compaction/distinct_helpers.hpp +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -79,6 +79,8 @@ using distinct_set_t = * the `reduction_init_value()` function. Then, the reduction result for each row group is written * into the output array at the index of an unspecified row in the group. * + * @tparam RowEqual The type of row equality comparator + * * @param set The auxiliary set to perform reduction * @param set_size The number of elements in set * @param num_rows The number of all input rows