From 1bbe440ee7ddbc021f945e4156220f9bc270a443 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 8 Aug 2024 12:25:29 -0500 Subject: [PATCH] Add keep option to distinct nvbench (#16497) This PR adopts some work from @srinivasyadav18 with additional modifications. This is meant to complement #16484. Authors: - Bradley Dice (https://github.com/bdice) - Srinivas Yadav (https://github.com/srinivasyadav18) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Srinivas Yadav (https://github.com/srinivasyadav18) URL: https://github.com/rapidsai/cudf/pull/16497 --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/stream_compaction/distinct.cpp | 45 ++++++++++++------- .../stream_compaction/stable_distinct.cpp | 45 ++++++++++++------- .../stream_compaction_common.cpp | 35 +++++++++++++++ .../stream_compaction_common.hpp | 19 ++++++++ 5 files changed, 113 insertions(+), 32 deletions(-) create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.cpp create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.hpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 7be456ddfba..483b7b0a539 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -162,6 +162,7 @@ ConfigureNVBench( stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp stream_compaction/stable_distinct.cpp + stream_compaction/stream_compaction_common.cpp stream_compaction/unique.cpp stream_compaction/unique_count.cpp ) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index c04b6516903..d7deebca89a 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -23,15 +24,29 @@ #include +#include + NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); template void nvbench_distinct(nvbench::state& state, nvbench::type_list) { - cudf::size_type const num_rows = state.get_int64("NumRows"); + cudf::size_type const num_rows = state.get_int64("NumRows"); + auto const keep = get_keep(state.get_string("keep")); + cudf::size_type const cardinality = state.get_int64("cardinality"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } - data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + data_profile profile = data_profile_builder() + .cardinality(cardinality) + .null_probability(0.01) + .distribution(cudf::type_to_id(), + distribution_id::UNIFORM, + static_cast(0), + std::numeric_limits::max()); auto source_column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); @@ -40,20 +55,19 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::distinct(input_table, - {0}, - cudf::duplicate_keep_option::KEEP_ANY, - cudf::null_equality::EQUAL, - cudf::nan_equality::ALL_EQUAL); + auto result = cudf::distinct( + input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); }); } -using data_type = nvbench::type_list; +using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) - .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + .add_string_axis("keep", {"any", "first", "last", "none"}) + .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) + .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); template void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) @@ -61,6 +75,7 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) auto const size = state.get_int64("ColumnSize"); auto const dtype = cudf::type_to_id(); double const null_probability = state.get_float64("null_probability"); + auto const keep = get_keep(state.get_string("keep")); auto builder = data_profile_builder().null_probability(null_probability); if (dtype == cudf::type_id::LIST) { @@ -80,11 +95,8 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::distinct(*table, - {0}, - cudf::duplicate_keep_option::KEEP_ANY, - cudf::null_equality::EQUAL, - cudf::nan_equality::ALL_EQUAL); + auto result = + cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); }); } @@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_distinct_list, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("distinct_list") .set_type_axes_names({"Type"}) + .add_string_axis("keep", {"any", "first", "last", "none"}) .add_float64_axis("null_probability", {0.0, 0.1}) .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp index bcee3048013..0a8836c0583 100644 --- a/cpp/benchmarks/stream_compaction/stable_distinct.cpp +++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -23,15 +24,29 @@ #include +#include + NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); template void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list) { - cudf::size_type const num_rows = state.get_int64("NumRows"); + cudf::size_type const num_rows = state.get_int64("NumRows"); + auto const keep = get_keep(state.get_string("keep")); + cudf::size_type const cardinality = state.get_int64("cardinality"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } - data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + data_profile profile = data_profile_builder() + .cardinality(cardinality) + .null_probability(0.01) + .distribution(cudf::type_to_id(), + distribution_id::UNIFORM, + static_cast(0), + std::numeric_limits::max()); auto source_column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); @@ -40,20 +55,19 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = cudf::stable_distinct(input_table, - {0}, - cudf::duplicate_keep_option::KEEP_ANY, - cudf::null_equality::EQUAL, - cudf::nan_equality::ALL_EQUAL); + auto result = cudf::stable_distinct( + input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); }); } -using data_type = nvbench::type_list; +using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("stable_distinct") .set_type_axes_names({"Type"}) - .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + .add_string_axis("keep", {"any", "first", "last", "none"}) + .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) + .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); template void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list) @@ -61,6 +75,7 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list(); double const null_probability = state.get_float64("null_probability"); + auto const keep = get_keep(state.get_string("keep")); auto builder = data_profile_builder().null_probability(null_probability); if (dtype == cudf::type_id::LIST) { @@ -80,11 +95,8 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list)) .set_name("stable_distinct_list") .set_type_axes_names({"Type"}) + .add_string_axis("keep", {"any", "first", "last", "none"}) .add_float64_axis("null_probability", {0.0, 0.1}) .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp new file mode 100644 index 00000000000..8cbb2956777 --- /dev/null +++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +cudf::duplicate_keep_option get_keep(std::string const& keep_str) +{ + if (keep_str == "any") { + return cudf::duplicate_keep_option::KEEP_ANY; + } else if (keep_str == "first") { + return cudf::duplicate_keep_option::KEEP_FIRST; + } else if (keep_str == "last") { + return cudf::duplicate_keep_option::KEEP_LAST; + } else if (keep_str == "none") { + return cudf::duplicate_keep_option::KEEP_NONE; + } else { + CUDF_FAIL("Unsupported keep option."); + } +} diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp new file mode 100644 index 00000000000..d1ef2b10f41 --- /dev/null +++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +cudf::duplicate_keep_option get_keep(std::string const& keep_str);