From a00466fe30ce6618e8f790ea1337875558867512 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 3 Dec 2021 21:34:29 +0530 Subject: [PATCH 1/3] add templated benchmark with fixture, use in binaryop benchmark --- .../binaryop/compiled_binaryop_benchmark.cpp | 42 ++++++----- .../fixture/templated_benchmark_fixture.hpp | 73 +++++++++++++++++++ 2 files changed, 97 insertions(+), 18 deletions(-) create mode 100644 cpp/benchmarks/fixture/templated_benchmark_fixture.hpp diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp index bc0818ace4b..745d4e354e7 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -23,12 +24,11 @@ #include -template class COMPILED_BINARYOP : public cudf::benchmark { }; -template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +template +void BM_compiled_binaryop(benchmark::State& state) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; @@ -50,20 +50,19 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ - BENCHMARK_TEMPLATE_DEFINE_F( \ - COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::binop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ +#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ + TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \ + BM_compiled_binaryop, \ + TypeLhs, \ + TypeRhs, \ + TypeOut, \ + cudf::binary_operator::binop) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ ->Arg(100000000); /* 100M */ using namespace cudf; @@ -71,12 +70,18 @@ using namespace numeric; // clang-format off BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t); +BINARYOP_BENCHMARK_DEFINE(float, float, ADD, float); +BINARYOP_BENCHMARK_DEFINE(timestamp_s, duration_s, ADD, timestamp_s); BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, SUB, int64_t); BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t); +BINARYOP_BENCHMARK_DEFINE(duration_s, int64_t, MUL, duration_s); BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(duration_ms, int32_t, DIV, duration_ms); BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t); BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t); BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double); +BINARYOP_BENCHMARK_DEFINE(duration_ms, int64_t, MOD, duration_ms); BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double); BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t); BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double); @@ -90,10 +95,11 @@ BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int6 BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t); BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool); BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool); +BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, EQUAL, bool); BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool); BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool); BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool); BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool); BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool); BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32); -BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s); +BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s); diff --git a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp new file mode 100644 index 00000000000..68e5ff841b0 --- /dev/null +++ b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf { +/** + * @brief Templated Google Benchmark with fixture + * + * Extends Google benchmarks to support templated Benchmarks with non-templated fixture class. + * + * The SetUp and TearDown methods is called before each templated benchmark function is run. + * These methods are called automatically by Google Benchmark + * + * Example: + * + * @code + * template + * void my_benchmark(::benchmark::State& state) { + * std::vector v1(state.range(0)); + * std::vector v2(state.range(0)); + * for (auto _ : state) { + * // benchmark stuff + * } + * } + * + * TEMPLATED_BENCHMARK_F(cudf::benchmark, my_benchmark, int, double)->Range(128, 512); + * @endcode + */ +template +class FunctionTemplateBenchmark : public Fixture { + public: + FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func) + : Fixture(), func_(func) + { + this->SetName(name); + } + + virtual void Run(::benchmark::State& st) + { + this->SetUp(st); + this->BenchmarkCase(st); + this->TearDown(st); + } + + private: + ::benchmark::internal::Function* func_; + + protected: + virtual void BenchmarkCase(::benchmark::State& st) { func_(st); } +}; + +#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \ + BENCHMARK_PRIVATE_DECLARE(n) = \ + (::benchmark::internal::RegisterBenchmarkInternal(new FunctionTemplateBenchmark( \ + #BaseClass "/" #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>))) + +} // namespace cudf From de4ef105dcfa7c61b427dc9fc8b33202678cac6a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 7 Dec 2021 13:28:18 +0530 Subject: [PATCH 2/3] add cudf namespace --- cpp/benchmarks/fixture/templated_benchmark_fixture.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp index 68e5ff841b0..7d86ed1b95c 100644 --- a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp @@ -65,9 +65,9 @@ class FunctionTemplateBenchmark : public Fixture { virtual void BenchmarkCase(::benchmark::State& st) { func_(st); } }; -#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \ - BENCHMARK_PRIVATE_DECLARE(n) = \ - (::benchmark::internal::RegisterBenchmarkInternal(new FunctionTemplateBenchmark( \ - #BaseClass "/" #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>))) +#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \ + BENCHMARK_PRIVATE_DECLARE(n) = (::benchmark::internal::RegisterBenchmarkInternal( \ + new cudf::FunctionTemplateBenchmark(#BaseClass "/" #n "<" #__VA_ARGS__ ">", \ + n<__VA_ARGS__>))) } // namespace cudf From 791097ca8251643b48e5273e942ec08808a57c7b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 7 Dec 2021 13:29:28 +0530 Subject: [PATCH 3/3] add more TEMPLATED_BENCHMARK_F --- cpp/benchmarks/ast/transform_benchmark.cpp | 62 +++++-------------- .../column/concatenate_benchmark.cpp | 51 +++++++-------- .../string/url_decode_benchmark.cpp | 20 +++--- 3 files changed, 48 insertions(+), 85 deletions(-) diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp index fd0a0f7d2c8..c17c288a6d3 100644 --- a/cpp/benchmarks/ast/transform_benchmark.cpp +++ b/cpp/benchmarks/ast/transform_benchmark.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -40,7 +41,6 @@ enum class TreeType { // child column reference }; -template class AST : public cudf::benchmark { }; @@ -127,9 +127,22 @@ static void BM_ast_transform(benchmark::State& state) (tree_levels + 1) * sizeof(key_type)); } -#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) { BM_ast_transform(st); } +static void CustomRanges(benchmark::internal::Benchmark* b) +{ + auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; + auto operation_counts = std::vector{1, 5, 10}; + for (auto const& row_count : row_counts) { + for (auto const& operation_count : operation_counts) { + b->Args({row_count, operation_count}); + } + } +} + +#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ + TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \ + ->Apply(CustomRanges) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime(); AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); @@ -144,44 +157,3 @@ AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true); AST_TRANSFORM_BENCHMARK_DEFINE( ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/column/concatenate_benchmark.cpp b/cpp/benchmarks/column/concatenate_benchmark.cpp index 3634b2f08a2..abca4b4e0f5 100644 --- a/cpp/benchmarks/column/concatenate_benchmark.cpp +++ b/cpp/benchmarks/column/concatenate_benchmark.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,7 +27,6 @@ #include #include -template class Concatenate : public cudf::benchmark { }; @@ -69,17 +69,15 @@ static void BM_concatenate(benchmark::State& state) state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); } -#define CONCAT_BENCHMARK_DEFINE(name, type, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \ - (::benchmark::State & state) { BM_concatenate(state); } \ - BENCHMARK_REGISTER_F(Concatenate, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ +#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ + TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -CONCAT_BENCHMARK_DEFINE(concat_columns_int64_non_null, int64_t, false) -CONCAT_BENCHMARK_DEFINE(concat_columns_int64_nullable, int64_t, true) +CONCAT_BENCHMARK_DEFINE(int64_t, false) +CONCAT_BENCHMARK_DEFINE(int64_t, true) template static void BM_concatenate_tables(benchmark::State& state) @@ -131,19 +129,16 @@ static void BM_concatenate_tables(benchmark::State& state) state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); } -#define CONCAT_TABLES_BENCHMARK_DEFINE(name, type, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \ - (::benchmark::State & state) { BM_concatenate_tables(state); } \ - BENCHMARK_REGISTER_F(Concatenate, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ +#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ + TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_non_null, int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_nullable, int64_t, true) +CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) +CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) -template class ConcatenateStrings : public cudf::benchmark { }; @@ -192,14 +187,12 @@ static void BM_concatenate_strings(benchmark::State& state) (sizeof(int32_t) + num_chars)); // offset + chars } -#define CONCAT_STRINGS_BENCHMARK_DEFINE(name, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConcatenateStrings, name, nullable) \ - (::benchmark::State & state) { BM_concatenate_strings(state); } \ - BENCHMARK_REGISTER_F(ConcatenateStrings, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ +#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ + TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_non_null, false) -CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_nullable, true) +CONCAT_STRINGS_BENCHMARK_DEFINE(false) +CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/string/url_decode_benchmark.cpp b/cpp/benchmarks/string/url_decode_benchmark.cpp index 9cfaaf27286..4dc77cffa1a 100644 --- a/cpp/benchmarks/string/url_decode_benchmark.cpp +++ b/cpp/benchmarks/string/url_decode_benchmark.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -66,7 +67,6 @@ cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, return cudf::test::strings_column_wrapper(strings.begin(), strings.end()); } -template class UrlDecode : public cudf::benchmark { }; @@ -88,15 +88,13 @@ void BM_url_decode(benchmark::State& state) (chars_per_row + sizeof(cudf::size_type))); } -#define URLD_BENCHMARK_DEFINE(name, esc_seq_pct) \ - BENCHMARK_TEMPLATE_DEFINE_F(UrlDecode, name, esc_seq_pct) \ - (::benchmark::State & state) { BM_url_decode(state); } \ - BENCHMARK_REGISTER_F(UrlDecode, name) \ - ->Args({100000000, 10}) \ - ->Args({10000000, 100}) \ - ->Args({1000000, 1000}) \ - ->Unit(benchmark::kMillisecond) \ +#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ + TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \ + ->Args({100000000, 10}) \ + ->Args({10000000, 100}) \ + ->Args({1000000, 1000}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -URLD_BENCHMARK_DEFINE(url_decode_10pct, 10) -URLD_BENCHMARK_DEFINE(url_decode_50pct, 50) +URLD_BENCHMARK_DEFINE(10) +URLD_BENCHMARK_DEFINE(50)