diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp index fd0a0f7d2c8..c17c288a6d3 100644 --- a/cpp/benchmarks/ast/transform_benchmark.cpp +++ b/cpp/benchmarks/ast/transform_benchmark.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -40,7 +41,6 @@ enum class TreeType { // child column reference }; -template class AST : public cudf::benchmark { }; @@ -127,9 +127,22 @@ static void BM_ast_transform(benchmark::State& state) (tree_levels + 1) * sizeof(key_type)); } -#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) { BM_ast_transform(st); } +static void CustomRanges(benchmark::internal::Benchmark* b) +{ + auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; + auto operation_counts = std::vector{1, 5, 10}; + for (auto const& row_count : row_counts) { + for (auto const& operation_count : operation_counts) { + b->Args({row_count, operation_count}); + } + } +} + +#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ + TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \ + ->Apply(CustomRanges) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime(); AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); @@ -144,44 +157,3 @@ AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true); AST_TRANSFORM_BENCHMARK_DEFINE( ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp index 8d04f8bdcb2..745d4e354e7 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -23,12 +24,11 @@ #include -template class COMPILED_BINARYOP : public cudf::benchmark { }; -template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +template +void BM_compiled_binaryop(benchmark::State& state) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; @@ -50,56 +50,56 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut) \ - BENCHMARK_TEMPLATE_DEFINE_F( \ - COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::binop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ +#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ + TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \ + BM_compiled_binaryop, \ + TypeLhs, \ + TypeRhs, \ + TypeOut, \ + cudf::binary_operator::binop) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ ->Arg(100000000); /* 100M */ using namespace cudf; using namespace numeric; // clang-format off -BINARYOP_BENCHMARK_DEFINE(ADD_1, float, float, ADD, float); -BINARYOP_BENCHMARK_DEFINE(ADD_2, timestamp_s, duration_s, ADD, timestamp_s); -BINARYOP_BENCHMARK_DEFINE(SUB_1, duration_s, duration_D, SUB, duration_ms); -BINARYOP_BENCHMARK_DEFINE(SUB_2, int64_t, int64_t, SUB, int64_t); -BINARYOP_BENCHMARK_DEFINE(MUL_1, float, float, MUL, int64_t); -BINARYOP_BENCHMARK_DEFINE(MUL_2, duration_s, int64_t, MUL, duration_s); -BINARYOP_BENCHMARK_DEFINE(DIV_1, int64_t, int64_t, DIV, int64_t); -BINARYOP_BENCHMARK_DEFINE(DIV_2, duration_ms, int32_t, DIV, duration_ms); -BINARYOP_BENCHMARK_DEFINE(TRUE_DIV, int64_t, int64_t, TRUE_DIV, int64_t); -BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV, int64_t, int64_t, FLOOR_DIV, int64_t); -BINARYOP_BENCHMARK_DEFINE(MOD_1, double, double, MOD, double); -BINARYOP_BENCHMARK_DEFINE(MOD_2, duration_ms, int64_t, MOD, duration_ms); -BINARYOP_BENCHMARK_DEFINE(PMOD, int32_t, int64_t, PMOD, double); -BINARYOP_BENCHMARK_DEFINE(PYMOD, int32_t, uint8_t, PYMOD, int64_t); -BINARYOP_BENCHMARK_DEFINE(POW, int64_t, int64_t, POW, double); -BINARYOP_BENCHMARK_DEFINE(LOG_BASE, float, double, LOG_BASE, double); -BINARYOP_BENCHMARK_DEFINE(ATAN2, float, double, ATAN2, double); -BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT, int, int, SHIFT_LEFT, int); -BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT, int16_t, int64_t, SHIFT_RIGHT, int); -BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT, int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); -BINARYOP_BENCHMARK_DEFINE(BITWISE_AND, int64_t, int32_t, BITWISE_AND, int16_t); -BINARYOP_BENCHMARK_DEFINE(BITWISE_OR, int16_t, int32_t, BITWISE_OR, int64_t); -BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR, int16_t, int64_t, BITWISE_XOR, int32_t); -BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND, double, int8_t, LOGICAL_AND, bool); -BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR, int16_t, int64_t, LOGICAL_OR, bool); -BINARYOP_BENCHMARK_DEFINE(EQUAL_1, int32_t, int64_t, EQUAL, bool); -BINARYOP_BENCHMARK_DEFINE(EQUAL_2, duration_ms, duration_ns, EQUAL, bool); -BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL, decimal32, decimal32, NOT_EQUAL, bool); -BINARYOP_BENCHMARK_DEFINE(LESS, timestamp_s, timestamp_s, LESS, bool); -BINARYOP_BENCHMARK_DEFINE(GREATER, timestamp_ms, timestamp_s, GREATER, bool); -BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS, duration_ms, duration_ns, NULL_EQUALS, bool); -BINARYOP_BENCHMARK_DEFINE(NULL_MAX, decimal32, decimal32, NULL_MAX, decimal32); -BINARYOP_BENCHMARK_DEFINE(NULL_MIN, timestamp_D, timestamp_s, NULL_MIN, timestamp_s); +BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t); +BINARYOP_BENCHMARK_DEFINE(float, float, ADD, float); +BINARYOP_BENCHMARK_DEFINE(timestamp_s, duration_s, ADD, timestamp_s); +BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, SUB, int64_t); +BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t); +BINARYOP_BENCHMARK_DEFINE(duration_s, int64_t, MUL, duration_s); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(duration_ms, int32_t, DIV, duration_ms); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double); +BINARYOP_BENCHMARK_DEFINE(duration_ms, int64_t, MOD, duration_ms); +BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double); +BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double); +BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double); +BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double); +BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int); +BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t); +BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t); +BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool); +BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool); +BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool); +BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool); +BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32); +BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s); diff --git a/cpp/benchmarks/column/concatenate_benchmark.cpp b/cpp/benchmarks/column/concatenate_benchmark.cpp index 3634b2f08a2..abca4b4e0f5 100644 --- a/cpp/benchmarks/column/concatenate_benchmark.cpp +++ b/cpp/benchmarks/column/concatenate_benchmark.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,7 +27,6 @@ #include #include -template class Concatenate : public cudf::benchmark { }; @@ -69,17 +69,15 @@ static void BM_concatenate(benchmark::State& state) state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); } -#define CONCAT_BENCHMARK_DEFINE(name, type, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \ - (::benchmark::State & state) { BM_concatenate(state); } \ - BENCHMARK_REGISTER_F(Concatenate, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ +#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ + TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -CONCAT_BENCHMARK_DEFINE(concat_columns_int64_non_null, int64_t, false) -CONCAT_BENCHMARK_DEFINE(concat_columns_int64_nullable, int64_t, true) +CONCAT_BENCHMARK_DEFINE(int64_t, false) +CONCAT_BENCHMARK_DEFINE(int64_t, true) template static void BM_concatenate_tables(benchmark::State& state) @@ -131,19 +129,16 @@ static void BM_concatenate_tables(benchmark::State& state) state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); } -#define CONCAT_TABLES_BENCHMARK_DEFINE(name, type, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \ - (::benchmark::State & state) { BM_concatenate_tables(state); } \ - BENCHMARK_REGISTER_F(Concatenate, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ +#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ + TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_non_null, int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_nullable, int64_t, true) +CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) +CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) -template class ConcatenateStrings : public cudf::benchmark { }; @@ -192,14 +187,12 @@ static void BM_concatenate_strings(benchmark::State& state) (sizeof(int32_t) + num_chars)); // offset + chars } -#define CONCAT_STRINGS_BENCHMARK_DEFINE(name, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConcatenateStrings, name, nullable) \ - (::benchmark::State & state) { BM_concatenate_strings(state); } \ - BENCHMARK_REGISTER_F(ConcatenateStrings, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ +#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ + TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_non_null, false) -CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_nullable, true) +CONCAT_STRINGS_BENCHMARK_DEFINE(false) +CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp new file mode 100644 index 00000000000..7d86ed1b95c --- /dev/null +++ b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf { +/** + * @brief Templated Google Benchmark with fixture + * + * Extends Google benchmarks to support templated Benchmarks with non-templated fixture class. + * + * The SetUp and TearDown methods is called before each templated benchmark function is run. + * These methods are called automatically by Google Benchmark + * + * Example: + * + * @code + * template + * void my_benchmark(::benchmark::State& state) { + * std::vector v1(state.range(0)); + * std::vector v2(state.range(0)); + * for (auto _ : state) { + * // benchmark stuff + * } + * } + * + * TEMPLATED_BENCHMARK_F(cudf::benchmark, my_benchmark, int, double)->Range(128, 512); + * @endcode + */ +template +class FunctionTemplateBenchmark : public Fixture { + public: + FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func) + : Fixture(), func_(func) + { + this->SetName(name); + } + + virtual void Run(::benchmark::State& st) + { + this->SetUp(st); + this->BenchmarkCase(st); + this->TearDown(st); + } + + private: + ::benchmark::internal::Function* func_; + + protected: + virtual void BenchmarkCase(::benchmark::State& st) { func_(st); } +}; + +#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \ + BENCHMARK_PRIVATE_DECLARE(n) = (::benchmark::internal::RegisterBenchmarkInternal( \ + new cudf::FunctionTemplateBenchmark(#BaseClass "/" #n "<" #__VA_ARGS__ ">", \ + n<__VA_ARGS__>))) + +} // namespace cudf diff --git a/cpp/benchmarks/string/url_decode_benchmark.cpp b/cpp/benchmarks/string/url_decode_benchmark.cpp index 9cfaaf27286..4dc77cffa1a 100644 --- a/cpp/benchmarks/string/url_decode_benchmark.cpp +++ b/cpp/benchmarks/string/url_decode_benchmark.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -66,7 +67,6 @@ cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, return cudf::test::strings_column_wrapper(strings.begin(), strings.end()); } -template class UrlDecode : public cudf::benchmark { }; @@ -88,15 +88,13 @@ void BM_url_decode(benchmark::State& state) (chars_per_row + sizeof(cudf::size_type))); } -#define URLD_BENCHMARK_DEFINE(name, esc_seq_pct) \ - BENCHMARK_TEMPLATE_DEFINE_F(UrlDecode, name, esc_seq_pct) \ - (::benchmark::State & state) { BM_url_decode(state); } \ - BENCHMARK_REGISTER_F(UrlDecode, name) \ - ->Args({100000000, 10}) \ - ->Args({10000000, 100}) \ - ->Args({1000000, 1000}) \ - ->Unit(benchmark::kMillisecond) \ +#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ + TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \ + ->Args({100000000, 10}) \ + ->Args({10000000, 100}) \ + ->Args({1000000, 1000}) \ + ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); -URLD_BENCHMARK_DEFINE(url_decode_10pct, 10) -URLD_BENCHMARK_DEFINE(url_decode_50pct, 50) +URLD_BENCHMARK_DEFINE(10) +URLD_BENCHMARK_DEFINE(50)