Skip to content

Commit

Permalink
add templated benchmark with fixture (#9838)
Browse files Browse the repository at this point in the history
BENCHMARK_TEMPLATE_F is for non-templated benchmark with templated fixture.
BENCHMARK_F is for non-templated benchmark with non-templated fixture.
Google benchmark does not have support for templated benchmark function with non-templated fixture.

Often, BENCHMARK_TEMPLATE_F is used as a proxy for templated benchmark. But templated fixture is not really required here.  It also has limitation of specifying different name for each template.
So, this PR extends google benchmark to support templated benchmark with non-templated fixture.

- [x] Use TEMPLATED_BENCHMARK_F in compiled binary op.
- [x] Use in other relevant benchmarks as well.

Usage:
`TEMPLATED_BENCHMARK_F(FixtureClass,    TemplateFunction, ...);`
`...`  are template arguments

Example:
```
class FixtureClass : public cudf::benchmark {
};

template<typename T, typename U>
void TemplateFunction(benchmark::State& state) {
     for (auto _ : state) {
       // benchmark stuff
     }
}
TEMPLATED_BENCHMARK_F(FixtureClass, TemplateFunction, int, double)->Range(128, 512);
```

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Mark Harris (https://github.com/harrism)

URL: #9838
  • Loading branch information
karthikeyann authored Dec 8, 2021
1 parent 4579d23 commit e6b0661
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 135 deletions.
62 changes: 17 additions & 45 deletions cpp/benchmarks/ast/transform_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include <benchmark/benchmark.h>
#include <fixture/benchmark_fixture.hpp>
#include <fixture/templated_benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <thrust/iterator/counting_iterator.h>
Expand All @@ -40,7 +41,6 @@ enum class TreeType {
// child column reference
};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {
};

Expand Down Expand Up @@ -127,9 +127,22 @@ static void BM_ast_transform(benchmark::State& state)
(tree_levels + 1) * sizeof(key_type));
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); }
static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \
->Apply(CustomRanges) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
Expand All @@ -144,44 +157,3 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();
100 changes: 50 additions & 50 deletions cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <fixture/benchmark_fixture.hpp>
#include <fixture/templated_benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>
Expand All @@ -23,12 +24,11 @@

#include <thrust/iterator/counting_iterator.h>

template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
class COMPILED_BINARYOP : public cudf::benchmark {
};

template <typename TypeLhs, typename TypeRhs, typename TypeOut>
void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator binop>
void BM_compiled_binaryop(benchmark::State& state)
{
const cudf::size_type column_size{(cudf::size_type)state.range(0)};

Expand All @@ -50,56 +50,56 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
}

// TODO tparam boolean for null.
#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut) \
BENCHMARK_TEMPLATE_DEFINE_F( \
COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
(::benchmark::State & st) \
{ \
BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
} \
BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \
TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \
BM_compiled_binaryop, \
TypeLhs, \
TypeRhs, \
TypeOut, \
cudf::binary_operator::binop) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

using namespace cudf;
using namespace numeric;

// clang-format off
BINARYOP_BENCHMARK_DEFINE(ADD_1, float, float, ADD, float);
BINARYOP_BENCHMARK_DEFINE(ADD_2, timestamp_s, duration_s, ADD, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(SUB_1, duration_s, duration_D, SUB, duration_ms);
BINARYOP_BENCHMARK_DEFINE(SUB_2, int64_t, int64_t, SUB, int64_t);
BINARYOP_BENCHMARK_DEFINE(MUL_1, float, float, MUL, int64_t);
BINARYOP_BENCHMARK_DEFINE(MUL_2, duration_s, int64_t, MUL, duration_s);
BINARYOP_BENCHMARK_DEFINE(DIV_1, int64_t, int64_t, DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(DIV_2, duration_ms, int32_t, DIV, duration_ms);
BINARYOP_BENCHMARK_DEFINE(TRUE_DIV, int64_t, int64_t, TRUE_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV, int64_t, int64_t, FLOOR_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(MOD_1, double, double, MOD, double);
BINARYOP_BENCHMARK_DEFINE(MOD_2, duration_ms, int64_t, MOD, duration_ms);
BINARYOP_BENCHMARK_DEFINE(PMOD, int32_t, int64_t, PMOD, double);
BINARYOP_BENCHMARK_DEFINE(PYMOD, int32_t, uint8_t, PYMOD, int64_t);
BINARYOP_BENCHMARK_DEFINE(POW, int64_t, int64_t, POW, double);
BINARYOP_BENCHMARK_DEFINE(LOG_BASE, float, double, LOG_BASE, double);
BINARYOP_BENCHMARK_DEFINE(ATAN2, float, double, ATAN2, double);
BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT, int, int, SHIFT_LEFT, int);
BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT, int16_t, int64_t, SHIFT_RIGHT, int);
BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT, int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_AND, int64_t, int32_t, BITWISE_AND, int16_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_OR, int16_t, int32_t, BITWISE_OR, int64_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR, int16_t, int64_t, BITWISE_XOR, int32_t);
BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND, double, int8_t, LOGICAL_AND, bool);
BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR, int16_t, int64_t, LOGICAL_OR, bool);
BINARYOP_BENCHMARK_DEFINE(EQUAL_1, int32_t, int64_t, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(EQUAL_2, duration_ms, duration_ns, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL, decimal32, decimal32, NOT_EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(LESS, timestamp_s, timestamp_s, LESS, bool);
BINARYOP_BENCHMARK_DEFINE(GREATER, timestamp_ms, timestamp_s, GREATER, bool);
BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS, duration_ms, duration_ns, NULL_EQUALS, bool);
BINARYOP_BENCHMARK_DEFINE(NULL_MAX, decimal32, decimal32, NULL_MAX, decimal32);
BINARYOP_BENCHMARK_DEFINE(NULL_MIN, timestamp_D, timestamp_s, NULL_MIN, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t);
BINARYOP_BENCHMARK_DEFINE(float, float, ADD, float);
BINARYOP_BENCHMARK_DEFINE(timestamp_s, duration_s, ADD, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, SUB, int64_t);
BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t);
BINARYOP_BENCHMARK_DEFINE(duration_s, int64_t, MUL, duration_s);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(duration_ms, int32_t, DIV, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double);
BINARYOP_BENCHMARK_DEFINE(duration_ms, int64_t, MOD, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double);
BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double);
BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double);
BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double);
BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int);
BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t);
BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t);
BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool);
BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool);
BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool);
BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool);
BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32);
BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s);
51 changes: 22 additions & 29 deletions cpp/benchmarks/column/concatenate_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
#include <cudf_test/column_wrapper.hpp>

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <algorithm>
#include <vector>

template <typename T, bool Nullable>
class Concatenate : public cudf::benchmark {
};

Expand Down Expand Up @@ -69,17 +69,15 @@ static void BM_concatenate(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
}

#define CONCAT_BENCHMARK_DEFINE(name, type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \
(::benchmark::State & state) { BM_concatenate<type, nullable>(state); } \
BENCHMARK_REGISTER_F(Concatenate, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_BENCHMARK_DEFINE(concat_columns_int64_non_null, int64_t, false)
CONCAT_BENCHMARK_DEFINE(concat_columns_int64_nullable, int64_t, true)
CONCAT_BENCHMARK_DEFINE(int64_t, false)
CONCAT_BENCHMARK_DEFINE(int64_t, true)

template <typename T, bool Nullable>
static void BM_concatenate_tables(benchmark::State& state)
Expand Down Expand Up @@ -131,19 +129,16 @@ static void BM_concatenate_tables(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
}

#define CONCAT_TABLES_BENCHMARK_DEFINE(name, type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \
(::benchmark::State & state) { BM_concatenate_tables<type, nullable>(state); } \
BENCHMARK_REGISTER_F(Concatenate, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_non_null, int64_t, false)
CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_nullable, int64_t, true)
CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)

template <bool Nullable>
class ConcatenateStrings : public cudf::benchmark {
};

Expand Down Expand Up @@ -192,14 +187,12 @@ static void BM_concatenate_strings(benchmark::State& state)
(sizeof(int32_t) + num_chars)); // offset + chars
}

#define CONCAT_STRINGS_BENCHMARK_DEFINE(name, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(ConcatenateStrings, name, nullable) \
(::benchmark::State & state) { BM_concatenate_strings<nullable>(state); } \
BENCHMARK_REGISTER_F(ConcatenateStrings, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \
TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_non_null, false)
CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_nullable, true)
CONCAT_STRINGS_BENCHMARK_DEFINE(false)
CONCAT_STRINGS_BENCHMARK_DEFINE(true)
73 changes: 73 additions & 0 deletions cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <benchmark/benchmark.h>

namespace cudf {
/**
* @brief Templated Google Benchmark with fixture
*
* Extends Google benchmarks to support templated Benchmarks with non-templated fixture class.
*
* The SetUp and TearDown methods is called before each templated benchmark function is run.
* These methods are called automatically by Google Benchmark
*
* Example:
*
* @code
* template <class T, class U>
* void my_benchmark(::benchmark::State& state) {
* std::vector<T> v1(state.range(0));
* std::vector<U> v2(state.range(0));
* for (auto _ : state) {
* // benchmark stuff
* }
* }
*
* TEMPLATED_BENCHMARK_F(cudf::benchmark, my_benchmark, int, double)->Range(128, 512);
* @endcode
*/
template <class Fixture>
class FunctionTemplateBenchmark : public Fixture {
public:
FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func)
: Fixture(), func_(func)
{
this->SetName(name);
}

virtual void Run(::benchmark::State& st)
{
this->SetUp(st);
this->BenchmarkCase(st);
this->TearDown(st);
}

private:
::benchmark::internal::Function* func_;

protected:
virtual void BenchmarkCase(::benchmark::State& st) { func_(st); }
};

#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \
BENCHMARK_PRIVATE_DECLARE(n) = (::benchmark::internal::RegisterBenchmarkInternal( \
new cudf::FunctionTemplateBenchmark<BaseClass>(#BaseClass "/" #n "<" #__VA_ARGS__ ">", \
n<__VA_ARGS__>)))

} // namespace cudf
Loading

0 comments on commit e6b0661

Please sign in to comment.