Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add templated benchmark with fixture #9838

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 17 additions & 45 deletions cpp/benchmarks/ast/transform_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include <benchmark/benchmark.h>
#include <fixture/benchmark_fixture.hpp>
#include <fixture/templated_benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <thrust/iterator/counting_iterator.h>
Expand All @@ -40,7 +41,6 @@ enum class TreeType {
// child column reference
};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {
};

Expand Down Expand Up @@ -127,9 +127,22 @@ static void BM_ast_transform(benchmark::State& state)
(tree_levels + 1) * sizeof(key_type));
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); }
static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \
->Apply(CustomRanges) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
Expand All @@ -144,44 +157,3 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();
100 changes: 50 additions & 50 deletions cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <fixture/benchmark_fixture.hpp>
#include <fixture/templated_benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>
Expand All @@ -23,12 +24,11 @@

#include <thrust/iterator/counting_iterator.h>

template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
class COMPILED_BINARYOP : public cudf::benchmark {
};

template <typename TypeLhs, typename TypeRhs, typename TypeOut>
void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator binop>
void BM_compiled_binaryop(benchmark::State& state)
{
const cudf::size_type column_size{(cudf::size_type)state.range(0)};

Expand All @@ -50,56 +50,56 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
}

// TODO tparam boolean for null.
#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut) \
BENCHMARK_TEMPLATE_DEFINE_F( \
COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
(::benchmark::State & st) \
{ \
BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
} \
BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \
TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \
BM_compiled_binaryop, \
TypeLhs, \
TypeRhs, \
TypeOut, \
cudf::binary_operator::binop) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

using namespace cudf;
using namespace numeric;

// clang-format off
BINARYOP_BENCHMARK_DEFINE(ADD_1, float, float, ADD, float);
BINARYOP_BENCHMARK_DEFINE(ADD_2, timestamp_s, duration_s, ADD, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(SUB_1, duration_s, duration_D, SUB, duration_ms);
BINARYOP_BENCHMARK_DEFINE(SUB_2, int64_t, int64_t, SUB, int64_t);
BINARYOP_BENCHMARK_DEFINE(MUL_1, float, float, MUL, int64_t);
BINARYOP_BENCHMARK_DEFINE(MUL_2, duration_s, int64_t, MUL, duration_s);
BINARYOP_BENCHMARK_DEFINE(DIV_1, int64_t, int64_t, DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(DIV_2, duration_ms, int32_t, DIV, duration_ms);
BINARYOP_BENCHMARK_DEFINE(TRUE_DIV, int64_t, int64_t, TRUE_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV, int64_t, int64_t, FLOOR_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(MOD_1, double, double, MOD, double);
BINARYOP_BENCHMARK_DEFINE(MOD_2, duration_ms, int64_t, MOD, duration_ms);
BINARYOP_BENCHMARK_DEFINE(PMOD, int32_t, int64_t, PMOD, double);
BINARYOP_BENCHMARK_DEFINE(PYMOD, int32_t, uint8_t, PYMOD, int64_t);
BINARYOP_BENCHMARK_DEFINE(POW, int64_t, int64_t, POW, double);
BINARYOP_BENCHMARK_DEFINE(LOG_BASE, float, double, LOG_BASE, double);
BINARYOP_BENCHMARK_DEFINE(ATAN2, float, double, ATAN2, double);
BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT, int, int, SHIFT_LEFT, int);
BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT, int16_t, int64_t, SHIFT_RIGHT, int);
BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT, int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_AND, int64_t, int32_t, BITWISE_AND, int16_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_OR, int16_t, int32_t, BITWISE_OR, int64_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR, int16_t, int64_t, BITWISE_XOR, int32_t);
BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND, double, int8_t, LOGICAL_AND, bool);
BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR, int16_t, int64_t, LOGICAL_OR, bool);
BINARYOP_BENCHMARK_DEFINE(EQUAL_1, int32_t, int64_t, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(EQUAL_2, duration_ms, duration_ns, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL, decimal32, decimal32, NOT_EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(LESS, timestamp_s, timestamp_s, LESS, bool);
BINARYOP_BENCHMARK_DEFINE(GREATER, timestamp_ms, timestamp_s, GREATER, bool);
BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS, duration_ms, duration_ns, NULL_EQUALS, bool);
BINARYOP_BENCHMARK_DEFINE(NULL_MAX, decimal32, decimal32, NULL_MAX, decimal32);
BINARYOP_BENCHMARK_DEFINE(NULL_MIN, timestamp_D, timestamp_s, NULL_MIN, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t);
BINARYOP_BENCHMARK_DEFINE(float, float, ADD, float);
BINARYOP_BENCHMARK_DEFINE(timestamp_s, duration_s, ADD, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, SUB, int64_t);
BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t);
BINARYOP_BENCHMARK_DEFINE(duration_s, int64_t, MUL, duration_s);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(duration_ms, int32_t, DIV, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double);
BINARYOP_BENCHMARK_DEFINE(duration_ms, int64_t, MOD, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double);
BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double);
BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double);
BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double);
BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int);
BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t);
BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t);
BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool);
BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool);
BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool);
BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool);
BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32);
BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s);
51 changes: 22 additions & 29 deletions cpp/benchmarks/column/concatenate_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
#include <cudf_test/column_wrapper.hpp>

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <algorithm>
#include <vector>

template <typename T, bool Nullable>
class Concatenate : public cudf::benchmark {
};

Expand Down Expand Up @@ -69,17 +69,15 @@ static void BM_concatenate(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
}

#define CONCAT_BENCHMARK_DEFINE(name, type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \
(::benchmark::State & state) { BM_concatenate<type, nullable>(state); } \
BENCHMARK_REGISTER_F(Concatenate, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_BENCHMARK_DEFINE(concat_columns_int64_non_null, int64_t, false)
CONCAT_BENCHMARK_DEFINE(concat_columns_int64_nullable, int64_t, true)
CONCAT_BENCHMARK_DEFINE(int64_t, false)
CONCAT_BENCHMARK_DEFINE(int64_t, true)

template <typename T, bool Nullable>
static void BM_concatenate_tables(benchmark::State& state)
Expand Down Expand Up @@ -131,19 +129,16 @@ static void BM_concatenate_tables(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
}

#define CONCAT_TABLES_BENCHMARK_DEFINE(name, type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \
(::benchmark::State & state) { BM_concatenate_tables<type, nullable>(state); } \
BENCHMARK_REGISTER_F(Concatenate, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_non_null, int64_t, false)
CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_nullable, int64_t, true)
CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)

template <bool Nullable>
class ConcatenateStrings : public cudf::benchmark {
};

Expand Down Expand Up @@ -192,14 +187,12 @@ static void BM_concatenate_strings(benchmark::State& state)
(sizeof(int32_t) + num_chars)); // offset + chars
}

#define CONCAT_STRINGS_BENCHMARK_DEFINE(name, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(ConcatenateStrings, name, nullable) \
(::benchmark::State & state) { BM_concatenate_strings<nullable>(state); } \
BENCHMARK_REGISTER_F(ConcatenateStrings, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \
TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_non_null, false)
CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_nullable, true)
CONCAT_STRINGS_BENCHMARK_DEFINE(false)
CONCAT_STRINGS_BENCHMARK_DEFINE(true)
73 changes: 73 additions & 0 deletions cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <benchmark/benchmark.h>

namespace cudf {
/**
* @brief Templated Google Benchmark with fixture
*
* Extends Google benchmarks to support templated Benchmarks with non-templated fixture class.
*
* The SetUp and TearDown methods is called before each templated benchmark function is run.
* These methods are called automatically by Google Benchmark
*
* Example:
*
* @code
* template <class T, class U>
* void my_benchmark(::benchmark::State& state) {
* std::vector<T> v1(state.range(0));
* std::vector<U> v2(state.range(0));
* for (auto _ : state) {
* // benchmark stuff
* }
* }
*
* TEMPLATED_BENCHMARK_F(cudf::benchmark, my_benchmark, int, double)->Range(128, 512);
* @endcode
*/
template <class Fixture>
class FunctionTemplateBenchmark : public Fixture {
public:
FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func)
: Fixture(), func_(func)
{
this->SetName(name);
}

virtual void Run(::benchmark::State& st)
{
this->SetUp(st);
this->BenchmarkCase(st);
this->TearDown(st);
}

private:
::benchmark::internal::Function* func_;

protected:
virtual void BenchmarkCase(::benchmark::State& st) { func_(st); }
};

#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \
BENCHMARK_PRIVATE_DECLARE(n) = (::benchmark::internal::RegisterBenchmarkInternal( \
new cudf::FunctionTemplateBenchmark<BaseClass>(#BaseClass "/" #n "<" #__VA_ARGS__ ">", \
n<__VA_ARGS__>)))

} // namespace cudf
Loading