Skip to content

Commit

Permalink
generate benchmark input in device (#10109)
Browse files Browse the repository at this point in the history
To speedup generate benchmark input generation, move all data generation to device.
To address #5773 (comment)
This PR moves the random input generation to device.

Rest all of the original work in this PR was split to multiple PRs and merged.
#10277
#10278
#10279
#10280
#10281
#10300

With all of these changes, single iteration of all benchmark runs in <1000 seconds. (from 3067s to 964s).
Running more iterations would see higher benefit too because the benchmark is restarted several times during run which again calls benchmark input generation code.

closes #9857

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: #10109
  • Loading branch information
karthikeyann authored Mar 22, 2022
1 parent 4300ba4 commit 76c772e
Show file tree
Hide file tree
Showing 34 changed files with 918 additions and 1,052 deletions.
6 changes: 3 additions & 3 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

find_package(Threads REQUIRED)

add_library(cudf_datagen STATIC common/generate_input.cpp common/generate_nullmask.cu)
add_library(cudf_datagen STATIC common/generate_input.cu)
target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17)

target_compile_options(
Expand Down Expand Up @@ -136,7 +136,7 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)

# ##################################################################################################
# * transpose benchmark ---------------------------------------------------------------------------
ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cu)
ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)

# ##################################################################################################
# * apply_boolean_mask benchmark ------------------------------------------------------------------
Expand All @@ -145,7 +145,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(
STREAM_COMPACTION_BENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
)

# ##################################################################################################
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/ast/transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <algorithm>
#include <list>
#include <memory>
#include <optional>
#include <vector>

enum class TreeType {
Expand All @@ -48,7 +49,7 @@ static void BM_ast_transform(benchmark::State& state)
auto const source_table =
create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
row_count{table_size},
Nullable ? 0.5 : -1.0);
Nullable ? std::optional<double>{0.5} : std::nullopt);
auto table = source_table->view();

// Create column references
Expand Down
126 changes: 48 additions & 78 deletions cpp/benchmarks/column/concatenate.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -13,15 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/concatenate.hpp>
#include <cudf/table/table.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/concatenate.hpp>
#include <cudf/table/table.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <algorithm>
Expand All @@ -33,31 +34,14 @@ class Concatenate : public cudf::benchmark {
template <typename T, bool Nullable>
static void BM_concatenate(benchmark::State& state)
{
using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;

auto const num_rows = state.range(0);
auto const num_cols = state.range(1);

// Create owning columns
std::vector<column_wrapper> columns;
columns.reserve(num_cols);
std::generate_n(std::back_inserter(columns), num_cols, [num_rows]() {
auto iter = thrust::make_counting_iterator(0);
if (Nullable) {
auto valid_iter = thrust::make_transform_iterator(iter, [](auto i) { return i % 3 == 0; });
return column_wrapper(iter, iter + num_rows, valid_iter);
} else {
return column_wrapper(iter, iter + num_rows);
}
});
cudf::size_type const num_rows = state.range(0);
cudf::size_type const num_cols = state.range(1);

// Generate column views
std::vector<cudf::column_view> column_views;
column_views.reserve(columns.size());
std::transform(
columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) {
return static_cast<cudf::column_view>(col);
});
auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
row_count{num_rows},
Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
auto input_columns = input->view();
std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());

CHECK_CUDA(0);

Expand All @@ -69,11 +53,13 @@ static void BM_concatenate(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
}

#define CONCAT_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_BENCHMARK_DEFINE(type, nullable) \
BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
(::benchmark::State & st) { BM_concatenate<type, nullable>(st); } \
BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_BENCHMARK_DEFINE(int64_t, false)
Expand All @@ -82,42 +68,22 @@ CONCAT_BENCHMARK_DEFINE(int64_t, true)
template <typename T, bool Nullable>
static void BM_concatenate_tables(benchmark::State& state)
{
using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;

auto const num_rows = state.range(0);
auto const num_cols = state.range(1);
auto const num_tables = state.range(2);

// Create owning columns
std::vector<column_wrapper> columns;
columns.reserve(num_cols);
std::generate_n(std::back_inserter(columns), num_cols * num_tables, [num_rows]() {
auto iter = thrust::make_counting_iterator(0);
if (Nullable) {
auto valid_iter = thrust::make_transform_iterator(iter, [](auto i) { return i % 3 == 0; });
return column_wrapper(iter, iter + num_rows, valid_iter);
} else {
return column_wrapper(iter, iter + num_rows);
}
cudf::size_type const num_rows = state.range(0);
cudf::size_type const num_cols = state.range(1);
cudf::size_type const num_tables = state.range(2);

std::vector<std::unique_ptr<cudf::table>> tables(num_tables);
std::generate_n(tables.begin(), num_tables, [&]() {
return create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
row_count{num_rows},
Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
});

// Generate column views
std::vector<std::vector<cudf::column_view>> column_views(num_tables);
for (int i = 0; i < num_tables; ++i) {
column_views[i].reserve(num_cols);
auto it = columns.begin() + (i * num_cols);
std::transform(it, it + num_cols, std::back_inserter(column_views[i]), [](auto const& col) {
return static_cast<cudf::column_view>(col);
});
}

// Generate table views
std::vector<cudf::table_view> table_views;
table_views.reserve(num_tables);
std::transform(column_views.begin(),
column_views.end(),
std::back_inserter(table_views),
[](auto const& col_vec) { return cudf::table_view(col_vec); });
std::vector<cudf::table_view> table_views(num_tables);
std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable {
return table->view();
});

CHECK_CUDA(0);

Expand All @@ -129,11 +95,13 @@ static void BM_concatenate_tables(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
}

#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \
BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
(::benchmark::State & st) { BM_concatenate_tables<type, nullable>(st); } \
BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
Expand Down Expand Up @@ -187,11 +155,13 @@ static void BM_concatenate_strings(benchmark::State& state)
(sizeof(int32_t) + num_chars)); // offset + chars
}

#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \
TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \
BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
(::benchmark::State & st) { BM_concatenate_strings<nullable>(st); } \
BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_STRINGS_BENCHMARK_DEFINE(false)
Expand Down
Loading

0 comments on commit 76c772e

Please sign in to comment.