Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add decimal types to cuIO benchmarks #9776

Merged
merged 37 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
daa70f8
decimal rand gen
vuule Nov 24, 2021
f1bfee1
expand benchmarks
vuule Nov 24, 2021
d1b09bc
read as dec128 when precision > 18 instead of throwing
vuule Nov 24, 2021
3065bac
update tests to match the new behavior
vuule Nov 24, 2021
c37deee
Merge branch 'bug-dec128-from-precision' of https://github.com/vuule/…
vuule Nov 24, 2021
2ce8412
fix error in make_normal_dist
vuule Nov 25, 2021
6624b44
generate random scale
vuule Nov 25, 2021
0f664f9
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Nov 29, 2021
c5966a3
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 2, 2021
1fb2164
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 3, 2021
761ed54
decode as decimal32
vuule Dec 4, 2021
6e11f75
read as dec32 when possible; update tests; disable dec128 from python
vuule Dec 4, 2021
41ac21c
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 4, 2021
7df6e76
set scale for decimal32
vuule Dec 4, 2021
fe03e08
adjust Java test
vuule Dec 4, 2021
b98b4a8
java fix try 2
vuule Dec 4, 2021
1869255
Remove deprecated methods from Java Table class
jlowe Dec 7, 2021
509a153
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 7, 2021
fd1c6da
Merge commit 'refs/pull/9853/head' of https://github.com/rapidsai/cud…
vuule Dec 7, 2021
d56b9b9
Merge branch 'bug-dec128-from-precision' of https://github.com/vuule/…
vuule Dec 7, 2021
b6c9d7b
revert Java test changes
vuule Dec 7, 2021
58af480
clean up
vuule Dec 7, 2021
92c689e
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 7, 2021
a21bff4
clean up initialization
vuule Dec 7, 2021
deaf954
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 7, 2021
8d0bc6d
Merge branch 'bug-dec128-from-precision' of https://github.com/vuule/…
vuule Dec 7, 2021
349d4e2
stylin'
vuule Dec 7, 2021
714284f
Merge branch 'bug-dec128-from-precision' of https://github.com/vuule/…
vuule Dec 7, 2021
1147ca3
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 8, 2021
85fea70
cmake style fix
vuule Dec 9, 2021
d2eceaf
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 10, 2021
dc70dd3
digits10
vuule Dec 10, 2021
7416c07
T::rep alias
vuule Dec 10, 2021
ca4390b
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 10, 2021
c9841c5
Merge branch 'branch-22.02' of https://github.com/rapidsai/cudf into …
vuule Dec 14, 2021
355040c
refactor make_normal_dist to make the range of output values clearer;…
vuule Dec 15, 2021
349e023
rename
vuule Dec 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions cpp/benchmarks/common/generate_benchmark_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,29 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
*/
template <typename T>
struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
random_value_fn(distribution_params<T> const&) {}
T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); }
using rep = typename T::rep;
rep const lower_bound;
rep const upper_bound;
distribution_fn<rep> dist;
std::optional<numeric::scale_type> scale;

random_value_fn(distribution_params<rep> const& desc)
: lower_bound{desc.lower_bound},
upper_bound{desc.upper_bound},
dist{make_distribution<rep>(desc.id, desc.lower_bound, desc.upper_bound)}
{
}

T operator()(std::mt19937& engine)
{
if (not scale.has_value()) {
int const max_scale = std::numeric_limits<rep>::digits10;
auto scale_dist = make_distribution<int>(distribution_id::NORMAL, -max_scale, max_scale);
scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)};
}
// Clamp the generated random value to the specified range
return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale};
}
};

/**
Expand Down
13 changes: 11 additions & 2 deletions cpp/benchmarks/common/generate_benchmark_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ class data_profile {
distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
distribution_params<cudf::list_view> list_dist_desc{
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;

double bool_probability = 0.5;
double null_frequency = 0.01;
Expand Down Expand Up @@ -284,9 +285,17 @@ class data_profile {
}

template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
distribution_params<T> get_distribution_params() const
distribution_params<typename T::rep> get_distribution_params() const
{
CUDF_FAIL("Not implemented");
using rep = typename T::rep;
auto it = decimal_params.find(cudf::type_to_id<T>());
if (it == decimal_params.end()) {
auto const range = default_range<rep>();
return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
} else {
auto& desc = it->second;
return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
}
}

auto get_bool_probability() const { return bool_probability; }
Expand Down
27 changes: 16 additions & 11 deletions cpp/benchmarks/common/random_distribution_factory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,24 @@
#include <memory>
#include <random>

/**
* @brief Generates a normal(binomial) distribution between zero and upper_bound.
*/
template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
auto make_normal_dist(T range_start, T range_end)
auto make_normal_dist(T upper_bound)
{
using uT = typename std::make_unsigned<T>::type;
uT const range_size = range_end - range_start;
return std::binomial_distribution<uT>(range_size, 0.5);
using uT = typename std::make_unsigned<T>::type;
return std::binomial_distribution<uT>(upper_bound, 0.5);
}

/**
* @brief Generates a normal distribution between zero and upper_bound.
*/
template <typename T, std::enable_if_t<cudf::is_floating_point<T>()>* = nullptr>
auto make_normal_dist(T range_start, T range_end)
auto make_normal_dist(T upper_bound)
{
T const mean = range_start / 2 + range_end / 2;
T const stddev = range_end / 6 - range_start / 6;
T const mean = upper_bound / 2;
T const stddev = upper_bound / 6;
return std::normal_distribution<T>(mean, stddev);
}

Expand Down Expand Up @@ -82,8 +87,8 @@ distribution_fn<T> make_distribution(distribution_id did, T lower_bound, T upper
{
switch (did) {
case distribution_id::NORMAL:
return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; };
return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
case distribution_id::UNIFORM:
return [dist = make_uniform_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
Expand All @@ -104,8 +109,8 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
{
switch (dist_id) {
case distribution_id::NORMAL:
return [dist = make_normal_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
case distribution_id::UNIFORM:
return [dist = make_uniform_dist(lower_bound, upper_bound)](
std::mt19937& engine) mutable -> T { return dist(engine); };
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
col_sel);
Expand Down Expand Up @@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);

Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});

Expand Down Expand Up @@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);

Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)});

auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)}),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state)

RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down
7 changes: 5 additions & 2 deletions cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state)

auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING)});
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)});

auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand All @@ -85,7 +87,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression)
.stats_level(enable_stats)
.column_chunks_file_path(file_path);
.column_chunks_file_paths({file_path});
cudf_io::write_parquet(options);
}

Expand All @@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state)

WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
Expand Down