diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 0ec2590bdb5..995cea13c27 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -161,8 +161,29 @@ struct random_value_fn()>> { */ template struct random_value_fn()>> { - random_value_fn(distribution_params const&) {} - T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); } + using rep = typename T::rep; + rep const lower_bound; + rep const upper_bound; + distribution_fn dist; + std::optional scale; + + random_value_fn(distribution_params const& desc) + : lower_bound{desc.lower_bound}, + upper_bound{desc.upper_bound}, + dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + { + } + + T operator()(std::mt19937& engine) + { + if (not scale.has_value()) { + int const max_scale = std::numeric_limits::digits10; + auto scale_dist = make_distribution(distribution_id::NORMAL, -max_scale, max_scale); + scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)}; + } + // Clamp the generated random value to the specified range + return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale}; + } }; /** diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp index 6ea57c0a7ad..3dbc6561839 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.hpp +++ b/cpp/benchmarks/common/generate_benchmark_input.hpp @@ -216,6 +216,7 @@ class data_profile { distribution_params string_dist_desc{{distribution_id::NORMAL, 0, 32}}; distribution_params list_dist_desc{ cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2}; + std::map> decimal_params; double bool_probability = 0.5; double null_frequency = 0.01; @@ -284,9 +285,17 @@ class data_profile { } template ()>* = nullptr> - distribution_params get_distribution_params() const + distribution_params get_distribution_params() const { - CUDF_FAIL("Not implemented"); + using rep = typename T::rep; + auto it = decimal_params.find(cudf::type_to_id()); + if (it == decimal_params.end()) { + auto const range = default_range(); + return distribution_params{default_distribution_id(), range.first, range.second}; + } else { + auto& desc = it->second; + return {desc.id, static_cast(desc.lower_bound), static_cast(desc.upper_bound)}; + } } auto get_bool_probability() const { return bool_probability; } diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp index c21fb645573..65dc8b4dd4d 100644 --- a/cpp/benchmarks/common/random_distribution_factory.hpp +++ b/cpp/benchmarks/common/random_distribution_factory.hpp @@ -21,19 +21,24 @@ #include #include +/** + * @brief Generates a normal(binomial) distribution between zero and upper_bound. + */ template ::value, T>* = nullptr> -auto make_normal_dist(T range_start, T range_end) +auto make_normal_dist(T upper_bound) { - using uT = typename std::make_unsigned::type; - uT const range_size = range_end - range_start; - return std::binomial_distribution(range_size, 0.5); + using uT = typename std::make_unsigned::type; + return std::binomial_distribution(upper_bound, 0.5); } +/** + * @brief Generates a normal distribution between zero and upper_bound. + */ template ()>* = nullptr> -auto make_normal_dist(T range_start, T range_end) +auto make_normal_dist(T upper_bound) { - T const mean = range_start / 2 + range_end / 2; - T const stddev = range_end / 6 - range_start / 6; + T const mean = upper_bound / 2; + T const stddev = upper_bound / 6; return std::normal_distribution(mean, stddev); } @@ -82,8 +87,8 @@ distribution_fn make_distribution(distribution_id did, T lower_bound, T upper { switch (did) { case distribution_id::NORMAL: - return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)]( - std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; }; + return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)]( + std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( std::mt19937& engine) mutable -> T { return dist(engine); }; @@ -104,8 +109,8 @@ distribution_fn make_distribution(distribution_id dist_id, T lower_bound, T u { switch (dist_id) { case distribution_id::NORMAL: - return [dist = make_normal_dist(lower_bound, upper_bound)]( - std::mt19937& engine) mutable -> T { return dist(engine); }; + return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)]( + std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( std::mt19937& engine) mutable -> T { return dist(engine); }; diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp index 3f5549a3148..77bf4b03a14 100644 --- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp @@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), int32_t(cudf::type_id::STRING)}), col_sel); @@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp index fdd7c63eece..9baab6b2571 100644 --- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp @@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), int32_t(cudf::type_id::STRING)}); @@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp index f0624e40149..6ab8d8d09c0 100644 --- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp @@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}), + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp index bfa7d4fc6d9..933b3d02e08 100644 --- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp @@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}); + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp index 045aa0e043b..a68ce2bd1a1 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp @@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}), + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index b4c11179c35..1af7e206692 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}); + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -85,7 +87,7 @@ void BM_parq_write_varying_options(benchmark::State& state) cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression) .stats_level(enable_stats) - .column_chunks_file_path(file_path); + .column_chunks_file_paths({file_path}); cudf_io::write_parquet(options); } @@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);