From daa70f8160003ede0338ceab6d666ec224ffc918 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Nov 2021 11:04:24 -0800 Subject: [PATCH 01/21] decimal rand gen --- .../common/generate_benchmark_input.cpp | 19 +++++++++++++++++-- .../common/generate_benchmark_input.hpp | 15 +++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 0ec2590bdb5..e66fff18d6b 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -161,8 +161,23 @@ struct random_value_fn()>> { */ template struct random_value_fn()>> { - random_value_fn(distribution_params const&) {} - T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); } + typename T::rep const lower_bound; + typename T::rep const upper_bound; + distribution_fn dist; + + random_value_fn(distribution_params const& desc) + : lower_bound{desc.lower_bound}, + upper_bound{desc.upper_bound}, + dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + { + // TODO generate random scale, since it's one random_value_fn object per column + } + + T operator()(std::mt19937& engine) + { + // Clamp the generated random value to the specified range + return T{std::max(std::min(dist(engine), upper_bound), lower_bound), numeric::scale_type{0}}; + } }; /** diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp index 6ea57c0a7ad..3bd371438d7 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.hpp +++ b/cpp/benchmarks/common/generate_benchmark_input.hpp @@ -216,6 +216,7 @@ class data_profile { distribution_params string_dist_desc{{distribution_id::NORMAL, 0, 32}}; distribution_params list_dist_desc{ cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2}; + std::map> decimal_params; double bool_probability = 0.5; double null_frequency = 0.01; @@ -284,9 +285,19 @@ class data_profile { } template ()>* = nullptr> - distribution_params get_distribution_params() const + distribution_params get_distribution_params() const { - CUDF_FAIL("Not implemented"); + auto it = decimal_params.find(cudf::type_to_id()); + if (it == decimal_params.end()) { + auto const range = default_range(); + return distribution_params{ + default_distribution_id(), range.first, range.second}; + } else { + auto& desc = it->second; + return {desc.id, + static_cast(desc.lower_bound), + static_cast(desc.upper_bound)}; + } } auto get_bool_probability() const { return bool_probability; } From f1bfee12bdf0cd8c019149b0115fd12c5ed7d6fb Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Nov 2021 11:05:26 -0800 Subject: [PATCH 02/21] expand benchmarks --- cpp/benchmarks/io/csv/csv_reader_benchmark.cpp | 2 ++ cpp/benchmarks/io/csv/csv_writer_benchmark.cpp | 2 ++ cpp/benchmarks/io/orc/orc_reader_benchmark.cpp | 5 ++++- cpp/benchmarks/io/orc/orc_writer_benchmark.cpp | 5 ++++- cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp | 5 ++++- cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp | 5 ++++- 6 files changed, 20 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp index 3f5549a3148..77bf4b03a14 100644 --- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp @@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), int32_t(cudf::type_id::STRING)}), col_sel); @@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp index fdd7c63eece..9baab6b2571 100644 --- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp +++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp @@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), int32_t(cudf::type_id::STRING)}); @@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp index f0624e40149..6ab8d8d09c0 100644 --- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp @@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}), + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp index bfa7d4fc6d9..933b3d02e08 100644 --- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp +++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp @@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}); + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp index 045aa0e043b..a68ce2bd1a1 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp @@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const data_types = dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}), + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}), col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state) RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST); diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index b4c11179c35..71a47578cca 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state) auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), int32_t(type_group_id::FLOATING_POINT), + int32_t(type_group_id::FIXED_POINT), int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING)}); + int32_t(cudf::type_id::STRING), + int32_t(cudf::type_id::LIST)}); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state) WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); +WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST); From d1b09bccfdd7554b7474fa6e0288b75d3d92427e Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Nov 2021 11:08:05 -0800 Subject: [PATCH 03/21] read as dec128 when precision > 18 instead of throwing --- cpp/src/io/orc/reader_impl.cu | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 798cdca178a..5cb3d5cc3f5 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -230,12 +230,12 @@ size_t gather_stream_info(const size_t stripe_index, /** * @brief Determines cuDF type of an ORC Decimal column. */ -auto decimal_column_type(const std::vector& float64_columns, - const std::vector& decimal128_columns, - cudf::io::orc::metadata& metadata, +auto decimal_column_type(std::vector const& float64_columns, + std::vector const& decimal128_columns, + cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { - auto const& column_path = metadata.column_path(column_index); + auto const& column_path = metadata.column_path(0, column_index); auto is_column_in = [&](const std::vector& cols) { return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end(); }; @@ -247,7 +247,8 @@ auto decimal_column_type(const std::vector& float64_columns, if (user_selected_float64) return type_id::FLOAT64; if (user_selected_decimal128) return type_id::DECIMAL128; - return type_id::DECIMAL64; + return metadata.get_col_type(column_index).precision <= 18 ? type_id::DECIMAL64 + : type_id::DECIMAL128; } } // namespace @@ -743,8 +744,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), - decimal_column_type( - _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id)); + decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id)); int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; @@ -952,13 +952,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), - decimal_column_type( - _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id)); + decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); - CUDF_EXPECTS( - (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18), - "Precision of column " + std::string{_metadata.column_name(0, col.id)} + - " is over 18, use 128-bit Decimal."); if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { // sign of the scale is changed since cuDF follows c++ libraries like CNL // which uses negative scaling, but liborc and other libraries From 3065bacc60fb005c7001e507b16d72c9ca11df67 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Nov 2021 14:53:20 -0800 Subject: [PATCH 04/21] update tests to match the new behavior --- cpp/tests/io/orc_test.cpp | 11 +++++------ python/cudf/cudf/tests/test_orc.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index da44c91eec3..ee37310aff6 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -387,9 +387,7 @@ TEST_F(OrcWriterTest, MultiColumn) cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = - cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .use_index(false) - .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"}); + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); @@ -1438,7 +1436,7 @@ TEST_F(OrcReaderTest, DecimalOptions) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) .decimal128_columns({"dec", "fake_name"}) .decimal_cols_as_float({"decc", "fake_name"}); - // Should not throw + // Should not throw, even with "fake name" in both options EXPECT_NO_THROW(cudf_io::read_orc(valid_opts)); cudf_io::orc_reader_options invalid_opts = @@ -1493,10 +1491,11 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) .use_index(false) - .decimal128_columns({"lists.1.dec128"}); + .decimal128_columns({"lists.1.dec64"}); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0), + result.tbl->view().column(0).child(1).child(1)); } CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 6b02874146e..1bc243e28c0 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -531,7 +531,7 @@ def test_orc_decimal_precision_fail(datadir): # Max precision supported is 18 (Decimal64Dtype limit) # and the data has the precision 19. This test should be removed # once Decimal128Dtype is introduced. - with pytest.raises(RuntimeError): + with pytest.raises(KeyError): cudf.read_orc(file_path) # Shouldn't cause failure if decimal column is not chosen to be read. From 2ce84122809728ce9faa99de49291c63c0db262c Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Nov 2021 16:00:32 -0800 Subject: [PATCH 05/21] fix error in make_normal_dist --- cpp/benchmarks/common/random_distribution_factory.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp index c21fb645573..dd85495e65f 100644 --- a/cpp/benchmarks/common/random_distribution_factory.hpp +++ b/cpp/benchmarks/common/random_distribution_factory.hpp @@ -83,7 +83,7 @@ distribution_fn make_distribution(distribution_id did, T lower_bound, T upper switch (did) { case distribution_id::NORMAL: return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)]( - std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; }; + std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( std::mt19937& engine) mutable -> T { return dist(engine); }; From 6624b44db38ed5070cee9e26c0f987d7cf4efb04 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Nov 2021 16:00:45 -0800 Subject: [PATCH 06/21] generate random scale --- cpp/benchmarks/common/generate_benchmark_input.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index e66fff18d6b..9115f2f4a89 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -164,19 +164,24 @@ struct random_value_fn()>> typename T::rep const lower_bound; typename T::rep const upper_bound; distribution_fn dist; + std::optional scale; random_value_fn(distribution_params const& desc) : lower_bound{desc.lower_bound}, upper_bound{desc.upper_bound}, dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} { - // TODO generate random scale, since it's one random_value_fn object per column } T operator()(std::mt19937& engine) { + if (not scale.has_value()) { + int const max_scale = 9 * sizeof(typename T::rep) / 4; + auto scale_dist = make_distribution(distribution_id::NORMAL, -max_scale, max_scale); + scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)}; + } // Clamp the generated random value to the specified range - return T{std::max(std::min(dist(engine), upper_bound), lower_bound), numeric::scale_type{0}}; + return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale}; } }; From 761ed544e6dc2a532a6eaa96e42e4bfed955e47a Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 3 Dec 2021 16:23:05 -0800 Subject: [PATCH 07/21] decode as decimal32 --- cpp/src/io/orc/stripe_data.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 44f106c4f5c..8f8bb87d9e4 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1066,12 +1066,12 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs, return (v / kPow5i[scale]) >> scale; } }(); - if (dtype_id == type_id::DECIMAL64) { + if (dtype_id == type_id::DECIMAL32) { + vals.i32[t] = scaled_value; + } else if (dtype_id == type_id::DECIMAL64) { vals.i64[t] = scaled_value; } else { - { - vals.i128[t] = scaled_value; - } + vals.i128[t] = scaled_value; } } } @@ -1708,8 +1708,10 @@ __global__ void __launch_bounds__(block_size) case DOUBLE: case LONG: static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; break; case DECIMAL: - if (s->chunk.dtype_id == type_id::FLOAT64 or - s->chunk.dtype_id == type_id::DECIMAL64) { + if (s->chunk.dtype_id == type_id::DECIMAL32) { + static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; + } else if (s->chunk.dtype_id == type_id::FLOAT64 or + s->chunk.dtype_id == type_id::DECIMAL64) { static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; } else { // decimal128 From 6e11f75db9b939f61b0a230eecaf7db2b516882f Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 3 Dec 2021 16:45:23 -0800 Subject: [PATCH 08/21] read as dec32 when possible; update tests; disable dec128 from python --- cpp/include/cudf/io/orc.hpp | 24 ++++++++++++++++++++++++ cpp/src/io/orc/reader_impl.cu | 23 ++++++++++++++++++----- cpp/src/io/orc/reader_impl.hpp | 1 + cpp/tests/io/orc_test.cpp | 11 +++-------- python/cudf/cudf/_lib/cpp/io/orc.pxd | 6 ++---- python/cudf/cudf/_lib/orc.pyx | 1 + python/cudf/cudf/tests/test_orc.py | 2 +- 7 files changed, 50 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 3bc2e6c9ef2..d748f5bec09 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -72,6 +72,7 @@ class orc_reader_options { // Columns that should be read as Decimal128 std::vector _decimal128_columns; + bool _enable_decimal128 = true; friend orc_reader_options_builder; @@ -151,6 +152,11 @@ class orc_reader_options { */ std::vector const& get_decimal128_columns() const { return _decimal128_columns; } + /** + * @brief Whether to use row index to speed-up reading. + */ + bool is_enabled_decimal128() const { return _enable_decimal128; } + // Setters /** @@ -225,6 +231,13 @@ class orc_reader_options { _decimal_cols_as_float = std::move(val); } + /** + * @brief Enable/Disable the use of decimal128 type + * + * @param use Boolean value to enable/disable. + */ + void enable_decimal128(bool enable) { _enable_decimal128 = enable; } + /** * @brief Set columns that should be read as 128-bit Decimal * @@ -362,6 +375,17 @@ class orc_reader_options_builder { return *this; } + /** + * @brief Enable/Disable use of decimal128 type + * + * @param use Boolean value to enable/disable. + */ + orc_reader_options_builder& decimal128(bool enable) + { + options.enable_decimal128(enable); + return *this; + } + /** * @brief move orc_reader_options member once it's built. */ diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 5cb3d5cc3f5..c500bb2319d 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -232,23 +232,33 @@ size_t gather_stream_info(const size_t stripe_index, */ auto decimal_column_type(std::vector const& float64_columns, std::vector const& decimal128_columns, + bool is_decimal128_enabled, cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { + if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY; + auto const& column_path = metadata.column_path(0, column_index); auto is_column_in = [&](const std::vector& cols) { return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end(); }; auto const user_selected_float64 = is_column_in(float64_columns); - auto const user_selected_decimal128 = is_column_in(decimal128_columns); + auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns); CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128, "Both decimal128 and float64 types selected for column " + column_path); if (user_selected_float64) return type_id::FLOAT64; if (user_selected_decimal128) return type_id::DECIMAL128; - return metadata.get_col_type(column_index).precision <= 18 ? type_id::DECIMAL64 - : type_id::DECIMAL128; + + auto const precision = metadata.get_col_type(column_index) + .precision.value_or(cuda::std::numeric_limits::digits10); + if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL32; + if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL64; + CUDF_EXPECTS(is_decimal128_enabled, + "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable " + "decimal128 use"); + return type_id::DECIMAL128; } } // namespace @@ -744,7 +754,8 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), - decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id)); + decimal_column_type( + _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id)); int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; @@ -889,6 +900,7 @@ reader::impl::impl(std::vector>&& sources, // Control decimals conversion _decimal_cols_as_float = options.get_decimal_cols_as_float(); decimal128_columns = options.get_decimal128_columns(); + is_decimal128_enabled = options.is_enabled_decimal128(); } timezone_table reader::impl::compute_timezone_table( @@ -952,7 +964,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), - decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id)); + decimal_column_type( + _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { // sign of the scale is changed since cuDF follows c++ libraries like CNL diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 64e7cbc74e5..d9c786d480a 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -223,6 +223,7 @@ class reader::impl { bool _use_np_dtypes = true; std::vector _decimal_cols_as_float; std::vector decimal128_columns; + bool is_decimal128_enabled; data_type _timestamp_type{type_id::EMPTY}; reader_column_meta _col_meta; }; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index ee37310aff6..cfcc266434c 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1176,9 +1176,9 @@ TEST_F(OrcWriterTest, Decimal32) auto data = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { return numeric::decimal32{vals[i], numeric::scale_type{2}}; }); - auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; }); + auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; }); column_wrapper col{data, data + num_rows, mask}; - cudf::table_view expected({static_cast(col)}); + cudf::table_view expected({col}); auto filepath = temp_env->get_temp_filepath("Decimal32.orc"); cudf_io::orc_writer_options out_opts = @@ -1190,12 +1190,7 @@ TEST_F(OrcWriterTest, Decimal32) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); auto result = cudf_io::read_orc(in_opts); - auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { - return numeric::decimal64{vals[i], numeric::scale_type{2}}; - }); - column_wrapper col64{data64, data64 + num_rows, mask}; - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0)); } TEST_F(OrcStatisticsTest, Overflow) diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 4b5ec913fb6..2fc71f64df1 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -36,7 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_np_dtypes(bool val) except+ void set_timestamp_type(data_type type) except+ void set_decimal_cols_as_float(vector[string] val) except+ - void set_decimal128_columns(vector[string] val) except+ + void enable_decimal128(bool val) except+ @staticmethod orc_reader_options_builder builder( @@ -58,9 +58,7 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& decimal_cols_as_float( vector[string] val ) except+ - orc_reader_options_builder& decimal128_columns( - vector[string] val - ) except+ + orc_reader_options_builder& decimal128(bool val) except+ orc_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 1281aa172b4..9a4bd8652da 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -248,6 +248,7 @@ cdef orc_reader_options make_orc_reader_options( .timestamp_type(data_type(timestamp_type)) .use_index(use_index) .decimal_cols_as_float(c_decimal_cols_as_float) + .decimal128(False) .build() ) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 37445eefc7b..dc176992434 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -531,7 +531,7 @@ def test_orc_decimal_precision_fail(datadir): # Max precision supported is 18 (Decimal64Dtype limit) # and the data has the precision 19. This test should be removed # once Decimal128Dtype is introduced. - with pytest.raises(KeyError): + with pytest.raises(RuntimeError): cudf.read_orc(file_path) # Shouldn't cause failure if decimal column is not chosen to be read. From 7df6e7628eb8a879a376ab88250f8b467b6ef5ec Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 3 Dec 2021 19:43:32 -0800 Subject: [PATCH 09/21] set scale for decimal32 --- cpp/src/io/orc/reader_impl.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index c500bb2319d..21c52f9295b 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -806,7 +806,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co break; case orc::DECIMAL: - if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { + if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { scale = -static_cast(_metadata.get_types()[orc_col_id].scale.value_or(0)); } out_col = make_empty_column(data_type(type, scale)); @@ -967,7 +967,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, decimal_column_type( _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); - if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { + if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or + col_type == type_id::DECIMAL128) { // sign of the scale is changed since cuDF follows c++ libraries like CNL // which uses negative scaling, but liborc and other libraries // follow positive scaling. From fe03e0810ee66cd1e1416f2a77e9a3449e8c0a4d Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 3 Dec 2021 22:02:33 -0800 Subject: [PATCH 10/21] adjust Java test --- java/src/test/java/ai/rapids/cudf/TableTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index a5779bf9dbb..d49ce5baa10 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -6883,7 +6883,7 @@ void testParquetWriteToBufferChunkedInt96() { ParquetWriterOptions options = ParquetWriterOptions.builder() .withNonNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") .withDecimalColumn("_c7", 5) - .withDecimalColumn("_c8", 5) + .withDecimalColumn("_c8", 15) .build(); try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { From b98b4a8d38fa4345dc4a7a37e9b56b2ba0c71166 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 3 Dec 2021 23:26:30 -0800 Subject: [PATCH 11/21] java fix try 2 --- java/src/test/java/ai/rapids/cudf/TableTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index d49ce5baa10..1b5f35c43bd 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -6882,8 +6882,8 @@ void testParquetWriteToBufferChunkedInt96() { MyBufferConsumer consumer = new MyBufferConsumer()) { ParquetWriterOptions options = ParquetWriterOptions.builder() .withNonNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") - .withDecimalColumn("_c7", 5) - .withDecimalColumn("_c8", 15) + .withDecimalColumn("_c7", 15) + .withDecimalColumn("_c8", 5) .build(); try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { From 18692554f13e847f02d91446c033f0c443e9ad35 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 7 Dec 2021 11:05:40 -0600 Subject: [PATCH 12/21] Remove deprecated methods from Java Table class --- java/src/main/java/ai/rapids/cudf/Table.java | 89 +------------------ .../test/java/ai/rapids/cudf/TableTest.java | 37 ++++---- 2 files changed, 19 insertions(+), 107 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index e32d466e853..a34d4afdc56 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1091,20 +1091,6 @@ public static void writeColumnViewsToParquet(ParquetWriterOptions options, } } - /** - * Writes this table to a Parquet file on the host - * - * @param options parameters for the writer - * @param outputFile file to write the table to - * @deprecated please use writeParquetChunked instead - */ - @Deprecated - public void writeParquet(ParquetWriterOptions options, File outputFile) { - try (TableWriter writer = writeParquetChunked(options, outputFile)) { - writer.write(this); - } - } - private static class ORCTableWriter implements TableWriter { private long handle; HostBufferConsumer consumer; @@ -1179,33 +1165,6 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo return new ORCTableWriter(options, consumer); } - /** - * Writes this table to a file on the host. - * @param outputFile - File to write the table to - * @deprecated please use writeORCChunked instead - */ - @Deprecated - public void writeORC(File outputFile) { - // Need to specify the number of columns but leave all column names undefined - String[] names = new String[getNumberOfColumns()]; - Arrays.fill(names, ""); - ORCWriterOptions opts = ORCWriterOptions.builder().withColumns(true, names).build(); - writeORC(opts, outputFile); - } - - /** - * Writes this table to a file on the host. - * @param outputFile - File to write the table to - * @deprecated please use writeORCChunked instead - */ - @Deprecated - public void writeORC(ORCWriterOptions options, File outputFile) { - assert options.getTopLevelChildren() == getNumberOfColumns() : "must specify names for all columns"; - try (TableWriter writer = Table.writeORCChunked(options, outputFile)) { - writer.write(this); - } - } - private static class ArrowIPCTableWriter implements TableWriter { private final ArrowIPCWriterOptions.DoneOnGpu callback; private long handle; @@ -2082,26 +2041,6 @@ public Table gather(ColumnView gatherMap) { return gather(gatherMap, OutOfBoundsPolicy.NULLIFY); } - /** - * Gathers the rows of this table according to `gatherMap` such that row "i" - * in the resulting table's columns will contain row "gatherMap[i]" from this table. - * The number of rows in the result table will be equal to the number of elements in - * `gatherMap`. - * - * A negative value `i` in the `gatherMap` is interpreted as `i+n`, where - * `n` is the number of rows in this table. - * - * @deprecated Use {@link #gather(ColumnView, OutOfBoundsPolicy)} - * @param gatherMap the map of indexes. Must be non-nullable and integral type. - * @param checkBounds if true bounds checking is performed on the value. Be very careful - * when setting this to false. - * @return the resulting Table. - */ - @Deprecated - public Table gather(ColumnView gatherMap, boolean checkBounds) { - return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds)); - } - /** * Gathers the rows of this table according to `gatherMap` such that row "i" * in the resulting table's columns will contain row "gatherMap[i]" from this table. @@ -2256,7 +2195,7 @@ public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable, * the left and right tables, respectively, to produce the result of the left join. * It is the responsibility of the caller to close the resulting gather map instances. * This interface allows passing an output row count that was previously computed from - * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression, boolean)}. + * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression)}. * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. * @param rightTable the right side table of the join in the join @@ -2396,7 +2335,7 @@ public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable, * the left and right tables, respectively, to produce the result of the inner join. * It is the responsibility of the caller to close the resulting gather map instances. * This interface allows passing an output row count that was previously computed from - * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression, boolean)}. + * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression)}. * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. * @param rightTable the right side table of the join in the join @@ -2588,7 +2527,7 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable, * to produce the result of the left semi join. * It is the responsibility of the caller to close the resulting gather map instance. * This interface allows passing an output row count that was previously computed from - * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression, boolean)}. + * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression)}. * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. * @param rightTable the right side table of the join @@ -2667,7 +2606,7 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, * to produce the result of the left anti join. * It is the responsibility of the caller to close the resulting gather map instance. * This interface allows passing an output row count that was previously computed from - * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression, boolean)}. + * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression)}. * WARNING: Passing a row count that is smaller than the actual row count will result * in undefined behavior. * @param rightTable the right side table of the join @@ -3449,14 +3388,6 @@ public ContiguousTable[] contiguousSplitGroups() { groupByOptions.getKeysDescending(), groupByOptions.getKeysNullSmallest()); } - - /** - * @deprecated use aggregateWindowsOverRanges - */ - @Deprecated - public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggregates) { - return aggregateWindowsOverRanges(windowAggregates); - } } public static final class TableOperation { @@ -3651,18 +3582,6 @@ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) { partitionOffsets.length, partitionOffsets)), partitionOffsets); } - - /** - * Hash partition a table into the specified number of partitions. - * @deprecated Use {@link #hashPartition(int)} - * @param numberOfPartitions - number of partitions to use - * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the - * {@link Table} class - */ - @Deprecated - public PartitionedTable partition(int numberOfPartitions) { - return hashPartition(numberOfPartitions); - } } ///////////////////////////////////////////////////////////////////////////// diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index a5779bf9dbb..21e3b3784fc 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7177,19 +7177,6 @@ void testORCWriteMapChunked() throws IOException { } } - @Test - void testORCWriteToFile() throws IOException { - File tempFile = File.createTempFile("test", ".orc"); - try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) { - table0.writeORC(tempFile.getAbsoluteFile()); - try (Table table1 = Table.readORC(tempFile.getAbsoluteFile())) { - assertTablesAreEqual(table0, table1); - } - } finally { - tempFile.delete(); - } - } - @Test void testORCWriteToFileWithColNames() throws IOException { File tempFile = File.createTempFile("test", ".orc"); @@ -7198,7 +7185,9 @@ void testORCWriteToFileWithColNames() throws IOException { ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder(); WriteUtils.buildWriterOptions(optBuilder, colNames); ORCWriterOptions options = optBuilder.build(); - table0.writeORC(options, tempFile.getAbsoluteFile()); + try (TableWriter writer = Table.writeORCChunked(options, tempFile.getAbsoluteFile())) { + writer.write(table0); + } ORCOptions opts = ORCOptions.builder().includeColumn(colNames).build(); try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) { assertTablesAreEqual(table0, table1); @@ -7217,7 +7206,9 @@ void testORCReadAndWriteForDecimal128() throws IOException { ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder(); WriteUtils.buildWriterOptions(optBuilder, colNames); ORCWriterOptions options = optBuilder.build(); - table0.writeORC(options, tempFile.getAbsoluteFile()); + try (TableWriter writer = Table.writeORCChunked(options, tempFile.getAbsoluteFile())) { + writer.write(table0); + } ORCOptions opts = ORCOptions.builder() .includeColumn(colNames) .decimal128Column(Columns.DECIMAL128.name, @@ -7236,13 +7227,15 @@ void testORCReadAndWriteForDecimal128() throws IOException { void testORCWriteToFileUncompressed() throws IOException { File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc"); try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) { - String[] colNames = new String[table0.getNumberOfColumns()]; - Arrays.fill(colNames, ""); - ORCWriterOptions opts = ORCWriterOptions.builder() - .withColumns(true, colNames) - .withCompressionType(CompressionType.NONE) - .build(); - table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile()); + String[] colNames = WriteUtils.getNonNestedColumns(false); + ORCWriterOptions.Builder optsBuilder = ORCWriterOptions.builder(); + WriteUtils.buildWriterOptions(optsBuilder, colNames); + optsBuilder.withCompressionType(CompressionType.NONE); + ORCWriterOptions opts = optsBuilder.build(); + try (TableWriter writer = + Table.writeORCChunked(opts,tempFileUncompressed.getAbsoluteFile())) { + writer.write(table0); + } try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) { assertTablesAreEqual(table0, table2); } From b6c9d7bb27f0cb5d62eb4a6603776076e6804ee7 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 7 Dec 2021 10:22:13 -0800 Subject: [PATCH 13/21] revert Java test changes --- java/src/test/java/ai/rapids/cudf/TableTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 2bdcaea9fba..21e3b3784fc 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -6882,7 +6882,7 @@ void testParquetWriteToBufferChunkedInt96() { MyBufferConsumer consumer = new MyBufferConsumer()) { ParquetWriterOptions options = ParquetWriterOptions.builder() .withNonNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") - .withDecimalColumn("_c7", 15) + .withDecimalColumn("_c7", 5) .withDecimalColumn("_c8", 5) .build(); From 58af48090f8a8da79c84d76fdd9207404d8bbb77 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 7 Dec 2021 12:36:18 -0800 Subject: [PATCH 14/21] clean up --- cpp/include/cudf/io/orc.hpp | 6 +++--- cpp/tests/io/orc_test.cpp | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index d748f5bec09..16588185f3d 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -236,7 +236,7 @@ class orc_reader_options { * * @param use Boolean value to enable/disable. */ - void enable_decimal128(bool enable) { _enable_decimal128 = enable; } + void enable_decimal128(bool use) { _enable_decimal128 = use; } /** * @brief Set columns that should be read as 128-bit Decimal @@ -380,9 +380,9 @@ class orc_reader_options_builder { * * @param use Boolean value to enable/disable. */ - orc_reader_options_builder& decimal128(bool enable) + orc_reader_options_builder& decimal128(bool use) { - options.enable_decimal128(enable); + options.enable_decimal128(use); return *this; } diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cfcc266434c..574ce8573e9 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1489,6 +1489,7 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) .decimal128_columns({"lists.1.dec64"}); auto result = cudf_io::read_orc(in_opts); + // Both columns should be read as decimal128 CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0), result.tbl->view().column(0).child(1).child(1)); } From a21bff4d1721896b2c17d174835aea4feda8b3f3 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 7 Dec 2021 15:29:30 -0800 Subject: [PATCH 15/21] clean up initialization --- cpp/src/io/orc/reader_impl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index d9c786d480a..32495a651df 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -219,13 +219,13 @@ class reader::impl { cudf::io::orc::detail::aggregate_orc_metadata _metadata; cudf::io::orc::detail::column_hierarchy selected_columns; - bool _use_index = true; - bool _use_np_dtypes = true; + bool _use_index{true}; + bool _use_np_dtypes{true}; std::vector _decimal_cols_as_float; std::vector decimal128_columns; - bool is_decimal128_enabled; + bool is_decimal128_enabled{true}; data_type _timestamp_type{type_id::EMPTY}; - reader_column_meta _col_meta; + reader_column_meta _col_meta{}; }; } // namespace orc From 349d4e2dff9bd82f330c2b3707ed7c93f5bd1b50 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 7 Dec 2021 15:47:30 -0800 Subject: [PATCH 16/21] stylin' --- cpp/src/io/orc/reader_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 32495a651df..e8aa298012b 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -223,7 +223,7 @@ class reader::impl { bool _use_np_dtypes{true}; std::vector _decimal_cols_as_float; std::vector decimal128_columns; - bool is_decimal128_enabled{true}; + bool is_decimal128_enabled{true}; data_type _timestamp_type{type_id::EMPTY}; reader_column_meta _col_meta{}; }; From 85fea70b146c756ae47913c47c6923beda80f4aa Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 8 Dec 2021 16:23:17 -0800 Subject: [PATCH 17/21] cmake style fix --- cpp/cmake/thirdparty/get_dlpack.cmake | 3 ++- cpp/cmake/thirdparty/get_jitify.cmake | 3 ++- cpp/cmake/thirdparty/get_libcudacxx.cmake | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/thirdparty/get_dlpack.cmake b/cpp/cmake/thirdparty/get_dlpack.cmake index aeffd64f371..252d50c7af8 100644 --- a/cpp/cmake/thirdparty/get_dlpack.cmake +++ b/cpp/cmake/thirdparty/get_dlpack.cmake @@ -21,7 +21,8 @@ function(find_and_configure_dlpack VERSION) dlpack ${VERSION} GIT_REPOSITORY https://github.com/dmlc/dlpack.git GIT_TAG v${VERSION} - GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE OPTIONS "BUILD_MOCK OFF" ) diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index 7c4526107a3..51bd41ea079 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -20,7 +20,8 @@ function(find_and_configure_jitify) jitify 2.0.0 GIT_REPOSITORY https://github.com/rapidsai/jitify.git GIT_TAG cudf_0.19 - GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE ) set(JITIFY_INCLUDE_DIR "${jitify_SOURCE_DIR}" diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake index 290c4f61e41..0917adcd764 100644 --- a/cpp/cmake/thirdparty/get_libcudacxx.cmake +++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake @@ -17,8 +17,9 @@ function(find_and_configure_libcudacxx) include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) rapids_cpm_libcudacxx( - BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch - --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch --reject-file=- -p1 -N < + ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true ) set(LIBCUDACXX_INCLUDE_DIR From dc70dd31ad79003709a784f11e6828c3a23e5a6d Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 10 Dec 2021 14:58:12 -0800 Subject: [PATCH 18/21] digits10 --- cpp/benchmarks/common/generate_benchmark_input.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 9115f2f4a89..995cea13c27 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -161,22 +161,23 @@ struct random_value_fn()>> { */ template struct random_value_fn()>> { - typename T::rep const lower_bound; - typename T::rep const upper_bound; - distribution_fn dist; + using rep = typename T::rep; + rep const lower_bound; + rep const upper_bound; + distribution_fn dist; std::optional scale; - random_value_fn(distribution_params const& desc) + random_value_fn(distribution_params const& desc) : lower_bound{desc.lower_bound}, upper_bound{desc.upper_bound}, - dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} { } T operator()(std::mt19937& engine) { if (not scale.has_value()) { - int const max_scale = 9 * sizeof(typename T::rep) / 4; + int const max_scale = std::numeric_limits::digits10; auto scale_dist = make_distribution(distribution_id::NORMAL, -max_scale, max_scale); scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)}; } From 7416c0714b82eea648663acf9dc13d8f3828c3ed Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 10 Dec 2021 14:58:27 -0800 Subject: [PATCH 19/21] T::rep alias --- cpp/benchmarks/common/generate_benchmark_input.hpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp index 3bd371438d7..3dbc6561839 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.hpp +++ b/cpp/benchmarks/common/generate_benchmark_input.hpp @@ -287,16 +287,14 @@ class data_profile { template ()>* = nullptr> distribution_params get_distribution_params() const { - auto it = decimal_params.find(cudf::type_to_id()); + using rep = typename T::rep; + auto it = decimal_params.find(cudf::type_to_id()); if (it == decimal_params.end()) { - auto const range = default_range(); - return distribution_params{ - default_distribution_id(), range.first, range.second}; + auto const range = default_range(); + return distribution_params{default_distribution_id(), range.first, range.second}; } else { auto& desc = it->second; - return {desc.id, - static_cast(desc.lower_bound), - static_cast(desc.upper_bound)}; + return {desc.id, static_cast(desc.lower_bound), static_cast(desc.upper_bound)}; } } From 355040c88fafdcf0dff6101c60c24fa09af134d9 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 14 Dec 2021 16:13:48 -0800 Subject: [PATCH 20/21] refactor make_normal_dist to make the range of output values clearer; update API use --- .../common/random_distribution_factory.hpp | 25 +++++++++++-------- .../io/parquet/parquet_writer_benchmark.cpp | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp index dd85495e65f..8416654f03e 100644 --- a/cpp/benchmarks/common/random_distribution_factory.hpp +++ b/cpp/benchmarks/common/random_distribution_factory.hpp @@ -21,19 +21,24 @@ #include #include +/** + * @brief Generates a normal(binomial) distribution between zero and max_val. + */ template ::value, T>* = nullptr> -auto make_normal_dist(T range_start, T range_end) +auto make_normal_dist(T max_val) { - using uT = typename std::make_unsigned::type; - uT const range_size = range_end - range_start; - return std::binomial_distribution(range_size, 0.5); + using uT = typename std::make_unsigned::type; + return std::binomial_distribution(max_val, 0.5); } +/** + * @brief Generates a normal distribution between zero and max_val. + */ template ()>* = nullptr> -auto make_normal_dist(T range_start, T range_end) +auto make_normal_dist(T max_val) { - T const mean = range_start / 2 + range_end / 2; - T const stddev = range_end / 6 - range_start / 6; + T const mean = max_val / 2; + T const stddev = max_val / 6; return std::normal_distribution(mean, stddev); } @@ -82,7 +87,7 @@ distribution_fn make_distribution(distribution_id did, T lower_bound, T upper { switch (did) { case distribution_id::NORMAL: - return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)]( + return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)]( std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( @@ -104,8 +109,8 @@ distribution_fn make_distribution(distribution_id dist_id, T lower_bound, T u { switch (dist_id) { case distribution_id::NORMAL: - return [dist = make_normal_dist(lower_bound, upper_bound)]( - std::mt19937& engine) mutable -> T { return dist(engine); }; + return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)]( + std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; }; case distribution_id::UNIFORM: return [dist = make_uniform_dist(lower_bound, upper_bound)]( std::mt19937& engine) mutable -> T { return dist(engine); }; diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index 71a47578cca..1af7e206692 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -87,7 +87,7 @@ void BM_parq_write_varying_options(benchmark::State& state) cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression) .stats_level(enable_stats) - .column_chunks_file_path(file_path); + .column_chunks_file_paths({file_path}); cudf_io::write_parquet(options); } From 349e023a2f4178fda45966e2a1d6ed6924477b05 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 14 Dec 2021 16:15:55 -0800 Subject: [PATCH 21/21] rename --- .../common/random_distribution_factory.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp index 8416654f03e..65dc8b4dd4d 100644 --- a/cpp/benchmarks/common/random_distribution_factory.hpp +++ b/cpp/benchmarks/common/random_distribution_factory.hpp @@ -22,23 +22,23 @@ #include /** - * @brief Generates a normal(binomial) distribution between zero and max_val. + * @brief Generates a normal(binomial) distribution between zero and upper_bound. */ template ::value, T>* = nullptr> -auto make_normal_dist(T max_val) +auto make_normal_dist(T upper_bound) { using uT = typename std::make_unsigned::type; - return std::binomial_distribution(max_val, 0.5); + return std::binomial_distribution(upper_bound, 0.5); } /** - * @brief Generates a normal distribution between zero and max_val. + * @brief Generates a normal distribution between zero and upper_bound. */ template ()>* = nullptr> -auto make_normal_dist(T max_val) +auto make_normal_dist(T upper_bound) { - T const mean = max_val / 2; - T const stddev = max_val / 6; + T const mean = upper_bound / 2; + T const stddev = upper_bound / 6; return std::normal_distribution(mean, stddev); }