From 2e95fb19eb2876110617af5b564287c748915087 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 7 Dec 2021 22:28:50 -0800 Subject: [PATCH] Pick smallest decimal type with required precision in ORC reader (#9775) Depends on #9853 Current behavior is to throw when an ORC column has precision that is too high for decimal64. This PR changes the behavior to instead read the column as decimal128, when precision is too high for 64 bits. This reduces the need for the use of `decimal128_columns` option. Also modified the decimal type inference to use decimal32 when the precision is sufficiently low, reducing memory use in such case. Adds a temporary option to disable decimal128 use. This option is used in Python to get a readable error message in this case, while allowing decimal128 use by other callers. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Jason Lowe (https://github.com/jlowe) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Mark Harris (https://github.com/harrism) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9775 --- cpp/include/cudf/io/orc.hpp | 24 ++++++++++++++++++ cpp/src/io/orc/reader_impl.cu | 37 +++++++++++++++++----------- cpp/src/io/orc/reader_impl.hpp | 7 +++--- cpp/src/io/orc/stripe_data.cu | 14 ++++++----- cpp/tests/io/orc_test.cpp | 23 +++++++---------- python/cudf/cudf/_lib/cpp/io/orc.pxd | 6 ++--- python/cudf/cudf/_lib/orc.pyx | 1 + 7 files changed, 71 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 3bc2e6c9ef2..16588185f3d 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -72,6 +72,7 @@ class orc_reader_options { // Columns that should be read as Decimal128 std::vector _decimal128_columns; + bool _enable_decimal128 = true; friend orc_reader_options_builder; @@ -151,6 +152,11 @@ class orc_reader_options { */ std::vector const& get_decimal128_columns() const { return _decimal128_columns; } + /** + * @brief Whether to use row index to speed-up reading. + */ + bool is_enabled_decimal128() const { return _enable_decimal128; } + // Setters /** @@ -225,6 +231,13 @@ class orc_reader_options { _decimal_cols_as_float = std::move(val); } + /** + * @brief Enable/Disable the use of decimal128 type + * + * @param use Boolean value to enable/disable. + */ + void enable_decimal128(bool use) { _enable_decimal128 = use; } + /** * @brief Set columns that should be read as 128-bit Decimal * @@ -362,6 +375,17 @@ class orc_reader_options_builder { return *this; } + /** + * @brief Enable/Disable use of decimal128 type + * + * @param use Boolean value to enable/disable. + */ + orc_reader_options_builder& decimal128(bool use) + { + options.enable_decimal128(use); + return *this; + } + /** * @brief move orc_reader_options member once it's built. */ diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 798cdca178a..21c52f9295b 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -230,24 +230,35 @@ size_t gather_stream_info(const size_t stripe_index, /** * @brief Determines cuDF type of an ORC Decimal column. */ -auto decimal_column_type(const std::vector& float64_columns, - const std::vector& decimal128_columns, - cudf::io::orc::metadata& metadata, +auto decimal_column_type(std::vector const& float64_columns, + std::vector const& decimal128_columns, + bool is_decimal128_enabled, + cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { - auto const& column_path = metadata.column_path(column_index); + if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY; + + auto const& column_path = metadata.column_path(0, column_index); auto is_column_in = [&](const std::vector& cols) { return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end(); }; auto const user_selected_float64 = is_column_in(float64_columns); - auto const user_selected_decimal128 = is_column_in(decimal128_columns); + auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns); CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128, "Both decimal128 and float64 types selected for column " + column_path); if (user_selected_float64) return type_id::FLOAT64; if (user_selected_decimal128) return type_id::DECIMAL128; - return type_id::DECIMAL64; + + auto const precision = metadata.get_col_type(column_index) + .precision.value_or(cuda::std::numeric_limits::digits10); + if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL32; + if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL64; + CUDF_EXPECTS(is_decimal128_enabled, + "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable " + "decimal128 use"); + return type_id::DECIMAL128; } } // namespace @@ -744,7 +755,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co _use_np_dtypes, _timestamp_type.id(), decimal_column_type( - _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id)); + _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id)); int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; @@ -795,7 +806,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co break; case orc::DECIMAL: - if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { + if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { scale = -static_cast(_metadata.get_types()[orc_col_id].scale.value_or(0)); } out_col = make_empty_column(data_type(type, scale)); @@ -889,6 +900,7 @@ reader::impl::impl(std::vector>&& sources, // Control decimals conversion _decimal_cols_as_float = options.get_decimal_cols_as_float(); decimal128_columns = options.get_decimal128_columns(); + is_decimal128_enabled = options.is_enabled_decimal128(); } timezone_table reader::impl::compute_timezone_table( @@ -953,13 +965,10 @@ table_with_metadata reader::impl::read(size_type skip_rows, _use_np_dtypes, _timestamp_type.id(), decimal_column_type( - _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id)); + _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); - CUDF_EXPECTS( - (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18), - "Precision of column " + std::string{_metadata.column_name(0, col.id)} + - " is over 18, use 128-bit Decimal."); - if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { + if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or + col_type == type_id::DECIMAL128) { // sign of the scale is changed since cuDF follows c++ libraries like CNL // which uses negative scaling, but liborc and other libraries // follow positive scaling. diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 64e7cbc74e5..e8aa298012b 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -219,12 +219,13 @@ class reader::impl { cudf::io::orc::detail::aggregate_orc_metadata _metadata; cudf::io::orc::detail::column_hierarchy selected_columns; - bool _use_index = true; - bool _use_np_dtypes = true; + bool _use_index{true}; + bool _use_np_dtypes{true}; std::vector _decimal_cols_as_float; std::vector decimal128_columns; + bool is_decimal128_enabled{true}; data_type _timestamp_type{type_id::EMPTY}; - reader_column_meta _col_meta; + reader_column_meta _col_meta{}; }; } // namespace orc diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 44f106c4f5c..8f8bb87d9e4 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1066,12 +1066,12 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs, return (v / kPow5i[scale]) >> scale; } }(); - if (dtype_id == type_id::DECIMAL64) { + if (dtype_id == type_id::DECIMAL32) { + vals.i32[t] = scaled_value; + } else if (dtype_id == type_id::DECIMAL64) { vals.i64[t] = scaled_value; } else { - { - vals.i128[t] = scaled_value; - } + vals.i128[t] = scaled_value; } } } @@ -1708,8 +1708,10 @@ __global__ void __launch_bounds__(block_size) case DOUBLE: case LONG: static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; break; case DECIMAL: - if (s->chunk.dtype_id == type_id::FLOAT64 or - s->chunk.dtype_id == type_id::DECIMAL64) { + if (s->chunk.dtype_id == type_id::DECIMAL32) { + static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; + } else if (s->chunk.dtype_id == type_id::FLOAT64 or + s->chunk.dtype_id == type_id::DECIMAL64) { static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; } else { // decimal128 diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index da44c91eec3..574ce8573e9 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -387,9 +387,7 @@ TEST_F(OrcWriterTest, MultiColumn) cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = - cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .use_index(false) - .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"}); + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); @@ -1178,9 +1176,9 @@ TEST_F(OrcWriterTest, Decimal32) auto data = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { return numeric::decimal32{vals[i], numeric::scale_type{2}}; }); - auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; }); + auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; }); column_wrapper col{data, data + num_rows, mask}; - cudf::table_view expected({static_cast(col)}); + cudf::table_view expected({col}); auto filepath = temp_env->get_temp_filepath("Decimal32.orc"); cudf_io::orc_writer_options out_opts = @@ -1192,12 +1190,7 @@ TEST_F(OrcWriterTest, Decimal32) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); auto result = cudf_io::read_orc(in_opts); - auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { - return numeric::decimal64{vals[i], numeric::scale_type{2}}; - }); - column_wrapper col64{data64, data64 + num_rows, mask}; - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0)); } TEST_F(OrcStatisticsTest, Overflow) @@ -1438,7 +1431,7 @@ TEST_F(OrcReaderTest, DecimalOptions) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) .decimal128_columns({"dec", "fake_name"}) .decimal_cols_as_float({"decc", "fake_name"}); - // Should not throw + // Should not throw, even with "fake name" in both options EXPECT_NO_THROW(cudf_io::read_orc(valid_opts)); cudf_io::orc_reader_options invalid_opts = @@ -1493,10 +1486,12 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) .use_index(false) - .decimal128_columns({"lists.1.dec128"}); + .decimal128_columns({"lists.1.dec64"}); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + // Both columns should be read as decimal128 + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0), + result.tbl->view().column(0).child(1).child(1)); } CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 4b5ec913fb6..2fc71f64df1 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -36,7 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_np_dtypes(bool val) except+ void set_timestamp_type(data_type type) except+ void set_decimal_cols_as_float(vector[string] val) except+ - void set_decimal128_columns(vector[string] val) except+ + void enable_decimal128(bool val) except+ @staticmethod orc_reader_options_builder builder( @@ -58,9 +58,7 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& decimal_cols_as_float( vector[string] val ) except+ - orc_reader_options_builder& decimal128_columns( - vector[string] val - ) except+ + orc_reader_options_builder& decimal128(bool val) except+ orc_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 1281aa172b4..9a4bd8652da 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -248,6 +248,7 @@ cdef orc_reader_options make_orc_reader_options( .timestamp_type(data_type(timestamp_type)) .use_index(use_index) .decimal_cols_as_float(c_decimal_cols_as_float) + .decimal128(False) .build() )