diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 3bc2e6c9ef2..16588185f3d 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -72,6 +72,7 @@ class orc_reader_options { // Columns that should be read as Decimal128 std::vector _decimal128_columns; + bool _enable_decimal128 = true; friend orc_reader_options_builder; @@ -151,6 +152,11 @@ class orc_reader_options { */ std::vector const& get_decimal128_columns() const { return _decimal128_columns; } + /** + * @brief Whether to use row index to speed-up reading. + */ + bool is_enabled_decimal128() const { return _enable_decimal128; } + // Setters /** @@ -225,6 +231,13 @@ class orc_reader_options { _decimal_cols_as_float = std::move(val); } + /** + * @brief Enable/Disable the use of decimal128 type + * + * @param use Boolean value to enable/disable. + */ + void enable_decimal128(bool use) { _enable_decimal128 = use; } + /** * @brief Set columns that should be read as 128-bit Decimal * @@ -362,6 +375,17 @@ class orc_reader_options_builder { return *this; } + /** + * @brief Enable/Disable use of decimal128 type + * + * @param use Boolean value to enable/disable. + */ + orc_reader_options_builder& decimal128(bool use) + { + options.enable_decimal128(use); + return *this; + } + /** * @brief move orc_reader_options member once it's built. */ diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 798cdca178a..21c52f9295b 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -230,24 +230,35 @@ size_t gather_stream_info(const size_t stripe_index, /** * @brief Determines cuDF type of an ORC Decimal column. */ -auto decimal_column_type(const std::vector& float64_columns, - const std::vector& decimal128_columns, - cudf::io::orc::metadata& metadata, +auto decimal_column_type(std::vector const& float64_columns, + std::vector const& decimal128_columns, + bool is_decimal128_enabled, + cudf::io::orc::detail::aggregate_orc_metadata const& metadata, int column_index) { - auto const& column_path = metadata.column_path(column_index); + if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY; + + auto const& column_path = metadata.column_path(0, column_index); auto is_column_in = [&](const std::vector& cols) { return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end(); }; auto const user_selected_float64 = is_column_in(float64_columns); - auto const user_selected_decimal128 = is_column_in(decimal128_columns); + auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns); CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128, "Both decimal128 and float64 types selected for column " + column_path); if (user_selected_float64) return type_id::FLOAT64; if (user_selected_decimal128) return type_id::DECIMAL128; - return type_id::DECIMAL64; + + auto const precision = metadata.get_col_type(column_index) + .precision.value_or(cuda::std::numeric_limits::digits10); + if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL32; + if (precision <= cuda::std::numeric_limits::digits10) return type_id::DECIMAL64; + CUDF_EXPECTS(is_decimal128_enabled, + "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable " + "decimal128 use"); + return type_id::DECIMAL128; } } // namespace @@ -744,7 +755,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co _use_np_dtypes, _timestamp_type.id(), decimal_column_type( - _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id)); + _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id)); int32_t scale = 0; std::vector> child_columns; std::unique_ptr out_col = nullptr; @@ -795,7 +806,7 @@ std::unique_ptr reader::impl::create_empty_column(const size_type orc_co break; case orc::DECIMAL: - if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { + if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { scale = -static_cast(_metadata.get_types()[orc_col_id].scale.value_or(0)); } out_col = make_empty_column(data_type(type, scale)); @@ -889,6 +900,7 @@ reader::impl::impl(std::vector>&& sources, // Control decimals conversion _decimal_cols_as_float = options.get_decimal_cols_as_float(); decimal128_columns = options.get_decimal128_columns(); + is_decimal128_enabled = options.is_enabled_decimal128(); } timezone_table reader::impl::compute_timezone_table( @@ -953,13 +965,10 @@ table_with_metadata reader::impl::read(size_type skip_rows, _use_np_dtypes, _timestamp_type.id(), decimal_column_type( - _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id)); + _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id)); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); - CUDF_EXPECTS( - (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18), - "Precision of column " + std::string{_metadata.column_name(0, col.id)} + - " is over 18, use 128-bit Decimal."); - if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) { + if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or + col_type == type_id::DECIMAL128) { // sign of the scale is changed since cuDF follows c++ libraries like CNL // which uses negative scaling, but liborc and other libraries // follow positive scaling. diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 64e7cbc74e5..e8aa298012b 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -219,12 +219,13 @@ class reader::impl { cudf::io::orc::detail::aggregate_orc_metadata _metadata; cudf::io::orc::detail::column_hierarchy selected_columns; - bool _use_index = true; - bool _use_np_dtypes = true; + bool _use_index{true}; + bool _use_np_dtypes{true}; std::vector _decimal_cols_as_float; std::vector decimal128_columns; + bool is_decimal128_enabled{true}; data_type _timestamp_type{type_id::EMPTY}; - reader_column_meta _col_meta; + reader_column_meta _col_meta{}; }; } // namespace orc diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 44f106c4f5c..8f8bb87d9e4 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1066,12 +1066,12 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs, return (v / kPow5i[scale]) >> scale; } }(); - if (dtype_id == type_id::DECIMAL64) { + if (dtype_id == type_id::DECIMAL32) { + vals.i32[t] = scaled_value; + } else if (dtype_id == type_id::DECIMAL64) { vals.i64[t] = scaled_value; } else { - { - vals.i128[t] = scaled_value; - } + vals.i128[t] = scaled_value; } } } @@ -1708,8 +1708,10 @@ __global__ void __launch_bounds__(block_size) case DOUBLE: case LONG: static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; break; case DECIMAL: - if (s->chunk.dtype_id == type_id::FLOAT64 or - s->chunk.dtype_id == type_id::DECIMAL64) { + if (s->chunk.dtype_id == type_id::DECIMAL32) { + static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; + } else if (s->chunk.dtype_id == type_id::FLOAT64 or + s->chunk.dtype_id == type_id::DECIMAL64) { static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; } else { // decimal128 diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index da44c91eec3..574ce8573e9 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -387,9 +387,7 @@ TEST_F(OrcWriterTest, MultiColumn) cudf_io::write_orc(out_opts); cudf_io::orc_reader_options in_opts = - cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) - .use_index(false) - .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"}); + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); auto result = cudf_io::read_orc(in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); @@ -1178,9 +1176,9 @@ TEST_F(OrcWriterTest, Decimal32) auto data = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { return numeric::decimal32{vals[i], numeric::scale_type{2}}; }); - auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; }); + auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; }); column_wrapper col{data, data + num_rows, mask}; - cudf::table_view expected({static_cast(col)}); + cudf::table_view expected({col}); auto filepath = temp_env->get_temp_filepath("Decimal32.orc"); cudf_io::orc_writer_options out_opts = @@ -1192,12 +1190,7 @@ TEST_F(OrcWriterTest, Decimal32) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}); auto result = cudf_io::read_orc(in_opts); - auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { - return numeric::decimal64{vals[i], numeric::scale_type{2}}; - }); - column_wrapper col64{data64, data64 + num_rows, mask}; - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0)); } TEST_F(OrcStatisticsTest, Overflow) @@ -1438,7 +1431,7 @@ TEST_F(OrcReaderTest, DecimalOptions) cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) .decimal128_columns({"dec", "fake_name"}) .decimal_cols_as_float({"decc", "fake_name"}); - // Should not throw + // Should not throw, even with "fake name" in both options EXPECT_NO_THROW(cudf_io::read_orc(valid_opts)); cudf_io::orc_reader_options invalid_opts = @@ -1493,10 +1486,12 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) cudf_io::orc_reader_options in_opts = cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}) .use_index(false) - .decimal128_columns({"lists.1.dec128"}); + .decimal128_columns({"lists.1.dec64"}); auto result = cudf_io::read_orc(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + // Both columns should be read as decimal128 + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0), + result.tbl->view().column(0).child(1).child(1)); } CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 4b5ec913fb6..2fc71f64df1 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -36,7 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_np_dtypes(bool val) except+ void set_timestamp_type(data_type type) except+ void set_decimal_cols_as_float(vector[string] val) except+ - void set_decimal128_columns(vector[string] val) except+ + void enable_decimal128(bool val) except+ @staticmethod orc_reader_options_builder builder( @@ -58,9 +58,7 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options_builder& decimal_cols_as_float( vector[string] val ) except+ - orc_reader_options_builder& decimal128_columns( - vector[string] val - ) except+ + orc_reader_options_builder& decimal128(bool val) except+ orc_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 1281aa172b4..9a4bd8652da 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -248,6 +248,7 @@ cdef orc_reader_options make_orc_reader_options( .timestamp_type(data_type(timestamp_type)) .use_index(use_index) .decimal_cols_as_float(c_decimal_cols_as_float) + .decimal128(False) .build() )