diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index c8f9a73a1d6..d71f7ce005f 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -520,12 +520,18 @@ class fixed_point { */ explicit operator std::string() const { - int const n = std::pow(10, -_scale); - int const f = _value % n; - auto const num_zeros = std::max(0, (-_scale - static_cast(std::to_string(f).size()))); - auto const zeros = num_zeros <= 0 ? std::string("") : std::string(num_zeros, '0'); - return std::to_string(_value / n) + std::string(".") + zeros + - std::to_string(std::abs(_value) % n); + if (_scale < 0) { + int const n = std::pow(10, -_scale); + int const f = _value % n; + auto const num_zeros = + std::max(0, (-_scale - static_cast(std::to_string(f).size()))); + auto const zeros = std::string(num_zeros, '0'); + return std::to_string(_value / n) + std::string(".") + zeros + + std::to_string(std::abs(_value) % n); + } else { + auto const zeros = std::string(_scale, '0'); + return std::to_string(_value) + zeros; + } } }; // namespace numeric diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 1769c72e1c8..163d8c9d735 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -113,11 +113,11 @@ class writer { */ std::unique_ptr> write( table_view const& table, - const table_metadata* metadata = nullptr, - bool return_filemetadata = false, - const std::string column_chunks_file_path = "", - bool int96_timestamps = false, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + const table_metadata* metadata = nullptr, + bool return_filemetadata = false, + const std::string column_chunks_file_path = "", + std::vector const& decimal_precision = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Begins the chunked/streamed write process. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index e6eebec42d8..177c2da8a9d 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -401,6 +401,9 @@ class parquet_writer_options { bool _write_timestamps_as_int96 = false; // Column chunks file path to be set in the raw output metadata std::string _column_chunks_file_path; + /// vector of precision values for decimal writing. Exactly one entry + /// per decimal column. Optional unless decimals are being written. + std::vector _decimal_precision; /** * @brief Constructor from sink and table. @@ -480,6 +483,11 @@ class parquet_writer_options { */ std::string get_column_chunks_file_path() const { return _column_chunks_file_path; } + /** + * @brief Returns a constant reference to the decimal precision vector. + */ + std::vector const& get_decimal_precision() const { return _decimal_precision; } + /** * @brief Sets metadata. * @@ -525,6 +533,11 @@ class parquet_writer_options { { _column_chunks_file_path.assign(file_path); } + + /** + * @brief Sets the decimal precision vector data. + */ + void set_decimal_precision(std::vector dp) { _decimal_precision = std::move(dp); } }; class parquet_writer_options_builder { @@ -687,6 +700,8 @@ class chunked_parquet_writer_options { const table_metadata_with_nullability* _nullable_metadata = nullptr; // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. bool _write_timestamps_as_int96 = false; + // Optional decimal precision data - must be present if writing decimals + std::vector _decimal_precision = {}; /** * @brief Constructor from sink. @@ -728,6 +743,11 @@ class chunked_parquet_writer_options { return _nullable_metadata; } + /** + * @brief Returns decimal precision pointer. + */ + std::vector const& get_decimal_precision() const { return _decimal_precision; } + /** * @brief Returns `true` if timestamps will be written as INT96 */ @@ -743,6 +763,14 @@ class chunked_parquet_writer_options { _nullable_metadata = metadata; } + /** + * @brief Sets decimal precision data. + * + * @param v Vector of precision data flattened with exactly one entry per + * decimal column. + */ + void set_decimal_precision_data(std::vector const& v) { _decimal_precision = v; } + /** * @brief Sets the level of statistics in parquet_writer_options. * diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 83aab53cc43..3dd14984d03 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -511,9 +511,9 @@ class fixed_point_column_wrapper : public detail::column_wrapper { * * Example: * @code{.cpp} - * // Creates a non-nullable column of INT32 elements with 5 elements: {0, 2, 4, 6, 8} + * // Creates a non-nullable column of DECIMAL32 elements with 5 elements: {0, 2, 4, 6, 8} * auto elements = make_counting_transform_iterator(0, [](auto i) { return i * 2;}); - * auto w = fixed_width_column_wrapper(elements, elements + 5, scale_type{0}); + * auto w = fixed_point_column_wrapper(elements, elements + 5, scale_type{0}); * @endcode * * @tparam FixedPointRepIterator Iterator for fixed_point::rep diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 38335f79a5c..4c50eb0362c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -363,7 +363,7 @@ std::unique_ptr> write_parquet(parquet_writer_options const options.get_metadata(), options.is_enabled_return_filemetadata(), options.get_column_chunks_file_path(), - options.is_enabled_int96_timestamps()); + options.get_decimal_precision()); } /** @@ -399,8 +399,9 @@ std::shared_ptr write_parquet_chunked_begin( state->user_metadata_with_nullability = *op.get_nullable_metadata(); state->user_metadata = &state->user_metadata_with_nullability; } - state->int96_timestamps = op.is_enabled_int96_timestamps(); - state->stream = 0; + state->int96_timestamps = op.is_enabled_int96_timestamps(); + state->_decimal_precision = op.get_decimal_precision(); + state->stream = 0; state->wp->write_chunked_begin(*state); return state; } diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp index 5bbc5366f70..d6758efe417 100644 --- a/cpp/src/io/parquet/chunked_state.hpp +++ b/cpp/src/io/parquet/chunked_state.hpp @@ -53,19 +53,24 @@ struct pq_chunked_state { /// only used in the write_chunked() case. copied from the (optionally) user supplied /// argument to write_parquet_chunked_begin() bool single_write_mode; - /// timestamps should be written as int96 types + /// timestamps should be written as int96 types bool int96_timestamps; + /// vector of precision values for decimal writing. Exactly one entry + /// per decimal column. + std::vector _decimal_precision; pq_chunked_state() = default; pq_chunked_state(table_metadata const* metadata, - SingleWriteMode mode = SingleWriteMode::NO, - bool write_int96_timestamps = false, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + SingleWriteMode mode = SingleWriteMode::NO, + bool write_int96_timestamps = false, + std::vector const& decimal_precision = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : stream{stream}, user_metadata{metadata}, single_write_mode{mode == SingleWriteMode::YES}, - int96_timestamps(write_int96_timestamps) + int96_timestamps(write_int96_timestamps), + _decimal_precision(decimal_precision) { } }; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 29c75aee847..a71a3fd9c05 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -164,6 +164,8 @@ class parquet_column_view { std::vector const &nullability, const table_metadata *metadata, bool int96_timestamps, + std::vector const &decimal_precision, + uint &decimal_precision_idx, rmm::cuda_stream_view stream) : _col(col), _leaf_col(get_leaf_col(col)), @@ -295,6 +297,28 @@ class parquet_column_view { _converted_type = ConvertedType::UTF8; _stats_dtype = statistics_dtype::dtype_string; break; + case cudf::type_id::DECIMAL32: + _physical_type = Type::INT32; + _converted_type = ConvertedType::DECIMAL; + _stats_dtype = statistics_dtype::dtype_int32; + _decimal_scale = -_leaf_col.type().scale(); // parquet and cudf disagree about scale signs + CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx, + "Not enough decimal precision values passed for data!"); + CUDF_EXPECTS(decimal_precision[decimal_precision_idx] > _decimal_scale, + "Precision must be greater than scale!"); + _decimal_precision = decimal_precision[decimal_precision_idx++]; + break; + case cudf::type_id::DECIMAL64: + _physical_type = Type::INT64; + _converted_type = ConvertedType::DECIMAL; + _stats_dtype = statistics_dtype::dtype_decimal64; + _decimal_scale = -_leaf_col.type().scale(); // parquet and cudf disagree about scale signs + CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx, + "Not enough decimal precision values passed for data!"); + CUDF_EXPECTS(decimal_precision[decimal_precision_idx] > _decimal_scale, + "Precision must be greater than scale!"); + _decimal_precision = decimal_precision[decimal_precision_idx++]; + break; default: _physical_type = UNDEFINED_TYPE; _stats_dtype = dtype_none; @@ -381,6 +405,8 @@ class parquet_column_view { uint32_t const *nulls() const noexcept { return _nulls; } size_type offset() const noexcept { return _offset; } bool level_nullable(size_t level) const { return _nullability[level]; } + int32_t decimal_scale() const noexcept { return _decimal_scale; } + uint8_t decimal_precision() const noexcept { return _decimal_precision; } // List related data column_view cudf_col() const noexcept { return _col; } @@ -466,6 +492,10 @@ class parquet_column_view { // String-related members rmm::device_buffer _indexes; + + // Decimal-related members + int32_t _decimal_scale = 0; + uint8_t _decimal_precision = 0; }; void writer::impl::init_page_fragments(hostdevice_vector &frag, @@ -656,10 +686,11 @@ std::unique_ptr> writer::impl::write( const table_metadata *metadata, bool return_filemetadata, const std::string &column_chunks_file_path, - bool int96_timestamps, + std::vector const &decimal_precisions, rmm::cuda_stream_view stream) { - pq_chunked_state state{metadata, SingleWriteMode::YES, int96_timestamps, stream}; + pq_chunked_state state{ + metadata, SingleWriteMode::YES, int96_timestamps, decimal_precisions, stream}; write_chunked_begin(state); write_chunk(table, state); @@ -697,6 +728,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) ? std::vector>{} : get_per_column_nullability(table, state.user_metadata_with_nullability.column_nullable); + uint decimal_precision_idx = 0; + for (auto it = table.begin(); it < table.end(); ++it) { const auto col = *it; const auto current_id = parquet_columns.size(); @@ -714,9 +747,14 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) this_column_nullability, state.user_metadata, state.int96_timestamps, + state._decimal_precision, + decimal_precision_idx, state.stream); } + CUDF_EXPECTS(decimal_precision_idx == state._decimal_precision.size(), + "Too many decimal precision values!"); + // first call. setup metadata. num_rows will get incremented as write_chunk is // called multiple times. // Calculate the sum of depths of all list columns @@ -767,7 +805,9 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) list_schema[nesting_depth * 2].type = physical_type; list_schema[nesting_depth * 2].converted_type = physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type(); - list_schema[nesting_depth * 2].num_children = 0; + list_schema[nesting_depth * 2].num_children = 0; + list_schema[nesting_depth * 2].decimal_precision = col.decimal_precision(); + list_schema[nesting_depth * 2].decimal_scale = col.decimal_scale(); std::vector path_in_schema; std::transform( @@ -790,8 +830,10 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) ? OPTIONAL : REQUIRED; - col_schema.name = col.name(); - col_schema.num_children = 0; // Leaf node + col_schema.name = col.name(); + col_schema.num_children = 0; // Leaf node + col_schema.decimal_precision = col.decimal_precision(); + col_schema.decimal_scale = col.decimal_scale(); this_table_schema.push_back(std::move(col_schema)); } @@ -1225,11 +1267,11 @@ std::unique_ptr> writer::write(table_view const &table, const table_metadata *metadata, bool return_filemetadata, const std::string column_chunks_file_path, - bool int96_timestamps, + std::vector const &decimal_precisions, rmm::cuda_stream_view stream) { return _impl->write( - table, metadata, return_filemetadata, column_chunks_file_path, int96_timestamps, stream); + table, metadata, return_filemetadata, column_chunks_file_path, decimal_precisions, stream); } // Forward to implementation diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 75130c1881d..ae6446efb95 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -88,7 +88,7 @@ class writer::impl { const table_metadata* metadata, bool return_filemetadata, const std::string& column_chunks_file_path, - bool int96_timestamps, + std::vector const& decimal_precisions, rmm::cuda_stream_view stream); /** diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index ada2eadaa31..89cbb82c726 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -286,6 +287,14 @@ TEST_F(ParquetWriterTest, MultiColumn) auto col3_data = random_values(num_rows); auto col4_data = random_values(num_rows); auto col5_data = random_values(num_rows); + auto col6_vals = random_values(num_rows); + auto col7_vals = random_values(num_rows); + auto col6_data = cudf::test::make_counting_transform_iterator(0, [col6_vals](auto i) { + return numeric::decimal32{col6_vals[i], numeric::scale_type{5}}; + }); + auto col7_data = cudf::test::make_counting_transform_iterator(0, [col7_vals](auto i) { + return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}}; + }); auto validity = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); // column_wrapper col0{ @@ -295,6 +304,8 @@ TEST_F(ParquetWriterTest, MultiColumn) column_wrapper col3{col3_data.begin(), col3_data.end(), validity}; column_wrapper col4{col4_data.begin(), col4_data.end(), validity}; column_wrapper col5{col5_data.begin(), col5_data.end(), validity}; + column_wrapper col6{col6_data, col6_data + num_rows, validity}; + column_wrapper col7{col7_data, col7_data + num_rows, validity}; cudf_io::table_metadata expected_metadata; // expected_metadata.column_names.emplace_back("bools"); @@ -303,6 +314,8 @@ TEST_F(ParquetWriterTest, MultiColumn) expected_metadata.column_names.emplace_back("int32s"); expected_metadata.column_names.emplace_back("floats"); expected_metadata.column_names.emplace_back("doubles"); + expected_metadata.column_names.emplace_back("decimal32s"); + expected_metadata.column_names.emplace_back("decimal64s"); std::vector> cols; // cols.push_back(col0.release()); @@ -311,13 +324,17 @@ TEST_F(ParquetWriterTest, MultiColumn) cols.push_back(col3.release()); cols.push_back(col4.release()); cols.push_back(col5.release()); + cols.push_back(col6.release()); + cols.push_back(col7.release()); auto expected = std::make_unique(std::move(cols)); - EXPECT_EQ(5, expected->num_columns()); + EXPECT_EQ(7, expected->num_columns()); auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); cudf_io::parquet_writer_options out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); + std::vector precisions = {10, 20}; + out_opts.set_decimal_precision(precisions); cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -338,6 +355,14 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) auto col3_data = random_values(num_rows); auto col4_data = random_values(num_rows); auto col5_data = random_values(num_rows); + auto col6_vals = random_values(num_rows); + auto col7_vals = random_values(num_rows); + auto col6_data = cudf::test::make_counting_transform_iterator(0, [col6_vals](auto i) { + return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}}; + }); + auto col7_data = cudf::test::make_counting_transform_iterator(0, [col7_vals](auto i) { + return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}}; + }); // auto col0_mask = cudf::test::make_counting_transform_iterator( // 0, [](auto i) { return (i % 2); }); auto col1_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); @@ -347,6 +372,9 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) auto col4_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 || i <= 60); }); auto col5_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i > 80); }); + auto col6_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i % 5); }); + auto col7_mask = + cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i != 55); }); // column_wrapper col0{ // col0_data.begin(), col0_data.end(), col0_mask}; @@ -355,6 +383,8 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) column_wrapper col3{col3_data.begin(), col3_data.end(), col3_mask}; column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; + column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; + column_wrapper col7{col7_data, col7_data + num_rows, col7_mask}; cudf_io::table_metadata expected_metadata; // expected_metadata.column_names.emplace_back("bools"); @@ -363,6 +393,8 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) expected_metadata.column_names.emplace_back("int32s"); expected_metadata.column_names.emplace_back("floats"); expected_metadata.column_names.emplace_back("doubles"); + expected_metadata.column_names.emplace_back("decimal32s"); + expected_metadata.column_names.emplace_back("decimal64s"); std::vector> cols; // cols.push_back(col0.release()); @@ -371,13 +403,18 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) cols.push_back(col3.release()); cols.push_back(col4.release()); cols.push_back(col5.release()); + cols.push_back(col6.release()); + cols.push_back(col7.release()); auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(5, expected->num_columns()); + EXPECT_EQ(7, expected->num_columns()); auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet"); cudf_io::parquet_writer_options out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); + std::vector precisions = {9, 20}; + out_opts.set_decimal_precision(precisions); + cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -1282,6 +1319,61 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError) EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error); } +TEST_F(ParquetChunkedWriterTest, DecimalWrite) +{ + constexpr cudf::size_type num_rows = 500; + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + auto valids = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + auto col0 = cudf::test::fixed_point_column_wrapper{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto col1 = cudf::test::fixed_point_column_wrapper{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto table = table_view({col0, col1}); + + auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet"); + cudf_io::chunked_parquet_writer_options args = + cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); + + // verify failure if no decimal precision given + auto state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // verify failure if too small a precision is given + std::vector precisions{7, 1}; + args.set_decimal_precision_data(precisions); + state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // verify failure if too few precisions given + precisions.pop_back(); + args.set_decimal_precision_data(precisions); + state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // verify failure if too many precisions given + precisions = {7, 14, 11}; + args.set_decimal_precision_data(precisions); + state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // write correctly + precisions.pop_back(); + args.set_decimal_precision_data(precisions); + state = cudf_io::write_parquet_chunked_begin(args); + cudf_io::write_parquet_chunked(table, state); + cudf_io::write_parquet_chunked_end(state); + + cudf_io::parquet_reader_options read_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + auto result = cudf_io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table); +} + TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) { // write out two 31 row tables and make sure they get