From 6d484983580d3cc13b914893c982b4bb7fe9829a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 16 Dec 2020 00:42:21 +0000 Subject: [PATCH 1/5] Adding support to decimal writing. In order to write decimal values to parquet, the user must pass in a vector of uint8's that indicate the precision of the columns. --- cpp/include/cudf/fixed_point/fixed_point.hpp | 18 ++-- cpp/include/cudf/io/detail/parquet.hpp | 2 +- cpp/include/cudf/io/parquet.hpp | 28 ++++++ cpp/include/cudf_test/column_wrapper.hpp | 2 +- cpp/src/io/functions.cpp | 7 +- cpp/src/io/parquet/chunked_state.hpp | 5 +- cpp/src/io/parquet/writer_impl.cu | 54 +++++++++-- cpp/src/io/parquet/writer_impl.hpp | 2 +- cpp/tests/io/parquet_test.cpp | 97 +++++++++++++++++++- 9 files changed, 195 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index c8f9a73a1d6..07555903701 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -520,12 +520,18 @@ class fixed_point { */ explicit operator std::string() const { - int const n = std::pow(10, -_scale); - int const f = _value % n; - auto const num_zeros = std::max(0, (-_scale - static_cast(std::to_string(f).size()))); - auto const zeros = num_zeros <= 0 ? std::string("") : std::string(num_zeros, '0'); - return std::to_string(_value / n) + std::string(".") + zeros + - std::to_string(std::abs(_value) % n); + if (_scale < 0) { + int const n = std::pow(10, -_scale); + int const f = _value % n; + auto const num_zeros = + std::max(0, (-_scale - static_cast(std::to_string(f).size()))); + auto const zeros = num_zeros <= 0 ? std::string("") : std::string(num_zeros, '0'); + return std::to_string(_value / n) + std::string(".") + zeros + + std::to_string(std::abs(_value) % n); + } else { + auto const zeros = _scale > 0 ? std::string(_scale, '0') : std::string(""); + return std::to_string(_value) + zeros; + } } }; // namespace numeric diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 1769c72e1c8..ba726e64d6d 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -116,7 +116,7 @@ class writer { const table_metadata* metadata = nullptr, bool return_filemetadata = false, const std::string column_chunks_file_path = "", - bool int96_timestamps = false, + std::vector* decimal_precision = nullptr, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index e6eebec42d8..1b164f63da2 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -401,6 +401,9 @@ class parquet_writer_options { bool _write_timestamps_as_int96 = false; // Column chunks file path to be set in the raw output metadata std::string _column_chunks_file_path; + /// vector of precision values for decimal writing. Exactly one entry + /// per decimal column. + std::vector* _decimal_precisions = nullptr; /** * @brief Constructor from sink and table. @@ -480,6 +483,11 @@ class parquet_writer_options { */ std::string get_column_chunks_file_path() const { return _column_chunks_file_path; } + /** + * @brief Returns a pointer to the decimal precision vector. + */ + std::vector* get_decimal_precisions() const { return _decimal_precisions; } + /** * @brief Sets metadata. * @@ -525,6 +533,11 @@ class parquet_writer_options { { _column_chunks_file_path.assign(file_path); } + + /** + * @brief Sets a pointer to the decimal precision vector. + */ + void set_decimal_precisions(std::vector* dp) { _decimal_precisions = dp; } }; class parquet_writer_options_builder { @@ -687,6 +700,8 @@ class chunked_parquet_writer_options { const table_metadata_with_nullability* _nullable_metadata = nullptr; // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. bool _write_timestamps_as_int96 = false; + // Optional decimal precision data - must be present if writing decimals + const std::vector* _decimal_precision = nullptr; /** * @brief Constructor from sink. @@ -728,6 +743,11 @@ class chunked_parquet_writer_options { return _nullable_metadata; } + /** + * @brief Returns decimal precision pointer. + */ + const std::vector* get_decimal_precision() const { return _decimal_precision; } + /** * @brief Returns `true` if timestamps will be written as INT96 */ @@ -743,6 +763,14 @@ class chunked_parquet_writer_options { _nullable_metadata = metadata; } + /** + * @brief Sets decimal precision data. + * + * @param v Vector of precision data flattened with exactly one entry per + * decimal column. + */ + void set_decimal_precision_data(const std::vector* v) { _decimal_precision = v; } + /** * @brief Sets the level of statistics in parquet_writer_options. * diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 83aab53cc43..c12fa5a6124 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -513,7 +513,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper { * @code{.cpp} * // Creates a non-nullable column of INT32 elements with 5 elements: {0, 2, 4, 6, 8} * auto elements = make_counting_transform_iterator(0, [](auto i) { return i * 2;}); - * auto w = fixed_width_column_wrapper(elements, elements + 5, scale_type{0}); + * auto w = fixed_point_column_wrapper(elements, elements + 5, scale_type{0}); * @endcode * * @tparam FixedPointRepIterator Iterator for fixed_point::rep diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 38335f79a5c..a5afd3419ca 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -363,7 +363,7 @@ std::unique_ptr> write_parquet(parquet_writer_options const options.get_metadata(), options.is_enabled_return_filemetadata(), options.get_column_chunks_file_path(), - options.is_enabled_int96_timestamps()); + options.get_decimal_precisions()); } /** @@ -400,7 +400,10 @@ std::shared_ptr write_parquet_chunked_begin( state->user_metadata = &state->user_metadata_with_nullability; } state->int96_timestamps = op.is_enabled_int96_timestamps(); - state->stream = 0; + if (op.get_decimal_precision() != nullptr) { + state->decimal_precisions = *op.get_decimal_precision(); + } + state->stream = 0; state->wp->write_chunked_begin(*state); return state; } diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp index 5bbc5366f70..b8a204911ed 100644 --- a/cpp/src/io/parquet/chunked_state.hpp +++ b/cpp/src/io/parquet/chunked_state.hpp @@ -53,8 +53,11 @@ struct pq_chunked_state { /// only used in the write_chunked() case. copied from the (optionally) user supplied /// argument to write_parquet_chunked_begin() bool single_write_mode; - /// timestamps should be written as int96 types + /// timestamps should be written as int96 types bool int96_timestamps; + /// vector of precision values for decimal writing. Exactly one entry + /// per decimal column. + std::vector decimal_precisions; pq_chunked_state() = default; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 29c75aee847..a7ce251ceeb 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -164,6 +164,8 @@ class parquet_column_view { std::vector const &nullability, const table_metadata *metadata, bool int96_timestamps, + const std::vector &decimal_precision, + uint &decimal_precision_idx, rmm::cuda_stream_view stream) : _col(col), _leaf_col(get_leaf_col(col)), @@ -295,6 +297,28 @@ class parquet_column_view { _converted_type = ConvertedType::UTF8; _stats_dtype = statistics_dtype::dtype_string; break; + case cudf::type_id::DECIMAL32: + _physical_type = Type::INT32; + _converted_type = ConvertedType::DECIMAL; + _stats_dtype = statistics_dtype::dtype_int32; + _decimal_scale = -_leaf_col.type().scale(); // parquet and cudf disagree about scale signs + CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx, + "Not enough decimal precision values passed for data!"); + CUDF_EXPECTS(decimal_precision[decimal_precision_idx] > _decimal_scale, + "Precision must be greater than scale!"); + _decimal_precision = decimal_precision[decimal_precision_idx++]; + break; + case cudf::type_id::DECIMAL64: + _physical_type = Type::INT64; + _converted_type = ConvertedType::DECIMAL; + _stats_dtype = statistics_dtype::dtype_decimal64; + _decimal_scale = -_leaf_col.type().scale(); // parquet and cudf disagree about scale signs + CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx, + "Not enough decimal precision values passed for data!"); + CUDF_EXPECTS(decimal_precision[decimal_precision_idx] > _decimal_scale, + "Precision must be greater than scale!"); + _decimal_precision = decimal_precision[decimal_precision_idx++]; + break; default: _physical_type = UNDEFINED_TYPE; _stats_dtype = dtype_none; @@ -381,6 +405,8 @@ class parquet_column_view { uint32_t const *nulls() const noexcept { return _nulls; } size_type offset() const noexcept { return _offset; } bool level_nullable(size_t level) const { return _nullability[level]; } + int32_t decimal_scale() const noexcept { return _decimal_scale; } + uint8_t decimal_precision() const noexcept { return _decimal_precision; } // List related data column_view cudf_col() const noexcept { return _col; } @@ -466,6 +492,10 @@ class parquet_column_view { // String-related members rmm::device_buffer _indexes; + + // Decimal-related members + int32_t _decimal_scale = 0; + uint8_t _decimal_precision = 0; }; void writer::impl::init_page_fragments(hostdevice_vector &frag, @@ -656,11 +686,12 @@ std::unique_ptr> writer::impl::write( const table_metadata *metadata, bool return_filemetadata, const std::string &column_chunks_file_path, - bool int96_timestamps, + std::vector *decimal_precisions, rmm::cuda_stream_view stream) { pq_chunked_state state{metadata, SingleWriteMode::YES, int96_timestamps, stream}; + if (decimal_precisions != nullptr) { state.decimal_precisions = *decimal_precisions; } write_chunked_begin(state); write_chunk(table, state); return write_chunked_end(state, return_filemetadata, column_chunks_file_path); @@ -697,6 +728,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) ? std::vector>{} : get_per_column_nullability(table, state.user_metadata_with_nullability.column_nullable); + uint decimal_precision_idx = 0; + for (auto it = table.begin(); it < table.end(); ++it) { const auto col = *it; const auto current_id = parquet_columns.size(); @@ -714,9 +747,14 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) this_column_nullability, state.user_metadata, state.int96_timestamps, + state.decimal_precisions, + decimal_precision_idx, state.stream); } + CUDF_EXPECTS(decimal_precision_idx == state.decimal_precisions.size(), + "Too many decimal precision values!"); + // first call. setup metadata. num_rows will get incremented as write_chunk is // called multiple times. // Calculate the sum of depths of all list columns @@ -767,7 +805,9 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) list_schema[nesting_depth * 2].type = physical_type; list_schema[nesting_depth * 2].converted_type = physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type(); - list_schema[nesting_depth * 2].num_children = 0; + list_schema[nesting_depth * 2].num_children = 0; + list_schema[nesting_depth * 2].decimal_precision = col.decimal_precision(); + list_schema[nesting_depth * 2].decimal_scale = col.decimal_scale(); std::vector path_in_schema; std::transform( @@ -790,8 +830,10 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) ? OPTIONAL : REQUIRED; - col_schema.name = col.name(); - col_schema.num_children = 0; // Leaf node + col_schema.name = col.name(); + col_schema.num_children = 0; // Leaf node + col_schema.decimal_precision = col.decimal_precision(); + col_schema.decimal_scale = col.decimal_scale(); this_table_schema.push_back(std::move(col_schema)); } @@ -1225,11 +1267,11 @@ std::unique_ptr> writer::write(table_view const &table, const table_metadata *metadata, bool return_filemetadata, const std::string column_chunks_file_path, - bool int96_timestamps, + std::vector *decimal_precisions, rmm::cuda_stream_view stream) { return _impl->write( - table, metadata, return_filemetadata, column_chunks_file_path, int96_timestamps, stream); + table, metadata, return_filemetadata, column_chunks_file_path, decimal_precisions, stream); } // Forward to implementation diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 75130c1881d..8450d8ac4ce 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -88,7 +88,7 @@ class writer::impl { const table_metadata* metadata, bool return_filemetadata, const std::string& column_chunks_file_path, - bool int96_timestamps, + std::vector* decimal_precisions, rmm::cuda_stream_view stream); /** diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index ada2eadaa31..39e60708a94 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -286,6 +287,14 @@ TEST_F(ParquetWriterTest, MultiColumn) auto col3_data = random_values(num_rows); auto col4_data = random_values(num_rows); auto col5_data = random_values(num_rows); + auto col6_vals = random_values(num_rows); + auto col7_vals = random_values(num_rows); + auto col6_data = cudf::test::make_counting_transform_iterator(0, [col6_vals](auto i) { + return numeric::decimal32{col6_vals[i], numeric::scale_type{5}}; + }); + auto col7_data = cudf::test::make_counting_transform_iterator(0, [col7_vals](auto i) { + return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}}; + }); auto validity = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); // column_wrapper col0{ @@ -295,6 +304,8 @@ TEST_F(ParquetWriterTest, MultiColumn) column_wrapper col3{col3_data.begin(), col3_data.end(), validity}; column_wrapper col4{col4_data.begin(), col4_data.end(), validity}; column_wrapper col5{col5_data.begin(), col5_data.end(), validity}; + column_wrapper col6{col6_data, col6_data + num_rows, validity}; + column_wrapper col7{col7_data, col7_data + num_rows, validity}; cudf_io::table_metadata expected_metadata; // expected_metadata.column_names.emplace_back("bools"); @@ -303,6 +314,8 @@ TEST_F(ParquetWriterTest, MultiColumn) expected_metadata.column_names.emplace_back("int32s"); expected_metadata.column_names.emplace_back("floats"); expected_metadata.column_names.emplace_back("doubles"); + expected_metadata.column_names.emplace_back("decimal32s"); + expected_metadata.column_names.emplace_back("decimal64s"); std::vector> cols; // cols.push_back(col0.release()); @@ -311,13 +324,17 @@ TEST_F(ParquetWriterTest, MultiColumn) cols.push_back(col3.release()); cols.push_back(col4.release()); cols.push_back(col5.release()); + cols.push_back(col6.release()); + cols.push_back(col7.release()); auto expected = std::make_unique(std::move(cols)); - EXPECT_EQ(5, expected->num_columns()); + EXPECT_EQ(7, expected->num_columns()); auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); cudf_io::parquet_writer_options out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); + std::vector precisions = {10, 20}; + out_opts.set_decimal_precisions(&precisions); cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -338,6 +355,14 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) auto col3_data = random_values(num_rows); auto col4_data = random_values(num_rows); auto col5_data = random_values(num_rows); + auto col6_vals = random_values(num_rows); + auto col7_vals = random_values(num_rows); + auto col6_data = cudf::test::make_counting_transform_iterator(0, [col6_vals](auto i) { + return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}}; + }); + auto col7_data = cudf::test::make_counting_transform_iterator(0, [col7_vals](auto i) { + return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}}; + }); // auto col0_mask = cudf::test::make_counting_transform_iterator( // 0, [](auto i) { return (i % 2); }); auto col1_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); @@ -347,6 +372,9 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) auto col4_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 || i <= 60); }); auto col5_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i > 80); }); + auto col6_mask = cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i % 5); }); + auto col7_mask = + cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i != 55); }); // column_wrapper col0{ // col0_data.begin(), col0_data.end(), col0_mask}; @@ -355,6 +383,8 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) column_wrapper col3{col3_data.begin(), col3_data.end(), col3_mask}; column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; + column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; + column_wrapper col7{col7_data, col7_data + num_rows, col7_mask}; cudf_io::table_metadata expected_metadata; // expected_metadata.column_names.emplace_back("bools"); @@ -363,6 +393,8 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) expected_metadata.column_names.emplace_back("int32s"); expected_metadata.column_names.emplace_back("floats"); expected_metadata.column_names.emplace_back("doubles"); + expected_metadata.column_names.emplace_back("decimal32s"); + expected_metadata.column_names.emplace_back("decimal64s"); std::vector> cols; // cols.push_back(col0.release()); @@ -371,13 +403,18 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) cols.push_back(col3.release()); cols.push_back(col4.release()); cols.push_back(col5.release()); + cols.push_back(col6.release()); + cols.push_back(col7.release()); auto expected = std::make_unique
(std::move(cols)); - EXPECT_EQ(5, expected->num_columns()); + EXPECT_EQ(7, expected->num_columns()); auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet"); cudf_io::parquet_writer_options out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); + std::vector precisions = {9, 20}; + out_opts.set_decimal_precisions(&precisions); + cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -1282,6 +1319,62 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError) EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error); } +TEST_F(ParquetChunkedWriterTest, DecimalWrite) +{ + constexpr cudf::size_type num_rows = 500; + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + auto col0 = cudf::test::fixed_point_column_wrapper{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto col1 = cudf::test::fixed_point_column_wrapper{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto table = table_view({col0, col1}); + + auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet"); + cudf_io::chunked_parquet_writer_options args = + cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath}); + + // verify failure if no decimal precision given + auto state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // verify failure if too small a precision is given + std::vector precisions{7, 1}; + args.set_decimal_precision_data(&precisions); + state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // verify failure if too few precisions given + precisions.pop_back(); + args.set_decimal_precision_data(&precisions); + state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // verify failure if too many precisions given + precisions = {7, 14, 11}; + args.set_decimal_precision_data(&precisions); + state = cudf_io::write_parquet_chunked_begin(args); + EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); + + // write correctly + precisions.pop_back(); + args.set_decimal_precision_data(&precisions); + state = cudf_io::write_parquet_chunked_begin(args); + cudf_io::write_parquet_chunked(table, state); + cudf_io::write_parquet_chunked_end(state); + + cudf_io::parquet_reader_options read_opts = + cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}); + auto result = cudf_io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table); +} + TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) { // write out two 31 row tables and make sure they get From 852f8c1247c0526bbc5f2bb7c2c4cff455d857d2 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 17 Dec 2020 14:31:35 -0500 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Vukasin Milovanovic --- cpp/include/cudf/fixed_point/fixed_point.hpp | 4 ++-- cpp/tests/io/parquet_test.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 07555903701..d71f7ce005f 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -525,11 +525,11 @@ class fixed_point { int const f = _value % n; auto const num_zeros = std::max(0, (-_scale - static_cast(std::to_string(f).size()))); - auto const zeros = num_zeros <= 0 ? std::string("") : std::string(num_zeros, '0'); + auto const zeros = std::string(num_zeros, '0'); return std::to_string(_value / n) + std::string(".") + zeros + std::to_string(std::abs(_value) % n); } else { - auto const zeros = _scale > 0 ? std::string(_scale, '0') : std::string(""); + auto const zeros = std::string(_scale, '0'); return std::to_string(_value) + zeros; } } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 39e60708a94..97da0a5f16d 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -1326,7 +1326,7 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite) auto seq_col1 = random_values(num_rows); auto valids = cudf::test::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? true : false; }); + 0, [](auto i) { return i % 2 == 0; }); auto col0 = cudf::test::fixed_point_column_wrapper{ seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; From 7b10c637095080539b19f2cd1185a29e1f41ece7 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 17 Dec 2020 22:10:52 +0000 Subject: [PATCH 3/5] updates based on review comments --- cpp/include/cudf/io/detail/parquet.hpp | 10 +++++----- cpp/include/cudf/io/parquet.hpp | 14 +++++++------- cpp/src/io/functions.cpp | 8 +++----- cpp/src/io/parquet/chunked_state.hpp | 12 +++++++----- cpp/src/io/parquet/writer_impl.cu | 14 +++++++------- cpp/src/io/parquet/writer_impl.hpp | 2 +- cpp/tests/io/parquet_test.cpp | 15 +++++++-------- 7 files changed, 37 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index ba726e64d6d..163d8c9d735 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -113,11 +113,11 @@ class writer { */ std::unique_ptr> write( table_view const& table, - const table_metadata* metadata = nullptr, - bool return_filemetadata = false, - const std::string column_chunks_file_path = "", - std::vector* decimal_precision = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); + const table_metadata* metadata = nullptr, + bool return_filemetadata = false, + const std::string column_chunks_file_path = "", + std::vector const& decimal_precision = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** * @brief Begins the chunked/streamed write process. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 1b164f63da2..f67dae0218b 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -402,8 +402,8 @@ class parquet_writer_options { // Column chunks file path to be set in the raw output metadata std::string _column_chunks_file_path; /// vector of precision values for decimal writing. Exactly one entry - /// per decimal column. - std::vector* _decimal_precisions = nullptr; + /// per decimal column. Optional unless decimals are being written. + std::vector _decimal_precisions; /** * @brief Constructor from sink and table. @@ -486,7 +486,7 @@ class parquet_writer_options { /** * @brief Returns a pointer to the decimal precision vector. */ - std::vector* get_decimal_precisions() const { return _decimal_precisions; } + std::vector const& get_decimal_precisions() const { return _decimal_precisions; } /** * @brief Sets metadata. @@ -537,7 +537,7 @@ class parquet_writer_options { /** * @brief Sets a pointer to the decimal precision vector. */ - void set_decimal_precisions(std::vector* dp) { _decimal_precisions = dp; } + void set_decimal_precisions(std::vector const& dp) { _decimal_precisions = dp; } }; class parquet_writer_options_builder { @@ -701,7 +701,7 @@ class chunked_parquet_writer_options { // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. bool _write_timestamps_as_int96 = false; // Optional decimal precision data - must be present if writing decimals - const std::vector* _decimal_precision = nullptr; + std::vector _decimal_precision = {}; /** * @brief Constructor from sink. @@ -746,7 +746,7 @@ class chunked_parquet_writer_options { /** * @brief Returns decimal precision pointer. */ - const std::vector* get_decimal_precision() const { return _decimal_precision; } + std::vector const& get_decimal_precision() const { return _decimal_precision; } /** * @brief Returns `true` if timestamps will be written as INT96 @@ -769,7 +769,7 @@ class chunked_parquet_writer_options { * @param v Vector of precision data flattened with exactly one entry per * decimal column. */ - void set_decimal_precision_data(const std::vector* v) { _decimal_precision = v; } + void set_decimal_precision_data(std::vector const& v) { _decimal_precision = v; } /** * @brief Sets the level of statistics in parquet_writer_options. diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a5afd3419ca..4fa26ef813c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -399,11 +399,9 @@ std::shared_ptr write_parquet_chunked_begin( state->user_metadata_with_nullability = *op.get_nullable_metadata(); state->user_metadata = &state->user_metadata_with_nullability; } - state->int96_timestamps = op.is_enabled_int96_timestamps(); - if (op.get_decimal_precision() != nullptr) { - state->decimal_precisions = *op.get_decimal_precision(); - } - state->stream = 0; + state->int96_timestamps = op.is_enabled_int96_timestamps(); + state->_decimal_precisions = op.get_decimal_precision(); + state->stream = 0; state->wp->write_chunked_begin(*state); return state; } diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp index b8a204911ed..e78bd50d36c 100644 --- a/cpp/src/io/parquet/chunked_state.hpp +++ b/cpp/src/io/parquet/chunked_state.hpp @@ -57,18 +57,20 @@ struct pq_chunked_state { bool int96_timestamps; /// vector of precision values for decimal writing. Exactly one entry /// per decimal column. - std::vector decimal_precisions; + std::vector _decimal_precisions; pq_chunked_state() = default; pq_chunked_state(table_metadata const* metadata, - SingleWriteMode mode = SingleWriteMode::NO, - bool write_int96_timestamps = false, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + SingleWriteMode mode = SingleWriteMode::NO, + bool write_int96_timestamps = false, + std::vector const& decimal_precisions = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : stream{stream}, user_metadata{metadata}, single_write_mode{mode == SingleWriteMode::YES}, - int96_timestamps(write_int96_timestamps) + int96_timestamps(write_int96_timestamps), + _decimal_precisions(decimal_precisions) { } }; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a7ce251ceeb..9dc60a72f8f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -164,7 +164,7 @@ class parquet_column_view { std::vector const &nullability, const table_metadata *metadata, bool int96_timestamps, - const std::vector &decimal_precision, + std::vector const &decimal_precision, uint &decimal_precision_idx, rmm::cuda_stream_view stream) : _col(col), @@ -686,12 +686,12 @@ std::unique_ptr> writer::impl::write( const table_metadata *metadata, bool return_filemetadata, const std::string &column_chunks_file_path, - std::vector *decimal_precisions, + std::vector const &decimal_precisions, rmm::cuda_stream_view stream) { - pq_chunked_state state{metadata, SingleWriteMode::YES, int96_timestamps, stream}; + pq_chunked_state state{ + metadata, SingleWriteMode::YES, int96_timestamps, decimal_precisions, stream}; - if (decimal_precisions != nullptr) { state.decimal_precisions = *decimal_precisions; } write_chunked_begin(state); write_chunk(table, state); return write_chunked_end(state, return_filemetadata, column_chunks_file_path); @@ -747,12 +747,12 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) this_column_nullability, state.user_metadata, state.int96_timestamps, - state.decimal_precisions, + state._decimal_precisions, decimal_precision_idx, state.stream); } - CUDF_EXPECTS(decimal_precision_idx == state.decimal_precisions.size(), + CUDF_EXPECTS(decimal_precision_idx == state._decimal_precisions.size(), "Too many decimal precision values!"); // first call. setup metadata. num_rows will get incremented as write_chunk is @@ -1267,7 +1267,7 @@ std::unique_ptr> writer::write(table_view const &table, const table_metadata *metadata, bool return_filemetadata, const std::string column_chunks_file_path, - std::vector *decimal_precisions, + std::vector const &decimal_precisions, rmm::cuda_stream_view stream) { return _impl->write( diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 8450d8ac4ce..ae6446efb95 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -88,7 +88,7 @@ class writer::impl { const table_metadata* metadata, bool return_filemetadata, const std::string& column_chunks_file_path, - std::vector* decimal_precisions, + std::vector const& decimal_precisions, rmm::cuda_stream_view stream); /** diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 97da0a5f16d..c464a4847a2 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -334,7 +334,7 @@ TEST_F(ParquetWriterTest, MultiColumn) cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); std::vector precisions = {10, 20}; - out_opts.set_decimal_precisions(&precisions); + out_opts.set_decimal_precisions(precisions); cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -413,7 +413,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); std::vector precisions = {9, 20}; - out_opts.set_decimal_precisions(&precisions); + out_opts.set_decimal_precisions(precisions); cudf_io::write_parquet(out_opts); @@ -1325,8 +1325,7 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite) auto seq_col0 = random_values(num_rows); auto seq_col1 = random_values(num_rows); - auto valids = cudf::test::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0; }); + auto valids = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); auto col0 = cudf::test::fixed_point_column_wrapper{ seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; @@ -1345,25 +1344,25 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite) // verify failure if too small a precision is given std::vector precisions{7, 1}; - args.set_decimal_precision_data(&precisions); + args.set_decimal_precision_data(precisions); state = cudf_io::write_parquet_chunked_begin(args); EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); // verify failure if too few precisions given precisions.pop_back(); - args.set_decimal_precision_data(&precisions); + args.set_decimal_precision_data(precisions); state = cudf_io::write_parquet_chunked_begin(args); EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); // verify failure if too many precisions given precisions = {7, 14, 11}; - args.set_decimal_precision_data(&precisions); + args.set_decimal_precision_data(precisions); state = cudf_io::write_parquet_chunked_begin(args); EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error); // write correctly precisions.pop_back(); - args.set_decimal_precision_data(&precisions); + args.set_decimal_precision_data(precisions); state = cudf_io::write_parquet_chunked_begin(args); cudf_io::write_parquet_chunked(table, state); cudf_io::write_parquet_chunked_end(state); From 23294c0a2bcee7aa69af2dc41edec542f530809c Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 4 Jan 2021 19:41:26 +0000 Subject: [PATCH 4/5] Review change and some comment fixes to update to current code --- cpp/include/cudf/io/parquet.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index f67dae0218b..f5527b748e9 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -484,7 +484,7 @@ class parquet_writer_options { std::string get_column_chunks_file_path() const { return _column_chunks_file_path; } /** - * @brief Returns a pointer to the decimal precision vector. + * @brief Returns a constant reference to the decimal precision vector. */ std::vector const& get_decimal_precisions() const { return _decimal_precisions; } @@ -535,9 +535,9 @@ class parquet_writer_options { } /** - * @brief Sets a pointer to the decimal precision vector. + * @brief Sets the decimal precision vector data. */ - void set_decimal_precisions(std::vector const& dp) { _decimal_precisions = dp; } + void set_decimal_precisions(std::vector dp) { _decimal_precisions = std::move(dp); } }; class parquet_writer_options_builder { From a1e1bcc16c7a9e49c7a97c5001725ac1c5652fd1 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 4 Jan 2021 21:32:25 +0000 Subject: [PATCH 5/5] normalizing on singular `decimal_precision` and fixing a comment typo --- cpp/include/cudf/io/parquet.hpp | 6 +++--- cpp/include/cudf_test/column_wrapper.hpp | 2 +- cpp/src/io/functions.cpp | 8 ++++---- cpp/src/io/parquet/chunked_state.hpp | 12 ++++++------ cpp/src/io/parquet/writer_impl.cu | 4 ++-- cpp/tests/io/parquet_test.cpp | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index f5527b748e9..177c2da8a9d 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -403,7 +403,7 @@ class parquet_writer_options { std::string _column_chunks_file_path; /// vector of precision values for decimal writing. Exactly one entry /// per decimal column. Optional unless decimals are being written. - std::vector _decimal_precisions; + std::vector _decimal_precision; /** * @brief Constructor from sink and table. @@ -486,7 +486,7 @@ class parquet_writer_options { /** * @brief Returns a constant reference to the decimal precision vector. */ - std::vector const& get_decimal_precisions() const { return _decimal_precisions; } + std::vector const& get_decimal_precision() const { return _decimal_precision; } /** * @brief Sets metadata. @@ -537,7 +537,7 @@ class parquet_writer_options { /** * @brief Sets the decimal precision vector data. */ - void set_decimal_precisions(std::vector dp) { _decimal_precisions = std::move(dp); } + void set_decimal_precision(std::vector dp) { _decimal_precision = std::move(dp); } }; class parquet_writer_options_builder { diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index c12fa5a6124..3dd14984d03 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -511,7 +511,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper { * * Example: * @code{.cpp} - * // Creates a non-nullable column of INT32 elements with 5 elements: {0, 2, 4, 6, 8} + * // Creates a non-nullable column of DECIMAL32 elements with 5 elements: {0, 2, 4, 6, 8} * auto elements = make_counting_transform_iterator(0, [](auto i) { return i * 2;}); * auto w = fixed_point_column_wrapper(elements, elements + 5, scale_type{0}); * @endcode diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 4fa26ef813c..4c50eb0362c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -363,7 +363,7 @@ std::unique_ptr> write_parquet(parquet_writer_options const options.get_metadata(), options.is_enabled_return_filemetadata(), options.get_column_chunks_file_path(), - options.get_decimal_precisions()); + options.get_decimal_precision()); } /** @@ -399,9 +399,9 @@ std::shared_ptr write_parquet_chunked_begin( state->user_metadata_with_nullability = *op.get_nullable_metadata(); state->user_metadata = &state->user_metadata_with_nullability; } - state->int96_timestamps = op.is_enabled_int96_timestamps(); - state->_decimal_precisions = op.get_decimal_precision(); - state->stream = 0; + state->int96_timestamps = op.is_enabled_int96_timestamps(); + state->_decimal_precision = op.get_decimal_precision(); + state->stream = 0; state->wp->write_chunked_begin(*state); return state; } diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp index e78bd50d36c..d6758efe417 100644 --- a/cpp/src/io/parquet/chunked_state.hpp +++ b/cpp/src/io/parquet/chunked_state.hpp @@ -57,20 +57,20 @@ struct pq_chunked_state { bool int96_timestamps; /// vector of precision values for decimal writing. Exactly one entry /// per decimal column. - std::vector _decimal_precisions; + std::vector _decimal_precision; pq_chunked_state() = default; pq_chunked_state(table_metadata const* metadata, - SingleWriteMode mode = SingleWriteMode::NO, - bool write_int96_timestamps = false, - std::vector const& decimal_precisions = {}, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + SingleWriteMode mode = SingleWriteMode::NO, + bool write_int96_timestamps = false, + std::vector const& decimal_precision = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) : stream{stream}, user_metadata{metadata}, single_write_mode{mode == SingleWriteMode::YES}, int96_timestamps(write_int96_timestamps), - _decimal_precisions(decimal_precisions) + _decimal_precision(decimal_precision) { } }; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 9dc60a72f8f..a71a3fd9c05 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -747,12 +747,12 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state) this_column_nullability, state.user_metadata, state.int96_timestamps, - state._decimal_precisions, + state._decimal_precision, decimal_precision_idx, state.stream); } - CUDF_EXPECTS(decimal_precision_idx == state._decimal_precisions.size(), + CUDF_EXPECTS(decimal_precision_idx == state._decimal_precision.size(), "Too many decimal precision values!"); // first call. setup metadata. num_rows will get incremented as write_chunk is diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index c464a4847a2..89cbb82c726 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -334,7 +334,7 @@ TEST_F(ParquetWriterTest, MultiColumn) cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); std::vector precisions = {10, 20}; - out_opts.set_decimal_precisions(precisions); + out_opts.set_decimal_precision(precisions); cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -413,7 +413,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls) cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) .metadata(&expected_metadata); std::vector precisions = {9, 20}; - out_opts.set_decimal_precisions(precisions); + out_opts.set_decimal_precision(precisions); cudf_io::write_parquet(out_opts);