From 94d64015e103790f2909c4dfc1652abdcd1f8aa4 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 16 Feb 2024 11:23:36 -0800 Subject: [PATCH 01/14] add ability to request encodings per column --- cpp/include/cudf/io/types.hpp | 47 ++++++++++++++ cpp/src/io/functions.cpp | 7 +++ cpp/src/io/parquet/page_enc.cu | 21 ++++++- cpp/src/io/parquet/parquet_common.hpp | 1 + cpp/src/io/parquet/parquet_gpu.hpp | 9 +-- cpp/src/io/parquet/writer_impl.cu | 69 ++++++++++++++++++-- cpp/tests/io/parquet_writer_test.cpp | 90 +++++++++++++++++++++++++++ 7 files changed, 233 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 3208a81cd63..89fb3646638 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -99,6 +99,20 @@ enum statistics_freq { STATISTICS_COLUMN = 3, ///< Full column and offset indices. Implies STATISTICS_ROWGROUP }; +/** + * @brief Valid parquet encodings for use with `column_in_metadata::set_encoding()` + */ +struct parquet_encoding { + static std::string const PLAIN; ///< Use plain encoding + static std::string const DICTIONARY; ///< Use dictionary encoding + static std::string const + DELTA_BINARY_PACKED; ///< Use DELTA_BINARY_PACKED encoding (only valid for integer columns) + static std::string const DELTA_LENGTH_BYTE_ARRAY; ///< Use DELTA_LENGTH_BYTE_ARRAY encoding (only + ///< valid for BYTE_ARRAY columns) + static std::string const DELTA_BYTE_ARRAY; ///< Use DELTA_BYTE_ARRAY encoding (only valid for + ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns) +}; + /** * @brief Statistics about compression performed by a writer. */ @@ -585,6 +599,7 @@ class column_in_metadata { std::optional _decimal_precision; std::optional _parquet_field_id; std::vector children; + std::optional _encoding; public: column_in_metadata() = default; @@ -701,6 +716,22 @@ class column_in_metadata { return *this; } + /** + * @brief Sets the encoding to use for this column. + * + * This is just a request, and the encoder may still choose to use a different encoding + * depending on resource constraints. Use the constants defined in the `parquet_encoding` + * struct. + * + * @param encoding The encoding to use + * @return this for chaining + */ + column_in_metadata& set_encoding(std::string const& encoding) noexcept + { + _encoding = encoding; + return *this; + } + /** * @brief Get reference to a child of this column * @@ -806,6 +837,22 @@ class column_in_metadata { * @return Boolean indicating whether to encode this column as binary data */ [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; } + + /** + * @brief Get whether the encoding has been set for this column. + * + * @return Boolean indicating whether and encoding has been set for this column + */ + [[nodiscard]] bool is_encoding_set() const noexcept { return _encoding.has_value(); } + + /** + * @brief Get the encoding that was set for this column. + * + * @throws std::bad_optional_access If encoding was not set for this + * column. Check using `is_encoding_set()` first. + * @return The encoding that was set for this column + */ + [[nodiscard]] std::string get_encoding() const { return _encoding.value(); } }; /** diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 42f2fd02d52..6cf4133915c 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -39,6 +39,13 @@ #include namespace cudf::io { + +std::string const parquet_encoding::PLAIN = "PLAIN"; +std::string const parquet_encoding::DICTIONARY = "DICTIONARY"; +std::string const parquet_encoding::DELTA_BINARY_PACKED = "DELTA_BINARY_PACKED"; +std::string const parquet_encoding::DELTA_LENGTH_BYTE_ARRAY = "DELTA_LENGTH_BYTE_ARRAY"; +std::string const parquet_encoding::DELTA_BYTE_ARRAY = "DELTA_BYTE_ARRAY"; + // Returns builder for csv_reader_options csv_reader_options_builder csv_reader_options::builder(source_info src) { diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 2f351edd2b9..3514cc09dd9 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -577,8 +577,10 @@ CUDF_KERNEL void __launch_bounds__(128) auto const physical_type = col_g.physical_type; auto const type_id = col_g.leaf_column->type().id(); auto const is_use_delta = - write_v2_headers && !ck_g.use_dictionary && - (physical_type == INT32 || physical_type == INT64 || physical_type == BYTE_ARRAY); + col_g.requested_encoding == Encoding::DELTA_BINARY_PACKED || + col_g.requested_encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY || + (write_v2_headers && !ck_g.use_dictionary && + (physical_type == INT32 || physical_type == INT64 || physical_type == BYTE_ARRAY)); if (t < 32) { uint32_t fragments_in_chunk = 0; @@ -789,7 +791,20 @@ CUDF_KERNEL void __launch_bounds__(128) if (t == 0) { if (not pages.empty()) { // set encoding - if (is_use_delta) { + if (col_g.requested_encoding != Encoding::UNDEFINED) { + switch (col_g.requested_encoding) { + case Encoding::PLAIN: page_g.kernel_mask = encode_kernel_mask::PLAIN; break; + case Encoding::RLE_DICTIONARY: + page_g.kernel_mask = encode_kernel_mask::DICTIONARY; + break; + case Encoding::DELTA_BINARY_PACKED: + page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY; + break; + case Encoding::DELTA_LENGTH_BYTE_ARRAY: + page_g.kernel_mask = encode_kernel_mask::DELTA_LENGTH_BA; + break; + } + } else if (is_use_delta) { // TODO(ets): at some point make a more intelligent decision on this. DELTA_LENGTH_BA // should always be preferred over PLAIN, but DELTA_BINARY is a different matter. // If the delta encoding size is going to be close to 32 bits anyway, then plain diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 8507eca047e..6751091756e 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -92,6 +92,7 @@ enum class Encoding : uint8_t { RLE_DICTIONARY = 8, BYTE_STREAM_SPLIT = 9, NUM_ENCODINGS = 10, + UNDEFINED = 255, }; /** diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index b215cd7a20b..2a6e46fe28b 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -460,10 +460,11 @@ struct parquet_column_device_view : stats_column_desc { size_type const* level_offsets; //!< Offset array for per-row pre-calculated rep/def level values uint8_t const* rep_values; //!< Pre-calculated repetition level values uint8_t const* def_values; //!< Pre-calculated definition level values - uint8_t const* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is - //!< nullability of parent_column. May be different from - //!< col.nullable() in case of chunked writing. - bool output_as_byte_array; //!< Indicates this list column is being written as a byte array + uint8_t const* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is + //!< nullability of parent_column. May be different from + //!< col.nullable() in case of chunked writing. + bool output_as_byte_array; //!< Indicates this list column is being written as a byte array + Encoding requested_encoding; //!< User specified encoding for this column. }; struct EncColumnChunk; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 3dcc9716579..fbf4e444f9a 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -259,6 +259,24 @@ bool is_col_fixed_width(column_view const& column) return is_fixed_width(column.type()); } +// Convert encoding passed in by user to the correct enum value. Returns `std::nullopt` +// if passed an unsupported (or mistyped) encoding. +std::optional string_to_encoding(std::string const& encoding) +{ + if (encoding == parquet_encoding::PLAIN) { + return Encoding::PLAIN; + } else if (encoding == parquet_encoding::DICTIONARY) { + return Encoding::RLE_DICTIONARY; + } else if (encoding == parquet_encoding::DELTA_BINARY_PACKED) { + return Encoding::DELTA_BINARY_PACKED; + } else if (encoding == parquet_encoding::DELTA_LENGTH_BYTE_ARRAY) { + return Encoding::DELTA_LENGTH_BYTE_ARRAY; + } else if (encoding == parquet_encoding::DELTA_BYTE_ARRAY) { + return Encoding::DELTA_BYTE_ARRAY; + } + return std::nullopt; +} + /** * @brief Extends SchemaElement to add members required in constructing parquet_column_view * @@ -268,11 +286,13 @@ bool is_col_fixed_width(column_view const& column) * 2. stats_dtype: datatype for statistics calculation required for the data stream of a leaf node. * 3. ts_scale: scale to multiply or divide timestamp by in order to convert timestamp to parquet * supported types + * 4. requested_encoding: A user provided encoding to use for the column. */ struct schema_tree_node : public SchemaElement { cudf::detail::LinkedColPtr leaf_column; statistics_dtype stats_dtype; int32_t ts_scale; + std::optional requested_encoding; // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The // function construct_schema_tree could be its constructor. It can have method to get the per @@ -589,7 +609,7 @@ std::vector construct_schema_tree( std::function add_schema = [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) { - bool col_nullable = is_col_nullable(col, col_meta, write_mode); + bool const col_nullable = is_col_nullable(col, col_meta, write_mode); auto set_field_id = [&schema, parent_idx](schema_tree_node& s, column_in_metadata const& col_meta) { @@ -605,6 +625,40 @@ std::vector construct_schema_tree( return child_col_type == type_id::UINT8; }; + // only call this after col_schema.type has been set + auto set_encoding = [&schema, parent_idx](schema_tree_node& s, + column_in_metadata const& col_meta) { + if (schema[parent_idx].name != "list" and col_meta.is_encoding_set()) { + auto enc = string_to_encoding(col_meta.get_encoding()); + + // do some validation on the requested encoding + // TODO(ets): should we print a warning or error out if the requested encoding is + // invalid? for now just silently fall back to the default encoder. + if (!enc.has_value() || !is_supported_encoding(enc.value())) { return; } + + switch (enc.value()) { + case Encoding::DELTA_BINARY_PACKED: + if (s.type != Type::INT32 && s.type != Type::INT64) { return; } + break; + + case Encoding::DELTA_LENGTH_BYTE_ARRAY: + if (s.type != Type::BYTE_ARRAY) { return; } + break; + + // TODO(ets): this will be caught by the check for supported encodings above...leaving + // this in for when we add this encoding. + case Encoding::DELTA_BYTE_ARRAY: + if (s.type != Type::BYTE_ARRAY && s.type != Type::FIXED_LEN_BYTE_ARRAY) { return; } + break; + + default: break; + } + + // requested encoding seems to be ok, set it + s.requested_encoding = enc; + } + }; + // There is a special case for a list column with one byte column child. This column can // have a special flag that indicates we write this out as binary instead of a list. This is a // more efficient storage mechanism for a single-depth list of bytes, but is a departure from @@ -627,6 +681,7 @@ std::vector construct_schema_tree( col_schema.parent_idx = parent_idx; col_schema.leaf_column = col; set_field_id(col_schema, col_meta); + set_encoding(col_schema, col_meta); col_schema.output_as_byte_array = col_meta.is_enabled_output_as_binary(); schema.push_back(col_schema); } else if (col->type().id() == type_id::STRUCT) { @@ -762,6 +817,7 @@ std::vector construct_schema_tree( col_schema.parent_idx = parent_idx; col_schema.leaf_column = col; set_field_id(col_schema, col_meta); + set_encoding(col_schema, col_meta); schema.push_back(col_schema); } }; @@ -951,6 +1007,9 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream desc.nullability = _d_nullability.data(); desc.max_def_level = _max_def_level; desc.max_rep_level = _max_rep_level; + // FIXME(ets): need to add some validation that the requested encoding matches the + // column type. can't ask for delta byte array for an int column, for instance. + desc.requested_encoding = schema_node.requested_encoding.value_or(Encoding::UNDEFINED); return desc; } @@ -1170,9 +1229,11 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, std::vector> hash_maps_storage; hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { - if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN || - (col_desc[chunk.col_desc_id].output_as_byte_array && - col_desc[chunk.col_desc_id].physical_type == Type::BYTE_ARRAY)) { + auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; + if (chunk_col_desc.physical_type == Type::BOOLEAN || + (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY) || + (chunk_col_desc.requested_encoding != Encoding::UNDEFINED && + chunk_col_desc.requested_encoding != Encoding::RLE_DICTIONARY)) { chunk.use_dictionary = false; } else { chunk.use_dictionary = true; diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 34061cb7bf8..e0625402e5f 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1426,6 +1426,96 @@ TEST_F(ParquetWriterTest, RowGroupMetadata) static_cast(num_rows * sizeof(column_type))); } +TEST_F(ParquetWriterTest, UserRequestedEncodings) +{ + constexpr int num_rows = 500; + + auto const ones = thrust::make_constant_iterator(1); + auto const col = + cudf::test::fixed_width_column_wrapper{ones, ones + num_rows, no_nulls()}; + + auto const strings = thrust::make_constant_iterator("string"); + auto const string_col = + cudf::test::strings_column_wrapper(strings, strings + num_rows, no_nulls()); + + auto const table = table_view( + {col, col, col, col, col, string_col, string_col, string_col, string_col, string_col}); + + cudf::io::table_input_metadata table_metadata(table); + table_metadata.column_metadata[0].set_name("int_plain"); + table_metadata.column_metadata[0].set_encoding(cudf::io::parquet_encoding::PLAIN); + table_metadata.column_metadata[1].set_name("int_dict"); + table_metadata.column_metadata[1].set_encoding(cudf::io::parquet_encoding::DICTIONARY); + table_metadata.column_metadata[2].set_name("int_delta_binary_packed"); + table_metadata.column_metadata[2].set_encoding(cudf::io::parquet_encoding::DELTA_BINARY_PACKED); + table_metadata.column_metadata[3].set_name("int_delta_length_byte_array"); + table_metadata.column_metadata[3].set_encoding( + cudf::io::parquet_encoding::DELTA_LENGTH_BYTE_ARRAY); + table_metadata.column_metadata[4].set_name("int_bogus"); + table_metadata.column_metadata[4].set_encoding("no such encoding"); + table_metadata.column_metadata[5].set_name("string_plain"); + table_metadata.column_metadata[5].set_encoding(cudf::io::parquet_encoding::PLAIN); + table_metadata.column_metadata[6].set_name("string_dict"); + table_metadata.column_metadata[6].set_encoding(cudf::io::parquet_encoding::DICTIONARY); + table_metadata.column_metadata[7].set_name("string_delta_length_byte_array"); + table_metadata.column_metadata[7].set_encoding( + cudf::io::parquet_encoding::DELTA_LENGTH_BYTE_ARRAY); + table_metadata.column_metadata[8].set_name("string_delta_binary_packed"); + table_metadata.column_metadata[8].set_encoding(cudf::io::parquet_encoding::DELTA_BINARY_PACKED); + table_metadata.column_metadata[9].set_name("string_bogus"); + table_metadata.column_metadata[9].set_encoding("no such encoding"); + + for (auto& col_meta : table_metadata.column_metadata) { + col_meta.set_nullability(false); + } + + auto const filepath = temp_env->get_temp_filepath("UserRequestedEncodings.parquet"); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table) + .metadata(table_metadata) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .compression(cudf::io::compression_type::ZSTD); + cudf::io::write_parquet(opts); + + // check page headers to make sure each column is encoded with the appropriate encoder + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + // no nulls and no repetition, so the only encoding used should be for the data. + // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY. + // requested plain + EXPECT_EQ(fmd.row_groups[0].columns[0].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN); + // requested dictionary + EXPECT_EQ(fmd.row_groups[0].columns[1].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + // requested delta_binary_packed + EXPECT_EQ(fmd.row_groups[0].columns[2].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::DELTA_BINARY_PACKED); + // requested delta_length_byte_array, but should fall back to dictionary + EXPECT_EQ(fmd.row_groups[0].columns[3].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + // requested nonsense, but should fall back to dictionary + EXPECT_EQ(fmd.row_groups[0].columns[4].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + // requested plain + EXPECT_EQ(fmd.row_groups[0].columns[5].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN); + // requested dictionary + EXPECT_EQ(fmd.row_groups[0].columns[6].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + // requested delta_length_byte_array + EXPECT_EQ(fmd.row_groups[0].columns[7].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::DELTA_LENGTH_BYTE_ARRAY); + // requested delta_binary_packed, but should fall back to dictionary + EXPECT_EQ(fmd.row_groups[0].columns[8].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + // requested nonsense, but should fall back to dictionary + EXPECT_EQ(fmd.row_groups[0].columns[9].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); +} + ///////////////////////////////////////////////////////////// // custom mem mapped data sink that supports device writes template From c77ecb15c9e452482e96cba6987d761efc359ff2 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 16 Feb 2024 17:07:55 -0800 Subject: [PATCH 02/14] remove out of date fixme --- cpp/src/io/parquet/writer_impl.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index fbf4e444f9a..4a205322153 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1004,11 +1004,9 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream desc.level_bits = CompactProtocolReader::NumRequiredBits(max_rep_level()) << 4 | CompactProtocolReader::NumRequiredBits(max_def_level()); - desc.nullability = _d_nullability.data(); - desc.max_def_level = _max_def_level; - desc.max_rep_level = _max_rep_level; - // FIXME(ets): need to add some validation that the requested encoding matches the - // column type. can't ask for delta byte array for an int column, for instance. + desc.nullability = _d_nullability.data(); + desc.max_def_level = _max_def_level; + desc.max_rep_level = _max_rep_level; desc.requested_encoding = schema_node.requested_encoding.value_or(Encoding::UNDEFINED); return desc; } From d5c2fdd649a8a134b50452b02a352bc9c7bb0fbb Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 16 Feb 2024 17:12:47 -0800 Subject: [PATCH 03/14] fix delta_byte_array case --- cpp/src/io/parquet/writer_impl.cu | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 4a205322153..18759f964f8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -645,11 +645,8 @@ std::vector construct_schema_tree( if (s.type != Type::BYTE_ARRAY) { return; } break; - // TODO(ets): this will be caught by the check for supported encodings above...leaving - // this in for when we add this encoding. - case Encoding::DELTA_BYTE_ARRAY: - if (s.type != Type::BYTE_ARRAY && s.type != Type::FIXED_LEN_BYTE_ARRAY) { return; } - break; + // this is not caught by the check for supported encodings above + case Encoding::DELTA_BYTE_ARRAY: return; default: break; } From dfe853b02d4b00c952edf33151f169e36ca4a66a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 19 Feb 2024 15:00:50 -0800 Subject: [PATCH 04/14] clean up definition of is_use_delta --- cpp/src/io/parquet/page_enc.cu | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3514cc09dd9..fcf68e03a7c 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -574,13 +574,14 @@ CUDF_KERNEL void __launch_bounds__(128) // if writing delta encoded values, we're going to need to know the data length to get a guess // at the worst case number of bytes needed to encode. - auto const physical_type = col_g.physical_type; - auto const type_id = col_g.leaf_column->type().id(); - auto const is_use_delta = - col_g.requested_encoding == Encoding::DELTA_BINARY_PACKED || - col_g.requested_encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY || - (write_v2_headers && !ck_g.use_dictionary && - (physical_type == INT32 || physical_type == INT64 || physical_type == BYTE_ARRAY)); + auto const physical_type = col_g.physical_type; + auto const type_id = col_g.leaf_column->type().id(); + auto const is_requested_delta = col_g.requested_encoding == Encoding::DELTA_BINARY_PACKED || + col_g.requested_encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY; + auto const is_fallback_to_delta = + !ck_g.use_dictionary && write_v2_headers && + (physical_type == INT32 || physical_type == INT64 || physical_type == BYTE_ARRAY); + auto const is_use_delta = is_requested_delta || is_fallback_to_delta; if (t < 32) { uint32_t fragments_in_chunk = 0; From 3445f59a1af0807c7f0e05ff0dbac5251d01d601 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Feb 2024 10:41:55 -0800 Subject: [PATCH 05/14] refactor to use enum rather than strings for setting encodings --- cpp/include/cudf/io/types.hpp | 41 +++++++++---------- cpp/src/io/functions.cpp | 6 --- cpp/src/io/parquet/page_enc.cu | 19 ++++----- cpp/src/io/parquet/parquet_gpu.hpp | 11 +++--- cpp/src/io/parquet/writer_impl.cu | 59 +++++++++++----------------- cpp/tests/io/parquet_writer_test.cpp | 26 ++++++------ 6 files changed, 70 insertions(+), 92 deletions(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 89fb3646638..adeb64c4024 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -100,17 +100,23 @@ enum statistics_freq { }; /** - * @brief Valid parquet encodings for use with `column_in_metadata::set_encoding()` + * @brief Valid encodings for use with `column_in_metadata::set_encoding()` */ -struct parquet_encoding { - static std::string const PLAIN; ///< Use plain encoding - static std::string const DICTIONARY; ///< Use dictionary encoding - static std::string const - DELTA_BINARY_PACKED; ///< Use DELTA_BINARY_PACKED encoding (only valid for integer columns) - static std::string const DELTA_LENGTH_BYTE_ARRAY; ///< Use DELTA_LENGTH_BYTE_ARRAY encoding (only - ///< valid for BYTE_ARRAY columns) - static std::string const DELTA_BYTE_ARRAY; ///< Use DELTA_BYTE_ARRAY encoding (only valid for - ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns) +enum class column_encoding { + // common encodings + NOT_SET = -1, ///< No encoding has been requested + DICTIONARY, ///< Use dictionary encoding + // parquet encodings + PLAIN, ///< Use plain encoding + DELTA_BINARY_PACKED, ///< Use DELTA_BINARY_PACKED encoding (only valid for integer columns) + DELTA_LENGTH_BYTE_ARRAY, ///< Use DELTA_LENGTH_BYTE_ARRAY encoding (only + ///< valid for BYTE_ARRAY columns) + DELTA_BYTE_ARRAY, ///< Use DELTA_BYTE_ARRAY encoding (only valid for + ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns) + // orc encodings + DIRECT, ///< Use DIRECT encoding + DIRECT_V2, ///< Use DIRECT_V2 encoding + DICTIONARY_V2, ///< Use DICTIONARY_V2 encoding }; /** @@ -599,7 +605,7 @@ class column_in_metadata { std::optional _decimal_precision; std::optional _parquet_field_id; std::vector children; - std::optional _encoding; + column_encoding _encoding = column_encoding::NOT_SET; public: column_in_metadata() = default; @@ -726,7 +732,7 @@ class column_in_metadata { * @param encoding The encoding to use * @return this for chaining */ - column_in_metadata& set_encoding(std::string const& encoding) noexcept + column_in_metadata& set_encoding(column_encoding encoding) noexcept { _encoding = encoding; return *this; @@ -838,21 +844,12 @@ class column_in_metadata { */ [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; } - /** - * @brief Get whether the encoding has been set for this column. - * - * @return Boolean indicating whether and encoding has been set for this column - */ - [[nodiscard]] bool is_encoding_set() const noexcept { return _encoding.has_value(); } - /** * @brief Get the encoding that was set for this column. * - * @throws std::bad_optional_access If encoding was not set for this - * column. Check using `is_encoding_set()` first. * @return The encoding that was set for this column */ - [[nodiscard]] std::string get_encoding() const { return _encoding.value(); } + [[nodiscard]] column_encoding get_encoding() const { return _encoding; } }; /** diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 44095329ed6..0dae59112f2 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -40,12 +40,6 @@ namespace cudf::io { -std::string const parquet_encoding::PLAIN = "PLAIN"; -std::string const parquet_encoding::DICTIONARY = "DICTIONARY"; -std::string const parquet_encoding::DELTA_BINARY_PACKED = "DELTA_BINARY_PACKED"; -std::string const parquet_encoding::DELTA_LENGTH_BYTE_ARRAY = "DELTA_LENGTH_BYTE_ARRAY"; -std::string const parquet_encoding::DELTA_BYTE_ARRAY = "DELTA_BYTE_ARRAY"; - // Returns builder for csv_reader_options csv_reader_options_builder csv_reader_options::builder(source_info src) { diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 3985e110785..9a1dcb0ace3 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -571,10 +571,11 @@ CUDF_KERNEL void __launch_bounds__(128) // if writing delta encoded values, we're going to need to know the data length to get a guess // at the worst case number of bytes needed to encode. - auto const physical_type = col_g.physical_type; - auto const type_id = col_g.leaf_column->type().id(); - auto const is_requested_delta = col_g.requested_encoding == Encoding::DELTA_BINARY_PACKED || - col_g.requested_encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY; + auto const physical_type = col_g.physical_type; + auto const type_id = col_g.leaf_column->type().id(); + auto const is_requested_delta = + col_g.requested_encoding == column_encoding::DELTA_BINARY_PACKED || + col_g.requested_encoding == column_encoding::DELTA_LENGTH_BYTE_ARRAY; auto const is_fallback_to_delta = !ck_g.use_dictionary && write_v2_headers && (physical_type == INT32 || physical_type == INT64 || physical_type == BYTE_ARRAY); @@ -789,16 +790,16 @@ CUDF_KERNEL void __launch_bounds__(128) if (t == 0) { if (not pages.empty()) { // set encoding - if (col_g.requested_encoding != Encoding::UNDEFINED) { + if (col_g.requested_encoding != column_encoding::NOT_SET) { switch (col_g.requested_encoding) { - case Encoding::PLAIN: page_g.kernel_mask = encode_kernel_mask::PLAIN; break; - case Encoding::RLE_DICTIONARY: + case column_encoding::PLAIN: page_g.kernel_mask = encode_kernel_mask::PLAIN; break; + case column_encoding::DICTIONARY: page_g.kernel_mask = encode_kernel_mask::DICTIONARY; break; - case Encoding::DELTA_BINARY_PACKED: + case column_encoding::DELTA_BINARY_PACKED: page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY; break; - case Encoding::DELTA_LENGTH_BYTE_ARRAY: + case column_encoding::DELTA_LENGTH_BYTE_ARRAY: page_g.kernel_mask = encode_kernel_mask::DELTA_LENGTH_BA; break; } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 5b9dd808d1a..80976c13372 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -458,11 +459,11 @@ struct parquet_column_device_view : stats_column_desc { size_type const* level_offsets; //!< Offset array for per-row pre-calculated rep/def level values uint8_t const* rep_values; //!< Pre-calculated repetition level values uint8_t const* def_values; //!< Pre-calculated definition level values - uint8_t const* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is - //!< nullability of parent_column. May be different from - //!< col.nullable() in case of chunked writing. - bool output_as_byte_array; //!< Indicates this list column is being written as a byte array - Encoding requested_encoding; //!< User specified encoding for this column. + uint8_t const* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is + //!< nullability of parent_column. May be different from + //!< col.nullable() in case of chunked writing. + bool output_as_byte_array; //!< Indicates this list column is being written as a byte array + column_encoding requested_encoding; //!< User specified encoding for this column. }; struct EncColumnChunk; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ddecc63339f..19ae6c13a68 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -258,24 +258,6 @@ bool is_col_fixed_width(column_view const& column) return is_fixed_width(column.type()); } -// Convert encoding passed in by user to the correct enum value. Returns `std::nullopt` -// if passed an unsupported (or mistyped) encoding. -std::optional string_to_encoding(std::string const& encoding) -{ - if (encoding == parquet_encoding::PLAIN) { - return Encoding::PLAIN; - } else if (encoding == parquet_encoding::DICTIONARY) { - return Encoding::RLE_DICTIONARY; - } else if (encoding == parquet_encoding::DELTA_BINARY_PACKED) { - return Encoding::DELTA_BINARY_PACKED; - } else if (encoding == parquet_encoding::DELTA_LENGTH_BYTE_ARRAY) { - return Encoding::DELTA_LENGTH_BYTE_ARRAY; - } else if (encoding == parquet_encoding::DELTA_BYTE_ARRAY) { - return Encoding::DELTA_BYTE_ARRAY; - } - return std::nullopt; -} - /** * @brief Extends SchemaElement to add members required in constructing parquet_column_view * @@ -291,7 +273,7 @@ struct schema_tree_node : public SchemaElement { cudf::detail::LinkedColPtr leaf_column; statistics_dtype stats_dtype; int32_t ts_scale; - std::optional requested_encoding; + column_encoding requested_encoding; // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The // function construct_schema_tree could be its constructor. It can have method to get the per @@ -627,31 +609,36 @@ std::vector construct_schema_tree( // only call this after col_schema.type has been set auto set_encoding = [&schema, parent_idx](schema_tree_node& s, column_in_metadata const& col_meta) { - if (schema[parent_idx].name != "list" and col_meta.is_encoding_set()) { - auto enc = string_to_encoding(col_meta.get_encoding()); + s.requested_encoding = column_encoding::NOT_SET; - // do some validation on the requested encoding - // TODO(ets): should we print a warning or error out if the requested encoding is - // invalid? for now just silently fall back to the default encoder. - if (!enc.has_value() || !is_supported_encoding(enc.value())) { return; } - - switch (enc.value()) { - case Encoding::DELTA_BINARY_PACKED: + if (schema[parent_idx].name != "list" and + col_meta.get_encoding() != column_encoding::NOT_SET) { + // do some validation + switch (col_meta.get_encoding()) { + case column_encoding::DELTA_BINARY_PACKED: if (s.type != Type::INT32 && s.type != Type::INT64) { return; } break; - case Encoding::DELTA_LENGTH_BYTE_ARRAY: + case column_encoding::DELTA_LENGTH_BYTE_ARRAY: if (s.type != Type::BYTE_ARRAY) { return; } break; - // this is not caught by the check for supported encodings above - case Encoding::DELTA_BYTE_ARRAY: return; + // not yet supported for write (soon...) + case column_encoding::DELTA_BYTE_ARRAY: return; + + // supported parquet encodings + case column_encoding::PLAIN: + case column_encoding::DICTIONARY: break; - default: break; + // all others + default: + CUDF_LOG_WARN("Unsupported page encoding requested: {}", + static_cast(col_meta.get_encoding())); + return; } // requested encoding seems to be ok, set it - s.requested_encoding = enc; + s.requested_encoding = col_meta.get_encoding(); } }; @@ -1003,7 +990,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream desc.nullability = _d_nullability.data(); desc.max_def_level = _max_def_level; desc.max_rep_level = _max_rep_level; - desc.requested_encoding = schema_node.requested_encoding.value_or(Encoding::UNDEFINED); + desc.requested_encoding = schema_node.requested_encoding; return desc; } @@ -1226,8 +1213,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; if (chunk_col_desc.physical_type == Type::BOOLEAN || (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY) || - (chunk_col_desc.requested_encoding != Encoding::UNDEFINED && - chunk_col_desc.requested_encoding != Encoding::RLE_DICTIONARY)) { + (chunk_col_desc.requested_encoding != column_encoding::NOT_SET && + chunk_col_desc.requested_encoding != column_encoding::DICTIONARY)) { chunk.use_dictionary = false; } else { chunk.use_dictionary = true; diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 7cfc899f693..918b7bdd513 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1443,27 +1443,25 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings) cudf::io::table_input_metadata table_metadata(table); table_metadata.column_metadata[0].set_name("int_plain"); - table_metadata.column_metadata[0].set_encoding(cudf::io::parquet_encoding::PLAIN); + table_metadata.column_metadata[0].set_encoding(cudf::io::column_encoding::PLAIN); table_metadata.column_metadata[1].set_name("int_dict"); - table_metadata.column_metadata[1].set_encoding(cudf::io::parquet_encoding::DICTIONARY); + table_metadata.column_metadata[1].set_encoding(cudf::io::column_encoding::DICTIONARY); table_metadata.column_metadata[2].set_name("int_delta_binary_packed"); - table_metadata.column_metadata[2].set_encoding(cudf::io::parquet_encoding::DELTA_BINARY_PACKED); + table_metadata.column_metadata[2].set_encoding(cudf::io::column_encoding::DELTA_BINARY_PACKED); table_metadata.column_metadata[3].set_name("int_delta_length_byte_array"); table_metadata.column_metadata[3].set_encoding( - cudf::io::parquet_encoding::DELTA_LENGTH_BYTE_ARRAY); - table_metadata.column_metadata[4].set_name("int_bogus"); - table_metadata.column_metadata[4].set_encoding("no such encoding"); + cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); + table_metadata.column_metadata[4].set_name("int_none"); table_metadata.column_metadata[5].set_name("string_plain"); - table_metadata.column_metadata[5].set_encoding(cudf::io::parquet_encoding::PLAIN); + table_metadata.column_metadata[5].set_encoding(cudf::io::column_encoding::PLAIN); table_metadata.column_metadata[6].set_name("string_dict"); - table_metadata.column_metadata[6].set_encoding(cudf::io::parquet_encoding::DICTIONARY); + table_metadata.column_metadata[6].set_encoding(cudf::io::column_encoding::DICTIONARY); table_metadata.column_metadata[7].set_name("string_delta_length_byte_array"); table_metadata.column_metadata[7].set_encoding( - cudf::io::parquet_encoding::DELTA_LENGTH_BYTE_ARRAY); + cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); table_metadata.column_metadata[8].set_name("string_delta_binary_packed"); - table_metadata.column_metadata[8].set_encoding(cudf::io::parquet_encoding::DELTA_BINARY_PACKED); - table_metadata.column_metadata[9].set_name("string_bogus"); - table_metadata.column_metadata[9].set_encoding("no such encoding"); + table_metadata.column_metadata[8].set_encoding(cudf::io::column_encoding::DELTA_BINARY_PACKED); + table_metadata.column_metadata[9].set_name("string_none"); for (auto& col_meta : table_metadata.column_metadata) { col_meta.set_nullability(false); @@ -1496,7 +1494,7 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings) // requested delta_length_byte_array, but should fall back to dictionary EXPECT_EQ(fmd.row_groups[0].columns[3].meta_data.encodings[0], cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); - // requested nonsense, but should fall back to dictionary + // no request, should fall back to dictionary EXPECT_EQ(fmd.row_groups[0].columns[4].meta_data.encodings[0], cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); // requested plain @@ -1511,7 +1509,7 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings) // requested delta_binary_packed, but should fall back to dictionary EXPECT_EQ(fmd.row_groups[0].columns[8].meta_data.encodings[0], cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); - // requested nonsense, but should fall back to dictionary + // no request, should fall back to dictionary EXPECT_EQ(fmd.row_groups[0].columns[9].meta_data.encodings[0], cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); } From 64e32345c7aaf53dd39b7304821547f5fc7a5865 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Feb 2024 10:45:32 -0800 Subject: [PATCH 06/14] clean up leftover cruft --- cpp/src/io/functions.cpp | 1 - cpp/src/io/parquet/parquet_common.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 0dae59112f2..b8353d312fe 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -39,7 +39,6 @@ #include namespace cudf::io { - // Returns builder for csv_reader_options csv_reader_options_builder csv_reader_options::builder(source_info src) { diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 6751091756e..8507eca047e 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -92,7 +92,6 @@ enum class Encoding : uint8_t { RLE_DICTIONARY = 8, BYTE_STREAM_SPLIT = 9, NUM_ENCODINGS = 10, - UNDEFINED = 255, }; /** From 67e85de03f924170104063a89ff627ab63585aa5 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Feb 2024 10:47:23 -0800 Subject: [PATCH 07/14] and a little more cruft --- cpp/src/io/parquet/parquet_gpu.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 80976c13372..e47c01058af 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -33,7 +33,6 @@ #include #include -#include #include #include From 3989641a85f9eb224ae32b733250d575509dd301 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Feb 2024 11:03:03 -0800 Subject: [PATCH 08/14] clean up some boolean logic --- cpp/src/io/parquet/writer_impl.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 19ae6c13a68..2668b4b20a0 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1211,10 +1211,14 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; - if (chunk_col_desc.physical_type == Type::BOOLEAN || - (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY) || - (chunk_col_desc.requested_encoding != column_encoding::NOT_SET && - chunk_col_desc.requested_encoding != column_encoding::DICTIONARY)) { + auto const is_requested_non_dict = + chunk_col_desc.requested_encoding != column_encoding::NOT_SET && + chunk_col_desc.requested_encoding != column_encoding::DICTIONARY; + auto const is_type_non_dict = + chunk_col_desc.physical_type == Type::BOOLEAN || + (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY); + + if (is_type_non_dict || is_requested_non_dict) { chunk.use_dictionary = false; } else { chunk.use_dictionary = true; From d5b451ee132f3cbadb71e54dcf61d3dbde085327 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Feb 2024 11:21:10 -0800 Subject: [PATCH 09/14] warn on DELTA_BYTE_ARRAY --- cpp/src/io/parquet/writer_impl.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 2668b4b20a0..12e0f83f6e2 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -623,13 +623,12 @@ std::vector construct_schema_tree( if (s.type != Type::BYTE_ARRAY) { return; } break; - // not yet supported for write (soon...) - case column_encoding::DELTA_BYTE_ARRAY: return; - // supported parquet encodings case column_encoding::PLAIN: case column_encoding::DICTIONARY: break; + // not yet supported for write (soon...) + case column_encoding::DELTA_BYTE_ARRAY: [[fallthrough]]; // all others default: CUDF_LOG_WARN("Unsupported page encoding requested: {}", From 45a9edc62fb2ddb91e564696563a048709b2bd3b Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 27 Feb 2024 12:58:56 -0800 Subject: [PATCH 10/14] suggested changes to warnings --- cpp/src/io/parquet/writer_impl.cu | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 12e0f83f6e2..11331340380 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -616,11 +616,21 @@ std::vector construct_schema_tree( // do some validation switch (col_meta.get_encoding()) { case column_encoding::DELTA_BINARY_PACKED: - if (s.type != Type::INT32 && s.type != Type::INT64) { return; } + if (s.type != Type::INT32 && s.type != Type::INT64) { + CUDF_LOG_WARN( + "DELTA_BINARY_PACKED encoding is only supported for INT32 and INT64 columns; the " + "requested encoding will be ignored"); + return; + } break; case column_encoding::DELTA_LENGTH_BYTE_ARRAY: - if (s.type != Type::BYTE_ARRAY) { return; } + if (s.type != Type::BYTE_ARRAY) { + CUDF_LOG_WARN( + "DELTA_LENGTH_BYTE_ARRAY encoding is only supported for BYTE_ARRAY columns; the " + "requested encoding will be ignored"); + return; + } break; // supported parquet encodings @@ -631,8 +641,9 @@ std::vector construct_schema_tree( case column_encoding::DELTA_BYTE_ARRAY: [[fallthrough]]; // all others default: - CUDF_LOG_WARN("Unsupported page encoding requested: {}", - static_cast(col_meta.get_encoding())); + CUDF_LOG_WARN( + "Unsupported page encoding requested: {}; the requested encoding will be ignored", + static_cast(col_meta.get_encoding())); return; } From 44473b0c1b94494c07b01008db5a5a62f3634c71 Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 4 Mar 2024 15:50:19 -0800 Subject: [PATCH 11/14] catch corner case where dict encoding is requested, but cannot be used --- cpp/src/io/parquet/page_enc.cu | 13 ++++++++++++- cpp/src/io/parquet/writer_impl.cu | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 9a1dcb0ace3..508ec484457 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -794,7 +794,18 @@ CUDF_KERNEL void __launch_bounds__(128) switch (col_g.requested_encoding) { case column_encoding::PLAIN: page_g.kernel_mask = encode_kernel_mask::PLAIN; break; case column_encoding::DICTIONARY: - page_g.kernel_mask = encode_kernel_mask::DICTIONARY; + // user may have requested dict, but we may not be able to use it + // TODO: when DELTA_BYTE_ARRAY is added, rework the fallback logic so there + // isn't duplicated code here and below. + if (ck_g.use_dictionary) { + page_g.kernel_mask = encode_kernel_mask::DICTIONARY; + } else if (is_fallback_to_delta) { + page_g.kernel_mask = physical_type == BYTE_ARRAY + ? encode_kernel_mask::DELTA_LENGTH_BA + : encode_kernel_mask::DELTA_BINARY; + } else { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + } break; case column_encoding::DELTA_BINARY_PACKED: page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 11331340380..e65fa03f36d 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1248,6 +1248,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.device_to_host_sync(stream); // Make decision about which chunks have dictionary + bool cannot_honor_request = false; for (auto& ck : h_chunks) { if (not ck.use_dictionary) { continue; } std::tie(ck.use_dictionary, ck.dict_rle_bits) = [&]() -> std::pair { @@ -1274,6 +1275,19 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, return {true, nbits}; }(); + // If dictionary encoding was requested, but it cannot be used, then print a warning. It will + // actually be disabled in gpuInitPages. + if (not ck.use_dictionary) { + auto const& chunk_col_desc = col_desc[ck.col_desc_id]; + if (chunk_col_desc.requested_encoding == column_encoding::DICTIONARY) { + cannot_honor_request = true; + } + } + } + + // warn if we have to ignore requested encoding + if (cannot_honor_request) { + CUDF_LOG_WARN("DICTIONARY encoding was requested, but resource constraints prevent its use"); } // TODO: (enh) Deallocate hash map storage for chunks that don't use dict and clear pointers. From 5405ddf4d49fdfc49c75b986da911f36ba9b230f Mon Sep 17 00:00:00 2001 From: seidl Date: Mon, 4 Mar 2024 16:43:06 -0800 Subject: [PATCH 12/14] add test and refactor UserRequestedEncodings test --- cpp/tests/io/parquet_writer_test.cpp | 110 ++++++++++++++++++--------- 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 918b7bdd513..f4da9f59b8c 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1426,8 +1426,52 @@ TEST_F(ParquetWriterTest, RowGroupMetadata) static_cast(num_rows * sizeof(column_type))); } +TEST_F(ParquetWriterTest, UserRequestedDictFallback) +{ + constexpr int num_rows = 100; + constexpr char const* big_string = + "a " + "very very very very very very very very very very very very very very very very very very " + "very very very very very very very very very very very very very very very very very very " + "very very very very very very very very very very very very very very very very very very " + "very very very very very very very very very very very very very very very very very very " + "very very very very very very very very very very very very very very very very very very " + "very very very very very very very very very very very very very very very very very very " + "long string"; + + auto const max_dict_size = strlen(big_string) * num_rows / 2; + + auto elements1 = cudf::detail::make_counting_transform_iterator( + 0, [big_string](auto i) { return big_string + std::to_string(i); }); + auto const col1 = cudf::test::strings_column_wrapper(elements1, elements1 + num_rows); + auto const table = table_view({col1}); + + cudf::io::table_input_metadata table_metadata(table); + table_metadata.column_metadata[0] + .set_name("big_strings") + .set_encoding(cudf::io::column_encoding::DICTIONARY) + .set_nullability(false); + + auto const filepath = temp_env->get_temp_filepath("UserRequestedDictFallback.parquet"); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table) + .metadata(table_metadata) + .max_dictionary_size(max_dict_size); + cudf::io::write_parquet(opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + // encoding should have fallen back to PLAIN + EXPECT_EQ(fmd.row_groups[0].columns[0].meta_data.encodings[0], + cudf::io::parquet::detail::Encoding::PLAIN); +} + TEST_F(ParquetWriterTest, UserRequestedEncodings) { + using cudf::io::column_encoding; + using cudf::io::parquet::detail::Encoding; constexpr int num_rows = 500; auto const ones = thrust::make_constant_iterator(1); @@ -1442,25 +1486,21 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings) {col, col, col, col, col, string_col, string_col, string_col, string_col, string_col}); cudf::io::table_input_metadata table_metadata(table); - table_metadata.column_metadata[0].set_name("int_plain"); - table_metadata.column_metadata[0].set_encoding(cudf::io::column_encoding::PLAIN); - table_metadata.column_metadata[1].set_name("int_dict"); - table_metadata.column_metadata[1].set_encoding(cudf::io::column_encoding::DICTIONARY); - table_metadata.column_metadata[2].set_name("int_delta_binary_packed"); - table_metadata.column_metadata[2].set_encoding(cudf::io::column_encoding::DELTA_BINARY_PACKED); - table_metadata.column_metadata[3].set_name("int_delta_length_byte_array"); - table_metadata.column_metadata[3].set_encoding( - cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); + + auto const set_meta = [&table_metadata](int idx, std::string const& name, column_encoding enc) { + table_metadata.column_metadata[idx].set_name(name).set_encoding(enc); + }; + + set_meta(0, "int_plain", column_encoding::PLAIN); + set_meta(1, "int_dict", column_encoding::DICTIONARY); + set_meta(2, "int_db", column_encoding::DELTA_BINARY_PACKED); + set_meta(3, "int_dlba", column_encoding::DELTA_LENGTH_BYTE_ARRAY); table_metadata.column_metadata[4].set_name("int_none"); - table_metadata.column_metadata[5].set_name("string_plain"); - table_metadata.column_metadata[5].set_encoding(cudf::io::column_encoding::PLAIN); - table_metadata.column_metadata[6].set_name("string_dict"); - table_metadata.column_metadata[6].set_encoding(cudf::io::column_encoding::DICTIONARY); - table_metadata.column_metadata[7].set_name("string_delta_length_byte_array"); - table_metadata.column_metadata[7].set_encoding( - cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY); - table_metadata.column_metadata[8].set_name("string_delta_binary_packed"); - table_metadata.column_metadata[8].set_encoding(cudf::io::column_encoding::DELTA_BINARY_PACKED); + + set_meta(5, "string_plain", column_encoding::PLAIN); + set_meta(6, "string_dict", column_encoding::DICTIONARY); + set_meta(7, "string_dlba", column_encoding::DELTA_LENGTH_BYTE_ARRAY); + set_meta(8, "string_db", column_encoding::DELTA_BINARY_PACKED); table_metadata.column_metadata[9].set_name("string_none"); for (auto& col_meta : table_metadata.column_metadata) { @@ -1482,36 +1522,30 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings) // no nulls and no repetition, so the only encoding used should be for the data. // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY. + auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) { + EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc); + }; + // requested plain - EXPECT_EQ(fmd.row_groups[0].columns[0].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN); + expect_enc(0, Encoding::PLAIN); // requested dictionary - EXPECT_EQ(fmd.row_groups[0].columns[1].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + expect_enc(1, Encoding::PLAIN_DICTIONARY); // requested delta_binary_packed - EXPECT_EQ(fmd.row_groups[0].columns[2].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::DELTA_BINARY_PACKED); + expect_enc(2, Encoding::DELTA_BINARY_PACKED); // requested delta_length_byte_array, but should fall back to dictionary - EXPECT_EQ(fmd.row_groups[0].columns[3].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + expect_enc(3, Encoding::PLAIN_DICTIONARY); // no request, should fall back to dictionary - EXPECT_EQ(fmd.row_groups[0].columns[4].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + expect_enc(4, Encoding::PLAIN_DICTIONARY); // requested plain - EXPECT_EQ(fmd.row_groups[0].columns[5].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN); + expect_enc(5, Encoding::PLAIN); // requested dictionary - EXPECT_EQ(fmd.row_groups[0].columns[6].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + expect_enc(6, Encoding::PLAIN_DICTIONARY); // requested delta_length_byte_array - EXPECT_EQ(fmd.row_groups[0].columns[7].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::DELTA_LENGTH_BYTE_ARRAY); + expect_enc(7, Encoding::DELTA_LENGTH_BYTE_ARRAY); // requested delta_binary_packed, but should fall back to dictionary - EXPECT_EQ(fmd.row_groups[0].columns[8].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + expect_enc(8, Encoding::PLAIN_DICTIONARY); // no request, should fall back to dictionary - EXPECT_EQ(fmd.row_groups[0].columns[9].meta_data.encodings[0], - cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY); + expect_enc(9, Encoding::PLAIN_DICTIONARY); } TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls) From 9882133d472bb640ab2ac6fbbd06ca1e9635b73a Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 5 Mar 2024 11:46:52 -0800 Subject: [PATCH 13/14] change enum name per review comment --- cpp/include/cudf/io/types.hpp | 6 +++--- cpp/src/io/parquet/page_enc.cu | 2 +- cpp/src/io/parquet/writer_impl.cu | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index adeb64c4024..5d86d004fdf 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -104,8 +104,8 @@ enum statistics_freq { */ enum class column_encoding { // common encodings - NOT_SET = -1, ///< No encoding has been requested - DICTIONARY, ///< Use dictionary encoding + USE_DEFAULT = -1, ///< No encoding has been requested, use default encoding + DICTIONARY, ///< Use dictionary encoding // parquet encodings PLAIN, ///< Use plain encoding DELTA_BINARY_PACKED, ///< Use DELTA_BINARY_PACKED encoding (only valid for integer columns) @@ -605,7 +605,7 @@ class column_in_metadata { std::optional _decimal_precision; std::optional _parquet_field_id; std::vector children; - column_encoding _encoding = column_encoding::NOT_SET; + column_encoding _encoding = column_encoding::USE_DEFAULT; public: column_in_metadata() = default; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 508ec484457..617cb1d0992 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -790,7 +790,7 @@ CUDF_KERNEL void __launch_bounds__(128) if (t == 0) { if (not pages.empty()) { // set encoding - if (col_g.requested_encoding != column_encoding::NOT_SET) { + if (col_g.requested_encoding != column_encoding::USE_DEFAULT) { switch (col_g.requested_encoding) { case column_encoding::PLAIN: page_g.kernel_mask = encode_kernel_mask::PLAIN; break; case column_encoding::DICTIONARY: diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e65fa03f36d..87c8b2f1611 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -609,10 +609,10 @@ std::vector construct_schema_tree( // only call this after col_schema.type has been set auto set_encoding = [&schema, parent_idx](schema_tree_node& s, column_in_metadata const& col_meta) { - s.requested_encoding = column_encoding::NOT_SET; + s.requested_encoding = column_encoding::USE_DEFAULT; if (schema[parent_idx].name != "list" and - col_meta.get_encoding() != column_encoding::NOT_SET) { + col_meta.get_encoding() != column_encoding::USE_DEFAULT) { // do some validation switch (col_meta.get_encoding()) { case column_encoding::DELTA_BINARY_PACKED: @@ -1222,7 +1222,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, for (auto& chunk : h_chunks) { auto const& chunk_col_desc = col_desc[chunk.col_desc_id]; auto const is_requested_non_dict = - chunk_col_desc.requested_encoding != column_encoding::NOT_SET && + chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT && chunk_col_desc.requested_encoding != column_encoding::DICTIONARY; auto const is_type_non_dict = chunk_col_desc.physical_type == Type::BOOLEAN || From ea63ec2e0a91abf67d7baf4e25983227f57b456c Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 5 Mar 2024 14:23:46 -0800 Subject: [PATCH 14/14] implement suggestion from review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/io/types.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 5d86d004fdf..64d627483e6 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -103,17 +103,17 @@ enum statistics_freq { * @brief Valid encodings for use with `column_in_metadata::set_encoding()` */ enum class column_encoding { - // common encodings + // Common encodings: USE_DEFAULT = -1, ///< No encoding has been requested, use default encoding DICTIONARY, ///< Use dictionary encoding - // parquet encodings + // Parquet encodings: PLAIN, ///< Use plain encoding DELTA_BINARY_PACKED, ///< Use DELTA_BINARY_PACKED encoding (only valid for integer columns) DELTA_LENGTH_BYTE_ARRAY, ///< Use DELTA_LENGTH_BYTE_ARRAY encoding (only ///< valid for BYTE_ARRAY columns) DELTA_BYTE_ARRAY, ///< Use DELTA_BYTE_ARRAY encoding (only valid for ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns) - // orc encodings + // ORC encodings: DIRECT, ///< Use DIRECT encoding DIRECT_V2, ///< Use DIRECT_V2 encoding DICTIONARY_V2, ///< Use DICTIONARY_V2 encoding