From f24f0b528b16454a2b79182f77bb46a663ab2c25 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 22 Jan 2024 10:42:25 -0800 Subject: [PATCH] Work around incompatibilities between V2 page header handling and zStandard compression in Parquet writer (#14772) In the version 2 Parquet page header, neither the repetition nor definition level data is compressed. The current Parquet writer achieves this by offsetting the input buffers passed to nvcomp to skip this level data. Doing so can lead to mis-aligned data being passed to nvcomp (for zstd, input currently must be aligned on a 4 byte boundary). This PR is a short-term fix that will print an error and exit if zStandard compression is used with V2 page headers. This also fixes an underestimation of the maximum V2 page header size. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/14772 --- cpp/src/io/parquet/page_enc.cu | 14 ++++++++++++-- cpp/src/io/parquet/writer_impl.cu | 4 ++++ cpp/tests/io/parquet_writer_test.cpp | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 12af5888d2f..3cc4fda695f 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -57,6 +57,13 @@ constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; constexpr int rolling_idx(int pos) { return rolling_index(pos); } +// max V1 header size +// also valid for dict page header (V1 or V2) +constexpr int MAX_V1_HDR_SIZE = util::round_up_unsafe(27, 8); + +// max V2 header size +constexpr int MAX_V2_HDR_SIZE = util::round_up_unsafe(49, 8); + // do not truncate statistics constexpr int32_t NO_TRUNC_STATS = 0; @@ -534,6 +541,9 @@ CUDF_KERNEL void __launch_bounds__(128) uint32_t const t = threadIdx.x; auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE; + // Max page header size excluding statistics + auto const max_data_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE; + if (t == 0) { col_g = col_desc[blockIdx.x]; ck_g = chunks[blockIdx.y][blockIdx.x]; @@ -584,7 +594,7 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.hdr_size = 0; - page_g.max_hdr_size = 32; + page_g.max_hdr_size = MAX_V1_HDR_SIZE; page_g.max_data_size = ck_g.uniq_data_size; page_g.start_row = cur_row; page_g.num_rows = ck_g.num_dict_entries; @@ -684,7 +694,7 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.page_type = data_page_type; page_g.hdr_size = 0; - page_g.max_hdr_size = 32; // Max size excluding statistics + page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics if (ck_g.stats) { uint32_t stats_hdr_len = 16; if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 90f52c0ee70..417577f7b89 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2220,6 +2220,10 @@ writer::impl::~impl() { close(); } void writer::impl::init_state() { + // See issue #14781. Can remove this check once that is fixed. + CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD), + "V2 page headers cannot be used with ZSTD compression"); + _current_chunk_offset.resize(_out_sink.size()); // Write file header file_header_s fhdr; diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 9415e018c6a..946c0e23f08 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1401,6 +1401,21 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics) EXPECT_EQ(max_value, std::string(max_val)); } +// See #14772. +// zStandard compression cannot currently be used with V2 page headers due to buffer +// alignment issues. +// TODO: Remove this test when #14781 is closed. +TEST_F(ParquetWriterTest, ZstdWithV2Header) +{ + auto const expected = table_view{}; + + cudf::io::parquet_writer_options const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected) + .compression(cudf::io::compression_type::ZSTD) + .write_v2_headers(true); + EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error); +} + // custom mem mapped data sink that supports device writes template class custom_test_memmap_sink : public cudf::io::data_sink {