From 3d949b54cdc42c1fec77bd0bcf2a9a98547b172e Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 17 Jan 2024 13:24:48 -0800 Subject: [PATCH 1/8] workaround for v2 alignment issues --- cpp/src/io/parquet/writer_impl.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 90f52c0ee70..c95dd305933 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2184,6 +2184,9 @@ writer::impl::impl(std::vector> sinks, if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } + if (_write_v2_headers and _compression == Compression::ZSTD) { + CUDF_FAIL("V2 page headers cannot be used with ZSTD compression"); + } init_state(); } From b5fc00b757589a04934a704948c63354a279c3fb Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 17 Jan 2024 15:39:11 -0800 Subject: [PATCH 2/8] get rid of some (incorrect) magic numbers --- cpp/src/io/parquet/page_enc.cu | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 12af5888d2f..f83c37353ed 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -57,6 +57,13 @@ constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; constexpr int rolling_idx(int pos) { return rolling_index(pos); } +// max V1 header size (27 rounded up to 8 byte boundary) +// also valid for dict page header (V1 or V2) +constexpr int MAX_V1_HDR_SIZE = 32; + +// max V2 header size (49 rounded up to 8 byte boundary) +constexpr int MAX_V2_HDR_SIZE = 56; + // do not truncate statistics constexpr int32_t NO_TRUNC_STATS = 0; @@ -534,6 +541,9 @@ CUDF_KERNEL void __launch_bounds__(128) uint32_t const t = threadIdx.x; auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE; + // Max page header size excluding statistics + auto const max_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE; + if (t == 0) { col_g = col_desc[blockIdx.x]; ck_g = chunks[blockIdx.y][blockIdx.x]; @@ -584,7 +594,7 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk = &chunks[blockIdx.y][blockIdx.x]; page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.hdr_size = 0; - page_g.max_hdr_size = 32; + page_g.max_hdr_size = MAX_V1_HDR_SIZE; page_g.max_data_size = ck_g.uniq_data_size; page_g.start_row = cur_row; page_g.num_rows = ck_g.num_dict_entries; @@ -684,7 +694,7 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.page_type = data_page_type; page_g.hdr_size = 0; - page_g.max_hdr_size = 32; // Max size excluding statistics + page_g.max_hdr_size = max_page_hdr_size; // Max size excluding statistics if (ck_g.stats) { uint32_t stats_hdr_len = 16; if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) { From 6cb3c0c1b0ceac0eb409a11014fb1057ce0c803d Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 17 Jan 2024 15:55:23 -0800 Subject: [PATCH 3/8] better name for const --- cpp/src/io/parquet/page_enc.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index f83c37353ed..52de8e8ac40 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -542,7 +542,7 @@ CUDF_KERNEL void __launch_bounds__(128) auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE; // Max page header size excluding statistics - auto const max_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE; + auto const max_data_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE; if (t == 0) { col_g = col_desc[blockIdx.x]; @@ -694,7 +694,7 @@ CUDF_KERNEL void __launch_bounds__(128) page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x; page_g.page_type = data_page_type; page_g.hdr_size = 0; - page_g.max_hdr_size = max_page_hdr_size; // Max size excluding statistics + page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics if (ck_g.stats) { uint32_t stats_hdr_len = 16; if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) { From 475f4ab5a138a756e1f6c0991e4f0a58e47ea701 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 17 Jan 2024 22:20:20 -0800 Subject: [PATCH 4/8] Apply suggestions from code review Co-authored-by: Vukasin Milovanovic --- cpp/src/io/parquet/page_enc.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 52de8e8ac40..fb2cd7e4842 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -57,12 +57,12 @@ constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; constexpr int rolling_idx(int pos) { return rolling_index(pos); } -// max V1 header size (27 rounded up to 8 byte boundary) +// max V1 header size // also valid for dict page header (V1 or V2) -constexpr int MAX_V1_HDR_SIZE = 32; +constexpr int MAX_V1_HDR_SIZE = util::round_up_unsafe(27, 8); -// max V2 header size (49 rounded up to 8 byte boundary) -constexpr int MAX_V2_HDR_SIZE = 56; +// max V2 header size +constexpr int MAX_V2_HDR_SIZE = util::round_up_unsafe(49, 8); // do not truncate statistics constexpr int32_t NO_TRUNC_STATS = 0; From 57122c26c6a9cd8f4125e4830f293f437041d418 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 18 Jan 2024 08:16:48 -0800 Subject: [PATCH 5/8] formatting --- cpp/src/io/parquet/page_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fb2cd7e4842..3cc4fda695f 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -57,7 +57,7 @@ constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; constexpr int rolling_idx(int pos) { return rolling_index(pos); } -// max V1 header size +// max V1 header size // also valid for dict page header (V1 or V2) constexpr int MAX_V1_HDR_SIZE = util::round_up_unsafe(27, 8); From c7a2a59534812906181ed1ecff6f21dbcf07d72f Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 18 Jan 2024 08:23:57 -0800 Subject: [PATCH 6/8] move validation to init_state() --- cpp/src/io/parquet/writer_impl.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index c95dd305933..d396762603d 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2184,9 +2184,6 @@ writer::impl::impl(std::vector> sinks, if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } - if (_write_v2_headers and _compression == Compression::ZSTD) { - CUDF_FAIL("V2 page headers cannot be used with ZSTD compression"); - } init_state(); } @@ -2223,6 +2220,10 @@ writer::impl::~impl() { close(); } void writer::impl::init_state() { + if (_write_v2_headers and _compression == Compression::ZSTD) { + CUDF_FAIL("V2 page headers cannot be used with ZSTD compression"); + } + _current_chunk_offset.resize(_out_sink.size()); // Write file header file_header_s fhdr; From 931f4be3da9888c6d44c705ec515d271040f71be Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 18 Jan 2024 08:48:08 -0800 Subject: [PATCH 7/8] add test --- cpp/tests/io/parquet_writer_test.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 9415e018c6a..946c0e23f08 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1401,6 +1401,21 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics) EXPECT_EQ(max_value, std::string(max_val)); } +// See #14772. +// zStandard compression cannot currently be used with V2 page headers due to buffer +// alignment issues. +// TODO: Remove this test when #14781 is closed. +TEST_F(ParquetWriterTest, ZstdWithV2Header) +{ + auto const expected = table_view{}; + + cudf::io::parquet_writer_options const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected) + .compression(cudf::io::compression_type::ZSTD) + .write_v2_headers(true); + EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error); +} + // custom mem mapped data sink that supports device writes template class custom_test_memmap_sink : public cudf::io::data_sink { From b5e3d7144a9681a63c52e18ba343034fac5061ac Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 18 Jan 2024 11:18:36 -0800 Subject: [PATCH 8/8] use CUDF_EXPECTS and add a comment --- cpp/src/io/parquet/writer_impl.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index d396762603d..417577f7b89 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2220,9 +2220,9 @@ writer::impl::~impl() { close(); } void writer::impl::init_state() { - if (_write_v2_headers and _compression == Compression::ZSTD) { - CUDF_FAIL("V2 page headers cannot be used with ZSTD compression"); - } + // See issue #14781. Can remove this check once that is fixed. + CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD), + "V2 page headers cannot be used with ZSTD compression"); _current_chunk_offset.resize(_out_sink.size()); // Write file header