Skip to content

Commit

Permalink
Work around incompatibilities between V2 page header handling and zSt…
Browse files Browse the repository at this point in the history
…andard compression in Parquet writer (#14772)

In the version 2 Parquet page header, neither the repetition nor definition level data is compressed. The current Parquet writer achieves this by offsetting the input buffers passed to nvcomp to skip this level data. Doing so can lead to mis-aligned data being passed to nvcomp (for zstd, input currently must be aligned on a 4 byte boundary). This PR is a short-term fix that will print an error and exit if zStandard compression is used with V2 page headers. This also fixes an underestimation of the maximum V2 page header size.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: #14772
  • Loading branch information
etseidl authored Jan 22, 2024
1 parent 8fdc62b commit f24f0b5
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 2 deletions.
14 changes: 12 additions & 2 deletions cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size;

constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }

// max V1 header size
// also valid for dict page header (V1 or V2)
constexpr int MAX_V1_HDR_SIZE = util::round_up_unsafe(27, 8);

// max V2 header size
constexpr int MAX_V2_HDR_SIZE = util::round_up_unsafe(49, 8);

// do not truncate statistics
constexpr int32_t NO_TRUNC_STATS = 0;

Expand Down Expand Up @@ -534,6 +541,9 @@ CUDF_KERNEL void __launch_bounds__(128)
uint32_t const t = threadIdx.x;
auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;

// Max page header size excluding statistics
auto const max_data_page_hdr_size = write_v2_headers ? MAX_V2_HDR_SIZE : MAX_V1_HDR_SIZE;

if (t == 0) {
col_g = col_desc[blockIdx.x];
ck_g = chunks[blockIdx.y][blockIdx.x];
Expand Down Expand Up @@ -584,7 +594,7 @@ CUDF_KERNEL void __launch_bounds__(128)
page_g.chunk = &chunks[blockIdx.y][blockIdx.x];
page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x;
page_g.hdr_size = 0;
page_g.max_hdr_size = 32;
page_g.max_hdr_size = MAX_V1_HDR_SIZE;
page_g.max_data_size = ck_g.uniq_data_size;
page_g.start_row = cur_row;
page_g.num_rows = ck_g.num_dict_entries;
Expand Down Expand Up @@ -684,7 +694,7 @@ CUDF_KERNEL void __launch_bounds__(128)
page_g.chunk_id = blockIdx.y * num_columns + blockIdx.x;
page_g.page_type = data_page_type;
page_g.hdr_size = 0;
page_g.max_hdr_size = 32; // Max size excluding statistics
page_g.max_hdr_size = max_data_page_hdr_size; // Max size excluding statistics
if (ck_g.stats) {
uint32_t stats_hdr_len = 16;
if (col_g.stats_dtype == dtype_string || col_g.stats_dtype == dtype_byte_array) {
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2220,6 +2220,10 @@ writer::impl::~impl() { close(); }

void writer::impl::init_state()
{
// See issue #14781. Can remove this check once that is fixed.
CUDF_EXPECTS(not(_write_v2_headers and _compression == Compression::ZSTD),
"V2 page headers cannot be used with ZSTD compression");

_current_chunk_offset.resize(_out_sink.size());
// Write file header
file_header_s fhdr;
Expand Down
15 changes: 15 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,21 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
EXPECT_EQ(max_value, std::string(max_val));
}

// See #14772.
// zStandard compression cannot currently be used with V2 page headers due to buffer
// alignment issues.
// TODO: Remove this test when #14781 is closed.
TEST_F(ParquetWriterTest, ZstdWithV2Header)
{
auto const expected = table_view{};

cudf::io::parquet_writer_options const out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{"14772.pq"}, expected)
.compression(cudf::io::compression_type::ZSTD)
.write_v2_headers(true);
EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error);
}

// custom mem mapped data sink that supports device writes
template <bool supports_device_writes>
class custom_test_memmap_sink : public cudf::io::data_sink {
Expand Down

0 comments on commit f24f0b5

Please sign in to comment.