From ef3ce4bc8db008f58249241c16c80f7e6e600fa9 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 22 Jan 2024 16:36:10 -0800 Subject: [PATCH] Fix total_byte_size in Parquet row group metadata (#14802) The `total_byte_size` field in the row group metadata should be "[t]otal byte size of all the uncompressed column data in this row group". cuDF currently populates this field with the _compressed_ size. This PR fixes that and adds a test. Authors: - Ed Seidl (https://github.com/etseidl) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/14802 --- cpp/src/io/parquet/writer_impl.cu | 2 +- cpp/tests/io/parquet_writer_test.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 417577f7b89..93b225dca1b 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2074,7 +2074,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, need_sync = true; } - row_group.total_byte_size += ck.compressed_size; + row_group.total_byte_size += ck.bfr_size; column_chunk_meta.total_uncompressed_size = ck.bfr_size; column_chunk_meta.total_compressed_size = ck.compressed_size; } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 946c0e23f08..2df34c7928b 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1401,6 +1401,33 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics) EXPECT_EQ(max_value, std::string(max_val)); } +TEST_F(ParquetWriterTest, RowGroupMetadata) +{ + using column_type = int; + constexpr int num_rows = 1'000; + auto const ones = thrust::make_constant_iterator(1); + auto const col = + cudf::test::fixed_width_column_wrapper{ones, ones + num_rows, no_nulls()}; + auto const table = table_view({col}); + + auto const filepath = temp_env->get_temp_filepath("RowGroupMetadata.parquet"); + // force PLAIN encoding to make size calculation easier + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table) + .dictionary_policy(cudf::io::dictionary_policy::NEVER) + .compression(cudf::io::compression_type::ZSTD); + cudf::io::write_parquet(opts); + + // check row group metadata to make sure total_byte_size is the uncompressed value + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + ASSERT_GT(fmd.row_groups.size(), 0); + EXPECT_GE(fmd.row_groups[0].total_byte_size, + static_cast(num_rows * sizeof(column_type))); +} + // See #14772. // zStandard compression cannot currently be used with V2 page headers due to buffer // alignment issues. @@ -1416,6 +1443,7 @@ TEST_F(ParquetWriterTest, ZstdWithV2Header) EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error); } +///////////////////////////////////////////////////////////// // custom mem mapped data sink that supports device writes template class custom_test_memmap_sink : public cudf::io::data_sink {