diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 417577f7b89..93b225dca1b 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2074,7 +2074,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, need_sync = true; } - row_group.total_byte_size += ck.compressed_size; + row_group.total_byte_size += ck.bfr_size; column_chunk_meta.total_uncompressed_size = ck.bfr_size; column_chunk_meta.total_compressed_size = ck.compressed_size; } diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 946c0e23f08..2df34c7928b 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -1401,6 +1401,33 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics) EXPECT_EQ(max_value, std::string(max_val)); } +TEST_F(ParquetWriterTest, RowGroupMetadata) +{ + using column_type = int; + constexpr int num_rows = 1'000; + auto const ones = thrust::make_constant_iterator(1); + auto const col = + cudf::test::fixed_width_column_wrapper{ones, ones + num_rows, no_nulls()}; + auto const table = table_view({col}); + + auto const filepath = temp_env->get_temp_filepath("RowGroupMetadata.parquet"); + // force PLAIN encoding to make size calculation easier + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table) + .dictionary_policy(cudf::io::dictionary_policy::NEVER) + .compression(cudf::io::compression_type::ZSTD); + cudf::io::write_parquet(opts); + + // check row group metadata to make sure total_byte_size is the uncompressed value + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + ASSERT_GT(fmd.row_groups.size(), 0); + EXPECT_GE(fmd.row_groups[0].total_byte_size, + static_cast(num_rows * sizeof(column_type))); +} + // See #14772. // zStandard compression cannot currently be used with V2 page headers due to buffer // alignment issues. @@ -1416,6 +1443,7 @@ TEST_F(ParquetWriterTest, ZstdWithV2Header) EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error); } +///////////////////////////////////////////////////////////// // custom mem mapped data sink that supports device writes template class custom_test_memmap_sink : public cudf::io::data_sink {