Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix total_byte_size in Parquet row group metadata #14802

Merged
merged 8 commits into from
Jan 23, 2024
Merged
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2074,7 +2074,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
need_sync = true;
}

row_group.total_byte_size += ck.compressed_size;
row_group.total_byte_size += ck.bfr_size;
column_chunk_meta.total_uncompressed_size = ck.bfr_size;
column_chunk_meta.total_compressed_size = ck.compressed_size;
}
Expand Down
25 changes: 25 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,31 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
EXPECT_EQ(max_value, std::string(max_val));
}

TEST_F(ParquetWriterTest, RowGroupMetadata)
{
constexpr int num_rows = 1'000;
auto const ones = thrust::make_constant_iterator(1);
auto const col = cudf::test::fixed_width_column_wrapper<int>{ones, ones + num_rows, no_nulls()};
auto const table = table_view({col});

auto const filepath = temp_env->get_temp_filepath("RowGroupMetadata.parquet");
// force PLAIN encoding to make size calculation easier
cudf::io::parquet_writer_options opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
.dictionary_policy(cudf::io::dictionary_policy::NEVER)
.compression(cudf::io::compression_type::ZSTD);
cudf::io::write_parquet(opts);

// check row group metadta to make sure total_byte_size is the uncompressed value
vyasr marked this conversation as resolved.
Show resolved Hide resolved
auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;
read_footer(source, &fmd);

ASSERT_GT(fmd.row_groups.size(), 0);
EXPECT_GE(fmd.row_groups[0].total_byte_size, static_cast<int64_t>(num_rows * sizeof(int)));
vyasr marked this conversation as resolved.
Show resolved Hide resolved
}

/////////////////////////////////////////////////////////////
// custom mem mapped data sink that supports device writes
template <bool supports_device_writes>
class custom_test_memmap_sink : public cudf::io::data_sink {
Expand Down