Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix total_byte_size in Parquet row group metadata #14802

Merged
merged 8 commits into from
Jan 23, 2024
Merged
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2074,7 +2074,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
need_sync = true;
}

row_group.total_byte_size += ck.compressed_size;
row_group.total_byte_size += ck.bfr_size;
column_chunk_meta.total_uncompressed_size = ck.bfr_size;
column_chunk_meta.total_compressed_size = ck.compressed_size;
}
Expand Down
28 changes: 28 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,33 @@ TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
EXPECT_EQ(max_value, std::string(max_val));
}

TEST_F(ParquetWriterTest, RowGroupMetadata)
{
using column_type = int;
constexpr int num_rows = 1'000;
auto const ones = thrust::make_constant_iterator(1);
auto const col =
cudf::test::fixed_width_column_wrapper<column_type>{ones, ones + num_rows, no_nulls()};
auto const table = table_view({col});

auto const filepath = temp_env->get_temp_filepath("RowGroupMetadata.parquet");
// force PLAIN encoding to make size calculation easier
cudf::io::parquet_writer_options opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
.dictionary_policy(cudf::io::dictionary_policy::NEVER)
.compression(cudf::io::compression_type::ZSTD);
cudf::io::write_parquet(opts);

// check row group metadata to make sure total_byte_size is the uncompressed value
auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;
read_footer(source, &fmd);

ASSERT_GT(fmd.row_groups.size(), 0);
EXPECT_GE(fmd.row_groups[0].total_byte_size,
static_cast<int64_t>(num_rows * sizeof(column_type)));
}

// See #14772.
// zStandard compression cannot currently be used with V2 page headers due to buffer
// alignment issues.
Expand All @@ -1416,6 +1443,7 @@ TEST_F(ParquetWriterTest, ZstdWithV2Header)
EXPECT_THROW(cudf::io::write_parquet(out_opts), cudf::logic_error);
}

/////////////////////////////////////////////////////////////
// custom mem mapped data sink that supports device writes
template <bool supports_device_writes>
class custom_test_memmap_sink : public cudf::io::data_sink {
Expand Down