Skip to content

Commit

Permalink
Fix reporting of row group size by parquet writer
Browse files Browse the repository at this point in the history
Output of parquet-tools
Before
Row group 25:  count: 2240892  26.32 B records  start: 1479039171  total(compressed): 56.243 MB total(uncompressed):56.243 MB

After
Row group 25:  count: 2244256  26.34 B records  start: 1479178837  total(compressed): 56.370 MB total(uncompressed):167.418 MB
  • Loading branch information
raunaqmorarka committed May 11, 2023
1 parent a212bf1 commit decd761
Showing 1 changed file with 3 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -374,9 +374,10 @@ Slice getFooter(List<RowGroup> rowGroups, MessageType messageType)

private void updateRowGroups(List<ColumnMetaData> columnMetaData)
{
long totalBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_compressed_size).sum();
long totalCompressedBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_compressed_size).sum();
long totalBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_uncompressed_size).sum();
ImmutableList<org.apache.parquet.format.ColumnChunk> columnChunks = columnMetaData.stream().map(ParquetWriter::toColumnChunk).collect(toImmutableList());
rowGroupBuilder.add(new RowGroup(columnChunks, totalBytes, rows));
rowGroupBuilder.add(new RowGroup(columnChunks, totalBytes, rows).setTotal_compressed_size(totalCompressedBytes));
}

private static org.apache.parquet.format.ColumnChunk toColumnChunk(ColumnMetaData metaData)
Expand Down

0 comments on commit decd761

Please sign in to comment.