From 35f6ebc0f7f1570d4e4bb8f404849267c0f9b02f Mon Sep 17 00:00:00 2001 From: Nikola Knezevic Date: Mon, 25 Sep 2023 13:41:20 +0200 Subject: [PATCH] Correctly account for page size Per https://github.com/xitongsys/parquet-go/issues/547#issuecomment-1730770158 In [parquet's thift file](https://github.com/apache/parquet-format/blob/aeae80660c1d0c97314e9da837de1abdebd49c37/src/main/thrift/parquet.thrift#L642C1-L642C1), =CompressedPageSize= field of =PageHeader= refers to ``` /** Compressed (and potentially encrypted) page size in bytes, not including this header **/ 3: required i32 compressed_page_size ``` So following code only account for page size without header. ```go pageLocation.CompressedPageSize = page.Header.CompressedPageSize ``` While according to [parquet' thrift file](https://github.com/apache/parquet-format/blob/aeae80660c1d0c97314e9da837de1abdebd49c37/src/main/thrift/parquet.thrift#L926) =CompressedPageSize= of =PageLocation= includes the size of page header ``` /** * Size of the page, including header. Sum of compressed_page_size and header * length */ 2: required i32 compressed_page_size ``` Closes #547 --- writer/writer.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/writer/writer.go b/writer/writer.go index 3efaf5e4..f570c15c 100644 --- a/writer/writer.go +++ b/writer/writer.go @@ -440,6 +440,7 @@ func (pw *ParquetWriter) Flush(flag bool) error { } page := rowGroup.Chunks[k].Pages[l] + data := page.RawData //only record DataPage if page.Header.Type != parquet.PageType_DICTIONARY_PAGE { if page.Header.DataPageHeader == nil && page.Header.DataPageHeaderV2 == nil { @@ -473,14 +474,14 @@ func (pw *ParquetWriter) Flush(flag bool) error { pageLocation := parquet.NewPageLocation() pageLocation.Offset = pw.Offset pageLocation.FirstRowIndex = firstRowIndex - pageLocation.CompressedPageSize = page.Header.CompressedPageSize + pageLocation.CompressedPageSize = int32(len(data)) + // pageLocation.CompressedPageSize = pw.pageLocationHeaderSize + page.Header.CompressedPageSize offsetIndex.PageLocations = append(offsetIndex.PageLocations, pageLocation) firstRowIndex += int64(page.Header.DataPageHeader.NumValues) } - data := rowGroup.Chunks[k].Pages[l].RawData if _, err = pw.PFile.Write(data); err != nil { return err }