diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 5f50f0058..b7b4321ef 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -191,6 +191,52 @@ enum FieldRepetitionType { REPEATED = 2; } +/** + * A structure for capturing metadata for estimating the unencoded, + * uncompressed size of data written. This is useful for readers to estimate + * how much memory is needed to reconstruct data in their memory model and for + * fine grained filter pushdown on nested structures (the histograms contained + * in this structure can help determine the number of nulls at a particular + * nesting level and maximum length of lists). + */ +struct SizeStatistics { + /** + * The number of physical bytes stored for BYTE_ARRAY data values assuming + * no encoding. This is exclusive of the bytes needed to store the length of + * each byte array. In other words, this field is equivalent to the `(size + * of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + * written)`. To determine unencoded sizes of other types readers can use + * schema information multiplied by the number of non-null and null values. + * The number of null/non-null values can be inferred from the histograms + * below. + * + * For example, if a column chunk is dictionary-encoded with dictionary + * ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + * then this value for that data page should be 7 (1 + 1 + 2 + 3). + * + * This field should only be set for types that use BYTE_ARRAY as their + * physical type. + */ + 1: optional i64 unencoded_byte_array_data_bytes; + /** + * When present, there is expected to be one element corresponding to each + * repetition (i.e. size=max repetition_level+1) where each element + * represents the number of times the repetition level was observed in the + * data. + * + * This field may be omitted if max_repetition_level is 0 without loss + * of information. + **/ + 2: optional list repetition_level_histogram; + /** + * Same as repetition_level_histogram except for definition levels. + * + * This field may be omitted if max_definition_level is 0 or 1 without + * loss of information. + **/ + 3: optional list definition_level_histogram; +} + /** * Statistics per row group and per page * All fields are optional. @@ -529,7 +575,7 @@ struct DataPageHeader { /** Encoding used for repetition levels **/ 4: required Encoding repetition_level_encoding; - /** Optional statistics for the data in this page**/ + /** Optional statistics for the data in this page **/ 5: optional Statistics statistics; } @@ -571,19 +617,19 @@ struct DataPageHeaderV2 { // repetition levels and definition levels are always using RLE (without size in it) - /** length of the definition levels */ + /** Length of the definition levels */ 5: required i32 definition_levels_byte_length; - /** length of the repetition levels */ + /** Length of the repetition levels */ 6: required i32 repetition_levels_byte_length; - /** whether the values are compressed. + /** Whether the values are compressed. Which means the section of the page between definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) is compressed with the compression_codec. If missing it is considered compressed */ 7: optional bool is_compressed = true; - /** optional statistics for the data in this page **/ + /** Optional statistics for the data in this page **/ 8: optional Statistics statistics; } @@ -596,11 +642,11 @@ union BloomFilterAlgorithm { } /** Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash - * algorithm. It uses 64 bits version of xxHash. + * algorithm. It uses 64 bits version of xxHash. **/ struct XxHash {} -/** +/** * The hash function used in Bloom filter. This function takes the hash of a column value * using plain encoding. **/ @@ -764,6 +810,14 @@ struct ColumnMetaData { * in a single I/O. */ 15: optional i32 bloom_filter_length; + + /** + * Optional statistics to help estimate total memory when converted to in-memory + * representations. The histograms contained in these statistics can + * also be useful in some cases for more fine-grained nullability/list length + * filter pushdown. + */ + 16: optional SizeStatistics size_statistics; } struct EncryptionWithFooterKey { @@ -772,7 +826,7 @@ struct EncryptionWithFooterKey { struct EncryptionWithColumnKey { /** Column path in schema **/ 1: required list path_in_schema - + /** Retrieval metadata of column encryption key **/ 2: optional binary key_metadata } @@ -811,7 +865,7 @@ struct ColumnChunk { /** Crypto metadata of encrypted columns **/ 8: optional ColumnCryptoMetaData crypto_metadata - + /** Encrypted column metadata for this chunk **/ 9: optional binary encrypted_column_metadata } @@ -938,6 +992,13 @@ struct OffsetIndex { * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. */ 1: required list page_locations + /** + * Unencoded/uncompressed size for BYTE_ARRAY types. + * + * See documention for unencoded_byte_array_data_bytes in SizeStatistics for + * more details on this field. + */ + 2: optional list unencoded_byte_array_data_bytes } /** @@ -977,6 +1038,25 @@ struct ColumnIndex { /** A list containing the number of null values for each page **/ 5: optional list null_counts + + /** + * Contains repetition level histograms for each page + * concatenated together. The repetition_level_histogram field on + * SizeStatistics contains more details. + * + * When present the length should always be (number of pages * + * (max_repetition_level + 1)) elements. + * + * Element 0 is the first element of the histogram for the first page. + * Element (max_repetition_level + 1) is the first element of the histogram + * for the second page. + **/ + 6: optional list repetition_level_histograms; + /** + * Same as repetition_level_histograms except for definitions levels. + **/ + 7: optional list definition_level_histograms; + } struct AesGcmV1 { @@ -985,7 +1065,7 @@ struct AesGcmV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix @@ -997,7 +1077,7 @@ struct AesGcmCtrV1 { /** Unique file identifier part of AAD suffix **/ 2: optional binary aad_file_unique - + /** In files encrypted with AAD prefix without storing it, * readers must supply the prefix **/ 3: optional bool supply_aad_prefix