From 5a5e2d3a75af57defafeb1d5ef59d449b1bd2dc0 Mon Sep 17 00:00:00 2001 From: Neville Li Date: Mon, 8 Aug 2022 22:45:48 -0400 Subject: [PATCH] Fix deprecated Parquet API usage Update usage of the following APIs in trino-parquet - Binary - ColumnChunkMetaData - ColumnDescriptor - Statistics --- .../parquet/TestParquetTimestampUtils.java | 2 +- .../TestTupleDomainParquetPredicate.java | 101 ++++++++++-------- .../BenchmarkTupleDomainParquetPredicate.java | 3 +- .../parquet/predicate/TestPredicateUtils.java | 7 +- .../parquet/reader/TestColumnReader.java | 5 +- 5 files changed, 67 insertions(+), 51 deletions(-) diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/TestParquetTimestampUtils.java b/lib/trino-parquet/src/test/java/io/trino/parquet/TestParquetTimestampUtils.java index f294b2f5ee0d..34c909b29f95 100644 --- a/lib/trino-parquet/src/test/java/io/trino/parquet/TestParquetTimestampUtils.java +++ b/lib/trino-parquet/src/test/java/io/trino/parquet/TestParquetTimestampUtils.java @@ -42,7 +42,7 @@ public void testInvalidBinaryLength() { try { byte[] invalidLengthBinaryTimestamp = new byte[8]; - decodeInt96Timestamp(Binary.fromByteArray(invalidLengthBinaryTimestamp)); + decodeInt96Timestamp(Binary.fromConstantByteArray(invalidLengthBinaryTimestamp)); } catch (TrinoException e) { assertEquals(e.getErrorCode(), NOT_SUPPORTED.toErrorCode()); diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/TestTupleDomainParquetPredicate.java b/lib/trino-parquet/src/test/java/io/trino/parquet/TestTupleDomainParquetPredicate.java index 5e93c676090d..c6a3342befe2 100644 --- a/lib/trino-parquet/src/test/java/io/trino/parquet/TestTupleDomainParquetPredicate.java +++ b/lib/trino-parquet/src/test/java/io/trino/parquet/TestTupleDomainParquetPredicate.java @@ -41,7 +41,6 @@ import org.apache.parquet.internal.column.columnindex.BoundaryOrder; import org.apache.parquet.internal.column.columnindex.ColumnIndex; import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; -import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit; import org.apache.parquet.schema.PrimitiveType; @@ -100,7 +99,6 @@ import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; import static java.util.concurrent.TimeUnit.MILLISECONDS; -import static org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; @@ -136,9 +134,11 @@ public void testBoolean() private static BooleanStatistics booleanColumnStats(boolean minimum, boolean maximum) { - BooleanStatistics statistics = new BooleanStatistics(); - statistics.setMinMax(minimum, maximum); - return statistics; + return (BooleanStatistics) Statistics.getBuilderForReading(Types.optional(PrimitiveTypeName.BOOLEAN).named("BooleanColumn")) + .withMin(BytesUtils.booleanToBytes(minimum)) + .withMax(BytesUtils.booleanToBytes(maximum)) + .withNumNulls(0) + .build(); } @Test @@ -350,9 +350,11 @@ public void testString() private static BinaryStatistics stringColumnStats(String minimum, String maximum) { - BinaryStatistics statistics = new BinaryStatistics(); - statistics.setMinMax(Binary.fromString(minimum), Binary.fromString(maximum)); - return statistics; + return (BinaryStatistics) Statistics.getBuilderForReading(Types.optional(BINARY).named("StringColumn")) + .withMin(minimum.getBytes(UTF_8)) + .withMax(maximum.getBytes(UTF_8)) + .withNumNulls(0) + .build(); } @Test @@ -514,9 +516,11 @@ private static long toEpochWithPrecision(LocalDateTime time, int precision) private static BinaryStatistics timestampColumnStats(LocalDateTime minimum, LocalDateTime maximum) { - BinaryStatistics statistics = new BinaryStatistics(); - statistics.setMinMax(Binary.fromConstantByteArray(toParquetEncoding(minimum)), Binary.fromConstantByteArray(toParquetEncoding(maximum))); - return statistics; + return (BinaryStatistics) Statistics.getBuilderForReading(Types.optional(BINARY).named("TimestampColumn")) + .withMin(toParquetEncoding(minimum)) + .withMax(toParquetEncoding(maximum)) + .withNumNulls(0) + .build(); } private static byte[] toParquetEncoding(LocalDateTime timestamp) @@ -544,9 +548,12 @@ public void testVarcharMatchesWithStatistics() RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value)); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column), UTC); - Statistics stats = getStatsBasedOnType(column.getPrimitiveType().getPrimitiveTypeName()); - stats.setNumNulls(1L); - stats.setMinMaxFromBytes(value.getBytes(UTF_8), value.getBytes(UTF_8)); + PrimitiveType type = column.getPrimitiveType(); + Statistics stats = Statistics.getBuilderForReading(type) + .withMin(value.getBytes(UTF_8)) + .withMax(value.getBytes(UTF_8)) + .withNumNulls(1L) + .build(); assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, stats), ID)); } @@ -582,7 +589,7 @@ public void testBigintMatchesWithStatistics() throws ParquetCorruptionException { RichColumnDescriptor column = new RichColumnDescriptor( - new ColumnDescriptor(new String[] {"path"}, INT64, 0, 0), + new ColumnDescriptor(new String[] {"path"}, Types.optional(INT64).named("Test column"), 0, 0), new PrimitiveType(OPTIONAL, INT64, "Test column")); TupleDomain effectivePredicate = TupleDomain.withColumnDomains(ImmutableMap.of( column, @@ -597,7 +604,7 @@ public void testBigintMatchesWithStatistics() @Test public void testVarcharMatchesWithDictionaryDescriptor() { - ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); + ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, Types.optional(BINARY).named("Test column"), 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain effectivePredicate = getEffectivePredicate(column, createVarcharType(255), EMPTY_SLICE); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column), UTC); @@ -616,7 +623,7 @@ public void testColumnIndexWithNullPages() asList(1L, 2L, 3L, 4L, 5L, 6L), toByteBufferList(null, 2L, null, 4L, null, 9L), toByteBufferList(null, 3L, null, 15L, null, 10L)); - ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, INT64, 0, 0); + ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, Types.optional(INT64).named("Test column"), 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, INT64, "Test column")); assertThat(getDomain(BIGINT, 200, columnIndex, new ParquetDataSourceId("test"), column, UTC)) .isEqualTo(Domain.create( @@ -634,7 +641,7 @@ private ColumnDescriptor createColumnDescriptor(PrimitiveTypeName typeName, Stri private TupleDomain getEffectivePredicate(RichColumnDescriptor column, VarcharType type, Slice value) { - ColumnDescriptor predicateColumn = new ColumnDescriptor(column.getPath(), column.getPrimitiveType().getPrimitiveTypeName(), 0, 0); + ColumnDescriptor predicateColumn = new ColumnDescriptor(column.getPath(), column.getPrimitiveType(), 0, 0); Domain predicateDomain = singleValue(type, value); Map predicateColumns = singletonMap(predicateColumn, predicateDomain); return withColumnDomains(predicateColumns); @@ -647,12 +654,11 @@ private static FloatStatistics floatColumnStats(float minimum, float maximum) private static FloatStatistics floatColumnStats(float minimum, float maximum, boolean hasNulls) { - FloatStatistics statistics = new FloatStatistics(); - statistics.setMinMax(minimum, maximum); - if (hasNulls) { - statistics.setNumNulls(1); - } - return statistics; + return (FloatStatistics) Statistics.getBuilderForReading(Types.optional(FLOAT).named("FloatColumn")) + .withMin(BytesUtils.longToBytes(Float.floatToRawIntBits(minimum))) + .withMax(BytesUtils.longToBytes(Float.floatToRawIntBits(maximum))) + .withNumNulls(hasNulls ? 1 : 0) + .build(); } private static DoubleStatistics doubleColumnStats(double minimum, double maximum) @@ -662,26 +668,29 @@ private static DoubleStatistics doubleColumnStats(double minimum, double maximum private static DoubleStatistics doubleColumnStats(double minimum, double maximum, boolean hasNulls) { - DoubleStatistics statistics = new DoubleStatistics(); - statistics.setMinMax(minimum, maximum); - if (hasNulls) { - statistics.setNumNulls(1); - } - return statistics; + return (DoubleStatistics) Statistics.getBuilderForReading(Types.optional(PrimitiveTypeName.DOUBLE).named("DoubleColumn")) + .withMin(BytesUtils.longToBytes(Double.doubleToLongBits(minimum))) + .withMax(BytesUtils.longToBytes(Double.doubleToLongBits(maximum))) + .withNumNulls(hasNulls ? 1 : 0) + .build(); } private static IntStatistics intColumnStats(int minimum, int maximum) { - IntStatistics statistics = new IntStatistics(); - statistics.setMinMax(minimum, maximum); - return statistics; + return (IntStatistics) Statistics.getBuilderForReading(Types.optional(INT32).named("IntColumn")) + .withMin(BytesUtils.intToBytes(minimum)) + .withMax(BytesUtils.intToBytes(maximum)) + .withNumNulls(0) + .build(); } private static LongStatistics longColumnStats(long minimum, long maximum) { - LongStatistics statistics = new LongStatistics(); - statistics.setMinMax(minimum, maximum); - return statistics; + return (LongStatistics) Statistics.getBuilderForReading(Types.optional(INT64).named("LongColumn")) + .withMin(BytesUtils.longToBytes(minimum)) + .withMax(BytesUtils.longToBytes(maximum)) + .withNumNulls(0) + .build(); } private static BinaryStatistics binaryColumnStats(long minimum, long maximum) @@ -691,18 +700,18 @@ private static BinaryStatistics binaryColumnStats(long minimum, long maximum) private static BinaryStatistics binaryColumnStats(BigInteger minimum, BigInteger maximum) { - BinaryStatistics statistics = new BinaryStatistics(); - statistics.setMinMax( - Binary.fromConstantByteArray(minimum.toByteArray()), - Binary.fromConstantByteArray(maximum.toByteArray())); - return statistics; + return (BinaryStatistics) Statistics.getBuilderForReading(Types.optional(BINARY).named("BinaryColumn")) + .withMin(minimum.toByteArray()) + .withMax(maximum.toByteArray()) + .withNumNulls(0) + .build(); } private static LongStatistics longOnlyNullsStats(long numNulls) { - LongStatistics statistics = new LongStatistics(); - statistics.setNumNulls(numNulls); - return statistics; + return (LongStatistics) Statistics.getBuilderForReading(Types.optional(INT64).named("LongColumn")) + .withNumNulls(numNulls) + .build(); } private DictionaryDescriptor floatDictionaryDescriptor(float... values) @@ -715,7 +724,7 @@ private DictionaryDescriptor floatDictionaryDescriptor(float... values) } } return new DictionaryDescriptor( - new ColumnDescriptor(new String[] {"dummy"}, new PrimitiveType(OPTIONAL, PrimitiveType.PrimitiveTypeName.FLOAT, 0, ""), 1, 1), + new ColumnDescriptor(new String[] {"dummy"}, new PrimitiveType(OPTIONAL, FLOAT, 0, "FloatColumn"), 1, 1), Optional.of(new DictionaryPage(Slices.wrappedBuffer(buf.toByteArray()), values.length, PLAIN_DICTIONARY))); } @@ -729,7 +738,7 @@ private DictionaryDescriptor doubleDictionaryDescriptor(double... values) } } return new DictionaryDescriptor( - new ColumnDescriptor(new String[] {"dummy"}, new PrimitiveType(OPTIONAL, PrimitiveType.PrimitiveTypeName.DOUBLE, 0, ""), 1, 1), + new ColumnDescriptor(new String[] {"dummy"}, new PrimitiveType(OPTIONAL, PrimitiveTypeName.DOUBLE, 0, "DoubleColumn"), 1, 1), Optional.of(new DictionaryPage(Slices.wrappedBuffer(buf.toByteArray()), values.length, PLAIN_DICTIONARY))); } diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/BenchmarkTupleDomainParquetPredicate.java b/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/BenchmarkTupleDomainParquetPredicate.java index 2ac21d919cbc..2e0a192e2f5d 100644 --- a/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/BenchmarkTupleDomainParquetPredicate.java +++ b/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/BenchmarkTupleDomainParquetPredicate.java @@ -19,6 +19,7 @@ import io.trino.parquet.ParquetEncoding; import io.trino.spi.predicate.Domain; import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.Types; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -91,7 +92,7 @@ private DictionaryDescriptor createBigintDictionary() } return new DictionaryDescriptor( - new ColumnDescriptor(new String[] {"path"}, INT64, 0, 0), + new ColumnDescriptor(new String[] {"path"}, Types.optional(INT64).named("Test column"), 0, 0), Optional.of( new DictionaryPage( slice, diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/TestPredicateUtils.java b/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/TestPredicateUtils.java index 06a048d1318d..dfe102049b84 100644 --- a/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/TestPredicateUtils.java +++ b/lib/trino-parquet/src/test/java/io/trino/parquet/predicate/TestPredicateUtils.java @@ -17,7 +17,10 @@ import org.apache.parquet.column.Encoding; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.testng.annotations.Test; import java.util.Set; @@ -108,7 +111,9 @@ private ColumnChunkMetaData createColumnMetaDataV2(Encoding... dataEncodings) .addDictEncoding(PLAIN) .addDataEncodings(ImmutableSet.copyOf(dataEncodings)).build(); - return ColumnChunkMetaData.get(fromDotString("column"), BINARY, UNCOMPRESSED, encodingStats, encodingStats.getDataEncodings(), new BinaryStatistics(), 0, 0, 1, 1, 1); + PrimitiveType type = Types.optional(BINARY).named(""); + Statistics stats = Statistics.createStats(type); + return ColumnChunkMetaData.get(fromDotString("column"), type, UNCOMPRESSED, encodingStats, encodingStats.getDataEncodings(), stats, 0, 0, 1, 1, 1); } @SuppressWarnings("deprecation") diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/reader/TestColumnReader.java b/lib/trino-parquet/src/test/java/io/trino/parquet/reader/TestColumnReader.java index b9e1cda0282c..ae3052666fac 100644 --- a/lib/trino-parquet/src/test/java/io/trino/parquet/reader/TestColumnReader.java +++ b/lib/trino-parquet/src/test/java/io/trino/parquet/reader/TestColumnReader.java @@ -22,11 +22,12 @@ import io.trino.spi.block.Block; import org.apache.parquet.bytes.HeapByteBufferAllocator; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.column.statistics.IntStatistics; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.column.values.plain.PlainValuesWriter; import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder; import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -248,7 +249,7 @@ private static DataPage createDataPage(RowRange pageRowRange) Slices.wrappedBuffer(encodePlainValues(values)), valueCount * 4, OptionalLong.of(start), - new IntStatistics(), + Statistics.createStats(Types.optional(INT32).named("TestColumn")), false); }