From 10138dc3212dcc1d22d94d89efddb71aa8b8d441 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Thu, 13 Oct 2022 21:06:54 -0700 Subject: [PATCH 1/4] Simplify HiveType isSupported --- .../java/io/trino/plugin/hive/HiveType.java | 72 +++++++++++-------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java index 210d05c3cc94..5f16bb4a8099 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java @@ -44,7 +44,6 @@ import static io.trino.plugin.hive.util.HiveTypeTranslator.UNION_FIELD_FIELD_PREFIX; import static io.trino.plugin.hive.util.HiveTypeTranslator.UNION_FIELD_TAG_NAME; import static io.trino.plugin.hive.util.HiveTypeTranslator.UNION_FIELD_TAG_TYPE; -import static io.trino.plugin.hive.util.HiveTypeTranslator.fromPrimitiveType; import static io.trino.plugin.hive.util.HiveTypeTranslator.toTypeInfo; import static io.trino.plugin.hive.util.HiveTypeTranslator.toTypeSignature; import static io.trino.plugin.hive.util.SerdeConstants.BIGINT_TYPE_NAME; @@ -162,35 +161,50 @@ public boolean isSupportedType(StorageFormat storageFormat) return isSupportedType(getTypeInfo(), storageFormat); } - public static boolean isSupportedType(TypeInfo typeInfo, StorageFormat storageFormat) + private static boolean isSupportedType(TypeInfo typeInfo, StorageFormat storageFormat) { - switch (typeInfo.getCategory()) { - case PRIMITIVE: - return fromPrimitiveType((PrimitiveTypeInfo) typeInfo) != null; - case MAP: - MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; - return isSupportedType(mapTypeInfo.getMapKeyTypeInfo(), storageFormat) && isSupportedType(mapTypeInfo.getMapValueTypeInfo(), storageFormat); - case LIST: - ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; - return isSupportedType(listTypeInfo.getListElementTypeInfo(), storageFormat); - case STRUCT: - StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; - return structTypeInfo.getAllStructFieldTypeInfos().stream() - .allMatch(fieldTypeInfo -> isSupportedType(fieldTypeInfo, storageFormat)); - case UNION: - // This feature (reading uniontypes as structs) has only been verified against Avro and ORC tables. Here's a discussion: - // 1. Avro tables are supported and verified. - // 2. ORC tables are supported and verified. - // 3. The Parquet format doesn't support uniontypes itself so there's no need to add support for it in Trino. - // 4. TODO: RCFile tables are not supported yet. - // 5. TODO: The support for Avro is done in SerDeUtils so it's possible that formats other than Avro are also supported. But verification is needed. - if (storageFormat.getSerde().equalsIgnoreCase(AVRO.getSerde()) || storageFormat.getSerde().equalsIgnoreCase(ORC.getSerde())) { - UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; - return unionTypeInfo.getAllUnionObjectTypeInfos().stream() - .allMatch(fieldTypeInfo -> isSupportedType(fieldTypeInfo, storageFormat)); - } - } - return false; + return switch (typeInfo.getCategory()) { + case PRIMITIVE -> isSupported((PrimitiveTypeInfo) typeInfo); + case MAP -> isSupportedType(((MapTypeInfo) typeInfo).getMapKeyTypeInfo(), storageFormat) && + isSupportedType(((MapTypeInfo) typeInfo).getMapValueTypeInfo(), storageFormat); + case LIST -> isSupportedType(((ListTypeInfo) typeInfo).getListElementTypeInfo(), storageFormat); + case STRUCT -> ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos().stream().allMatch(fieldTypeInfo -> isSupportedType(fieldTypeInfo, storageFormat)); + case UNION -> + // This feature (reading union types as structs) has only been verified against Avro and ORC tables. Here's a discussion: + // 1. Avro tables are supported and verified. + // 2. ORC tables are supported and verified. + // 3. The Parquet format doesn't support union types itself so there's no need to add support for it in Trino. + // 4. TODO: RCFile tables are not supported yet. + // 5. TODO: The support for Avro is done in SerDeUtils so it's possible that formats other than Avro are also supported. But verification is needed. + storageFormat.getSerde().equalsIgnoreCase(AVRO.getSerde()) || + storageFormat.getSerde().equalsIgnoreCase(ORC.getSerde()) || + ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos().stream().allMatch(fieldTypeInfo -> isSupportedType(fieldTypeInfo, storageFormat)); + }; + } + + private static boolean isSupported(PrimitiveTypeInfo typeInfo) + { + return switch (typeInfo.getPrimitiveCategory()) { + case BOOLEAN, + BYTE, + SHORT, + INT, + LONG, + FLOAT, + DOUBLE, + STRING, + VARCHAR, + CHAR, + DATE, + TIMESTAMP, + BINARY, + DECIMAL -> true; + case TIMESTAMPLOCALTZ, + INTERVAL_YEAR_MONTH, + INTERVAL_DAY_TIME, + VOID, + UNKNOWN -> false; + }; } @JsonCreator From a69b2348863964fe1a409847b2406513af4c6b67 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Wed, 19 Apr 2023 15:37:19 -0700 Subject: [PATCH 2/4] Cleanup HiveType --- .../trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java index 5f16bb4a8099..691579009aca 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveType.java @@ -250,7 +250,7 @@ else if (typeInfo instanceof UnionTypeInfo unionTypeInfo) { try { if (fieldIndex == 0) { // union's tag field, defined in {@link io.trino.plugin.hive.util.HiveTypeTranslator#toTypeSignature} - return Optional.of(HiveType.toHiveType(UNION_FIELD_TAG_TYPE)); + return Optional.of(toHiveType(UNION_FIELD_TAG_TYPE)); } else { typeInfo = unionTypeInfo.getAllUnionObjectTypeInfos().get(fieldIndex - 1); From d49def930fc2ea82626689c2d12f1e9ab07bd187 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Thu, 13 Oct 2022 21:43:31 -0700 Subject: [PATCH 3/4] Remove HiveUtil isStructuralType for HiveType --- .../java/io/trino/plugin/hive/GenericHiveRecordCursor.java | 2 +- .../src/main/java/io/trino/plugin/hive/util/HiveUtil.java | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java index e67c25cd8136..7a9878fb7b7e 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/GenericHiveRecordCursor.java @@ -540,7 +540,7 @@ else if (type instanceof VarcharType || VARBINARY.equals(type)) { else if (type instanceof CharType) { parseStringColumn(column); } - else if (isStructuralType(hiveTypes[column])) { + else if (isStructuralType(type)) { parseObjectColumn(column); } else if (DATE.equals(type)) { diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java index 78976db0f36c..eaaa5a6d57fd 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java @@ -38,7 +38,6 @@ import io.trino.plugin.hive.metastore.Column; import io.trino.plugin.hive.metastore.SortingColumn; import io.trino.plugin.hive.metastore.Table; -import io.trino.plugin.hive.type.Category; import io.trino.plugin.hive.type.StructTypeInfo; import io.trino.spi.ErrorCodeSupplier; import io.trino.spi.TrinoException; @@ -696,11 +695,6 @@ public static boolean isStructuralType(Type type) return (type instanceof ArrayType) || (type instanceof MapType) || (type instanceof RowType); } - public static boolean isStructuralType(HiveType hiveType) - { - return hiveType.getCategory() == Category.LIST || hiveType.getCategory() == Category.MAP || hiveType.getCategory() == Category.STRUCT || hiveType.getCategory() == Category.UNION; - } - public static boolean booleanPartitionKey(String value, String name) { if (value.equalsIgnoreCase("true")) { From ae8f8640812dde85a74cd2567091a8ee5628c5b4 Mon Sep 17 00:00:00 2001 From: Dain Sundstrom Date: Wed, 19 Apr 2023 15:44:38 -0700 Subject: [PATCH 4/4] Fix HiveUtil IntelliJ warnings --- .../io/trino/plugin/hive/util/HiveUtil.java | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java index eaaa5a6d57fd..59ae442ba775 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveUtil.java @@ -25,7 +25,6 @@ import io.airlift.compress.lzo.LzopCodec; import io.airlift.slice.Slice; import io.airlift.slice.SliceUtf8; -import io.airlift.slice.Slices; import io.trino.hadoop.TextLineLengthLimitExceededException; import io.trino.hive.formats.compression.CompressionKind; import io.trino.orc.OrcWriterOptions; @@ -173,7 +172,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; -import static java.util.stream.Collectors.joining; import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS; import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR; @@ -181,7 +179,7 @@ public final class HiveUtil { public static final String SPARK_TABLE_PROVIDER_KEY = "spark.sql.sources.provider"; public static final String DELTA_LAKE_PROVIDER = "delta"; - public static final String SPARK_TABLE_BUCKET_NUMBER_KEY = "spark.sql.sources.schema.numBuckets"; + private static final String SPARK_TABLE_BUCKET_NUMBER_KEY = "spark.sql.sources.schema.numBuckets"; public static final String ICEBERG_TABLE_TYPE_NAME = "table_type"; public static final String ICEBERG_TABLE_TYPE_VALUE = "iceberg"; @@ -309,7 +307,7 @@ private static void skipHeader(RecordReader reader, int headerCount } } - public static void setReadColumns(Configuration configuration, List readHiveColumnIndexes) + private static void setReadColumns(Configuration configuration, List readHiveColumnIndexes) { configuration.set(READ_COLUMN_IDS_CONF_STR, Joiner.on(',').join(readHiveColumnIndexes)); configuration.setBoolean(READ_ALL_COLUMNS, false); @@ -317,7 +315,7 @@ public static void setReadColumns(Configuration configuration, List rea private static void configureCompressionCodecs(JobConf jobConf) { - // add Airlift LZO and LZOP to head of codecs list so as to not override existing entries + // add Airlift LZO and LZOP to head of codecs list to not override existing entries List codecs = newArrayList(Splitter.on(",").trimResults().omitEmptyStrings().split(jobConf.get("io.compression.codecs", ""))); if (!codecs.contains(LzoCodec.class.getName())) { codecs.add(0, LzoCodec.class.getName()); @@ -325,7 +323,7 @@ private static void configureCompressionCodecs(JobConf jobConf) if (!codecs.contains(LzopCodec.class.getName())) { codecs.add(0, LzopCodec.class.getName()); } - jobConf.set("io.compression.codecs", codecs.stream().collect(joining(","))); + jobConf.set("io.compression.codecs", String.join(",", codecs)); } public static Optional getCompressionCodec(TextInputFormat inputFormat, Path file) @@ -399,7 +397,7 @@ public static String getInputFormatName(Properties schema) return name; } - public static long parseHiveDate(String value) + private static long parseHiveDate(String value) { LocalDateTime date = HIVE_DATE_PARSER.parseLocalDateTime(value); if (!date.toLocalTime().equals(LocalTime.MIDNIGHT)) { @@ -532,7 +530,7 @@ private static void validate(Deserializer deserializer) } } - public static boolean isHiveNull(byte[] bytes) + private static boolean isHiveNull(byte[] bytes) { return bytes.length == 2 && bytes[0] == '\\' && bytes[1] == 'N'; } @@ -684,7 +682,7 @@ public static NullableValue parsePartitionValue(String partitionName, String val if (isNull) { return NullableValue.asNull(type); } - return NullableValue.of(type, Slices.utf8Slice(value)); + return NullableValue.of(type, utf8Slice(value)); } throw new VerifyException(format("Unhandled type [%s] for partition: %s", type, partitionName)); @@ -695,7 +693,7 @@ public static boolean isStructuralType(Type type) return (type instanceof ArrayType) || (type instanceof MapType) || (type instanceof RowType); } - public static boolean booleanPartitionKey(String value, String name) + private static boolean booleanPartitionKey(String value, String name) { if (value.equalsIgnoreCase("true")) { return true; @@ -706,7 +704,7 @@ public static boolean booleanPartitionKey(String value, String name) throw new TrinoException(HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for BOOLEAN partition key: %s", value, name)); } - public static long bigintPartitionKey(String value, String name) + private static long bigintPartitionKey(String value, String name) { try { return parseLong(value); @@ -716,7 +714,7 @@ public static long bigintPartitionKey(String value, String name) } } - public static long integerPartitionKey(String value, String name) + private static long integerPartitionKey(String value, String name) { try { return parseInt(value); @@ -726,7 +724,7 @@ public static long integerPartitionKey(String value, String name) } } - public static long smallintPartitionKey(String value, String name) + private static long smallintPartitionKey(String value, String name) { try { return parseShort(value); @@ -736,7 +734,7 @@ public static long smallintPartitionKey(String value, String name) } } - public static long tinyintPartitionKey(String value, String name) + private static long tinyintPartitionKey(String value, String name) { try { return parseByte(value); @@ -746,7 +744,7 @@ public static long tinyintPartitionKey(String value, String name) } } - public static long floatPartitionKey(String value, String name) + private static long floatPartitionKey(String value, String name) { try { return floatToRawIntBits(parseFloat(value)); @@ -756,7 +754,7 @@ public static long floatPartitionKey(String value, String name) } } - public static double doublePartitionKey(String value, String name) + private static double doublePartitionKey(String value, String name) { try { return parseDouble(value); @@ -766,7 +764,7 @@ public static double doublePartitionKey(String value, String name) } } - public static long datePartitionKey(String value, String name) + private static long datePartitionKey(String value, String name) { try { return parseHiveDate(value); @@ -776,7 +774,7 @@ public static long datePartitionKey(String value, String name) } } - public static long timestampPartitionKey(String value, String name) + private static long timestampPartitionKey(String value, String name) { try { return parseHiveTimestamp(value); @@ -786,12 +784,12 @@ public static long timestampPartitionKey(String value, String name) } } - public static long shortDecimalPartitionKey(String value, DecimalType type, String name) + private static long shortDecimalPartitionKey(String value, DecimalType type, String name) { return decimalPartitionKey(value, type, name).unscaledValue().longValue(); } - public static Int128 longDecimalPartitionKey(String value, DecimalType type, String name) + private static Int128 longDecimalPartitionKey(String value, DecimalType type, String name) { return Int128.valueOf(decimalPartitionKey(value, type, name).unscaledValue()); } @@ -815,9 +813,9 @@ private static BigDecimal decimalPartitionKey(String value, DecimalType type, St } } - public static Slice varcharPartitionKey(String value, String name, Type columnType) + private static Slice varcharPartitionKey(String value, String name, Type columnType) { - Slice partitionKey = Slices.utf8Slice(value); + Slice partitionKey = utf8Slice(value); VarcharType varcharType = (VarcharType) columnType; if (!varcharType.isUnbounded() && SliceUtf8.countCodePoints(partitionKey) > varcharType.getBoundedLength()) { throw new TrinoException(HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, columnType, name)); @@ -825,9 +823,9 @@ public static Slice varcharPartitionKey(String value, String name, Type columnTy return partitionKey; } - public static Slice charPartitionKey(String value, String name, Type columnType) + private static Slice charPartitionKey(String value, String name, Type columnType) { - Slice partitionKey = trimTrailingSpaces(Slices.utf8Slice(value)); + Slice partitionKey = trimTrailingSpaces(utf8Slice(value)); CharType charType = (CharType) columnType; if (SliceUtf8.countCodePoints(partitionKey) > charType.getLength()) { throw new TrinoException(HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, columnType, name));