diff --git a/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java b/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java index 022538eaa..d1dc558af 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java @@ -140,7 +140,6 @@ public static RelDataType convert(StructTypeInfo structType, final RelDataTypeFa // The schema of output Struct conforms to https://github.com/trinodb/trino/pull/3483 // except we adopted "integer" for the type of "tag" field instead of "tinyint" in the Trino patch // for compatibility with other platforms that Iceberg currently doesn't support tinyint type. - // When the field count inside UnionTypeInfo is one, we surface the underlying RelDataType instead. // Note: this is subject to change in the future pending on the discussion in // https://mail-archives.apache.org/mod_mbox/iceberg-dev/202112.mbox/browser diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index 01109a372..a7165a779 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -495,48 +495,58 @@ public RexNode visitRangeRef(RexRangeRef rexRangeRef) { public RexNode visitFieldAccess(RexFieldAccess rexFieldAccess) { RexNode referenceExpr = rexFieldAccess.getReferenceExpr(); - if (referenceExpr instanceof RexCall - && ((RexCall) referenceExpr).getOperator() instanceof SqlUserDefinedFunction) { - String oldFieldName = rexFieldAccess.getField().getName(); - String suggestNewFieldName = suggestedFieldNames.poll(); - String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); - - RelDataType fieldType = rexFieldAccess.getType(); - boolean isNullable = SchemaUtilities.isFieldNullable((RexCall) referenceExpr, inputSchema); - // TODO: add field documentation - SchemaUtilities.appendField(newFieldName, fieldType, null, fieldAssembler, isNullable); - } else { - Deque innerRecordNames = new LinkedList<>(); - while (!(referenceExpr instanceof RexInputRef)) { - if (referenceExpr instanceof RexCall - && ((RexCall) referenceExpr).getOperator().getName().equalsIgnoreCase("ITEM")) { - // While selecting `int_field` from `array_col:array>` using `array_col[x].int_field`, - // `rexFieldAccess` is like `ITEM($1, 1).int_field`, we need to set `referenceExpr` to be the first operand (`$1`) of `ITEM` function - referenceExpr = ((RexCall) referenceExpr).getOperands().get(0); - } else if (referenceExpr instanceof RexFieldAccess) { - // While selecting `int_field` from `struct_col:struct>` using `struct_col.inner_struct_col.int_field`, - // `rexFieldAccess` is like `$3.inner_struct_col.int_field`, we need to set `referenceExpr` to be the expr (`$3`) of itself. - // Besides, we need to store the field name (`inner_struct_col`) in `fieldNames` so that we can retrieve the correct inner struct from `topSchema` afterwards - innerRecordNames.push(((RexFieldAccess) referenceExpr).getField().getName()); - referenceExpr = ((RexFieldAccess) referenceExpr).getReferenceExpr(); - } else { - return super.visitFieldAccess(rexFieldAccess); - } + Deque innerRecordNames = new LinkedList<>(); + while (!(referenceExpr instanceof RexInputRef)) { + if (referenceExpr instanceof RexCall + && ((RexCall) referenceExpr).getOperator().getName().equalsIgnoreCase("ITEM")) { + // While selecting `int_field` from `array_col:array>` using `array_col[x].int_field`, + // `rexFieldAccess` is like `ITEM($1, 1).int_field`, we need to set `referenceExpr` to be the first operand (`$1`) of `ITEM` function + referenceExpr = ((RexCall) referenceExpr).getOperands().get(0); + } else if (referenceExpr instanceof RexCall + && ((RexCall) referenceExpr).getOperator() instanceof SqlUserDefinedFunction) { + // UDFs calls could potentially be doubly (or more) field-referenced, for example, `extract_union(baz).single.tag_0` + // where baz is a struct containing a uniontype field. In this case, we simply need to use derived type of the entire + // call. Note that this also takes care of the simple one layer field reference on a UDF call. + handleUDFFieldAccess(rexFieldAccess, (RexCall) referenceExpr); + return rexFieldAccess; + } else if (referenceExpr instanceof RexFieldAccess) { + // While selecting `int_field` from `struct_col:struct>` using `struct_col.inner_struct_col.int_field`, + // `rexFieldAccess` is like `$3.inner_struct_col.int_field`, we need to set `referenceExpr` to be the expr (`$3`) of itself. + // Besides, we need to store the field name (`inner_struct_col`) in `fieldNames` so that we can retrieve the correct inner struct from `topSchema` afterwards + innerRecordNames.push(((RexFieldAccess) referenceExpr).getField().getName()); + referenceExpr = ((RexFieldAccess) referenceExpr).getReferenceExpr(); + } else { + return super.visitFieldAccess(rexFieldAccess); } - - String oldFieldName = rexFieldAccess.getField().getName(); - String suggestNewFieldName = suggestedFieldNames.poll(); - String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); - Schema topSchema = inputSchema.getFields().get(((RexInputRef) referenceExpr).getIndex()).schema(); - - Schema.Field accessedField = getFieldFromTopSchema(topSchema, oldFieldName, innerRecordNames); - assert accessedField != null; - SchemaUtilities.appendField(newFieldName, accessedField, fieldAssembler); } + handleFieldAccess(rexFieldAccess, (RexInputRef) referenceExpr, innerRecordNames); return rexFieldAccess; } + private void handleFieldAccess(RexFieldAccess rexFieldAccess, RexInputRef referenceExpr, + Deque innerRecordNames) { + String oldFieldName = rexFieldAccess.getField().getName(); + String suggestNewFieldName = suggestedFieldNames.poll(); + String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); + + Schema topSchema = inputSchema.getFields().get(referenceExpr.getIndex()).schema(); + Schema.Field accessedField = getFieldFromTopSchema(topSchema, oldFieldName, innerRecordNames); + assert accessedField != null; + SchemaUtilities.appendField(newFieldName, accessedField, fieldAssembler); + } + + private void handleUDFFieldAccess(RexFieldAccess rexFieldAccess, RexCall referenceExpr) { + String oldFieldName = rexFieldAccess.getField().getName(); + String suggestNewFieldName = suggestedFieldNames.poll(); + String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); + + RelDataType fieldType = rexFieldAccess.getType(); + boolean isNullable = SchemaUtilities.isFieldNullable(referenceExpr, inputSchema); + // TODO: add field documentation + SchemaUtilities.appendField(newFieldName, fieldType, null, fieldAssembler, isNullable); + } + @Override public RexNode visitSubQuery(RexSubQuery rexSubQuery) { // TODO: implement this method diff --git a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java index 56642a828..1b0f2bc3a 100644 --- a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java +++ b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java @@ -144,6 +144,9 @@ private static void initializeTables() { executeQuery("DROP TABLE IF EXISTS basedecimal"); executeQuery("CREATE TABLE IF NOT EXISTS basedecimal(decimal_col decimal(2,1))"); + + executeQuery( + "CREATE TABLE IF NOT EXISTS single_uniontypes(single uniontype, struct_col struct>)"); } private static void initializeUdfs() { diff --git a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java index 4bf8130c5..d13046229 100644 --- a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java +++ b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -560,6 +560,17 @@ public void testNullabliltyExtractUnionUDF() { Assert.assertEquals(actual.toString(true), TestUtils.loadSchema("testNullabilityExtractUnionUDF-expected.avsc")); } + @Test + public void testSingleUnionFieldReference() { + String sql = + "select extract_union(struct_col).single.tag_0 as single_in_struct, extract_union(single).tag_0 as single from single_uniontypes"; + ViewToAvroSchemaConverter viewToAvroSchemaConverter = ViewToAvroSchemaConverter.create(hiveMetastoreClient); + + Schema actual = viewToAvroSchemaConverter.toAvroSchema(sql); + + Assert.assertEquals(actual.toString(true), TestUtils.loadSchema("testSingleUnionFieldReference-expected.avsc")); + } + @Test(enabled = false) public void testRenameToLowercase() { String viewSql = "CREATE VIEW v AS " + "SELECT bc.Id AS id, bc.Array_Col AS array_col " + "FROM basecomplex bc " diff --git a/coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc b/coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc new file mode 100644 index 000000000..d68d1a248 --- /dev/null +++ b/coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc @@ -0,0 +1,12 @@ +{ + "type" : "record", + "name" : "SingleUniontypes", + "namespace" : "default.single_uniontypes", + "fields" : [ { + "name" : "single_in_struct", + "type" : [ "null", "string" ] + }, { + "name" : "single", + "type" : [ "null", "string" ] + } ] +} \ No newline at end of file