Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for reading UniForm with Iceberg in Delta Lake #22311

Merged
merged 1 commit into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ private DeltaLakeSchemaSupport() {}
private static final String CHECK_CONSTRAINTS_FEATURE_NAME = "checkConstraints";
private static final String COLUMN_MAPPING_FEATURE_NAME = "columnMapping";
private static final String DELETION_VECTORS_FEATURE_NAME = "deletionVectors";
private static final String ICEBERG_COMPATIBILITY_V1_FEATURE_NAME = "icebergCompatV1";
private static final String ICEBERG_COMPATIBILITY_V2_FEATURE_NAME = "icebergCompatV2";
private static final String IDENTITY_COLUMNS_FEATURE_NAME = "identityColumns";
private static final String INVARIANTS_FEATURE_NAME = "invariants";
public static final String TIMESTAMP_NTZ_FEATURE_NAME = "timestampNtz";
Expand Down Expand Up @@ -184,6 +186,12 @@ public static boolean isDeletionVectorEnabled(MetadataEntry metadataEntry, Proto
public static ColumnMappingMode getColumnMappingMode(MetadataEntry metadata, ProtocolEntry protocolEntry)
{
if (protocolEntry.supportsReaderFeatures() || protocolEntry.supportsWriterFeatures()) {
if (protocolEntry.writerFeaturesContains(ICEBERG_COMPATIBILITY_V1_FEATURE_NAME) || protocolEntry.writerFeaturesContains(ICEBERG_COMPATIBILITY_V2_FEATURE_NAME)) {
String columnMappingMode = metadata.getConfiguration().get(COLUMN_MAPPING_MODE_CONFIGURATION_KEY);
ebyhr marked this conversation as resolved.
Show resolved Hide resolved
checkArgument(columnMappingMode != null && columnMappingMode.equals("name"), "Column mapping mode must be 'name' for Iceberg compatibility: %s", columnMappingMode);
return ColumnMappingMode.NAME;
}

boolean supportsColumnMappingReader = protocolEntry.readerFeaturesContains(COLUMN_MAPPING_FEATURE_NAME);
boolean supportsColumnMappingWriter = protocolEntry.writerFeaturesContains(COLUMN_MAPPING_FEATURE_NAME);
checkArgument(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ public class TestDeltaLakeBasic
new ResourceTable("deletion_vectors", "databricks122/deletion_vectors"),
new ResourceTable("liquid_clustering", "deltalake/liquid_clustering"),
new ResourceTable("timestamp_ntz", "databricks131/timestamp_ntz"),
new ResourceTable("timestamp_ntz_partition", "databricks131/timestamp_ntz_partition"));
new ResourceTable("timestamp_ntz_partition", "databricks131/timestamp_ntz_partition"),
new ResourceTable("uniform_iceberg_v1", "databricks133/uniform_iceberg_v1"),
new ResourceTable("uniform_iceberg_v2", "databricks143/uniform_iceberg_v2"));

// The col-{uuid} pattern for delta.columnMapping.physicalName
private static final Pattern PHYSICAL_COLUMN_NAME_PATTERN = Pattern.compile("^col-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$");
Expand Down Expand Up @@ -959,6 +961,26 @@ public void testLiquidClustering()
assertQueryFails("INSERT INTO liquid_clustering VALUES ('test 3', 2024, 3)", "Unsupported writer features: .*");
}

/**
* @see databricks133.uniform_iceberg_v1
*/
@Test
public void testUniFormIcebergV1()
{
assertQuery("SELECT * FROM uniform_iceberg_v1", "VALUES (1, 'test data')");
assertQueryFails("INSERT INTO uniform_iceberg_v1 VALUES (2, 'new data')", "\\QUnsupported writer features: [icebergCompatV1]");
}

/**
* @see databricks143.uniform_iceberg_v2
*/
@Test
public void testUniFormIcebergV2()
{
assertQuery("SELECT * FROM uniform_iceberg_v2", "VALUES (1, 'test data')");
assertQueryFails("INSERT INTO uniform_iceberg_v2 VALUES (2, 'new data')", "\\QUnsupported writer features: [icebergCompatV2]");
}

@Test
public void testCorruptedManagedTableLocation()
throws Exception
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Data generated using Databricks 13.3 with Unity.
`delta.universalFormat.enabledFormats` requires Unity catalog.

```sql
CREATE TABLE main.default.test_uniform_iceberg_v1
(a integer, b string)
USING DELTA
TBLPROPERTIES (
'delta.enableIcebergCompatV1' = 'true',
'delta.universalFormat.enabledFormats' = 'iceberg'
);

INSERT INTO main.default.test_uniform_iceberg_v1 VALUES (1, 'test data');
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1717737493887,"userId":"7853186923043731","userName":"[email protected]","operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.enableIcebergCompatV1\":\"true\",\"delta.universalFormat.enabledFormats\":\"iceberg\"}","statsOnLoad":false},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0213-045432-cqrij0nb","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/13.3.x-scala2.12","txnId":"a48af02b-55e8-411b-b278-876b33ea6270"}}
{"metaData":{"id":"2543f7c2-17c5-421d-b270-773af5654cec","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\"}},{\"name\":\"b\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\"}}]}","partitionColumns":[],"configuration":{"delta.enableIcebergCompatV1":"true","delta.universalFormat.enabledFormats":"iceberg","delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"2"},"createdTime":1717737492488}}
{"protocol":{"minReaderVersion":2,"minWriterVersion":7,"writerFeatures":["columnMapping","icebergCompatV1"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"commitInfo":{"timestamp":1717737560500,"userId":"7853186923043731","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0213-045432-cqrij0nb","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1320"},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/13.3.x-scala2.12","txnId":"18b114d7-eaa8-4279-87e3-9136a3ba510d"}}
{"add":{"path":"vL/part-00000-a3b8b550-b0f5-4585-beef-fc82aea5d233-c000.snappy.parquet","partitionValues":{},"size":1320,"modificationTime":1717737559000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\":1,\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\":\"test data\"},\"maxValues\":{\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\":1,\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\":\"test data\"},\"nullCount\":{\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\":0,\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\":0}}","tags":{"INSERTION_TIME":"1717737559000000","MIN_INSERTION_TIME":"1717737559000000","MAX_INSERTION_TIME":"1717737559000000","OPTIMIZE_TARGET_SIZE":"67108864"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"format-version" : 1,
"table-uuid" : "0eb73468-144c-44fb-911c-8cee8bf441aa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1",
"last-updated-ms" : 1717737524993,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717737495000",
"delta-version" : "-1"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"format-version" : 1,
"table-uuid" : "0eb73468-144c-44fb-911c-8cee8bf441aa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1",
"last-updated-ms" : 1717737525809,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717737495000",
"delta-version" : "0"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ {
"timestamp-ms" : 1717737524993,
"metadata-file" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/00000-a6f70674-5338-4b57-b07e-cd4b3253fa44.metadata.json"
} ]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"format-version" : 1,
"table-uuid" : "0eb73468-144c-44fb-911c-8cee8bf441aa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1",
"last-updated-ms" : 1717737567454,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717737561000",
"delta-version" : "1"
},
"current-snapshot-id" : 3448407241734536873,
"refs" : {
"main" : {
"snapshot-id" : 3448407241734536873,
"type" : "branch"
}
},
"snapshots" : [ {
"snapshot-id" : 3448407241734536873,
"timestamp-ms" : 1717737567285,
"summary" : {
"operation" : "append",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "1320",
"changed-partition-count" : "1",
"total-records" : "1",
"total-files-size" : "1320",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/snap-3448407241734536873-1-2dc6572c-92dd-4786-80d0-324b424124b3.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"snapshot-log" : [ {
"timestamp-ms" : 1717737567285,
"snapshot-id" : 3448407241734536873
} ],
"metadata-log" : [ {
"timestamp-ms" : 1717737524993,
"metadata-file" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/00000-a6f70674-5338-4b57-b07e-cd4b3253fa44.metadata.json"
}, {
"timestamp-ms" : 1717737525809,
"metadata-file" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/00001-15a4900d-0b87-4ca8-93cf-ebde86de39aa.metadata.json"
} ]
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Data generated using Databricks 14.3 with Unity.
`delta.universalFormat.enabledFormats` requires Unity catalog.

```sql
CREATE TABLE main.default.test_uniform_iceberg_v2
(a integer, b string)
USING DELTA
TBLPROPERTIES (
'delta.enableIcebergCompatV2' = 'true',
'delta.universalFormat.enabledFormats' = 'iceberg'
);

INSERT INTO main.default.test_uniform_iceberg_v2 VALUES (1, 'test data');
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1717738469512,"userId":"7853186923043731","userName":"[email protected]","operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.enableIcebergCompatV2\":\"true\",\"delta.universalFormat.enabledFormats\":\"iceberg\",\"delta.columnMapping.mode\":\"name\",\"delta.columnMapping.maxColumnId\":\"2\"}","statsOnLoad":false},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-052725-dyf4xqms","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/14.3.x-scala2.12","txnId":"0220e873-19b8-42cf-be60-c903515c9ca8"}}
{"metaData":{"id":"ba5c6d1b-f968-4204-bde0-f9ce04c5165b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.nested.ids\":{},\"delta.columnMapping.physicalName\":\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\"}},{\"name\":\"b\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.nested.ids\":{},\"delta.columnMapping.physicalName\":\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\"}}]}","partitionColumns":[],"configuration":{"delta.enableIcebergCompatV2":"true","delta.universalFormat.enabledFormats":"iceberg","delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"2"},"createdTime":1717738467564}}
{"protocol":{"minReaderVersion":2,"minWriterVersion":7,"writerFeatures":["columnMapping","icebergCompatV2"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"commitInfo":{"timestamp":1717738519331,"userId":"7853186923043731","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-052725-dyf4xqms","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1360"},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/14.3.x-scala2.12","txnId":"69ff1ee1-d2b0-4d51-8bbd-38290b97b55d"}}
{"add":{"path":"OZ/part-00000-01da98d5-5186-49d7-bd68-477cd393b4d4-c000.zstd.parquet","partitionValues":{},"size":1360,"modificationTime":1717738518000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\":1,\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\":\"test data\"},\"maxValues\":{\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\":1,\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\":\"test data\"},\"nullCount\":{\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\":0,\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\":0}}","tags":{"MAX_INSERTION_TIME":"1717738518000000","INSERTION_TIME":"1717738518000000","ICEBERG_COMPAT_VERSION":"2","MIN_INSERTION_TIME":"1717738518000000","OPTIMIZE_TARGET_SIZE":"67108864"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"format-version" : 1,
"table-uuid" : "3dcdec09-c735-4cf3-b5d1-9bfa9786ebaa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/1cf271cf-2321-4fa7-8169-b611445faf5f",
"last-updated-ms" : 1717738503539,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717738471000",
"delta-version" : "-1"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
Loading
Loading