Skip to content

Commit

Permalink
Add support for reading UniForm with Iceberg in Delta Lake
Browse files Browse the repository at this point in the history
  • Loading branch information
ebyhr committed Jun 9, 2024
1 parent f198b32 commit e14ae4d
Show file tree
Hide file tree
Showing 20 changed files with 505 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ private DeltaLakeSchemaSupport() {}
private static final String CHECK_CONSTRAINTS_FEATURE_NAME = "checkConstraints";
private static final String COLUMN_MAPPING_FEATURE_NAME = "columnMapping";
private static final String DELETION_VECTORS_FEATURE_NAME = "deletionVectors";
private static final String ICEBERG_COMPATIBILITY_V1_FEATURE_NAME = "icebergCompatV1";
private static final String ICEBERG_COMPATIBILITY_V2_FEATURE_NAME = "icebergCompatV2";
private static final String IDENTITY_COLUMNS_FEATURE_NAME = "identityColumns";
private static final String INVARIANTS_FEATURE_NAME = "invariants";
public static final String TIMESTAMP_NTZ_FEATURE_NAME = "timestampNtz";
Expand Down Expand Up @@ -184,6 +186,12 @@ public static boolean isDeletionVectorEnabled(MetadataEntry metadataEntry, Proto
public static ColumnMappingMode getColumnMappingMode(MetadataEntry metadata, ProtocolEntry protocolEntry)
{
if (protocolEntry.supportsReaderFeatures() || protocolEntry.supportsWriterFeatures()) {
if (protocolEntry.writerFeaturesContains(ICEBERG_COMPATIBILITY_V1_FEATURE_NAME) || protocolEntry.writerFeaturesContains(ICEBERG_COMPATIBILITY_V2_FEATURE_NAME)) {
String columnMappingMode = metadata.getConfiguration().get(COLUMN_MAPPING_MODE_CONFIGURATION_KEY);
checkArgument(columnMappingMode != null && columnMappingMode.equals("name"), "Column mapping mode must be 'name' for Iceberg compatibility: %s", columnMappingMode);
return ColumnMappingMode.NAME;
}

boolean supportsColumnMappingReader = protocolEntry.readerFeaturesContains(COLUMN_MAPPING_FEATURE_NAME);
boolean supportsColumnMappingWriter = protocolEntry.writerFeaturesContains(COLUMN_MAPPING_FEATURE_NAME);
checkArgument(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ public class TestDeltaLakeBasic
new ResourceTable("deletion_vectors", "databricks122/deletion_vectors"),
new ResourceTable("liquid_clustering", "deltalake/liquid_clustering"),
new ResourceTable("timestamp_ntz", "databricks131/timestamp_ntz"),
new ResourceTable("timestamp_ntz_partition", "databricks131/timestamp_ntz_partition"));
new ResourceTable("timestamp_ntz_partition", "databricks131/timestamp_ntz_partition"),
new ResourceTable("uniform_iceberg_v1", "databricks133/uniform_iceberg_v1"),
new ResourceTable("uniform_iceberg_v2", "databricks143/uniform_iceberg_v2"));

// The col-{uuid} pattern for delta.columnMapping.physicalName
private static final Pattern PHYSICAL_COLUMN_NAME_PATTERN = Pattern.compile("^col-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$");
Expand Down Expand Up @@ -959,6 +961,26 @@ public void testLiquidClustering()
assertQueryFails("INSERT INTO liquid_clustering VALUES ('test 3', 2024, 3)", "Unsupported writer features: .*");
}

/**
* @see databricks133.uniform_iceberg_v1
*/
@Test
public void testUniFormIcebergV1()
{
assertQuery("SELECT * FROM uniform_iceberg_v1", "VALUES (1, 'test data')");
assertQueryFails("INSERT INTO uniform_iceberg_v1 VALUES (2, 'new data')", "\\QUnsupported writer features: [icebergCompatV1]");
}

/**
* @see databricks143.uniform_iceberg_v2
*/
@Test
public void testUniFormIcebergV2()
{
assertQuery("SELECT * FROM uniform_iceberg_v2", "VALUES (1, 'test data')");
assertQueryFails("INSERT INTO uniform_iceberg_v2 VALUES (2, 'new data')", "\\QUnsupported writer features: [icebergCompatV2]");
}

@Test
public void testCorruptedManagedTableLocation()
throws Exception
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Data generated using Databricks 13.3 with Unity.
`delta.universalFormat.enabledFormats` requires Unity catalog.

```sql
CREATE TABLE main.default.test_uniform_iceberg_v1
(a integer, b string)
USING DELTA
TBLPROPERTIES (
'delta.enableIcebergCompatV1' = 'true',
'delta.universalFormat.enabledFormats' = 'iceberg'
);

INSERT INTO main.default.test_uniform_iceberg_v1 VALUES (1, 'test data');
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1717737493887,"userId":"7853186923043731","userName":"[email protected]","operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.enableIcebergCompatV1\":\"true\",\"delta.universalFormat.enabledFormats\":\"iceberg\"}","statsOnLoad":false},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0213-045432-cqrij0nb","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/13.3.x-scala2.12","txnId":"a48af02b-55e8-411b-b278-876b33ea6270"}}
{"metaData":{"id":"2543f7c2-17c5-421d-b270-773af5654cec","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\"}},{\"name\":\"b\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\"}}]}","partitionColumns":[],"configuration":{"delta.enableIcebergCompatV1":"true","delta.universalFormat.enabledFormats":"iceberg","delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"2"},"createdTime":1717737492488}}
{"protocol":{"minReaderVersion":2,"minWriterVersion":7,"writerFeatures":["columnMapping","icebergCompatV1"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"commitInfo":{"timestamp":1717737560500,"userId":"7853186923043731","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0213-045432-cqrij0nb","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1320"},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/13.3.x-scala2.12","txnId":"18b114d7-eaa8-4279-87e3-9136a3ba510d"}}
{"add":{"path":"vL/part-00000-a3b8b550-b0f5-4585-beef-fc82aea5d233-c000.snappy.parquet","partitionValues":{},"size":1320,"modificationTime":1717737559000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\":1,\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\":\"test data\"},\"maxValues\":{\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\":1,\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\":\"test data\"},\"nullCount\":{\"col-7cccf654-9a0b-463e-add7-da6899c1c97b\":0,\"col-f272ebe6-6618-4a5c-8dd6-0793d2204587\":0}}","tags":{"INSERTION_TIME":"1717737559000000","MIN_INSERTION_TIME":"1717737559000000","MAX_INSERTION_TIME":"1717737559000000","OPTIMIZE_TARGET_SIZE":"67108864"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"format-version" : 1,
"table-uuid" : "0eb73468-144c-44fb-911c-8cee8bf441aa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1",
"last-updated-ms" : 1717737524993,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717737495000",
"delta-version" : "-1"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"format-version" : 1,
"table-uuid" : "0eb73468-144c-44fb-911c-8cee8bf441aa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1",
"last-updated-ms" : 1717737525809,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717737495000",
"delta-version" : "0"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ {
"timestamp-ms" : 1717737524993,
"metadata-file" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/00000-a6f70674-5338-4b57-b07e-cd4b3253fa44.metadata.json"
} ]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"format-version" : 1,
"table-uuid" : "0eb73468-144c-44fb-911c-8cee8bf441aa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1",
"last-updated-ms" : 1717737567454,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717737561000",
"delta-version" : "1"
},
"current-snapshot-id" : 3448407241734536873,
"refs" : {
"main" : {
"snapshot-id" : 3448407241734536873,
"type" : "branch"
}
},
"snapshots" : [ {
"snapshot-id" : 3448407241734536873,
"timestamp-ms" : 1717737567285,
"summary" : {
"operation" : "append",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "1320",
"changed-partition-count" : "1",
"total-records" : "1",
"total-files-size" : "1320",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/snap-3448407241734536873-1-2dc6572c-92dd-4786-80d0-324b424124b3.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"snapshot-log" : [ {
"timestamp-ms" : 1717737567285,
"snapshot-id" : 3448407241734536873
} ],
"metadata-log" : [ {
"timestamp-ms" : 1717737524993,
"metadata-file" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/00000-a6f70674-5338-4b57-b07e-cd4b3253fa44.metadata.json"
}, {
"timestamp-ms" : 1717737525809,
"metadata-file" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/867ed8f6-384c-42b7-a589-76551ed2dea1/metadata/00001-15a4900d-0b87-4ca8-93cf-ebde86de39aa.metadata.json"
} ]
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Data generated using Databricks 14.3 with Unity.
`delta.universalFormat.enabledFormats` requires Unity catalog.

```sql
CREATE TABLE main.default.test_uniform_iceberg_v2
(a integer, b string)
USING DELTA
TBLPROPERTIES (
'delta.enableIcebergCompatV2' = 'true',
'delta.universalFormat.enabledFormats' = 'iceberg'
);

INSERT INTO main.default.test_uniform_iceberg_v2 VALUES (1, 'test data');
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1717738469512,"userId":"7853186923043731","userName":"[email protected]","operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","description":null,"isManaged":"true","properties":"{\"delta.enableIcebergCompatV2\":\"true\",\"delta.universalFormat.enabledFormats\":\"iceberg\",\"delta.columnMapping.mode\":\"name\",\"delta.columnMapping.maxColumnId\":\"2\"}","statsOnLoad":false},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-052725-dyf4xqms","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/14.3.x-scala2.12","txnId":"0220e873-19b8-42cf-be60-c903515c9ca8"}}
{"metaData":{"id":"ba5c6d1b-f968-4204-bde0-f9ce04c5165b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.nested.ids\":{},\"delta.columnMapping.physicalName\":\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\"}},{\"name\":\"b\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.nested.ids\":{},\"delta.columnMapping.physicalName\":\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\"}}]}","partitionColumns":[],"configuration":{"delta.enableIcebergCompatV2":"true","delta.universalFormat.enabledFormats":"iceberg","delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"2"},"createdTime":1717738467564}}
{"protocol":{"minReaderVersion":2,"minWriterVersion":7,"writerFeatures":["columnMapping","icebergCompatV2"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"commitInfo":{"timestamp":1717738519331,"userId":"7853186923043731","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-052725-dyf4xqms","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1360"},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/14.3.x-scala2.12","txnId":"69ff1ee1-d2b0-4d51-8bbd-38290b97b55d"}}
{"add":{"path":"OZ/part-00000-01da98d5-5186-49d7-bd68-477cd393b4d4-c000.zstd.parquet","partitionValues":{},"size":1360,"modificationTime":1717738518000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\":1,\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\":\"test data\"},\"maxValues\":{\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\":1,\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\":\"test data\"},\"nullCount\":{\"col-5d3fdc39-eb9a-484c-a266-50b74243b351\":0,\"col-f0fe15be-af45-4422-8cb1-4ba97bd1af89\":0}}","tags":{"MAX_INSERTION_TIME":"1717738518000000","INSERTION_TIME":"1717738518000000","ICEBERG_COMPAT_VERSION":"2","MIN_INSERTION_TIME":"1717738518000000","OPTIMIZE_TARGET_SIZE":"67108864"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"format-version" : 1,
"table-uuid" : "3dcdec09-c735-4cf3-b5d1-9bfa9786ebaa",
"location" : "s3://trino-ci-test/databricks-unity/af968baf-af29-4111-8bd7-4a67d3a3a39f/tables/1cf271cf-2321-4fa7-8169-b611445faf5f",
"last-updated-ms" : 1717738503539,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "a",
"required" : false,
"type" : "int"
}, {
"id" : 2,
"name" : "b",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"schema.name-mapping.default" : "[ {\n \"field-id\" : 1,\n \"names\" : [ \"a\" ]\n}, {\n \"field-id\" : 2,\n \"names\" : [ \"b\" ]\n} ]",
"delta-timestamp" : "1717738471000",
"delta-version" : "-1"
},
"current-snapshot-id" : -1,
"refs" : { },
"snapshots" : [ ],
"statistics" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
Loading

0 comments on commit e14ae4d

Please sign in to comment.