From d53785ae113b1baf67bd256c4dccef9233415ad4 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 5 Nov 2023 00:32:47 -0700 Subject: [PATCH 01/27] Implement table metadata updater first draft --- pyiceberg/table/__init__.py | 240 +++++++++++++++++++++++++++++++++++- pyiceberg/table/metadata.py | 2 + tests/table/test_init.py | 11 +- 3 files changed, 250 insertions(+), 3 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index ae35b34384..5a04c9ae7e 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -22,7 +22,7 @@ from copy import copy from dataclasses import dataclass from enum import Enum -from functools import cached_property +from functools import cached_property, singledispatchmethod from itertools import chain from typing import ( TYPE_CHECKING, @@ -69,7 +69,14 @@ promote, visit, ) -from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadata +from pyiceberg.table.metadata import ( + INITIAL_SEQUENCE_NUMBER, + SUPPORTED_TABLE_FORMAT_VERSION, + TableMetadata, + TableMetadataUtil, + TableMetadataV1, +) +from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef, SnapshotRefType from pyiceberg.table.snapshots import Snapshot, SnapshotLogEntry from pyiceberg.table.sorting import SortOrder from pyiceberg.typedef import ( @@ -349,15 +356,205 @@ class RemovePropertiesUpdate(TableUpdate): removals: List[str] +class TableMetadataUpdateBuilder: + _base_metadata: TableMetadata + _updates: List[TableUpdate] + _last_added_schema_id: Optional[int] + + def __init__(self, base_metadata: TableMetadata) -> None: + self._base_metadata = TableMetadataUtil.parse_obj(copy(base_metadata.model_dump())) + self._updates = [] + self._last_added_schema_id = None + + def _reuse_or_create_new_schema_id(self, new_schema: Schema) -> Tuple[int, bool]: + # if the schema already exists, use its id; otherwise use the highest id + 1 + new_schema_id = self._base_metadata.current_schema_id + for schema in self._base_metadata.schemas: + if schema == new_schema: + return schema.schema_id, False + elif schema.schema_id >= new_schema_id: + new_schema_id = schema.schema_id + 1 + return new_schema_id, True + + def _add_schema_internal(self, schema: Schema, last_column_id: int, update: TableUpdate) -> int: + if last_column_id < self._base_metadata.last_column_id: + raise ValueError(f"Invalid last column id {last_column_id}, must be >= {self._base_metadata.last_column_id}") + new_schema_id, schema_found = self._reuse_or_create_new_schema_id(schema) + if schema_found and last_column_id == self._base_metadata.last_column_id: + if self._last_added_schema_id is not None and any( + update.schema_.schema_id == new_schema_id for update in self._updates if isinstance(update, AddSchemaUpdate) + ): + self._last_added_schema_id = new_schema_id + return new_schema_id + + self._base_metadata.last_column_id = last_column_id + + new_schema = ( + schema + if new_schema_id == schema.schema_id + # TODO: double check the parameter passing here, schema.fields may be interpreted as the **data fileds + else Schema(*schema.fields, schema_id=new_schema_id, identifier_field_ids=schema.identifier_field_ids) + ) + + if not schema_found: + self._base_metadata.schemas.append(new_schema) + + self._updates.append(update) + self._last_added_schema_id = new_schema_id + return new_schema_id + + def _set_current_schema(self, schema_id: int) -> None: + if schema_id == -1: + if self._last_added_schema_id is None: + raise ValueError("Cannot set current schema to last added schema when no schema has been added") + return self._set_current_schema(self._last_added_schema_id) + + if schema_id == self._base_metadata.current_schema_id: + return + + schema = next(schema for schema in self._base_metadata.schemas if schema.schema_id == schema_id) + if schema is None: + raise ValueError(f"Schema with id {schema_id} does not exist") + + # TODO: rebuild sort_order and partition_spec + # So it seems the rebuild just refresh the inner field which hold the schema and some naming check for partition_spec + # Seems this is not necessary in pyiceberg case wince + + self._base_metadata.current_schema_id = schema_id + if self._last_added_schema_id is not None and self._last_added_schema_id == schema_id: + self._updates.append(SetCurrentSchemaUpdate(schema_id=-1)) + else: + self._updates.append(SetCurrentSchemaUpdate(schema_id=schema_id)) + + @singledispatchmethod + def update_table_metadata(self, update: TableUpdate) -> None: + raise TypeError(f"Unsupported update: {update}") + + @update_table_metadata.register(UpgradeFormatVersionUpdate) + def _(self, update: UpgradeFormatVersionUpdate) -> None: + if update.format_version > SUPPORTED_TABLE_FORMAT_VERSION: + raise ValueError(f"Unsupported table format version: {update.format_version}") + if update.format_version < self._base_metadata.format_version: + raise ValueError(f"Cannot downgrade v{self._base_metadata.format_version} table to v{update.format_version}") + if update.format_version == self._base_metadata.format_version: + return + # At this point, the base_metadata is guaranteed to be v1 + if isinstance(self._base_metadata, TableMetadataV1): + self._base_metadata = self._base_metadata.to_v2() + + raise ValueError(f"Cannot upgrade v{self._base_metadata.format_version} table to v{update.format_version}") + + @update_table_metadata.register(AddSchemaUpdate) + def _(self, update: AddSchemaUpdate) -> None: + self._add_schema_internal(update.schema_, update.last_column_id, update) + + @update_table_metadata.register(SetCurrentSchemaUpdate) + def _(self, update: SetCurrentSchemaUpdate) -> None: + self._set_current_schema(update.schema_id) + + @update_table_metadata.register(AddSnapshotUpdate) + def _(self, update: AddSnapshotUpdate) -> None: + if len(self._base_metadata.schemas) == 0: + raise ValueError("Attempting to add a snapshot before a schema is added") + if len(self._base_metadata.partition_specs) == 0: + raise ValueError("Attempting to add a snapshot before a partition spec is added") + if len(self._base_metadata.sort_orders) == 0: + raise ValueError("Attempting to add a snapshot before a sort order is added") + if any(update.snapshot.snapshot_id == snapshot.snapshot_id for snapshot in self._base_metadata.snapshots): + raise ValueError(f"Snapshot with id {update.snapshot.snapshot_id} already exists") + if ( + self._base_metadata.format_version == 2 + and update.snapshot.sequence_number is not None + and self._base_metadata.last_sequence_number is not None + and update.snapshot.sequence_number <= self._base_metadata.last_sequence_number + and update.snapshot.parent_snapshot_id is not None + ): + raise ValueError( + f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} older than last sequence number {self._base_metadata.last_sequence_number}" + ) + + self._base_metadata.last_updated_ms = update.snapshot.timestamp + self._base_metadata.last_sequence_number = update.snapshot.sequence_number + self._base_metadata.snapshots.append(update.snapshot) + self._updates.append(update) + + @update_table_metadata.register(SetSnapshotRefUpdate) + def _(self, update: SetSnapshotRefUpdate) -> None: + ## TODO: may be some of the validation could be added to SnapshotRef class + ## TODO: may be we need to make some of the field in this update as optional or we can remove some of the checks + if update.type is None: + raise ValueError("Snapshot ref type must be set") + if update.min_snapshots_to_keep is not None and update.type == SnapshotRefType.TAG: + raise ValueError("Cannot set min snapshots to keep for branch refs") + if update.min_snapshots_to_keep is not None and update.min_snapshots_to_keep <= 0: + raise ValueError("Minimum snapshots to keep must be >= 0") + if update.max_snapshot_age_ms is not None and update.type == SnapshotRefType.TAG: + raise ValueError("Tags do not support setting maxSnapshotAgeMs") + if update.max_snapshot_age_ms is not None and update.max_snapshot_age_ms <= 0: + raise ValueError("Max snapshot age must be > 0 ms") + if update.max_age_ref_ms is not None and update.max_age_ref_ms <= 0: + raise ValueError("Max ref age must be > 0 ms") + snapshot_ref = SnapshotRef( + snapshot_id=update.snapshot_id, + snapshot_ref_type=update.type, + min_snapshots_to_keep=update.min_snapshots_to_keep, + max_snapshot_age_ms=update.max_snapshot_age_ms, + max_ref_age_ms=update.max_age_ref_ms, + ) + existing_ref = self._base_metadata.refs.get(snapshot_ref.ref_name) + if existing_ref is not None and existing_ref == snapshot_ref: + return + + snapshot = next( + snapshot for snapshot in self._base_metadata.snapshots if snapshot.snapshot_id == snapshot_ref.snapshot_id + ) + if snapshot is None: + raise ValueError(f"Cannot set {snapshot_ref.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") + + if any( + snapshot_ref.snapshot_id == prev_update.snapshot.snapshot_id + for prev_update in self._updates + if isinstance(self._updates, AddSnapshotUpdate) + ): + self._base_metadata.last_updated_ms = snapshot.timestamp + + if snapshot_ref.ref_name == MAIN_BRANCH: + self._base_metadata.current_snapshot_id = snapshot_ref.snapshot_id + # TODO: double-check if the default value of TableMetadata make the timestamp too early + # if self._base_metadata.last_updated_ms is None: + # self._base_metadata.last_updated_ms = datetime_to_millis(datetime.datetime.now().astimezone()) + self._base_metadata.snapshot_log.append( + SnapshotLogEntry( + snapshot_id=snapshot_ref.snapshot_id, + timestamp_ms=self._base_metadata.last_updated_ms, + ) + ) + + self._base_metadata.refs[snapshot_ref.ref_name] = snapshot_ref + self._updates.append(update) + + def build(self) -> TableMetadata: + return TableMetadataUtil.parse_obj(self._base_metadata.model_dump()) + + class TableRequirement(IcebergBaseModel): type: str + @abstractmethod + def validate(self, base_metadata: TableMetadata) -> None: + """Validate the requirement against the base metadata.""" + ... + class AssertCreate(TableRequirement): """The table must not already exist; used for create transactions.""" type: Literal["assert-create"] = Field(default="assert-create") + def validate(self, base_metadata: TableMetadata) -> None: + if base_metadata is not None: + raise ValueError("Table already exists") + class AssertTableUUID(TableRequirement): """The table UUID must match the requirement's `uuid`.""" @@ -365,6 +562,10 @@ class AssertTableUUID(TableRequirement): type: Literal["assert-table-uuid"] = Field(default="assert-table-uuid") uuid: str + def validate(self, base_metadata: TableMetadata) -> None: + if self.uuid != base_metadata.uuid: + raise ValueError(f"Table UUID does not match: {self.uuid} != {base_metadata.uuid}") + class AssertRefSnapshotId(TableRequirement): """The table branch or tag identified by the requirement's `ref` must reference the requirement's `snapshot-id`. @@ -376,6 +577,19 @@ class AssertRefSnapshotId(TableRequirement): ref: str snapshot_id: Optional[int] = Field(default=None, alias="snapshot-id") + def validate(self, base_metadata: TableMetadata) -> None: + snapshot_ref = base_metadata.refs.get(self.ref) + if snapshot_ref is not None: + ref_type = snapshot_ref.snapshot_ref_type + if self.snapshot_id is None: + raise ValueError(f"Requirement failed: {self.ref_tpe} {self.ref} was created concurrently") + elif self.snapshot_id != snapshot_ref.snapshot_id: + raise ValueError( + f"Requirement failed: {ref_type} {self.ref} has changed: expected id {self.snapshot_id}, found {snapshot_ref.snapshot_id}" + ) + elif self.snapshot_id is not None: + raise ValueError(f"Requirement failed: branch or tag {self.ref} is missing, expected {self.snapshot_id}") + class AssertLastAssignedFieldId(TableRequirement): """The table's last assigned column id must match the requirement's `last-assigned-field-id`.""" @@ -383,6 +597,9 @@ class AssertLastAssignedFieldId(TableRequirement): type: Literal["assert-last-assigned-field-id"] = Field(default="assert-last-assigned-field-id") last_assigned_field_id: int = Field(..., alias="last-assigned-field-id") + def validate(self, base_metadata: TableMetadata) -> None: + raise NotImplementedError("Not yet implemented") + class AssertCurrentSchemaId(TableRequirement): """The table's current schema id must match the requirement's `current-schema-id`.""" @@ -390,6 +607,9 @@ class AssertCurrentSchemaId(TableRequirement): type: Literal["assert-current-schema-id"] = Field(default="assert-current-schema-id") current_schema_id: int = Field(..., alias="current-schema-id") + def validate(self, base_metadata: TableMetadata) -> None: + raise NotImplementedError("Not yet implemented") + class AssertLastAssignedPartitionId(TableRequirement): """The table's last assigned partition id must match the requirement's `last-assigned-partition-id`.""" @@ -397,6 +617,9 @@ class AssertLastAssignedPartitionId(TableRequirement): type: Literal["assert-last-assigned-partition-id"] = Field(default="assert-last-assigned-partition-id") last_assigned_partition_id: int = Field(..., alias="last-assigned-partition-id") + def validate(self, base_metadata: TableMetadata) -> None: + raise NotImplementedError("Not yet implemented") + class AssertDefaultSpecId(TableRequirement): """The table's default spec id must match the requirement's `default-spec-id`.""" @@ -404,6 +627,9 @@ class AssertDefaultSpecId(TableRequirement): type: Literal["assert-default-spec-id"] = Field(default="assert-default-spec-id") default_spec_id: int = Field(..., alias="default-spec-id") + def validate(self, base_metadata: TableMetadata) -> None: + raise NotImplementedError("Not yet implemented") + class AssertDefaultSortOrderId(TableRequirement): """The table's default sort order id must match the requirement's `default-sort-order-id`.""" @@ -411,6 +637,9 @@ class AssertDefaultSortOrderId(TableRequirement): type: Literal["assert-default-sort-order-id"] = Field(default="assert-default-sort-order-id") default_sort_order_id: int = Field(..., alias="default-sort-order-id") + def validate(self, base_metadata: TableMetadata) -> None: + raise NotImplementedError("Not yet implemented") + class Namespace(IcebergRootModel[List[str]]): """Reference to one or more levels of a namespace.""" @@ -439,6 +668,13 @@ class CommitTableResponse(IcebergBaseModel): metadata_location: str = Field(alias="metadata-location") +def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...]) -> TableMetadata: + builder = TableMetadataUpdateBuilder(base_metadata) + for update in updates: + builder.update_table_metadata(update) + return builder.build() + + class Table: identifier: Identifier = Field() metadata: TableMetadata diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 73d76d8606..271d40e25a 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -69,6 +69,8 @@ INITIAL_SPEC_ID = 0 DEFAULT_SCHEMA_ID = 0 +SUPPORTED_TABLE_FORMAT_VERSION = 2 + def cleanup_snapshot_id(data: Dict[str, Any]) -> Dict[str, Any]: """Run before validation.""" diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 369df4fa92..a5a3e3e833 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -42,7 +42,7 @@ Table, UpdateSchema, _generate_snapshot_id, - _match_deletes_to_datafile, + _match_deletes_to_datafile, update_table_metadata, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER from pyiceberg.table.snapshots import ( @@ -508,6 +508,15 @@ def test_add_nested_list_type_column(table: Table) -> None: ) assert new_schema.highest_field_id == 7 +def test_update_metadata_table_schema(table: Table) -> None: + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + + new_metadata = update_table_metadata(table.metadata, transaction._updates) # pylint: disable=W0212 + print(new_metadata) + def test_generate_snapshot_id(table: Table) -> None: assert isinstance(_generate_snapshot_id(), int) From 274b91bb9d37570c722f6e49117a7065953700a9 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 5 Nov 2023 01:56:32 -0800 Subject: [PATCH 02/27] fix updater error and add tests --- pyiceberg/table/__init__.py | 100 ++++++++++++++++++++---------------- tests/table/test_init.py | 59 +++++++++++++++++++-- 2 files changed, 112 insertions(+), 47 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 5a04c9ae7e..203825fbae 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -16,6 +16,7 @@ # under the License. from __future__ import annotations +import datetime import itertools import uuid from abc import ABC, abstractmethod @@ -74,7 +75,6 @@ SUPPORTED_TABLE_FORMAT_VERSION, TableMetadata, TableMetadataUtil, - TableMetadataV1, ) from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef, SnapshotRefType from pyiceberg.table.snapshots import Snapshot, SnapshotLogEntry @@ -96,6 +96,7 @@ StructType, ) from pyiceberg.utils.concurrent import ExecutorFactory +from pyiceberg.utils.datetime import datetime_to_millis if TYPE_CHECKING: import pandas as pd @@ -357,19 +358,20 @@ class RemovePropertiesUpdate(TableUpdate): class TableMetadataUpdateBuilder: - _base_metadata: TableMetadata + _base_metadata: Dict[str, Any] _updates: List[TableUpdate] _last_added_schema_id: Optional[int] def __init__(self, base_metadata: TableMetadata) -> None: - self._base_metadata = TableMetadataUtil.parse_obj(copy(base_metadata.model_dump())) + self._base_metadata = copy(base_metadata.model_dump()) self._updates = [] self._last_added_schema_id = None def _reuse_or_create_new_schema_id(self, new_schema: Schema) -> Tuple[int, bool]: # if the schema already exists, use its id; otherwise use the highest id + 1 - new_schema_id = self._base_metadata.current_schema_id - for schema in self._base_metadata.schemas: + new_schema_id = self._base_metadata["current-schema-id"] + for raw_schema in self._base_metadata["schemas"]: + schema = Schema(**raw_schema) if schema == new_schema: return schema.schema_id, False elif schema.schema_id >= new_schema_id: @@ -377,17 +379,17 @@ def _reuse_or_create_new_schema_id(self, new_schema: Schema) -> Tuple[int, bool] return new_schema_id, True def _add_schema_internal(self, schema: Schema, last_column_id: int, update: TableUpdate) -> int: - if last_column_id < self._base_metadata.last_column_id: - raise ValueError(f"Invalid last column id {last_column_id}, must be >= {self._base_metadata.last_column_id}") - new_schema_id, schema_found = self._reuse_or_create_new_schema_id(schema) - if schema_found and last_column_id == self._base_metadata.last_column_id: + if last_column_id < self._base_metadata["last-column-id"]: + raise ValueError(f"Invalid last column id {last_column_id}, must be >= {self._base_metadata['last-column-id']}") + new_schema_id, is_new_schema = self._reuse_or_create_new_schema_id(schema) + if not is_new_schema and last_column_id == self._base_metadata["last-column-id"]: if self._last_added_schema_id is not None and any( update.schema_.schema_id == new_schema_id for update in self._updates if isinstance(update, AddSchemaUpdate) ): self._last_added_schema_id = new_schema_id return new_schema_id - self._base_metadata.last_column_id = last_column_id + self._base_metadata["last-column-id"] = last_column_id new_schema = ( schema @@ -396,8 +398,8 @@ def _add_schema_internal(self, schema: Schema, last_column_id: int, update: Tabl else Schema(*schema.fields, schema_id=new_schema_id, identifier_field_ids=schema.identifier_field_ids) ) - if not schema_found: - self._base_metadata.schemas.append(new_schema) + if is_new_schema: + self._base_metadata["schemas"].append(new_schema.model_dump()) self._updates.append(update) self._last_added_schema_id = new_schema_id @@ -409,10 +411,12 @@ def _set_current_schema(self, schema_id: int) -> None: raise ValueError("Cannot set current schema to last added schema when no schema has been added") return self._set_current_schema(self._last_added_schema_id) - if schema_id == self._base_metadata.current_schema_id: + if schema_id == self._base_metadata["current-schema-id"]: return - schema = next(schema for schema in self._base_metadata.schemas if schema.schema_id == schema_id) + schema = next( + (Schema(**raw_schema) for raw_schema in self._base_metadata["schemas"] if raw_schema["schema-id"] == schema_id), None + ) if schema is None: raise ValueError(f"Schema with id {schema_id} does not exist") @@ -420,7 +424,7 @@ def _set_current_schema(self, schema_id: int) -> None: # So it seems the rebuild just refresh the inner field which hold the schema and some naming check for partition_spec # Seems this is not necessary in pyiceberg case wince - self._base_metadata.current_schema_id = schema_id + self._base_metadata["current-schema-id"] = schema_id if self._last_added_schema_id is not None and self._last_added_schema_id == schema_id: self._updates.append(SetCurrentSchemaUpdate(schema_id=-1)) else: @@ -432,17 +436,17 @@ def update_table_metadata(self, update: TableUpdate) -> None: @update_table_metadata.register(UpgradeFormatVersionUpdate) def _(self, update: UpgradeFormatVersionUpdate) -> None: + current_format_version = self._base_metadata["format-version"] if update.format_version > SUPPORTED_TABLE_FORMAT_VERSION: raise ValueError(f"Unsupported table format version: {update.format_version}") - if update.format_version < self._base_metadata.format_version: - raise ValueError(f"Cannot downgrade v{self._base_metadata.format_version} table to v{update.format_version}") - if update.format_version == self._base_metadata.format_version: + if update.format_version < current_format_version: + raise ValueError(f"Cannot downgrade v{current_format_version} table to v{update.format_version}") + if update.format_version == current_format_version: return # At this point, the base_metadata is guaranteed to be v1 - if isinstance(self._base_metadata, TableMetadataV1): - self._base_metadata = self._base_metadata.to_v2() + self._base_metadata["format-version"] = 2 - raise ValueError(f"Cannot upgrade v{self._base_metadata.format_version} table to v{update.format_version}") + raise ValueError(f"Cannot upgrade v{current_format_version} table to v{update.format_version}") @update_table_metadata.register(AddSchemaUpdate) def _(self, update: AddSchemaUpdate) -> None: @@ -454,28 +458,31 @@ def _(self, update: SetCurrentSchemaUpdate) -> None: @update_table_metadata.register(AddSnapshotUpdate) def _(self, update: AddSnapshotUpdate) -> None: - if len(self._base_metadata.schemas) == 0: + if len(self._base_metadata["schemas"]) == 0: raise ValueError("Attempting to add a snapshot before a schema is added") - if len(self._base_metadata.partition_specs) == 0: + if len(self._base_metadata["partition-specs"]) == 0: raise ValueError("Attempting to add a snapshot before a partition spec is added") - if len(self._base_metadata.sort_orders) == 0: + if len(self._base_metadata["sort-orders"]) == 0: raise ValueError("Attempting to add a snapshot before a sort order is added") - if any(update.snapshot.snapshot_id == snapshot.snapshot_id for snapshot in self._base_metadata.snapshots): + if any( + update.snapshot.snapshot_id == Snapshot(**raw_snapshot).snapshot_id + for raw_snapshot in self._base_metadata["snapshots"] + ): raise ValueError(f"Snapshot with id {update.snapshot.snapshot_id} already exists") if ( - self._base_metadata.format_version == 2 + self._base_metadata["format-version"] == 2 and update.snapshot.sequence_number is not None - and self._base_metadata.last_sequence_number is not None - and update.snapshot.sequence_number <= self._base_metadata.last_sequence_number + and self._base_metadata["last-sequence-number"] is not None + and update.snapshot.sequence_number <= self._base_metadata["last-sequence-number"] and update.snapshot.parent_snapshot_id is not None ): raise ValueError( - f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} older than last sequence number {self._base_metadata.last_sequence_number}" + f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} older than last sequence number {self._base_metadata['last-sequence-number']}" ) - self._base_metadata.last_updated_ms = update.snapshot.timestamp - self._base_metadata.last_sequence_number = update.snapshot.sequence_number - self._base_metadata.snapshots.append(update.snapshot) + self._base_metadata["last-updated-ms"] = update.snapshot.timestamp_ms + self._base_metadata["last-sequence-number"] = update.snapshot.sequence_number + self._base_metadata["snapshots"].append(update.snapshot) self._updates.append(update) @update_table_metadata.register(SetSnapshotRefUpdate) @@ -501,12 +508,17 @@ def _(self, update: SetSnapshotRefUpdate) -> None: max_snapshot_age_ms=update.max_snapshot_age_ms, max_ref_age_ms=update.max_age_ref_ms, ) - existing_ref = self._base_metadata.refs.get(snapshot_ref.ref_name) + existing_ref = self._base_metadata["refs"].get(update.ref_name) if existing_ref is not None and existing_ref == snapshot_ref: return snapshot = next( - snapshot for snapshot in self._base_metadata.snapshots if snapshot.snapshot_id == snapshot_ref.snapshot_id + ( + Snapshot(**raw_snapshot) + for raw_snapshot in self._base_metadata["snapshots"] + if raw_snapshot["snapshot-id"] == snapshot_ref.snapshot_id + ), + None, ) if snapshot is None: raise ValueError(f"Cannot set {snapshot_ref.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") @@ -516,25 +528,25 @@ def _(self, update: SetSnapshotRefUpdate) -> None: for prev_update in self._updates if isinstance(self._updates, AddSnapshotUpdate) ): - self._base_metadata.last_updated_ms = snapshot.timestamp + self._base_metadata["last-updated-ms"] = snapshot.timestamp - if snapshot_ref.ref_name == MAIN_BRANCH: - self._base_metadata.current_snapshot_id = snapshot_ref.snapshot_id + if update.ref_name == MAIN_BRANCH: + self._base_metadata["current-snapshot-id"] = snapshot_ref.snapshot_id # TODO: double-check if the default value of TableMetadata make the timestamp too early - # if self._base_metadata.last_updated_ms is None: - # self._base_metadata.last_updated_ms = datetime_to_millis(datetime.datetime.now().astimezone()) - self._base_metadata.snapshot_log.append( + if self._base_metadata["last-updated-ms"] is None: + self._base_metadata["last-updated-ms"] = datetime_to_millis(datetime.datetime.now().astimezone()) + self._base_metadata["snapshot-log"].append( SnapshotLogEntry( snapshot_id=snapshot_ref.snapshot_id, - timestamp_ms=self._base_metadata.last_updated_ms, - ) + timestamp_ms=self._base_metadata["last-updated-ms"], + ).model_dump() ) - self._base_metadata.refs[snapshot_ref.ref_name] = snapshot_ref + self._base_metadata["refs"][update.ref_name] = snapshot_ref self._updates.append(update) def build(self) -> TableMetadata: - return TableMetadataUtil.parse_obj(self._base_metadata.model_dump()) + return TableMetadataUtil.parse_obj(self._base_metadata) class TableRequirement(IcebergBaseModel): diff --git a/tests/table/test_init.py b/tests/table/test_init.py index a5a3e3e833..7699fe1a6e 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -37,12 +37,15 @@ from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table import ( + AddSnapshotUpdate, SetPropertiesUpdate, + SetSnapshotRefUpdate, StaticTable, Table, UpdateSchema, _generate_snapshot_id, - _match_deletes_to_datafile, update_table_metadata, + _match_deletes_to_datafile, + update_table_metadata, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER from pyiceberg.table.snapshots import ( @@ -508,14 +511,64 @@ def test_add_nested_list_type_column(table: Table) -> None: ) assert new_schema.highest_field_id == 7 + def test_update_metadata_table_schema(table: Table) -> None: transaction = table.transaction() update = transaction.update_schema() update.add_column(path="b", field_type=IntegerType()) update.commit() - new_metadata = update_table_metadata(table.metadata, transaction._updates) # pylint: disable=W0212 - print(new_metadata) + apply_schema: Schema = next(schema for schema in new_metadata.schemas if schema.schema_id == 2) + assert len(apply_schema.fields) == 4 + + assert apply_schema == Schema( + NestedField(field_id=1, name="x", field_type=LongType(), required=True), + NestedField(field_id=2, name="y", field_type=LongType(), required=True, doc="comment"), + NestedField(field_id=3, name="z", field_type=LongType(), required=True), + NestedField(field_id=4, name="b", field_type=IntegerType(), required=False), + identifier_field_ids=[1, 2], + ) + assert apply_schema.schema_id == 2 + assert apply_schema.highest_field_id == 4 + + assert new_metadata.current_schema_id == 2 + + +def test_update_metadata_add_snapshot(table: Table) -> None: + new_snapshot = Snapshot( + snapshot_id=25, + parent_snapshot_id=19, + sequence_number=200, + timestamp_ms=1602638573590, + manifest_list="s3:/a/b/c.avro", + summary=Summary(Operation.APPEND), + schema_id=3, + ) + + new_metadata = update_table_metadata(table.metadata, (AddSnapshotUpdate(snapshot=new_snapshot),)) + assert len(new_metadata.snapshots) == 3 + assert new_metadata.snapshots[2] == new_snapshot + assert new_metadata.last_sequence_number == new_snapshot.sequence_number + assert new_metadata.last_updated_ms == new_snapshot.timestamp_ms + + +def test_update_metadata_set_snapshot_ref(table: Table) -> None: + update = SetSnapshotRefUpdate( + ref_name="main", + type="branch", + snapshot_id=3051729675574597004, + max_age_ref_ms=123123123, + max_snapshot_age_ms=12312312312, + min_snapshots_to_keep=1, + ) + + new_metadata = update_table_metadata(table.metadata, (update,)) + assert len(new_metadata.snapshot_log) == 3 + assert new_metadata.snapshot_log[2] == SnapshotLogEntry( + snapshot_id=3051729675574597004, timestamp_ms=table.metadata.last_updated_ms + ) + assert new_metadata.current_snapshot_id == 3051729675574597004 + assert new_metadata.last_updated_ms == table.metadata.last_updated_ms def test_generate_snapshot_id(table: Table) -> None: From 8e8d39dacde067773d6d840b9bf65070399957a9 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 5 Nov 2023 22:48:10 -0800 Subject: [PATCH 03/27] implement _commit_table for glue --- pyiceberg/catalog/__init__.py | 21 +++++++- pyiceberg/catalog/glue.py | 92 ++++++++++++++++++++++++++++++----- pyiceberg/table/__init__.py | 5 +- tests/catalog/test_glue.py | 44 +++++++++++++++++ tests/conftest.py | 2 +- 5 files changed, 147 insertions(+), 17 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 2577a97bc1..505f57d2b5 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -572,8 +572,25 @@ def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) @staticmethod - def _get_metadata_location(location: str) -> str: - return f"{location}/metadata/00000-{uuid.uuid4()}.metadata.json" + def _get_metadata_location(location: str, new_version: int = 0) -> str: + if new_version < 0: + raise ValueError(f"Table metadata version: {new_version} cannot be negative") + version_str = f"{new_version:05d}" + return f"{location}/metadata/{version_str}-{uuid.uuid4()}.metadata.json" + + @staticmethod + def _parse_metadata_version(metadata_location: str) -> int: + try: + # Split the string by '/' and take the last part, then split by '-' and take the first part. + version_part = metadata_location.split('/')[-1].split('-')[0] + # Attempt to convert the version part to an integer. + return int(version_part) + except ValueError: + # Handle any ValueError that occurs if the conversion fails. + return -1 + except IndexError: + # Handle the case where the splits don't produce the expected number of parts. + return -1 def _get_updated_props_and_update_summary( self, current_properties: Properties, removals: Optional[Set[str]], updates: Properties diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index e0683632de..fa4289cd6a 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -40,6 +40,7 @@ ICEBERG, LOCATION, METADATA_LOCATION, + PREVIOUS_METADATA_LOCATION, TABLE_TYPE, Catalog, Identifier, @@ -47,6 +48,7 @@ PropertiesUpdateSummary, ) from pyiceberg.exceptions import ( + CommitFailedException, NamespaceAlreadyExistsError, NamespaceNotEmptyError, NoSuchIcebergTableError, @@ -59,21 +61,33 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table +from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, update_table_metadata from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT -def _construct_parameters(metadata_location: str) -> Properties: - return {TABLE_TYPE: ICEBERG.upper(), METADATA_LOCATION: metadata_location} +def _construct_parameters( + metadata_location: str, glue_table: Optional[TableTypeDef] = None, prev_metadata_location: Optional[str] = None +) -> Properties: + new_parameters = glue_table.get("Parameters", {}) if glue_table else {} + new_parameters.update({TABLE_TYPE: ICEBERG.upper(), METADATA_LOCATION: metadata_location}) + if prev_metadata_location: + new_parameters[PREVIOUS_METADATA_LOCATION] = prev_metadata_location + return new_parameters -def _construct_create_table_input(table_name: str, metadata_location: str, properties: Properties) -> TableInputTypeDef: +def _construct_table_input( + table_name: str, + metadata_location: str, + properties: Properties, + glue_table: Optional[TableTypeDef] = None, + prev_metadata_location: Optional[str] = None, +) -> TableInputTypeDef: table_input: TableInputTypeDef = { "Name": table_name, "TableType": EXTERNAL_TABLE, - "Parameters": _construct_parameters(metadata_location), + "Parameters": _construct_parameters(metadata_location, glue_table, prev_metadata_location), } if "Description" in properties: @@ -177,6 +191,23 @@ def _create_glue_table(self, database_name: str, table_name: str, table_input: T except self.glue.exceptions.EntityNotFoundException as e: raise NoSuchNamespaceError(f"Database {database_name} does not exist") from e + def _update_glue_table(self, database_name: str, table_name: str, table_input: TableInputTypeDef, version_id: str) -> None: + try: + self.glue.update_table(DatabaseName=database_name, TableInput=table_input, VersionId=version_id) + except self.glue.exceptions.EntityNotFoundException as e: + raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e + except self.glue.exceptions.ConcurrentModificationException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update" + ) from e + + def _get_glue_table(self, database_name: str, table_name: str) -> TableTypeDef: + try: + load_table_response = self.glue.get_table(DatabaseName=database_name, Name=table_name) + return load_table_response["Table"] + except self.glue.exceptions.EntityNotFoundException as e: + raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e + def create_table( self, identifier: Union[str, Identifier], @@ -215,7 +246,7 @@ def create_table( io = load_file_io(properties=self.properties, location=metadata_location) self._write_metadata(metadata, io, metadata_location) - table_input = _construct_create_table_input(table_name, metadata_location, properties) + table_input = _construct_table_input(table_name, metadata_location, properties) database_name, table_name = self.identifier_to_database_and_table(identifier) self._create_glue_table(database_name=database_name, table_name=table_name, table_input=table_input) @@ -248,7 +279,47 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. """ - raise NotImplementedError + # TODO: This is a workaround for issue: https://github.com/apache/iceberg-python/issues/123 + database_name, table_name = self.identifier_to_database_and_table( + tuple(table_request.identifier.namespace.root[1:] + [table_request.identifier.name]), NoSuchTableError + ) + current_glue_table = self._get_glue_table(database_name=database_name, table_name=table_name) + glue_table_version_id = current_glue_table.get("VersionId") + if glue_table_version_id is None: + raise ValueError(f"Cannot commit {database_name}.{table_name} because Glue table version id is missing") + current_table = self._convert_glue_to_iceberg(glue_table=current_glue_table) + base_metadata = current_table.metadata + + # Validate the update requirements + for requirement in table_request.requirements: + requirement.validate(base_metadata) + + updated_metadata = update_table_metadata(base_metadata, table_request.updates) + if updated_metadata == base_metadata: + # no changes, do nothing + return CommitTableResponse(metadata=base_metadata, metadata_location=current_table.metadata_location) + + # write new metadata + new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 + new_metadata_location = self._get_metadata_location(current_table.metadata.location, new_metadata_version) + self._write_metadata(updated_metadata, current_table.io, new_metadata_location) + + update_table_input = _construct_table_input( + table_name=table_name, + metadata_location=new_metadata_location, + properties=current_table.properties, + glue_table=current_glue_table, + prev_metadata_location=current_table.metadata_location, + ) + + self._update_glue_table( + database_name=database_name, + table_name=table_name, + table_input=update_table_input, + version_id=glue_table_version_id, + ) + + return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location) def load_table(self, identifier: Union[str, Identifier]) -> Table: """Load the table's metadata and returns the table instance. @@ -266,12 +337,7 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: NoSuchTableError: If a table with the name does not exist, or the identifier is invalid. """ database_name, table_name = self.identifier_to_database_and_table(identifier, NoSuchTableError) - try: - load_table_response = self.glue.get_table(DatabaseName=database_name, Name=table_name) - except self.glue.exceptions.EntityNotFoundException as e: - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e - - return self._convert_glue_to_iceberg(load_table_response["Table"]) + return self._convert_glue_to_iceberg(self._get_glue_table(database_name=database_name, table_name=table_name)) def drop_table(self, identifier: Union[str, Identifier]) -> None: """Drop a table. diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 203825fbae..cdc4d5042a 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -620,7 +620,10 @@ class AssertCurrentSchemaId(TableRequirement): current_schema_id: int = Field(..., alias="current-schema-id") def validate(self, base_metadata: TableMetadata) -> None: - raise NotImplementedError("Not yet implemented") + if self.current_schema_id != base_metadata.current_schema_id: + raise ValueError( + f"Requirement failed: current schema id {base_metadata.current_schema_id} does not match {self.current_schema_id}" + ) class AssertLastAssignedPartitionId(TableRequirement): diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 1d7027a216..130c3d4a1f 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -31,6 +31,7 @@ TableAlreadyExistsError, ) from pyiceberg.schema import Schema +from pyiceberg.types import IntegerType from tests.conftest import BUCKET_NAME, TABLE_METADATA_LOCATION_REGEX @@ -45,6 +46,7 @@ def test_create_table_with_database_location( table = test_catalog.create_table(identifier, table_schema_nested) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @mock_glue @@ -60,6 +62,7 @@ def test_create_table_with_default_warehouse( table = test_catalog.create_table(identifier, table_schema_nested) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @mock_glue @@ -75,6 +78,7 @@ def test_create_table_with_given_location( ) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @mock_glue @@ -100,6 +104,7 @@ def test_create_table_with_strips( table = test_catalog.create_table(identifier, table_schema_nested) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @mock_glue @@ -113,6 +118,7 @@ def test_create_table_with_strips_bucket_root( table_strip = test_catalog.create_table(identifier, table_schema_nested) assert table_strip.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table_strip.metadata_location) + assert test_catalog._parse_metadata_version(table_strip.metadata_location) == 0 @mock_glue @@ -151,6 +157,7 @@ def test_load_table( table = test_catalog.load_table(identifier) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @mock_glue @@ -176,6 +183,7 @@ def test_drop_table( table = test_catalog.load_table(identifier) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 test_catalog.drop_table(identifier) with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) @@ -202,6 +210,7 @@ def test_rename_table( table = test_catalog.create_table(identifier, table_schema_nested) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 test_catalog.rename_table(identifier, new_identifier) new_table = test_catalog.load_table(new_identifier) assert new_table.identifier == (catalog_name,) + new_identifier @@ -457,3 +466,38 @@ def test_passing_profile_name() -> None: mock_session.assert_called_with(**session_properties) assert test_catalog.glue is mock_session().client() + + +@mock_glue +def test_commit_table_update_schema( + _bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested: Schema, database_name: str, table_name: str +) -> None: + catalog_name = "glue" + identifier = (database_name, table_name) + test_catalog = GlueCatalog( + catalog_name, **{"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}"} + ) + test_catalog.create_namespace(namespace=database_name) + table = test_catalog.create_table(identifier, table_schema_nested) + original_table_metadata = table.metadata + + assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 + assert original_table_metadata.current_schema_id == 0 + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + + assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 1 + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + new_schema = next(schema for schema in updated_table_metadata.schemas if schema.schema_id == 1) + assert new_schema + assert new_schema == update._apply() + assert new_schema.find_field("b").field_type == IntegerType() diff --git a/tests/conftest.py b/tests/conftest.py index 79c01dc747..b5081ecc47 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1585,7 +1585,7 @@ def database_list(database_name: str) -> List[str]: TABLE_METADATA_LOCATION_REGEX = re.compile( r"""s3://test_bucket/my_iceberg_database-[a-z]{20}.db/ my_iceberg_table-[a-z]{20}/metadata/ - 00000-[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.metadata.json""", + [0-9]{5}-[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.metadata.json""", re.X, ) From c3e13119e7fb7279f842450ac33b11734b9a9624 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 00:22:28 -0800 Subject: [PATCH 04/27] implement apply_metadata_update which is simpler --- pyiceberg/table/__init__.py | 203 +++++++++++++++++++++++++++++++++++- 1 file changed, 199 insertions(+), 4 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 203825fbae..430d2cc0f9 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -23,7 +23,7 @@ from copy import copy from dataclasses import dataclass from enum import Enum -from functools import cached_property, singledispatchmethod +from functools import cached_property, singledispatch, singledispatchmethod from itertools import chain from typing import ( TYPE_CHECKING, @@ -357,6 +357,30 @@ class RemovePropertiesUpdate(TableUpdate): removals: List[str] +class TableMetadataUpdateContext: + updates: List[TableUpdate] + last_added_schema_id: Optional[int] + + def __init__(self) -> None: + self.updates = [] + self.last_added_schema_id = None + + def get_updates_by_action(self, update_type: TableUpdateAction) -> List[TableUpdate]: + return [update for update in self.updates if update.action == update_type] + + def is_added_snapshot(self, snapshot_id: int) -> bool: + return any( + update.snapshot.snapshot_id == snapshot_id + for update in self.updates + if update.action == TableUpdateAction.add_snapshot + ) + + def is_added_schema(self, schema_id: int) -> bool: + return any( + update.schema_.schema_id == schema_id for update in self.updates if update.action == TableUpdateAction.add_schema + ) + + class TableMetadataUpdateBuilder: _base_metadata: Dict[str, Any] _updates: List[TableUpdate] @@ -549,6 +573,176 @@ def build(self) -> TableMetadata: return TableMetadataUtil.parse_obj(self._base_metadata) +@singledispatch +def apply_table_update(update: TableUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: + raise ValueError(f"Unsupported update: {update}") + + +@apply_table_update.register(UpgradeFormatVersionUpdate) +def _(update: UpgradeFormatVersionUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: + current_format_version = base_metadata.format_version + if update.format_version > SUPPORTED_TABLE_FORMAT_VERSION: + raise ValueError(f"Unsupported table format version: {update.format_version}") + if update.format_version < current_format_version: + raise ValueError(f"Cannot downgrade v{current_format_version} table to v{update.format_version}") + if update.format_version == current_format_version: + return base_metadata + + if current_format_version == 1 and update.format_version == 2: + updated_metadata_data = copy(base_metadata.model_dump()) + updated_metadata_data["format-version"] = update.format_version + return TableMetadataUtil.parse_obj(updated_metadata_data) + + raise ValueError(f"Cannot upgrade v{current_format_version} table to v{update.format_version}") + + +@apply_table_update.register(AddSchemaUpdate) +def _(update: AddSchemaUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: + def reuse_or_create_new_schema_id(new_schema: Schema) -> Tuple[int, bool]: + # if the schema already exists, use its id; otherwise use the highest id + 1 + new_schema_id = base_metadata.current_schema_id + for schema in base_metadata.schemas: + if schema == new_schema: + return schema.schema_id, True + elif schema.schema_id >= new_schema_id: + new_schema_id = schema.schema_id + 1 + return new_schema_id, False + + if update.last_column_id < base_metadata.last_column_id: + raise ValueError(f"Invalid last column id {update.last_column_id}, must be >= {base_metadata.last_column_id}") + new_schema_id, schema_found = reuse_or_create_new_schema_id(update.schema_) + if schema_found and update.last_column_id == base_metadata.last_column_id: + if context.last_added_schema_id is not None and context.is_added_schema(new_schema_id): + context.last_added_schema_id = new_schema_id + return base_metadata + + updated_metadata_data = copy(base_metadata.model_dump()) + updated_metadata_data["last-column-id"] = update.last_column_id + + new_schema = ( + update.schema_ + if new_schema_id == update.schema_.schema_id + # TODO: double check the parameter passing here, schema.fields may be interpreted as the **data fileds + else Schema(*update.schema_.fields, schema_id=new_schema_id, identifier_field_ids=update.schema_.identifier_field_ids) + ) + + if not schema_found: + updated_metadata_data["schemas"].append(new_schema.model_dump()) + + context.updates.append(update) + context.last_added_schema_id = new_schema_id + return TableMetadataUtil.parse_obj(updated_metadata_data) + + +@apply_table_update.register(SetCurrentSchemaUpdate) +def _(update: SetCurrentSchemaUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: + if update.schema_id == -1: + if context.last_added_schema_id is None: + raise ValueError("Cannot set current schema to last added schema when no schema has been added") + return apply_table_update(SetCurrentSchemaUpdate(schema_id=context.last_added_schema_id), base_metadata, context) + + if update.schema_id == base_metadata.current_schema_id: + return base_metadata + + schema = next((schema for schema in base_metadata.schemas if schema.schema_id == update.schema_id), None) + if schema is None: + raise ValueError(f"Schema with id {update.schema_id} does not exist") + + # TODO: rebuild sort_order and partition_spec + # So it seems the rebuild just refresh the inner field which hold the schema and some naming check for partition_spec + # Seems this is not necessary in pyiceberg case wince + + updated_metadata_data = copy(base_metadata.model_dump()) + updated_metadata_data["current-schema-id"] = update.schema_id + if context.last_added_schema_id is not None and context.last_added_schema_id == update.schema_id: + context.updates.append(SetCurrentSchemaUpdate(schema_id=-1)) + else: + context.updates.append(update) + + return TableMetadataUtil.parse_obj(updated_metadata_data) + + +@apply_table_update.register(AddSnapshotUpdate) +def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: + if len(base_metadata.schemas) == 0: + raise ValueError("Attempting to add a snapshot before a schema is added") + if len(base_metadata.partition_specs) == 0: + raise ValueError("Attempting to add a snapshot before a partition spec is added") + if len(base_metadata.sort_orders) == 0: + raise ValueError("Attempting to add a snapshot before a sort order is added") + if any(update.snapshot.snapshot_id == snapshot.snapshot_id for snapshot in base_metadata.snapshots): + raise ValueError(f"Snapshot with id {update.snapshot.snapshot_id} already exists") + if ( + base_metadata.format_version == 2 + and update.snapshot.sequence_number is not None + and update.snapshot.sequence_number <= base_metadata.last_sequence_number + and update.snapshot.parent_snapshot_id is not None + ): + raise ValueError( + f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} older than last sequence number {base_metadata.last_sequence_number}" + ) + + updated_metadata_data = copy(base_metadata.model_dump()) + updated_metadata_data["last-updated-ms"] = update.snapshot.timestamp_ms + updated_metadata_data["last-sequence-number"] = update.snapshot.sequence_number + updated_metadata_data["snapshots"].append(update.snapshot.model_dump()) + context.updates.append(update) + return TableMetadataUtil.parse_obj(updated_metadata_data) + + +@apply_table_update.register(SetSnapshotRefUpdate) +def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: + if update.type is None: + raise ValueError("Snapshot ref type must be set") + if update.min_snapshots_to_keep is not None and update.type == SnapshotRefType.TAG: + raise ValueError("Cannot set min snapshots to keep for branch refs") + if update.min_snapshots_to_keep is not None and update.min_snapshots_to_keep <= 0: + raise ValueError("Minimum snapshots to keep must be >= 0") + if update.max_snapshot_age_ms is not None and update.type == SnapshotRefType.TAG: + raise ValueError("Tags do not support setting maxSnapshotAgeMs") + if update.max_snapshot_age_ms is not None and update.max_snapshot_age_ms <= 0: + raise ValueError("Max snapshot age must be > 0 ms") + if update.max_age_ref_ms is not None and update.max_age_ref_ms <= 0: + raise ValueError("Max ref age must be > 0 ms") + snapshot_ref = SnapshotRef( + snapshot_id=update.snapshot_id, + snapshot_ref_type=update.type, + min_snapshots_to_keep=update.min_snapshots_to_keep, + max_snapshot_age_ms=update.max_snapshot_age_ms, + max_ref_age_ms=update.max_age_ref_ms, + ) + existing_ref = base_metadata.refs.get(update.ref_name) + if existing_ref is not None and existing_ref == snapshot_ref: + return base_metadata + + snapshot = next( + (snapshot for snapshot in base_metadata.snapshots if snapshot.snapshot_id == snapshot_ref.snapshot_id), + None, + ) + if snapshot is None: + raise ValueError(f"Cannot set {snapshot_ref.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") + + update_metadata_data = copy(base_metadata.model_dump()) + if context.is_added_snapshot(snapshot_ref.snapshot_id): + update_metadata_data["last-updated-ms"] = snapshot.timestamp + + if update.ref_name == MAIN_BRANCH: + update_metadata_data["current-snapshot-id"] = snapshot_ref.snapshot_id + # TODO: double-check if the default value of TableMetadata make the timestamp too early + # if base_metadata.last_updated_ms is None: + # update_metadata_data["last-updated-ms"] = datetime_to_millis(datetime.datetime.now().astimezone()) + update_metadata_data["snapshot-log"].append( + SnapshotLogEntry( + snapshot_id=snapshot_ref.snapshot_id, + timestamp_ms=update_metadata_data["last-updated-ms"], + ).model_dump() + ) + + update_metadata_data["refs"][update.ref_name] = snapshot_ref.model_dump() + context.updates.append(update) + return TableMetadataUtil.parse_obj(update_metadata_data) + + class TableRequirement(IcebergBaseModel): type: str @@ -681,10 +875,11 @@ class CommitTableResponse(IcebergBaseModel): def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...]) -> TableMetadata: - builder = TableMetadataUpdateBuilder(base_metadata) + context = TableMetadataUpdateContext() + new_metadata = base_metadata for update in updates: - builder.update_table_metadata(update) - return builder.build() + new_metadata = apply_table_update(update, new_metadata, context) + return new_metadata class Table: From 2b7a7d17aeecb929ae34ecd7ecf92d42982ed8f9 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 00:24:25 -0800 Subject: [PATCH 05/27] remove old implementation --- pyiceberg/table/__init__.py | 196 +----------------------------------- 1 file changed, 1 insertion(+), 195 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 430d2cc0f9..1c99302713 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -16,14 +16,13 @@ # under the License. from __future__ import annotations -import datetime import itertools import uuid from abc import ABC, abstractmethod from copy import copy from dataclasses import dataclass from enum import Enum -from functools import cached_property, singledispatch, singledispatchmethod +from functools import cached_property, singledispatch from itertools import chain from typing import ( TYPE_CHECKING, @@ -96,7 +95,6 @@ StructType, ) from pyiceberg.utils.concurrent import ExecutorFactory -from pyiceberg.utils.datetime import datetime_to_millis if TYPE_CHECKING: import pandas as pd @@ -381,198 +379,6 @@ def is_added_schema(self, schema_id: int) -> bool: ) -class TableMetadataUpdateBuilder: - _base_metadata: Dict[str, Any] - _updates: List[TableUpdate] - _last_added_schema_id: Optional[int] - - def __init__(self, base_metadata: TableMetadata) -> None: - self._base_metadata = copy(base_metadata.model_dump()) - self._updates = [] - self._last_added_schema_id = None - - def _reuse_or_create_new_schema_id(self, new_schema: Schema) -> Tuple[int, bool]: - # if the schema already exists, use its id; otherwise use the highest id + 1 - new_schema_id = self._base_metadata["current-schema-id"] - for raw_schema in self._base_metadata["schemas"]: - schema = Schema(**raw_schema) - if schema == new_schema: - return schema.schema_id, False - elif schema.schema_id >= new_schema_id: - new_schema_id = schema.schema_id + 1 - return new_schema_id, True - - def _add_schema_internal(self, schema: Schema, last_column_id: int, update: TableUpdate) -> int: - if last_column_id < self._base_metadata["last-column-id"]: - raise ValueError(f"Invalid last column id {last_column_id}, must be >= {self._base_metadata['last-column-id']}") - new_schema_id, is_new_schema = self._reuse_or_create_new_schema_id(schema) - if not is_new_schema and last_column_id == self._base_metadata["last-column-id"]: - if self._last_added_schema_id is not None and any( - update.schema_.schema_id == new_schema_id for update in self._updates if isinstance(update, AddSchemaUpdate) - ): - self._last_added_schema_id = new_schema_id - return new_schema_id - - self._base_metadata["last-column-id"] = last_column_id - - new_schema = ( - schema - if new_schema_id == schema.schema_id - # TODO: double check the parameter passing here, schema.fields may be interpreted as the **data fileds - else Schema(*schema.fields, schema_id=new_schema_id, identifier_field_ids=schema.identifier_field_ids) - ) - - if is_new_schema: - self._base_metadata["schemas"].append(new_schema.model_dump()) - - self._updates.append(update) - self._last_added_schema_id = new_schema_id - return new_schema_id - - def _set_current_schema(self, schema_id: int) -> None: - if schema_id == -1: - if self._last_added_schema_id is None: - raise ValueError("Cannot set current schema to last added schema when no schema has been added") - return self._set_current_schema(self._last_added_schema_id) - - if schema_id == self._base_metadata["current-schema-id"]: - return - - schema = next( - (Schema(**raw_schema) for raw_schema in self._base_metadata["schemas"] if raw_schema["schema-id"] == schema_id), None - ) - if schema is None: - raise ValueError(f"Schema with id {schema_id} does not exist") - - # TODO: rebuild sort_order and partition_spec - # So it seems the rebuild just refresh the inner field which hold the schema and some naming check for partition_spec - # Seems this is not necessary in pyiceberg case wince - - self._base_metadata["current-schema-id"] = schema_id - if self._last_added_schema_id is not None and self._last_added_schema_id == schema_id: - self._updates.append(SetCurrentSchemaUpdate(schema_id=-1)) - else: - self._updates.append(SetCurrentSchemaUpdate(schema_id=schema_id)) - - @singledispatchmethod - def update_table_metadata(self, update: TableUpdate) -> None: - raise TypeError(f"Unsupported update: {update}") - - @update_table_metadata.register(UpgradeFormatVersionUpdate) - def _(self, update: UpgradeFormatVersionUpdate) -> None: - current_format_version = self._base_metadata["format-version"] - if update.format_version > SUPPORTED_TABLE_FORMAT_VERSION: - raise ValueError(f"Unsupported table format version: {update.format_version}") - if update.format_version < current_format_version: - raise ValueError(f"Cannot downgrade v{current_format_version} table to v{update.format_version}") - if update.format_version == current_format_version: - return - # At this point, the base_metadata is guaranteed to be v1 - self._base_metadata["format-version"] = 2 - - raise ValueError(f"Cannot upgrade v{current_format_version} table to v{update.format_version}") - - @update_table_metadata.register(AddSchemaUpdate) - def _(self, update: AddSchemaUpdate) -> None: - self._add_schema_internal(update.schema_, update.last_column_id, update) - - @update_table_metadata.register(SetCurrentSchemaUpdate) - def _(self, update: SetCurrentSchemaUpdate) -> None: - self._set_current_schema(update.schema_id) - - @update_table_metadata.register(AddSnapshotUpdate) - def _(self, update: AddSnapshotUpdate) -> None: - if len(self._base_metadata["schemas"]) == 0: - raise ValueError("Attempting to add a snapshot before a schema is added") - if len(self._base_metadata["partition-specs"]) == 0: - raise ValueError("Attempting to add a snapshot before a partition spec is added") - if len(self._base_metadata["sort-orders"]) == 0: - raise ValueError("Attempting to add a snapshot before a sort order is added") - if any( - update.snapshot.snapshot_id == Snapshot(**raw_snapshot).snapshot_id - for raw_snapshot in self._base_metadata["snapshots"] - ): - raise ValueError(f"Snapshot with id {update.snapshot.snapshot_id} already exists") - if ( - self._base_metadata["format-version"] == 2 - and update.snapshot.sequence_number is not None - and self._base_metadata["last-sequence-number"] is not None - and update.snapshot.sequence_number <= self._base_metadata["last-sequence-number"] - and update.snapshot.parent_snapshot_id is not None - ): - raise ValueError( - f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} older than last sequence number {self._base_metadata['last-sequence-number']}" - ) - - self._base_metadata["last-updated-ms"] = update.snapshot.timestamp_ms - self._base_metadata["last-sequence-number"] = update.snapshot.sequence_number - self._base_metadata["snapshots"].append(update.snapshot) - self._updates.append(update) - - @update_table_metadata.register(SetSnapshotRefUpdate) - def _(self, update: SetSnapshotRefUpdate) -> None: - ## TODO: may be some of the validation could be added to SnapshotRef class - ## TODO: may be we need to make some of the field in this update as optional or we can remove some of the checks - if update.type is None: - raise ValueError("Snapshot ref type must be set") - if update.min_snapshots_to_keep is not None and update.type == SnapshotRefType.TAG: - raise ValueError("Cannot set min snapshots to keep for branch refs") - if update.min_snapshots_to_keep is not None and update.min_snapshots_to_keep <= 0: - raise ValueError("Minimum snapshots to keep must be >= 0") - if update.max_snapshot_age_ms is not None and update.type == SnapshotRefType.TAG: - raise ValueError("Tags do not support setting maxSnapshotAgeMs") - if update.max_snapshot_age_ms is not None and update.max_snapshot_age_ms <= 0: - raise ValueError("Max snapshot age must be > 0 ms") - if update.max_age_ref_ms is not None and update.max_age_ref_ms <= 0: - raise ValueError("Max ref age must be > 0 ms") - snapshot_ref = SnapshotRef( - snapshot_id=update.snapshot_id, - snapshot_ref_type=update.type, - min_snapshots_to_keep=update.min_snapshots_to_keep, - max_snapshot_age_ms=update.max_snapshot_age_ms, - max_ref_age_ms=update.max_age_ref_ms, - ) - existing_ref = self._base_metadata["refs"].get(update.ref_name) - if existing_ref is not None and existing_ref == snapshot_ref: - return - - snapshot = next( - ( - Snapshot(**raw_snapshot) - for raw_snapshot in self._base_metadata["snapshots"] - if raw_snapshot["snapshot-id"] == snapshot_ref.snapshot_id - ), - None, - ) - if snapshot is None: - raise ValueError(f"Cannot set {snapshot_ref.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") - - if any( - snapshot_ref.snapshot_id == prev_update.snapshot.snapshot_id - for prev_update in self._updates - if isinstance(self._updates, AddSnapshotUpdate) - ): - self._base_metadata["last-updated-ms"] = snapshot.timestamp - - if update.ref_name == MAIN_BRANCH: - self._base_metadata["current-snapshot-id"] = snapshot_ref.snapshot_id - # TODO: double-check if the default value of TableMetadata make the timestamp too early - if self._base_metadata["last-updated-ms"] is None: - self._base_metadata["last-updated-ms"] = datetime_to_millis(datetime.datetime.now().astimezone()) - self._base_metadata["snapshot-log"].append( - SnapshotLogEntry( - snapshot_id=snapshot_ref.snapshot_id, - timestamp_ms=self._base_metadata["last-updated-ms"], - ).model_dump() - ) - - self._base_metadata["refs"][update.ref_name] = snapshot_ref - self._updates.append(update) - - def build(self) -> TableMetadata: - return TableMetadataUtil.parse_obj(self._base_metadata) - - @singledispatch def apply_table_update(update: TableUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: raise ValueError(f"Unsupported update: {update}") From 4fc25df27a3c974f9bf2d4dcbf940badce961dcf Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 00:26:13 -0800 Subject: [PATCH 06/27] re-organize method place --- pyiceberg/table/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 1c99302713..5a25edb283 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -549,6 +549,14 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table return TableMetadataUtil.parse_obj(update_metadata_data) +def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...]) -> TableMetadata: + context = TableMetadataUpdateContext() + new_metadata = base_metadata + for update in updates: + new_metadata = apply_table_update(update, new_metadata, context) + return new_metadata + + class TableRequirement(IcebergBaseModel): type: str @@ -680,14 +688,6 @@ class CommitTableResponse(IcebergBaseModel): metadata_location: str = Field(alias="metadata-location") -def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...]) -> TableMetadata: - context = TableMetadataUpdateContext() - new_metadata = base_metadata - for update in updates: - new_metadata = apply_table_update(update, new_metadata, context) - return new_metadata - - class Table: identifier: Identifier = Field() metadata: TableMetadata From facb43b89815abca72d2ba0417861082104630fb Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 00:27:53 -0800 Subject: [PATCH 07/27] fix nit --- pyiceberg/table/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 5a25edb283..69e55e9774 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -406,13 +406,13 @@ def _(update: UpgradeFormatVersionUpdate, base_metadata: TableMetadata, context: def _(update: AddSchemaUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: def reuse_or_create_new_schema_id(new_schema: Schema) -> Tuple[int, bool]: # if the schema already exists, use its id; otherwise use the highest id + 1 - new_schema_id = base_metadata.current_schema_id + result_schema_id = base_metadata.current_schema_id for schema in base_metadata.schemas: if schema == new_schema: return schema.schema_id, True - elif schema.schema_id >= new_schema_id: - new_schema_id = schema.schema_id + 1 - return new_schema_id, False + elif schema.schema_id >= result_schema_id: + result_schema_id = schema.schema_id + 1 + return result_schema_id, False if update.last_column_id < base_metadata.last_column_id: raise ValueError(f"Invalid last column id {update.last_column_id}, must be >= {base_metadata.last_column_id}") From 116c6fdadbdc4becc7acd4e5c23079ba5afa007a Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 10:27:48 -0800 Subject: [PATCH 08/27] fix test --- pyiceberg/table/__init__.py | 6 +++--- tests/table/test_init.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 69e55e9774..833116beeb 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -325,7 +325,7 @@ class SetSnapshotRefUpdate(TableUpdate): ref_name: str = Field(alias="ref-name") type: Literal["tag", "branch"] snapshot_id: int = Field(alias="snapshot-id") - max_age_ref_ms: int = Field(alias="max-ref-age-ms") + max_ref_age_ms: int = Field(alias="max-ref-age-ms") max_snapshot_age_ms: int = Field(alias="max-snapshot-age-ms") min_snapshots_to_keep: int = Field(alias="min-snapshots-to-keep") @@ -508,14 +508,14 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table raise ValueError("Tags do not support setting maxSnapshotAgeMs") if update.max_snapshot_age_ms is not None and update.max_snapshot_age_ms <= 0: raise ValueError("Max snapshot age must be > 0 ms") - if update.max_age_ref_ms is not None and update.max_age_ref_ms <= 0: + if update.max_ref_age_ms is not None and update.max_ref_age_ms <= 0: raise ValueError("Max ref age must be > 0 ms") snapshot_ref = SnapshotRef( snapshot_id=update.snapshot_id, snapshot_ref_type=update.type, min_snapshots_to_keep=update.min_snapshots_to_keep, max_snapshot_age_ms=update.max_snapshot_age_ms, - max_ref_age_ms=update.max_age_ref_ms, + max_ref_age_ms=update.max_ref_age_ms, ) existing_ref = base_metadata.refs.get(update.ref_name) if existing_ref is not None and existing_ref == snapshot_ref: diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 7699fe1a6e..98f39408b6 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -45,7 +45,7 @@ UpdateSchema, _generate_snapshot_id, _match_deletes_to_datafile, - update_table_metadata, + update_table_metadata, SnapshotRef, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER from pyiceberg.table.snapshots import ( @@ -547,7 +547,7 @@ def test_update_metadata_add_snapshot(table: Table) -> None: new_metadata = update_table_metadata(table.metadata, (AddSnapshotUpdate(snapshot=new_snapshot),)) assert len(new_metadata.snapshots) == 3 - assert new_metadata.snapshots[2] == new_snapshot + assert new_metadata.snapshots[-1] == new_snapshot assert new_metadata.last_sequence_number == new_snapshot.sequence_number assert new_metadata.last_updated_ms == new_snapshot.timestamp_ms @@ -557,7 +557,7 @@ def test_update_metadata_set_snapshot_ref(table: Table) -> None: ref_name="main", type="branch", snapshot_id=3051729675574597004, - max_age_ref_ms=123123123, + max_ref_age_ms=123123123, max_snapshot_age_ms=12312312312, min_snapshots_to_keep=1, ) @@ -569,6 +569,13 @@ def test_update_metadata_set_snapshot_ref(table: Table) -> None: ) assert new_metadata.current_snapshot_id == 3051729675574597004 assert new_metadata.last_updated_ms == table.metadata.last_updated_ms + assert new_metadata.refs[update.ref_name] == SnapshotRef( + snapshot_id=3051729675574597004, + snapshot_ref_type="branch", + min_snapshots_to_keep=1, + max_snapshot_age_ms=12312312312, + max_ref_age_ms=123123123, + ) def test_generate_snapshot_id(table: Table) -> None: From 66a4f46352b70421fecfceaf46f720e2c4b8c938 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 17:25:45 -0800 Subject: [PATCH 09/27] add another test --- pyiceberg/table/__init__.py | 57 +++++++++++++++++++++++-------------- pyiceberg/table/metadata.py | 8 ++++++ tests/table/test_init.py | 32 ++++++++++++++++++++- 3 files changed, 74 insertions(+), 23 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 833116beeb..743479f709 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -386,26 +386,31 @@ def apply_table_update(update: TableUpdate, base_metadata: TableMetadata, contex @apply_table_update.register(UpgradeFormatVersionUpdate) def _(update: UpgradeFormatVersionUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: - current_format_version = base_metadata.format_version if update.format_version > SUPPORTED_TABLE_FORMAT_VERSION: raise ValueError(f"Unsupported table format version: {update.format_version}") - if update.format_version < current_format_version: - raise ValueError(f"Cannot downgrade v{current_format_version} table to v{update.format_version}") - if update.format_version == current_format_version: - return base_metadata - if current_format_version == 1 and update.format_version == 2: - updated_metadata_data = copy(base_metadata.model_dump()) - updated_metadata_data["format-version"] = update.format_version - return TableMetadataUtil.parse_obj(updated_metadata_data) + if update.format_version < base_metadata.format_version: + raise ValueError(f"Cannot downgrade v{base_metadata.format_version} table to v{update.format_version}") - raise ValueError(f"Cannot upgrade v{current_format_version} table to v{update.format_version}") + if update.format_version == base_metadata.format_version: + return base_metadata + + updated_metadata_data = copy(base_metadata.model_dump()) + updated_metadata_data["format-version"] = update.format_version + return TableMetadataUtil.parse_obj(updated_metadata_data) @apply_table_update.register(AddSchemaUpdate) def _(update: AddSchemaUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: def reuse_or_create_new_schema_id(new_schema: Schema) -> Tuple[int, bool]: - # if the schema already exists, use its id; otherwise use the highest id + 1 + """Reuse schema id if schema already exists, otherwise create a new one. + + Args: + new_schema: The new schema to be added. + + Returns: + The new schema id and whether the schema already exists. + """ result_schema_id = base_metadata.current_schema_id for schema in base_metadata.schemas: if schema == new_schema: @@ -416,6 +421,7 @@ def reuse_or_create_new_schema_id(new_schema: Schema) -> Tuple[int, bool]: if update.last_column_id < base_metadata.last_column_id: raise ValueError(f"Invalid last column id {update.last_column_id}, must be >= {base_metadata.last_column_id}") + new_schema_id, schema_found = reuse_or_create_new_schema_id(update.schema_) if schema_found and update.last_column_id == base_metadata.last_column_id: if context.last_added_schema_id is not None and context.is_added_schema(new_schema_id): @@ -428,7 +434,6 @@ def reuse_or_create_new_schema_id(new_schema: Schema) -> Tuple[int, bool]: new_schema = ( update.schema_ if new_schema_id == update.schema_.schema_id - # TODO: double check the parameter passing here, schema.fields may be interpreted as the **data fileds else Schema(*update.schema_.fields, schema_id=new_schema_id, identifier_field_ids=update.schema_.identifier_field_ids) ) @@ -450,7 +455,7 @@ def _(update: SetCurrentSchemaUpdate, base_metadata: TableMetadata, context: Tab if update.schema_id == base_metadata.current_schema_id: return base_metadata - schema = next((schema for schema in base_metadata.schemas if schema.schema_id == update.schema_id), None) + schema = base_metadata.schema_by_id(update.schema_id) if schema is None: raise ValueError(f"Schema with id {update.schema_id} does not exist") @@ -460,6 +465,7 @@ def _(update: SetCurrentSchemaUpdate, base_metadata: TableMetadata, context: Tab updated_metadata_data = copy(base_metadata.model_dump()) updated_metadata_data["current-schema-id"] = update.schema_id + if context.last_added_schema_id is not None and context.last_added_schema_id == update.schema_id: context.updates.append(SetCurrentSchemaUpdate(schema_id=-1)) else: @@ -472,12 +478,16 @@ def _(update: SetCurrentSchemaUpdate, base_metadata: TableMetadata, context: Tab def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: if len(base_metadata.schemas) == 0: raise ValueError("Attempting to add a snapshot before a schema is added") + if len(base_metadata.partition_specs) == 0: raise ValueError("Attempting to add a snapshot before a partition spec is added") + if len(base_metadata.sort_orders) == 0: raise ValueError("Attempting to add a snapshot before a sort order is added") - if any(update.snapshot.snapshot_id == snapshot.snapshot_id for snapshot in base_metadata.snapshots): + + if base_metadata.snapshot_by_id(update.snapshot.snapshot_id) is not None: raise ValueError(f"Snapshot with id {update.snapshot.snapshot_id} already exists") + if ( base_metadata.format_version == 2 and update.snapshot.sequence_number is not None @@ -500,16 +510,22 @@ def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: TableMet def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: if update.type is None: raise ValueError("Snapshot ref type must be set") + if update.min_snapshots_to_keep is not None and update.type == SnapshotRefType.TAG: raise ValueError("Cannot set min snapshots to keep for branch refs") + if update.min_snapshots_to_keep is not None and update.min_snapshots_to_keep <= 0: raise ValueError("Minimum snapshots to keep must be >= 0") + if update.max_snapshot_age_ms is not None and update.type == SnapshotRefType.TAG: raise ValueError("Tags do not support setting maxSnapshotAgeMs") + if update.max_snapshot_age_ms is not None and update.max_snapshot_age_ms <= 0: raise ValueError("Max snapshot age must be > 0 ms") + if update.max_ref_age_ms is not None and update.max_ref_age_ms <= 0: raise ValueError("Max ref age must be > 0 ms") + snapshot_ref = SnapshotRef( snapshot_id=update.snapshot_id, snapshot_ref_type=update.type, @@ -517,14 +533,12 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table max_snapshot_age_ms=update.max_snapshot_age_ms, max_ref_age_ms=update.max_ref_age_ms, ) + existing_ref = base_metadata.refs.get(update.ref_name) if existing_ref is not None and existing_ref == snapshot_ref: return base_metadata - snapshot = next( - (snapshot for snapshot in base_metadata.snapshots if snapshot.snapshot_id == snapshot_ref.snapshot_id), - None, - ) + snapshot = base_metadata.snapshot_by_id(snapshot_ref.snapshot_id) if snapshot is None: raise ValueError(f"Cannot set {snapshot_ref.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") @@ -552,8 +566,10 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...]) -> TableMetadata: context = TableMetadataUpdateContext() new_metadata = base_metadata + for update in updates: new_metadata = apply_table_update(update, new_metadata, context) + return new_metadata @@ -800,10 +816,7 @@ def current_snapshot(self) -> Optional[Snapshot]: def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]: """Get the snapshot of this table with the given id, or None if there is no matching snapshot.""" - try: - return next(snapshot for snapshot in self.metadata.snapshots if snapshot.snapshot_id == snapshot_id) - except StopIteration: - return None + return self.metadata.snapshot_by_id(snapshot_id) # pylint: disable=W0212 def snapshot_by_name(self, name: str) -> Optional[Snapshot]: """Return the snapshot referenced by the given name or null if no such reference exists.""" diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 271d40e25a..91a0dfad57 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -218,6 +218,14 @@ class TableMetadataCommonFields(IcebergBaseModel): There is always a main branch reference pointing to the current-snapshot-id even if the refs map is null.""" + def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]: + """Get the snapshot by id.""" + return next((snapshot for snapshot in self.snapshots if snapshot.snapshot_id == snapshot_id), None) + + def schema_by_id(self, schema_id: int) -> Optional[Schema]: + """Get the schema by id.""" + return next((schema for schema in self.schemas if schema.schema_id == schema_id), None) + class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel): """Represents version 1 of the Table Metadata. diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 98f39408b6..e0a545cbe9 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -40,12 +40,15 @@ AddSnapshotUpdate, SetPropertiesUpdate, SetSnapshotRefUpdate, + SnapshotRef, StaticTable, Table, + TableMetadataUpdateContext, UpdateSchema, _generate_snapshot_id, _match_deletes_to_datafile, - update_table_metadata, SnapshotRef, + apply_table_update, + update_table_metadata, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER from pyiceberg.table.snapshots import ( @@ -512,6 +515,33 @@ def test_add_nested_list_type_column(table: Table) -> None: assert new_schema.highest_field_id == 7 +def test_apply_add_schema_update(table: Table) -> None: + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + + test_context = TableMetadataUpdateContext() + + new_table_metadata = apply_table_update( + transaction._updates[0], base_metadata=table.metadata, context=test_context + ) # pylint: disable=W0212 + assert len(new_table_metadata.schemas) == 3 + assert new_table_metadata.current_schema_id == 1 + assert len(test_context.updates) == 1 + assert test_context.updates[0] == transaction._updates[0] # pylint: disable=W0212 + assert test_context.last_added_schema_id == 2 + + new_table_metadata = apply_table_update( + transaction._updates[1], base_metadata=new_table_metadata, context=test_context + ) # pylint: disable=W0212 + assert len(new_table_metadata.schemas) == 3 + assert new_table_metadata.current_schema_id == 2 + assert len(test_context.updates) == 2 + assert test_context.updates[1] == transaction._updates[1] # pylint: disable=W0212 + assert test_context.last_added_schema_id == 2 + + def test_update_metadata_table_schema(table: Table) -> None: transaction = table.transaction() update = transaction.update_schema() From 2882d0db4815831ff2e40a0fbf297500635e95e8 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 22:32:03 -0800 Subject: [PATCH 10/27] clear TODO --- pyiceberg/table/__init__.py | 46 ++++++++++++++++++++++++++----------- pyiceberg/table/metadata.py | 15 +++++++----- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 743479f709..c9de95bdb3 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -16,6 +16,7 @@ # under the License. from __future__ import annotations +import datetime import itertools import uuid from abc import ABC, abstractmethod @@ -95,6 +96,7 @@ StructType, ) from pyiceberg.utils.concurrent import ExecutorFactory +from pyiceberg.utils.datetime import datetime_to_millis if TYPE_CHECKING: import pandas as pd @@ -381,7 +383,18 @@ def is_added_schema(self, schema_id: int) -> bool: @singledispatch def apply_table_update(update: TableUpdate, base_metadata: TableMetadata, context: TableMetadataUpdateContext) -> TableMetadata: - raise ValueError(f"Unsupported update: {update}") + """Apply a table update to the table metadata. + + Args: + update: The update to be applied. + base_metadata: The base metadata to be updated. + context: Contains previous updates, last_added_snapshot_id and other change tracking information in the current transaction. + + Returns: + The updated metadata. + + """ + raise NotImplementedError(f"Unsupported table update: {update}") @apply_table_update.register(UpgradeFormatVersionUpdate) @@ -455,14 +468,10 @@ def _(update: SetCurrentSchemaUpdate, base_metadata: TableMetadata, context: Tab if update.schema_id == base_metadata.current_schema_id: return base_metadata - schema = base_metadata.schema_by_id(update.schema_id) + schema = base_metadata.schemas_by_id.get(update.schema_id) if schema is None: raise ValueError(f"Schema with id {update.schema_id} does not exist") - # TODO: rebuild sort_order and partition_spec - # So it seems the rebuild just refresh the inner field which hold the schema and some naming check for partition_spec - # Seems this is not necessary in pyiceberg case wince - updated_metadata_data = copy(base_metadata.model_dump()) updated_metadata_data["current-schema-id"] = update.schema_id @@ -485,7 +494,7 @@ def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: TableMet if len(base_metadata.sort_orders) == 0: raise ValueError("Attempting to add a snapshot before a sort order is added") - if base_metadata.snapshot_by_id(update.snapshot.snapshot_id) is not None: + if base_metadata.snapshots_by_id.get(update.snapshot.snapshot_id) is not None: raise ValueError(f"Snapshot with id {update.snapshot.snapshot_id} already exists") if ( @@ -495,7 +504,8 @@ def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: TableMet and update.snapshot.parent_snapshot_id is not None ): raise ValueError( - f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} older than last sequence number {base_metadata.last_sequence_number}" + f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} " + f"older than last sequence number {base_metadata.last_sequence_number}" ) updated_metadata_data = copy(base_metadata.model_dump()) @@ -538,19 +548,20 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table if existing_ref is not None and existing_ref == snapshot_ref: return base_metadata - snapshot = base_metadata.snapshot_by_id(snapshot_ref.snapshot_id) + snapshot = base_metadata.snapshots_by_id.get(snapshot_ref.snapshot_id) if snapshot is None: raise ValueError(f"Cannot set {snapshot_ref.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") update_metadata_data = copy(base_metadata.model_dump()) + update_last_updated_ms = True if context.is_added_snapshot(snapshot_ref.snapshot_id): update_metadata_data["last-updated-ms"] = snapshot.timestamp + update_last_updated_ms = False if update.ref_name == MAIN_BRANCH: update_metadata_data["current-snapshot-id"] = snapshot_ref.snapshot_id - # TODO: double-check if the default value of TableMetadata make the timestamp too early - # if base_metadata.last_updated_ms is None: - # update_metadata_data["last-updated-ms"] = datetime_to_millis(datetime.datetime.now().astimezone()) + if update_last_updated_ms: + update_metadata_data["last-updated-ms"] = datetime_to_millis(datetime.datetime.now().astimezone()) update_metadata_data["snapshot-log"].append( SnapshotLogEntry( snapshot_id=snapshot_ref.snapshot_id, @@ -564,6 +575,15 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpdate, ...]) -> TableMetadata: + """Update the table metadata with the given updates in one transaction. + + Args: + base_metadata: The base metadata to be updated. + updates: The updates in one transaction. + + Returns: + The updated metadata. + """ context = TableMetadataUpdateContext() new_metadata = base_metadata @@ -816,7 +836,7 @@ def current_snapshot(self) -> Optional[Snapshot]: def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]: """Get the snapshot of this table with the given id, or None if there is no matching snapshot.""" - return self.metadata.snapshot_by_id(snapshot_id) # pylint: disable=W0212 + return self.metadata.snapshots_by_id.get(snapshot_id) def snapshot_by_name(self, name: str) -> Optional[Snapshot]: """Return the snapshot referenced by the given name or null if no such reference exists.""" diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 91a0dfad57..cb3799bf1c 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -19,6 +19,7 @@ import datetime import uuid from copy import copy +from functools import cached_property from typing import ( Any, Dict, @@ -218,13 +219,15 @@ class TableMetadataCommonFields(IcebergBaseModel): There is always a main branch reference pointing to the current-snapshot-id even if the refs map is null.""" - def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]: - """Get the snapshot by id.""" - return next((snapshot for snapshot in self.snapshots if snapshot.snapshot_id == snapshot_id), None) + @cached_property + def snapshots_by_id(self) -> Dict[int, Snapshot]: + """Index the snapshots by snapshot_id.""" + return {snapshot.snapshot_id: snapshot for snapshot in self.snapshots} - def schema_by_id(self, schema_id: int) -> Optional[Schema]: - """Get the schema by id.""" - return next((schema for schema in self.schemas if schema.schema_id == schema_id), None) + @cached_property + def schemas_by_id(self) -> Dict[int, Schema]: + """Index the schemas by schema_id.""" + return {schema.schema_id: schema for schema in self.schemas} class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel): From 8a8d4ffed2f230c387c06e627692066deefc8072 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 23:35:37 -0800 Subject: [PATCH 11/27] add a combined test --- pyiceberg/table/__init__.py | 7 +- tests/conftest.py | 42 ++++++- tests/table/test_init.py | 209 ++++++++++++++++++++++------------- tests/table/test_metadata.py | 25 ----- 4 files changed, 178 insertions(+), 105 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index c9de95bdb3..5473561e76 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -365,9 +365,6 @@ def __init__(self) -> None: self.updates = [] self.last_added_schema_id = None - def get_updates_by_action(self, update_type: TableUpdateAction) -> List[TableUpdate]: - return [update for update in self.updates if update.action == update_type] - def is_added_snapshot(self, snapshot_id: int) -> bool: return any( update.snapshot.snapshot_id == snapshot_id @@ -410,6 +407,8 @@ def _(update: UpgradeFormatVersionUpdate, base_metadata: TableMetadata, context: updated_metadata_data = copy(base_metadata.model_dump()) updated_metadata_data["format-version"] = update.format_version + + context.updates.append(update) return TableMetadataUtil.parse_obj(updated_metadata_data) @@ -555,7 +554,7 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: Table update_metadata_data = copy(base_metadata.model_dump()) update_last_updated_ms = True if context.is_added_snapshot(snapshot_ref.snapshot_id): - update_metadata_data["last-updated-ms"] = snapshot.timestamp + update_metadata_data["last-updated-ms"] = snapshot.timestamp_ms update_last_updated_ms = False if update.ref_name == MAIN_BRANCH: diff --git a/tests/conftest.py b/tests/conftest.py index 79c01dc747..68a10be6b7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,7 +73,7 @@ from pyiceberg.schema import Accessor, Schema from pyiceberg.serializers import ToOutputFile from pyiceberg.table import FileScanTask, Table -from pyiceberg.table.metadata import TableMetadataV2 +from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2 from pyiceberg.types import ( BinaryType, BooleanType, @@ -353,6 +353,32 @@ def all_avro_types() -> Dict[str, Any]: } +EXAMPLE_TABLE_METADATA_V1 = { + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + {"id": 1, "name": "x", "required": True, "type": "long"}, + {"id": 2, "name": "y", "required": True, "type": "long", "doc": "comment"}, + {"id": 3, "name": "z", "required": True, "type": "long"}, + ], + }, + "partition-spec": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}], +} + + +@pytest.fixture(scope="session") +def example_table_metadata_v1() -> Dict[str, Any]: + return EXAMPLE_TABLE_METADATA_V1 + + EXAMPLE_TABLE_METADATA_V2 = { "format-version": 2, "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", @@ -1651,7 +1677,19 @@ def example_task(data_file: str) -> FileScanTask: @pytest.fixture -def table(example_table_metadata_v2: Dict[str, Any]) -> Table: +def table_v1(example_table_metadata_v1: Dict[str, Any]) -> Table: + table_metadata = TableMetadataV1(**example_table_metadata_v1) + return Table( + identifier=("database", "table"), + metadata=table_metadata, + metadata_location=f"{table_metadata.location}/uuid.metadata.json", + io=load_file_io(), + catalog=NoopCatalog("NoopCatalog"), + ) + + +@pytest.fixture +def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**example_table_metadata_v2) return Table( identifier=("database", "table"), diff --git a/tests/table/test_init.py b/tests/table/test_init.py index e0a545cbe9..e9522108eb 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -85,8 +85,8 @@ ) -def test_schema(table: Table) -> None: - assert table.schema() == Schema( +def test_schema(table_v2: Table) -> None: + assert table_v2.schema() == Schema( NestedField(field_id=1, name="x", field_type=LongType(), required=True), NestedField(field_id=2, name="y", field_type=LongType(), required=True, doc="comment"), NestedField(field_id=3, name="z", field_type=LongType(), required=True), @@ -95,8 +95,8 @@ def test_schema(table: Table) -> None: ) -def test_schemas(table: Table) -> None: - assert table.schemas() == { +def test_schemas(table_v2: Table) -> None: + assert table_v2.schemas() == { 0: Schema( NestedField(field_id=1, name="x", field_type=LongType(), required=True), schema_id=0, @@ -112,20 +112,20 @@ def test_schemas(table: Table) -> None: } -def test_spec(table: Table) -> None: - assert table.spec() == PartitionSpec( +def test_spec(table_v2: Table) -> None: + assert table_v2.spec() == PartitionSpec( PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="x"), spec_id=0 ) -def test_specs(table: Table) -> None: - assert table.specs() == { +def test_specs(table_v2: Table) -> None: + assert table_v2.specs() == { 0: PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="x"), spec_id=0) } -def test_sort_order(table: Table) -> None: - assert table.sort_order() == SortOrder( +def test_sort_order(table_v2: Table) -> None: + assert table_v2.sort_order() == SortOrder( SortField(source_id=2, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), SortField( source_id=3, @@ -137,8 +137,8 @@ def test_sort_order(table: Table) -> None: ) -def test_sort_orders(table: Table) -> None: - assert table.sort_orders() == { +def test_sort_orders(table_v2: Table) -> None: + assert table_v2.sort_orders() == { 3: SortOrder( SortField(source_id=2, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), SortField( @@ -152,12 +152,12 @@ def test_sort_orders(table: Table) -> None: } -def test_location(table: Table) -> None: - assert table.location() == "s3://bucket/test/location" +def test_location(table_v2: Table) -> None: + assert table_v2.location() == "s3://bucket/test/location" -def test_current_snapshot(table: Table) -> None: - assert table.current_snapshot() == Snapshot( +def test_current_snapshot(table_v2: Table) -> None: + assert table_v2.current_snapshot() == Snapshot( snapshot_id=3055729675574597004, parent_snapshot_id=3051729675574597004, sequence_number=1, @@ -168,8 +168,8 @@ def test_current_snapshot(table: Table) -> None: ) -def test_snapshot_by_id(table: Table) -> None: - assert table.snapshot_by_id(3055729675574597004) == Snapshot( +def test_snapshot_by_id(table_v2: Table) -> None: + assert table_v2.snapshot_by_id(3055729675574597004) == Snapshot( snapshot_id=3055729675574597004, parent_snapshot_id=3051729675574597004, sequence_number=1, @@ -180,12 +180,12 @@ def test_snapshot_by_id(table: Table) -> None: ) -def test_snapshot_by_id_does_not_exist(table: Table) -> None: - assert table.snapshot_by_id(-1) is None +def test_snapshot_by_id_does_not_exist(table_v2: Table) -> None: + assert table_v2.snapshot_by_id(-1) is None -def test_snapshot_by_name(table: Table) -> None: - assert table.snapshot_by_name("test") == Snapshot( +def test_snapshot_by_name(table_v2: Table) -> None: + assert table_v2.snapshot_by_name("test") == Snapshot( snapshot_id=3051729675574597004, parent_snapshot_id=None, sequence_number=0, @@ -196,11 +196,11 @@ def test_snapshot_by_name(table: Table) -> None: ) -def test_snapshot_by_name_does_not_exist(table: Table) -> None: - assert table.snapshot_by_name("doesnotexist") is None +def test_snapshot_by_name_does_not_exist(table_v2: Table) -> None: + assert table_v2.snapshot_by_name("doesnotexist") is None -def test_repr(table: Table) -> None: +def test_repr(table_v2: Table) -> None: expected = """table( 1: x: required long, 2: y: required long (comment), @@ -209,37 +209,37 @@ def test_repr(table: Table) -> None: partition by: [x], sort order: [2 ASC NULLS FIRST, bucket[4](3) DESC NULLS LAST], snapshot: Operation.APPEND: id=3055729675574597004, parent_id=3051729675574597004, schema_id=1""" - assert repr(table) == expected + assert repr(table_v2) == expected -def test_history(table: Table) -> None: - assert table.history() == [ +def test_history(table_v2: Table) -> None: + assert table_v2.history() == [ SnapshotLogEntry(snapshot_id=3051729675574597004, timestamp_ms=1515100955770), SnapshotLogEntry(snapshot_id=3055729675574597004, timestamp_ms=1555100955770), ] -def test_table_scan_select(table: Table) -> None: - scan = table.scan() +def test_table_scan_select(table_v2: Table) -> None: + scan = table_v2.scan() assert scan.selected_fields == ("*",) assert scan.select("a", "b").selected_fields == ("a", "b") assert scan.select("a", "c").select("a").selected_fields == ("a",) -def test_table_scan_row_filter(table: Table) -> None: - scan = table.scan() +def test_table_scan_row_filter(table_v2: Table) -> None: + scan = table_v2.scan() assert scan.row_filter == AlwaysTrue() assert scan.filter(EqualTo("x", 10)).row_filter == EqualTo("x", 10) assert scan.filter(EqualTo("x", 10)).filter(In("y", (10, 11))).row_filter == And(EqualTo("x", 10), In("y", (10, 11))) -def test_table_scan_ref(table: Table) -> None: - scan = table.scan() +def test_table_scan_ref(table_v2: Table) -> None: + scan = table_v2.scan() assert scan.use_ref("test").snapshot_id == 3051729675574597004 -def test_table_scan_ref_does_not_exists(table: Table) -> None: - scan = table.scan() +def test_table_scan_ref_does_not_exists(table_v2: Table) -> None: + scan = table_v2.scan() with pytest.raises(ValueError) as exc_info: _ = scan.use_ref("boom") @@ -247,8 +247,8 @@ def test_table_scan_ref_does_not_exists(table: Table) -> None: assert "Cannot scan unknown ref=boom" in str(exc_info.value) -def test_table_scan_projection_full_schema(table: Table) -> None: - scan = table.scan() +def test_table_scan_projection_full_schema(table_v2: Table) -> None: + scan = table_v2.scan() assert scan.select("x", "y", "z").projection() == Schema( NestedField(field_id=1, name="x", field_type=LongType(), required=True), NestedField(field_id=2, name="y", field_type=LongType(), required=True, doc="comment"), @@ -258,8 +258,8 @@ def test_table_scan_projection_full_schema(table: Table) -> None: ) -def test_table_scan_projection_single_column(table: Table) -> None: - scan = table.scan() +def test_table_scan_projection_single_column(table_v2: Table) -> None: + scan = table_v2.scan() assert scan.select("y").projection() == Schema( NestedField(field_id=2, name="y", field_type=LongType(), required=True, doc="comment"), schema_id=1, @@ -267,8 +267,8 @@ def test_table_scan_projection_single_column(table: Table) -> None: ) -def test_table_scan_projection_single_column_case_sensitive(table: Table) -> None: - scan = table.scan() +def test_table_scan_projection_single_column_case_sensitive(table_v2: Table) -> None: + scan = table_v2.scan() assert scan.with_case_sensitive(False).select("Y").projection() == Schema( NestedField(field_id=2, name="y", field_type=LongType(), required=True, doc="comment"), schema_id=1, @@ -276,8 +276,8 @@ def test_table_scan_projection_single_column_case_sensitive(table: Table) -> Non ) -def test_table_scan_projection_unknown_column(table: Table) -> None: - scan = table.scan() +def test_table_scan_projection_unknown_column(table_v2: Table) -> None: + scan = table_v2.scan() with pytest.raises(ValueError) as exc_info: _ = scan.select("a").projection() @@ -285,16 +285,16 @@ def test_table_scan_projection_unknown_column(table: Table) -> None: assert "Could not find column: 'a'" in str(exc_info.value) -def test_static_table_same_as_table(table: Table, metadata_location: str) -> None: +def test_static_table_same_as_table(table_v2: Table, metadata_location: str) -> None: static_table = StaticTable.from_metadata(metadata_location) assert isinstance(static_table, Table) - assert static_table.metadata == table.metadata + assert static_table.metadata == table_v2.metadata -def test_static_table_gz_same_as_table(table: Table, metadata_location_gz: str) -> None: +def test_static_table_gz_same_as_table(table_v2: Table, metadata_location_gz: str) -> None: static_table = StaticTable.from_metadata(metadata_location_gz) assert isinstance(static_table, Table) - assert static_table.metadata == table.metadata + assert static_table.metadata == table_v2.metadata def test_static_table_io_does_not_exist(metadata_location: str) -> None: @@ -415,8 +415,8 @@ def test_serialize_set_properties_updates() -> None: assert SetPropertiesUpdate(updates={"abc": "🤪"}).model_dump_json() == """{"action":"set-properties","updates":{"abc":"🤪"}}""" -def test_add_column(table: Table) -> None: - update = UpdateSchema(table) +def test_add_column(table_v2: Table) -> None: + update = UpdateSchema(table_v2) update.add_column(path="b", field_type=IntegerType()) apply_schema: Schema = update._apply() # pylint: disable=W0212 assert len(apply_schema.fields) == 4 @@ -432,7 +432,7 @@ def test_add_column(table: Table) -> None: assert apply_schema.highest_field_id == 4 -def test_add_primitive_type_column(table: Table) -> None: +def test_add_primitive_type_column(table_v2: Table) -> None: primitive_type: Dict[str, PrimitiveType] = { "boolean": BooleanType(), "int": IntegerType(), @@ -450,7 +450,7 @@ def test_add_primitive_type_column(table: Table) -> None: for name, type_ in primitive_type.items(): field_name = f"new_column_{name}" - update = UpdateSchema(table) + update = UpdateSchema(table_v2) update.add_column(path=field_name, field_type=type_, doc=f"new_column_{name}") new_schema = update._apply() # pylint: disable=W0212 @@ -459,10 +459,10 @@ def test_add_primitive_type_column(table: Table) -> None: assert field.doc == f"new_column_{name}" -def test_add_nested_type_column(table: Table) -> None: +def test_add_nested_type_column(table_v2: Table) -> None: # add struct type column field_name = "new_column_struct" - update = UpdateSchema(table) + update = UpdateSchema(table_v2) struct_ = StructType( NestedField(1, "lat", DoubleType()), NestedField(2, "long", DoubleType()), @@ -477,10 +477,10 @@ def test_add_nested_type_column(table: Table) -> None: assert schema_.highest_field_id == 6 -def test_add_nested_map_type_column(table: Table) -> None: +def test_add_nested_map_type_column(table_v2: Table) -> None: # add map type column field_name = "new_column_map" - update = UpdateSchema(table) + update = UpdateSchema(table_v2) map_ = MapType(1, StringType(), 2, IntegerType(), False) update.add_column(path=field_name, field_type=map_) new_schema = update._apply() # pylint: disable=W0212 @@ -489,10 +489,10 @@ def test_add_nested_map_type_column(table: Table) -> None: assert new_schema.highest_field_id == 6 -def test_add_nested_list_type_column(table: Table) -> None: +def test_add_nested_list_type_column(table_v2: Table) -> None: # add list type column field_name = "new_column_list" - update = UpdateSchema(table) + update = UpdateSchema(table_v2) list_ = ListType( element_id=101, element_type=StructType( @@ -515,8 +515,8 @@ def test_add_nested_list_type_column(table: Table) -> None: assert new_schema.highest_field_id == 7 -def test_apply_add_schema_update(table: Table) -> None: - transaction = table.transaction() +def test_apply_add_schema_update(table_v2: Table) -> None: + transaction = table_v2.transaction() update = transaction.update_schema() update.add_column(path="b", field_type=IntegerType()) update.commit() @@ -524,7 +524,7 @@ def test_apply_add_schema_update(table: Table) -> None: test_context = TableMetadataUpdateContext() new_table_metadata = apply_table_update( - transaction._updates[0], base_metadata=table.metadata, context=test_context + transaction._updates[0], base_metadata=table_v2.metadata, context=test_context ) # pylint: disable=W0212 assert len(new_table_metadata.schemas) == 3 assert new_table_metadata.current_schema_id == 1 @@ -542,12 +542,12 @@ def test_apply_add_schema_update(table: Table) -> None: assert test_context.last_added_schema_id == 2 -def test_update_metadata_table_schema(table: Table) -> None: - transaction = table.transaction() +def test_update_metadata_table_schema(table_v2: Table) -> None: + transaction = table_v2.transaction() update = transaction.update_schema() update.add_column(path="b", field_type=IntegerType()) update.commit() - new_metadata = update_table_metadata(table.metadata, transaction._updates) # pylint: disable=W0212 + new_metadata = update_table_metadata(table_v2.metadata, transaction._updates) # pylint: disable=W0212 apply_schema: Schema = next(schema for schema in new_metadata.schemas if schema.schema_id == 2) assert len(apply_schema.fields) == 4 @@ -564,7 +564,7 @@ def test_update_metadata_table_schema(table: Table) -> None: assert new_metadata.current_schema_id == 2 -def test_update_metadata_add_snapshot(table: Table) -> None: +def test_update_metadata_add_snapshot(table_v2: Table) -> None: new_snapshot = Snapshot( snapshot_id=25, parent_snapshot_id=19, @@ -575,14 +575,14 @@ def test_update_metadata_add_snapshot(table: Table) -> None: schema_id=3, ) - new_metadata = update_table_metadata(table.metadata, (AddSnapshotUpdate(snapshot=new_snapshot),)) + new_metadata = update_table_metadata(table_v2.metadata, (AddSnapshotUpdate(snapshot=new_snapshot),)) assert len(new_metadata.snapshots) == 3 assert new_metadata.snapshots[-1] == new_snapshot assert new_metadata.last_sequence_number == new_snapshot.sequence_number assert new_metadata.last_updated_ms == new_snapshot.timestamp_ms -def test_update_metadata_set_snapshot_ref(table: Table) -> None: +def test_update_metadata_set_snapshot_ref(table_v2: Table) -> None: update = SetSnapshotRefUpdate( ref_name="main", type="branch", @@ -592,13 +592,11 @@ def test_update_metadata_set_snapshot_ref(table: Table) -> None: min_snapshots_to_keep=1, ) - new_metadata = update_table_metadata(table.metadata, (update,)) + new_metadata = update_table_metadata(table_v2.metadata, (update,)) assert len(new_metadata.snapshot_log) == 3 - assert new_metadata.snapshot_log[2] == SnapshotLogEntry( - snapshot_id=3051729675574597004, timestamp_ms=table.metadata.last_updated_ms - ) + assert new_metadata.snapshot_log[2].snapshot_id == 3051729675574597004 assert new_metadata.current_snapshot_id == 3051729675574597004 - assert new_metadata.last_updated_ms == table.metadata.last_updated_ms + assert new_metadata.last_updated_ms > table_v2.metadata.last_updated_ms assert new_metadata.refs[update.ref_name] == SnapshotRef( snapshot_id=3051729675574597004, snapshot_ref_type="branch", @@ -608,6 +606,69 @@ def test_update_metadata_set_snapshot_ref(table: Table) -> None: ) -def test_generate_snapshot_id(table: Table) -> None: +def test_update_metadata_with_multiple_updates(table_v1: Table) -> None: + base_metadata = table_v1.metadata + transaction = table_v1.transaction() + transaction.upgrade_table_version(format_version=2) + + schema_update_1 = transaction.update_schema() + schema_update_1.add_column(path="b", field_type=IntegerType()) + schema_update_1.commit() + + test_updates = transaction._updates # pylint: disable=W0212 + + new_snapshot = Snapshot( + snapshot_id=25, + parent_snapshot_id=19, + sequence_number=200, + timestamp_ms=1602638573590, + manifest_list="s3:/a/b/c.avro", + summary=Summary(Operation.APPEND), + schema_id=3, + ) + + test_updates += ( + AddSnapshotUpdate(snapshot=new_snapshot), + SetSnapshotRefUpdate( + ref_name="main", + type="branch", + snapshot_id=25, + max_ref_age_ms=123123123, + max_snapshot_age_ms=12312312312, + min_snapshots_to_keep=1, + ), + ) + + new_metadata = update_table_metadata(base_metadata, test_updates) + + # UpgradeFormatVersionUpdate + assert new_metadata.format_version == 2 + + # UpdateSchema + assert len(new_metadata.schemas) == 2 + assert new_metadata.current_schema_id == 1 + assert new_metadata.schemas_by_id[new_metadata.current_schema_id].highest_field_id == 4 + + # AddSchemaUpdate + assert len(new_metadata.snapshots) == 2 + assert new_metadata.snapshots[-1] == new_snapshot + assert new_metadata.last_sequence_number == new_snapshot.sequence_number + assert new_metadata.last_updated_ms == new_snapshot.timestamp_ms + + # SetSnapshotRefUpdate + assert len(new_metadata.snapshot_log) == 1 + assert new_metadata.snapshot_log[0].snapshot_id == 25 + assert new_metadata.current_snapshot_id == 25 + assert new_metadata.last_updated_ms == 1602638573590 + assert new_metadata.refs["main"] == SnapshotRef( + snapshot_id=25, + snapshot_ref_type="branch", + min_snapshots_to_keep=1, + max_snapshot_age_ms=12312312312, + max_ref_age_ms=123123123, + ) + + +def test_generate_snapshot_id(table_v2: Table) -> None: assert isinstance(_generate_snapshot_id(), int) - assert isinstance(table.new_snapshot_id(), int) + assert isinstance(table_v2.new_snapshot_id(), int) diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index 2273843645..9f35dcbe8d 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -51,31 +51,6 @@ StructType, ) -EXAMPLE_TABLE_METADATA_V1 = { - "format-version": 1, - "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", - "location": "s3://bucket/test/location", - "last-updated-ms": 1602638573874, - "last-column-id": 3, - "schema": { - "type": "struct", - "fields": [ - {"id": 1, "name": "x", "required": True, "type": "long"}, - {"id": 2, "name": "y", "required": True, "type": "long", "doc": "comment"}, - {"id": 3, "name": "z", "required": True, "type": "long"}, - ], - }, - "partition-spec": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}], - "properties": {}, - "current-snapshot-id": -1, - "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}], -} - - -@pytest.fixture(scope="session") -def example_table_metadata_v1() -> Dict[str, Any]: - return EXAMPLE_TABLE_METADATA_V1 - def test_from_dict_v1(example_table_metadata_v1: Dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a dictionary""" From 1cfe9d20b7a189f39d5d566f4290e46b2ca888d5 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sat, 11 Nov 2023 23:44:10 -0800 Subject: [PATCH 12/27] Fix merge conflict --- pyiceberg/table/__init__.py | 2 -- tests/conftest.py | 1 - 2 files changed, 3 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 6f659d05d1..f81b8e836c 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -77,8 +77,6 @@ TableMetadataUtil, ) from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef, SnapshotRefType -from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadata -from pyiceberg.table.refs import SnapshotRef from pyiceberg.table.snapshots import Snapshot, SnapshotLogEntry from pyiceberg.table.sorting import SortOrder from pyiceberg.typedef import ( diff --git a/tests/conftest.py b/tests/conftest.py index 3bb608fd53..11b371586c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,7 +74,6 @@ from pyiceberg.serializers import ToOutputFile from pyiceberg.table import FileScanTask, Table from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2 -from pyiceberg.table.metadata import TableMetadataV2 from pyiceberg.typedef import UTF8 from pyiceberg.types import ( BinaryType, From 7afe31819498c9ee317644443d2ec0aaef3ee34e Mon Sep 17 00:00:00 2001 From: HonahX Date: Thu, 7 Dec 2023 22:41:45 -0800 Subject: [PATCH 13/27] update table metadata merged --- pyiceberg/catalog/glue.py | 7 ++++--- pyiceberg/table/__init__.py | 7 ------- tests/catalog/test_glue.py | 1 + tests/conftest.py | 2 +- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index a9d2fb0251..e5d3dc6348 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -279,10 +279,11 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. """ - # TODO: This is a workaround for issue: https://github.com/apache/iceberg-python/issues/123 - database_name, table_name = self.identifier_to_database_and_table( - tuple(table_request.identifier.namespace.root[1:] + [table_request.identifier.name]), NoSuchTableError + identifier_tuple = self.identifier_to_tuple_without_catalog( + tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) ) + database_name, table_name = self.identifier_to_database_and_table(identifier_tuple) + current_glue_table = self._get_glue_table(database_name=database_name, table_name=table_name) glue_table_version_id = current_glue_table.get("VersionId") if glue_table_version_id is None: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 958aa5af4b..5b2707b02c 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -78,13 +78,6 @@ TableMetadataUtil, ) from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef -from pyiceberg.table.metadata import ( - INITIAL_SEQUENCE_NUMBER, - SUPPORTED_TABLE_FORMAT_VERSION, - TableMetadata, - TableMetadataUtil, -) -from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef, SnapshotRefType from pyiceberg.table.snapshots import Snapshot, SnapshotLogEntry from pyiceberg.table.sorting import SortOrder from pyiceberg.typedef import ( diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 5c091d4617..2bcdc1570b 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -554,6 +554,7 @@ def test_commit_table_update_schema( transaction.commit_transaction() updated_table_metadata = table.metadata + print(table.metadata_location) assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 1 diff --git a/tests/conftest.py b/tests/conftest.py index 367e08151e..87226015b1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1740,7 +1740,7 @@ def database_list(database_name: str) -> List[str]: TABLE_METADATA_LOCATION_REGEX = re.compile( r"""s3://test_bucket/my_iceberg_database-[a-z]{20}.db/ my_iceberg_table-[a-z]{20}/metadata/ - 00000-[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.metadata.json""", + [0-9]{5}-[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.metadata.json""", re.X, ) From f769101bbdd8062320faaab998c8319c5b75f418 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 15:21:28 -0800 Subject: [PATCH 14/27] implement requirements validation --- pyiceberg/table/__init__.py | 58 +++++++++++++++++++- tests/table/test_init.py | 105 ++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 4768706d1d..91f725ee02 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -540,18 +540,31 @@ def update_table_metadata(base_metadata: TableMetadata, updates: Tuple[TableUpda class TableRequirement(IcebergBaseModel): type: str + @abstractmethod + def validate(self, base_metadata: TableMetadata) -> None: + """Validate the requirement against the base metadata.""" + ... + class AssertCreate(TableRequirement): """The table must not already exist; used for create transactions.""" type: Literal["assert-create"] = Field(default="assert-create") + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is not None: + raise ValueError("Table already exists") + class AssertTableUUID(TableRequirement): """The table UUID must match the requirement's `uuid`.""" type: Literal["assert-table-uuid"] = Field(default="assert-table-uuid") - uuid: str + uuid: uuid.UUID + + def validate(self, base_metadata: TableMetadata) -> None: + if self.uuid != base_metadata.table_uuid: + raise ValueError(f"Table UUID does not match: {self.uuid} != {base_metadata.table_uuid}") class AssertRefSnapshotId(TableRequirement): @@ -564,6 +577,19 @@ class AssertRefSnapshotId(TableRequirement): ref: str snapshot_id: Optional[int] = Field(default=None, alias="snapshot-id") + def validate(self, base_metadata: TableMetadata) -> None: + snapshot_ref = base_metadata.refs.get(self.ref) + if snapshot_ref is not None: + ref_type = snapshot_ref.snapshot_ref_type + if self.snapshot_id is None: + raise ValueError(f"Requirement failed: {ref_type} {self.ref} was created concurrently") + elif self.snapshot_id != snapshot_ref.snapshot_id: + raise ValueError( + f"Requirement failed: {ref_type} {self.ref} has changed: expected id {self.snapshot_id}, found {snapshot_ref.snapshot_id}" + ) + elif self.snapshot_id is not None: + raise ValueError(f"Requirement failed: branch or tag {self.ref} is missing, expected {self.snapshot_id}") + class AssertLastAssignedFieldId(TableRequirement): """The table's last assigned column id must match the requirement's `last-assigned-field-id`.""" @@ -571,6 +597,12 @@ class AssertLastAssignedFieldId(TableRequirement): type: Literal["assert-last-assigned-field-id"] = Field(default="assert-last-assigned-field-id") last_assigned_field_id: int = Field(..., alias="last-assigned-field-id") + def validate(self, base_metadata: TableMetadata) -> None: + if base_metadata.last_column_id != self.last_assigned_field_id: + raise ValueError( + f"Requirement failed: last assigned field id has changed: expected {self.last_assigned_field_id}, found {base_metadata.last_column_id}" + ) + class AssertCurrentSchemaId(TableRequirement): """The table's current schema id must match the requirement's `current-schema-id`.""" @@ -578,6 +610,12 @@ class AssertCurrentSchemaId(TableRequirement): type: Literal["assert-current-schema-id"] = Field(default="assert-current-schema-id") current_schema_id: int = Field(..., alias="current-schema-id") + def validate(self, base_metadata: TableMetadata) -> None: + if self.current_schema_id != base_metadata.current_schema_id: + raise ValueError( + f"Requirement failed: current schema id has changed: expected {self.current_schema_id}, found {base_metadata.current_schema_id}" + ) + class AssertLastAssignedPartitionId(TableRequirement): """The table's last assigned partition id must match the requirement's `last-assigned-partition-id`.""" @@ -585,6 +623,12 @@ class AssertLastAssignedPartitionId(TableRequirement): type: Literal["assert-last-assigned-partition-id"] = Field(default="assert-last-assigned-partition-id") last_assigned_partition_id: int = Field(..., alias="last-assigned-partition-id") + def validate(self, base_metadata: TableMetadata) -> None: + if base_metadata.last_partition_id != self.last_assigned_partition_id: + raise ValueError( + f"Requirement failed: last assigned partition id has changed: expected {self.last_assigned_partition_id}, found {base_metadata.last_partition_id}" + ) + class AssertDefaultSpecId(TableRequirement): """The table's default spec id must match the requirement's `default-spec-id`.""" @@ -592,6 +636,12 @@ class AssertDefaultSpecId(TableRequirement): type: Literal["assert-default-spec-id"] = Field(default="assert-default-spec-id") default_spec_id: int = Field(..., alias="default-spec-id") + def validate(self, base_metadata: TableMetadata) -> None: + if self.default_spec_id != base_metadata.default_spec_id: + raise ValueError( + f"Requirement failed: default spec id has changed: expected {self.default_spec_id}, found {base_metadata.default_spec_id}" + ) + class AssertDefaultSortOrderId(TableRequirement): """The table's default sort order id must match the requirement's `default-sort-order-id`.""" @@ -599,6 +649,12 @@ class AssertDefaultSortOrderId(TableRequirement): type: Literal["assert-default-sort-order-id"] = Field(default="assert-default-sort-order-id") default_sort_order_id: int = Field(..., alias="default-sort-order-id") + def validate(self, base_metadata: TableMetadata) -> None: + if self.default_sort_order_id != base_metadata.default_sort_order_id: + raise ValueError( + f"Requirement failed: default sort order id has changed: expected {self.default_sort_order_id}, found {base_metadata.default_sort_order_id}" + ) + class Namespace(IcebergRootModel[List[str]]): """Reference to one or more levels of a namespace.""" diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 8d13a82f3a..713784e8d7 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name +import uuid from copy import copy from typing import Dict @@ -39,6 +40,14 @@ from pyiceberg.schema import Schema from pyiceberg.table import ( AddSnapshotUpdate, + AssertCreate, + AssertCurrentSchemaId, + AssertDefaultSortOrderId, + AssertDefaultSpecId, + AssertLastAssignedFieldId, + AssertLastAssignedPartitionId, + AssertRefSnapshotId, + AssertTableUUID, SetPropertiesUpdate, SetSnapshotRefUpdate, SnapshotRef, @@ -721,3 +730,99 @@ def test_metadata_isolation_from_illegal_updates(table_v1: Table) -> None: def test_generate_snapshot_id(table_v2: Table) -> None: assert isinstance(_generate_snapshot_id(), int) assert isinstance(table_v2.new_snapshot_id(), int) + + +def test_assert_create(table_v2: Table) -> None: + AssertCreate().validate(None) + + with pytest.raises(ValueError, match="Table already exists"): + AssertCreate().validate(table_v2.metadata) + + +def test_assert_table_uuid(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertTableUUID(uuid=base_metadata.table_uuid).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Table UUID does not match: 9c12d441-03fe-4693-9a96-a0705ddf69c2 != 9c12d441-03fe-4693-9a96-a0705ddf69c1", + ): + AssertTableUUID(uuid=uuid.UUID("9c12d441-03fe-4693-9a96-a0705ddf69c2")).validate(base_metadata) + + +def test_assert_ref_snapshot_id(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertRefSnapshotId(ref="main", snapshot_id=base_metadata.current_snapshot_id).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: SnapshotRefType.BRANCH main was created concurrently", + ): + AssertRefSnapshotId(ref="main", snapshot_id=None).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: SnapshotRefType.BRANCH main has changed: expected id 1, found 3055729675574597004", + ): + AssertRefSnapshotId(ref="main", snapshot_id=1).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: branch or tag not_exist is missing, expected 1", + ): + AssertRefSnapshotId(ref="not_exist", snapshot_id=1).validate(base_metadata) + + +def test_assert_last_assigned_field_id(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertLastAssignedFieldId(last_assigned_field_id=base_metadata.last_column_id).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: last assigned field id has changed: expected 1, found 3", + ): + AssertLastAssignedFieldId(last_assigned_field_id=1).validate(base_metadata) + + +def test_assert_current_schema_id(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertCurrentSchemaId(current_schema_id=base_metadata.current_schema_id).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: current schema id has changed: expected 2, found 1", + ): + AssertCurrentSchemaId(current_schema_id=2).validate(base_metadata) + + +def test_last_assigned_partition_id(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertLastAssignedPartitionId(last_assigned_partition_id=base_metadata.last_partition_id).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: last assigned partition id has changed: expected 1, found 1000", + ): + AssertLastAssignedPartitionId(last_assigned_partition_id=1).validate(base_metadata) + + +def test_assert_default_spec_id(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertDefaultSpecId(default_spec_id=base_metadata.default_spec_id).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: default spec id has changed: expected 1, found 0", + ): + AssertDefaultSpecId(default_spec_id=1).validate(base_metadata) + + +def test_assert_default_sort_order_id(table_v2: Table) -> None: + base_metadata = table_v2.metadata + AssertDefaultSortOrderId(default_sort_order_id=base_metadata.default_sort_order_id).validate(base_metadata) + + with pytest.raises( + ValueError, + match="Requirement failed: default sort order id has changed: expected 1, found 3", + ): + AssertDefaultSortOrderId(default_sort_order_id=1).validate(base_metadata) From 4282d37abe734bb4a714574925cef133eb6663eb Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 15:27:40 -0800 Subject: [PATCH 15/27] change the exception to CommitFailedException --- pyiceberg/exceptions.py | 2 +- pyiceberg/table/__init__.py | 22 +++++++++++----------- tests/table/test_init.py | 21 +++++++++++---------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index f555543723..64356b11a4 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -104,7 +104,7 @@ class GenericDynamoDbError(DynamoDbError): pass -class CommitFailedException(RESTError): +class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 91f725ee02..8aceb03b95 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -44,7 +44,7 @@ from sortedcontainers import SortedList from typing_extensions import Annotated -from pyiceberg.exceptions import ResolveError, ValidationError +from pyiceberg.exceptions import CommitFailedException, ResolveError, ValidationError from pyiceberg.expressions import ( AlwaysTrue, And, @@ -553,7 +553,7 @@ class AssertCreate(TableRequirement): def validate(self, base_metadata: Optional[TableMetadata]) -> None: if base_metadata is not None: - raise ValueError("Table already exists") + raise CommitFailedException("Table already exists") class AssertTableUUID(TableRequirement): @@ -564,7 +564,7 @@ class AssertTableUUID(TableRequirement): def validate(self, base_metadata: TableMetadata) -> None: if self.uuid != base_metadata.table_uuid: - raise ValueError(f"Table UUID does not match: {self.uuid} != {base_metadata.table_uuid}") + raise CommitFailedException(f"Table UUID does not match: {self.uuid} != {base_metadata.table_uuid}") class AssertRefSnapshotId(TableRequirement): @@ -582,13 +582,13 @@ def validate(self, base_metadata: TableMetadata) -> None: if snapshot_ref is not None: ref_type = snapshot_ref.snapshot_ref_type if self.snapshot_id is None: - raise ValueError(f"Requirement failed: {ref_type} {self.ref} was created concurrently") + raise CommitFailedException(f"Requirement failed: {ref_type} {self.ref} was created concurrently") elif self.snapshot_id != snapshot_ref.snapshot_id: - raise ValueError( + raise CommitFailedException( f"Requirement failed: {ref_type} {self.ref} has changed: expected id {self.snapshot_id}, found {snapshot_ref.snapshot_id}" ) elif self.snapshot_id is not None: - raise ValueError(f"Requirement failed: branch or tag {self.ref} is missing, expected {self.snapshot_id}") + raise CommitFailedException(f"Requirement failed: branch or tag {self.ref} is missing, expected {self.snapshot_id}") class AssertLastAssignedFieldId(TableRequirement): @@ -599,7 +599,7 @@ class AssertLastAssignedFieldId(TableRequirement): def validate(self, base_metadata: TableMetadata) -> None: if base_metadata.last_column_id != self.last_assigned_field_id: - raise ValueError( + raise CommitFailedException( f"Requirement failed: last assigned field id has changed: expected {self.last_assigned_field_id}, found {base_metadata.last_column_id}" ) @@ -612,7 +612,7 @@ class AssertCurrentSchemaId(TableRequirement): def validate(self, base_metadata: TableMetadata) -> None: if self.current_schema_id != base_metadata.current_schema_id: - raise ValueError( + raise CommitFailedException( f"Requirement failed: current schema id has changed: expected {self.current_schema_id}, found {base_metadata.current_schema_id}" ) @@ -625,7 +625,7 @@ class AssertLastAssignedPartitionId(TableRequirement): def validate(self, base_metadata: TableMetadata) -> None: if base_metadata.last_partition_id != self.last_assigned_partition_id: - raise ValueError( + raise CommitFailedException( f"Requirement failed: last assigned partition id has changed: expected {self.last_assigned_partition_id}, found {base_metadata.last_partition_id}" ) @@ -638,7 +638,7 @@ class AssertDefaultSpecId(TableRequirement): def validate(self, base_metadata: TableMetadata) -> None: if self.default_spec_id != base_metadata.default_spec_id: - raise ValueError( + raise CommitFailedException( f"Requirement failed: default spec id has changed: expected {self.default_spec_id}, found {base_metadata.default_spec_id}" ) @@ -651,7 +651,7 @@ class AssertDefaultSortOrderId(TableRequirement): def validate(self, base_metadata: TableMetadata) -> None: if self.default_sort_order_id != base_metadata.default_sort_order_id: - raise ValueError( + raise CommitFailedException( f"Requirement failed: default sort order id has changed: expected {self.default_sort_order_id}, found {base_metadata.default_sort_order_id}" ) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 713784e8d7..c2f201134c 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -22,6 +22,7 @@ import pytest from sortedcontainers import SortedList +from pyiceberg.exceptions import CommitFailedException from pyiceberg.expressions import ( AlwaysTrue, And, @@ -735,7 +736,7 @@ def test_generate_snapshot_id(table_v2: Table) -> None: def test_assert_create(table_v2: Table) -> None: AssertCreate().validate(None) - with pytest.raises(ValueError, match="Table already exists"): + with pytest.raises(CommitFailedException, match="Table already exists"): AssertCreate().validate(table_v2.metadata) @@ -744,7 +745,7 @@ def test_assert_table_uuid(table_v2: Table) -> None: AssertTableUUID(uuid=base_metadata.table_uuid).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Table UUID does not match: 9c12d441-03fe-4693-9a96-a0705ddf69c2 != 9c12d441-03fe-4693-9a96-a0705ddf69c1", ): AssertTableUUID(uuid=uuid.UUID("9c12d441-03fe-4693-9a96-a0705ddf69c2")).validate(base_metadata) @@ -755,19 +756,19 @@ def test_assert_ref_snapshot_id(table_v2: Table) -> None: AssertRefSnapshotId(ref="main", snapshot_id=base_metadata.current_snapshot_id).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: SnapshotRefType.BRANCH main was created concurrently", ): AssertRefSnapshotId(ref="main", snapshot_id=None).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: SnapshotRefType.BRANCH main has changed: expected id 1, found 3055729675574597004", ): AssertRefSnapshotId(ref="main", snapshot_id=1).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: branch or tag not_exist is missing, expected 1", ): AssertRefSnapshotId(ref="not_exist", snapshot_id=1).validate(base_metadata) @@ -778,7 +779,7 @@ def test_assert_last_assigned_field_id(table_v2: Table) -> None: AssertLastAssignedFieldId(last_assigned_field_id=base_metadata.last_column_id).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: last assigned field id has changed: expected 1, found 3", ): AssertLastAssignedFieldId(last_assigned_field_id=1).validate(base_metadata) @@ -789,7 +790,7 @@ def test_assert_current_schema_id(table_v2: Table) -> None: AssertCurrentSchemaId(current_schema_id=base_metadata.current_schema_id).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: current schema id has changed: expected 2, found 1", ): AssertCurrentSchemaId(current_schema_id=2).validate(base_metadata) @@ -800,7 +801,7 @@ def test_last_assigned_partition_id(table_v2: Table) -> None: AssertLastAssignedPartitionId(last_assigned_partition_id=base_metadata.last_partition_id).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: last assigned partition id has changed: expected 1, found 1000", ): AssertLastAssignedPartitionId(last_assigned_partition_id=1).validate(base_metadata) @@ -811,7 +812,7 @@ def test_assert_default_spec_id(table_v2: Table) -> None: AssertDefaultSpecId(default_spec_id=base_metadata.default_spec_id).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: default spec id has changed: expected 1, found 0", ): AssertDefaultSpecId(default_spec_id=1).validate(base_metadata) @@ -822,7 +823,7 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None: AssertDefaultSortOrderId(default_sort_order_id=base_metadata.default_sort_order_id).validate(base_metadata) with pytest.raises( - ValueError, + CommitFailedException, match="Requirement failed: default sort order id has changed: expected 1, found 3", ): AssertDefaultSortOrderId(default_sort_order_id=1).validate(base_metadata) From 94cfc6914cd8aeb7c9a5bd453b0ee3d8e542c203 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 15:28:52 -0800 Subject: [PATCH 16/27] add docstring --- pyiceberg/table/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 8aceb03b95..bf6102eeee 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -542,7 +542,14 @@ class TableRequirement(IcebergBaseModel): @abstractmethod def validate(self, base_metadata: TableMetadata) -> None: - """Validate the requirement against the base metadata.""" + """Validate the requirement against the base metadata. + + Args: + base_metadata: The base metadata to be validated against. + + Raises: + CommitFailedException: When the requirement is not met. + """ ... From 6d4efc870640f275007f94d8bbc9ac10c6fb7472 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 17:34:09 -0800 Subject: [PATCH 17/27] use regex to parse the metadata version --- pyiceberg/catalog/__init__.py | 30 ++++++++++++++++++++---------- tests/catalog/test_glue.py | 1 - 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index a04dbae396..0e7b93e166 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -18,6 +18,7 @@ from __future__ import annotations import logging +import re import uuid from abc import ABC, abstractmethod from dataclasses import dataclass @@ -74,6 +75,8 @@ LOCATION = "location" EXTERNAL_TABLE = "EXTERNAL_TABLE" +TABLE_METADATA_FILE_NAME_REGEX = re.compile(r"""\d*-.*\.json""", re.X) + class CatalogType(Enum): REST = "rest" @@ -595,16 +598,23 @@ def _get_metadata_location(location: str, new_version: int = 0) -> str: @staticmethod def _parse_metadata_version(metadata_location: str) -> int: - try: - # Split the string by '/' and take the last part, then split by '-' and take the first part. - version_part = metadata_location.split('/')[-1].split('-')[0] - # Attempt to convert the version part to an integer. - return int(version_part) - except ValueError: - # Handle any ValueError that occurs if the conversion fails. - return -1 - except IndexError: - # Handle the case where the splits don't produce the expected number of parts. + """Parse the version from the metadata location. + + The version is the first part of the file name, before the first dash. + For example, the version of the metadata file + `s3://bucket/db/tb/metadata/00001-6c97e413-d51b-4538-ac70-12fe2a85cb83.metadata.json` + is 1. + + Args: + metadata_location (str): The location of the metadata file. + + Returns: + int: The version of the metadata file. -1 if the file name does not have valid version string + """ + file_name = metadata_location.split("/")[-1] + if TABLE_METADATA_FILE_NAME_REGEX.fullmatch(file_name): + return int(file_name.split("-")[0]) + else: return -1 def _get_updated_props_and_update_summary( diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 2bcdc1570b..5c091d4617 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -554,7 +554,6 @@ def test_commit_table_update_schema( transaction.commit_transaction() updated_table_metadata = table.metadata - print(table.metadata_location) assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 1 From 5efb1556b7b00c33e0cbff972ab76f79b70df6fa Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 17:51:32 -0800 Subject: [PATCH 18/27] fix lint issue --- pyiceberg/catalog/glue.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index e5d3dc6348..ff763703d8 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -278,6 +278,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. + CommitFailedException: If the commit failed. """ identifier_tuple = self.identifier_to_tuple_without_catalog( tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) @@ -287,7 +288,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons current_glue_table = self._get_glue_table(database_name=database_name, table_name=table_name) glue_table_version_id = current_glue_table.get("VersionId") if glue_table_version_id is None: - raise ValueError(f"Cannot commit {database_name}.{table_name} because Glue table version id is missing") + raise CommitFailedException(f"Cannot commit {database_name}.{table_name} because Glue table version id is missing") current_table = self._convert_glue_to_iceberg(glue_table=current_glue_table) base_metadata = current_table.metadata From 413935e6f1a78df07655a69591bb5197ba498991 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 19:37:14 -0800 Subject: [PATCH 19/27] fix CI issue --- pyiceberg/table/refs.py | 4 ++++ tests/table/test_init.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pyiceberg/table/refs.py b/pyiceberg/table/refs.py index 6f17880cac..df18fadd31 100644 --- a/pyiceberg/table/refs.py +++ b/pyiceberg/table/refs.py @@ -34,6 +34,10 @@ def __repr__(self) -> str: """Return the string representation of the SnapshotRefType class.""" return f"SnapshotRefType.{self.name}" + def __str__(self) -> str: + """Return the string representation of the SnapshotRefType class.""" + return self.value + class SnapshotRef(IcebergBaseModel): snapshot_id: int = Field(alias="snapshot-id") diff --git a/tests/table/test_init.py b/tests/table/test_init.py index c2f201134c..5beea50665 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -757,13 +757,13 @@ def test_assert_ref_snapshot_id(table_v2: Table) -> None: with pytest.raises( CommitFailedException, - match="Requirement failed: SnapshotRefType.BRANCH main was created concurrently", + match="Requirement failed: branch main was created concurrently", ): AssertRefSnapshotId(ref="main", snapshot_id=None).validate(base_metadata) with pytest.raises( CommitFailedException, - match="Requirement failed: SnapshotRefType.BRANCH main has changed: expected id 1, found 3055729675574597004", + match="Requirement failed: branch main has changed: expected id 1, found 3055729675574597004", ): AssertRefSnapshotId(ref="main", snapshot_id=1).validate(base_metadata) From e8666dc74379dc6c65b587b5d3885aa6e01fe774 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 22:33:28 -0800 Subject: [PATCH 20/27] make base_metadata optional and add null check --- pyiceberg/table/__init__.py | 45 ++++++++++++++++++++++++------------- tests/table/test_init.py | 21 +++++++++++++++++ 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index bf6102eeee..e4ca71f0a1 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -541,7 +541,7 @@ class TableRequirement(IcebergBaseModel): type: str @abstractmethod - def validate(self, base_metadata: TableMetadata) -> None: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: """Validate the requirement against the base metadata. Args: @@ -569,8 +569,10 @@ class AssertTableUUID(TableRequirement): type: Literal["assert-table-uuid"] = Field(default="assert-table-uuid") uuid: uuid.UUID - def validate(self, base_metadata: TableMetadata) -> None: - if self.uuid != base_metadata.table_uuid: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.uuid != base_metadata.table_uuid: raise CommitFailedException(f"Table UUID does not match: {self.uuid} != {base_metadata.table_uuid}") @@ -584,9 +586,10 @@ class AssertRefSnapshotId(TableRequirement): ref: str snapshot_id: Optional[int] = Field(default=None, alias="snapshot-id") - def validate(self, base_metadata: TableMetadata) -> None: - snapshot_ref = base_metadata.refs.get(self.ref) - if snapshot_ref is not None: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif snapshot_ref := base_metadata.refs.get(self.ref): ref_type = snapshot_ref.snapshot_ref_type if self.snapshot_id is None: raise CommitFailedException(f"Requirement failed: {ref_type} {self.ref} was created concurrently") @@ -604,8 +607,10 @@ class AssertLastAssignedFieldId(TableRequirement): type: Literal["assert-last-assigned-field-id"] = Field(default="assert-last-assigned-field-id") last_assigned_field_id: int = Field(..., alias="last-assigned-field-id") - def validate(self, base_metadata: TableMetadata) -> None: - if base_metadata.last_column_id != self.last_assigned_field_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif base_metadata.last_column_id != self.last_assigned_field_id: raise CommitFailedException( f"Requirement failed: last assigned field id has changed: expected {self.last_assigned_field_id}, found {base_metadata.last_column_id}" ) @@ -617,8 +622,10 @@ class AssertCurrentSchemaId(TableRequirement): type: Literal["assert-current-schema-id"] = Field(default="assert-current-schema-id") current_schema_id: int = Field(..., alias="current-schema-id") - def validate(self, base_metadata: TableMetadata) -> None: - if self.current_schema_id != base_metadata.current_schema_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.current_schema_id != base_metadata.current_schema_id: raise CommitFailedException( f"Requirement failed: current schema id has changed: expected {self.current_schema_id}, found {base_metadata.current_schema_id}" ) @@ -630,8 +637,10 @@ class AssertLastAssignedPartitionId(TableRequirement): type: Literal["assert-last-assigned-partition-id"] = Field(default="assert-last-assigned-partition-id") last_assigned_partition_id: int = Field(..., alias="last-assigned-partition-id") - def validate(self, base_metadata: TableMetadata) -> None: - if base_metadata.last_partition_id != self.last_assigned_partition_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif base_metadata.last_partition_id != self.last_assigned_partition_id: raise CommitFailedException( f"Requirement failed: last assigned partition id has changed: expected {self.last_assigned_partition_id}, found {base_metadata.last_partition_id}" ) @@ -643,8 +652,10 @@ class AssertDefaultSpecId(TableRequirement): type: Literal["assert-default-spec-id"] = Field(default="assert-default-spec-id") default_spec_id: int = Field(..., alias="default-spec-id") - def validate(self, base_metadata: TableMetadata) -> None: - if self.default_spec_id != base_metadata.default_spec_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.default_spec_id != base_metadata.default_spec_id: raise CommitFailedException( f"Requirement failed: default spec id has changed: expected {self.default_spec_id}, found {base_metadata.default_spec_id}" ) @@ -656,8 +667,10 @@ class AssertDefaultSortOrderId(TableRequirement): type: Literal["assert-default-sort-order-id"] = Field(default="assert-default-sort-order-id") default_sort_order_id: int = Field(..., alias="default-sort-order-id") - def validate(self, base_metadata: TableMetadata) -> None: - if self.default_sort_order_id != base_metadata.default_sort_order_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.default_sort_order_id != base_metadata.default_sort_order_id: raise CommitFailedException( f"Requirement failed: default sort order id has changed: expected {self.default_sort_order_id}, found {base_metadata.default_sort_order_id}" ) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 5beea50665..04d467c318 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -744,6 +744,9 @@ def test_assert_table_uuid(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertTableUUID(uuid=base_metadata.table_uuid).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertTableUUID(uuid=uuid.UUID("9c12d441-03fe-4693-9a96-a0705ddf69c2")).validate(None) + with pytest.raises( CommitFailedException, match="Table UUID does not match: 9c12d441-03fe-4693-9a96-a0705ddf69c2 != 9c12d441-03fe-4693-9a96-a0705ddf69c1", @@ -755,6 +758,9 @@ def test_assert_ref_snapshot_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertRefSnapshotId(ref="main", snapshot_id=base_metadata.current_snapshot_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertRefSnapshotId(ref="main", snapshot_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: branch main was created concurrently", @@ -778,6 +784,9 @@ def test_assert_last_assigned_field_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertLastAssignedFieldId(last_assigned_field_id=base_metadata.last_column_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertLastAssignedFieldId(last_assigned_field_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: last assigned field id has changed: expected 1, found 3", @@ -789,6 +798,9 @@ def test_assert_current_schema_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertCurrentSchemaId(current_schema_id=base_metadata.current_schema_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertCurrentSchemaId(current_schema_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: current schema id has changed: expected 2, found 1", @@ -800,6 +812,9 @@ def test_last_assigned_partition_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertLastAssignedPartitionId(last_assigned_partition_id=base_metadata.last_partition_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertLastAssignedPartitionId(last_assigned_partition_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: last assigned partition id has changed: expected 1, found 1000", @@ -811,6 +826,9 @@ def test_assert_default_spec_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertDefaultSpecId(default_spec_id=base_metadata.default_spec_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertDefaultSpecId(default_spec_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: default spec id has changed: expected 1, found 0", @@ -822,6 +840,9 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertDefaultSortOrderId(default_sort_order_id=base_metadata.default_sort_order_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertDefaultSortOrderId(default_sort_order_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: default sort order id has changed: expected 1, found 3", From 52ceaf8da0b91a9ecccd191f31c48beef3b92ab0 Mon Sep 17 00:00:00 2001 From: HonahX Date: Sun, 10 Dec 2023 22:42:07 -0800 Subject: [PATCH 21/27] make base_metadata optional and add null check --- pyiceberg/table/__init__.py | 45 ++++++++++++++++++++++++------------- tests/table/test_init.py | 21 +++++++++++++++++ 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index bf6102eeee..e4ca71f0a1 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -541,7 +541,7 @@ class TableRequirement(IcebergBaseModel): type: str @abstractmethod - def validate(self, base_metadata: TableMetadata) -> None: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: """Validate the requirement against the base metadata. Args: @@ -569,8 +569,10 @@ class AssertTableUUID(TableRequirement): type: Literal["assert-table-uuid"] = Field(default="assert-table-uuid") uuid: uuid.UUID - def validate(self, base_metadata: TableMetadata) -> None: - if self.uuid != base_metadata.table_uuid: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.uuid != base_metadata.table_uuid: raise CommitFailedException(f"Table UUID does not match: {self.uuid} != {base_metadata.table_uuid}") @@ -584,9 +586,10 @@ class AssertRefSnapshotId(TableRequirement): ref: str snapshot_id: Optional[int] = Field(default=None, alias="snapshot-id") - def validate(self, base_metadata: TableMetadata) -> None: - snapshot_ref = base_metadata.refs.get(self.ref) - if snapshot_ref is not None: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif snapshot_ref := base_metadata.refs.get(self.ref): ref_type = snapshot_ref.snapshot_ref_type if self.snapshot_id is None: raise CommitFailedException(f"Requirement failed: {ref_type} {self.ref} was created concurrently") @@ -604,8 +607,10 @@ class AssertLastAssignedFieldId(TableRequirement): type: Literal["assert-last-assigned-field-id"] = Field(default="assert-last-assigned-field-id") last_assigned_field_id: int = Field(..., alias="last-assigned-field-id") - def validate(self, base_metadata: TableMetadata) -> None: - if base_metadata.last_column_id != self.last_assigned_field_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif base_metadata.last_column_id != self.last_assigned_field_id: raise CommitFailedException( f"Requirement failed: last assigned field id has changed: expected {self.last_assigned_field_id}, found {base_metadata.last_column_id}" ) @@ -617,8 +622,10 @@ class AssertCurrentSchemaId(TableRequirement): type: Literal["assert-current-schema-id"] = Field(default="assert-current-schema-id") current_schema_id: int = Field(..., alias="current-schema-id") - def validate(self, base_metadata: TableMetadata) -> None: - if self.current_schema_id != base_metadata.current_schema_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.current_schema_id != base_metadata.current_schema_id: raise CommitFailedException( f"Requirement failed: current schema id has changed: expected {self.current_schema_id}, found {base_metadata.current_schema_id}" ) @@ -630,8 +637,10 @@ class AssertLastAssignedPartitionId(TableRequirement): type: Literal["assert-last-assigned-partition-id"] = Field(default="assert-last-assigned-partition-id") last_assigned_partition_id: int = Field(..., alias="last-assigned-partition-id") - def validate(self, base_metadata: TableMetadata) -> None: - if base_metadata.last_partition_id != self.last_assigned_partition_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif base_metadata.last_partition_id != self.last_assigned_partition_id: raise CommitFailedException( f"Requirement failed: last assigned partition id has changed: expected {self.last_assigned_partition_id}, found {base_metadata.last_partition_id}" ) @@ -643,8 +652,10 @@ class AssertDefaultSpecId(TableRequirement): type: Literal["assert-default-spec-id"] = Field(default="assert-default-spec-id") default_spec_id: int = Field(..., alias="default-spec-id") - def validate(self, base_metadata: TableMetadata) -> None: - if self.default_spec_id != base_metadata.default_spec_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.default_spec_id != base_metadata.default_spec_id: raise CommitFailedException( f"Requirement failed: default spec id has changed: expected {self.default_spec_id}, found {base_metadata.default_spec_id}" ) @@ -656,8 +667,10 @@ class AssertDefaultSortOrderId(TableRequirement): type: Literal["assert-default-sort-order-id"] = Field(default="assert-default-sort-order-id") default_sort_order_id: int = Field(..., alias="default-sort-order-id") - def validate(self, base_metadata: TableMetadata) -> None: - if self.default_sort_order_id != base_metadata.default_sort_order_id: + def validate(self, base_metadata: Optional[TableMetadata]) -> None: + if base_metadata is None: + raise CommitFailedException("Requirement failed: current table metadata is missing") + elif self.default_sort_order_id != base_metadata.default_sort_order_id: raise CommitFailedException( f"Requirement failed: default sort order id has changed: expected {self.default_sort_order_id}, found {base_metadata.default_sort_order_id}" ) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 5beea50665..04d467c318 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -744,6 +744,9 @@ def test_assert_table_uuid(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertTableUUID(uuid=base_metadata.table_uuid).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertTableUUID(uuid=uuid.UUID("9c12d441-03fe-4693-9a96-a0705ddf69c2")).validate(None) + with pytest.raises( CommitFailedException, match="Table UUID does not match: 9c12d441-03fe-4693-9a96-a0705ddf69c2 != 9c12d441-03fe-4693-9a96-a0705ddf69c1", @@ -755,6 +758,9 @@ def test_assert_ref_snapshot_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertRefSnapshotId(ref="main", snapshot_id=base_metadata.current_snapshot_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertRefSnapshotId(ref="main", snapshot_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: branch main was created concurrently", @@ -778,6 +784,9 @@ def test_assert_last_assigned_field_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertLastAssignedFieldId(last_assigned_field_id=base_metadata.last_column_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertLastAssignedFieldId(last_assigned_field_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: last assigned field id has changed: expected 1, found 3", @@ -789,6 +798,9 @@ def test_assert_current_schema_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertCurrentSchemaId(current_schema_id=base_metadata.current_schema_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertCurrentSchemaId(current_schema_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: current schema id has changed: expected 2, found 1", @@ -800,6 +812,9 @@ def test_last_assigned_partition_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertLastAssignedPartitionId(last_assigned_partition_id=base_metadata.last_partition_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertLastAssignedPartitionId(last_assigned_partition_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: last assigned partition id has changed: expected 1, found 1000", @@ -811,6 +826,9 @@ def test_assert_default_spec_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertDefaultSpecId(default_spec_id=base_metadata.default_spec_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertDefaultSpecId(default_spec_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: default spec id has changed: expected 1, found 0", @@ -822,6 +840,9 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None: base_metadata = table_v2.metadata AssertDefaultSortOrderId(default_sort_order_id=base_metadata.default_sort_order_id).validate(base_metadata) + with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"): + AssertDefaultSortOrderId(default_sort_order_id=1).validate(None) + with pytest.raises( CommitFailedException, match="Requirement failed: default sort order id has changed: expected 1, found 3", From ccb787cf11871d6ed90ab82b6380a2a1f21c786e Mon Sep 17 00:00:00 2001 From: HonahX Date: Wed, 13 Dec 2023 23:13:38 -0800 Subject: [PATCH 22/27] add integration test --- tests/catalog/integration_test_glue.py | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 2689ef14d3..6f9a7ac998 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -31,6 +31,7 @@ TableAlreadyExistsError, ) from pyiceberg.schema import Schema +from pyiceberg.types import IntegerType from tests.conftest import clean_up, get_bucket_name, get_s3_path # The number of tables/databases used in list_table/namespace test @@ -261,3 +262,31 @@ def test_update_namespace_properties(test_catalog: Catalog, database_name: str) else: assert k in update_report.removed assert "updated test description" == test_catalog.load_namespace_properties(database_name)["comment"] + + +def test_commit_table_update_schema( + test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_name: str +) -> None: + identifier = (database_name, table_name) + test_catalog.create_namespace(namespace=database_name) + table = test_catalog.create_table(identifier, table_schema_nested) + original_table_metadata = table.metadata + + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 + assert original_table_metadata.current_schema_id == 0 + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + + assert test_catalog._parse_metadata_version(table.metadata_location) == 1 + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + new_schema = next(schema for schema in updated_table_metadata.schemas if schema.schema_id == 1) + assert new_schema + assert new_schema == update._apply() + assert new_schema.find_field("b").field_type == IntegerType() From 5e78af9d4ca393358936502d46e024e6b4c7f391 Mon Sep 17 00:00:00 2001 From: HonahX Date: Fri, 15 Dec 2023 17:08:34 -0800 Subject: [PATCH 23/27] default skip-archive to true and comments --- pyiceberg/catalog/glue.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index ff763703d8..f118d450fc 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -66,6 +66,13 @@ from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT +# If Glue should skip archiving an old table version when creating a new version in a commit. By +# default, Glue archives all old table versions after an UpdateTable call, but Glue has a default +# max number of archived table versions (can be increased). So for streaming use case with lots +# of commits, it is recommended to set this value to true. +GLUE_SKIP_ARCHIVE = "glue.skip-archive" +GLUE_SKIP_ARCHIVE_DEFAULT = True + def _construct_parameters( metadata_location: str, glue_table: Optional[TableTypeDef] = None, prev_metadata_location: Optional[str] = None @@ -193,7 +200,12 @@ def _create_glue_table(self, database_name: str, table_name: str, table_input: T def _update_glue_table(self, database_name: str, table_name: str, table_input: TableInputTypeDef, version_id: str) -> None: try: - self.glue.update_table(DatabaseName=database_name, TableInput=table_input, VersionId=version_id) + self.glue.update_table( + DatabaseName=database_name, + TableInput=table_input, + SkipArchive=self.properties.get(GLUE_SKIP_ARCHIVE, GLUE_SKIP_ARCHIVE_DEFAULT), + VersionId=version_id, + ) except self.glue.exceptions.EntityNotFoundException as e: raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e except self.glue.exceptions.ConcurrentModificationException as e: @@ -314,6 +326,8 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons prev_metadata_location=current_table.metadata_location, ) + # Pass `version_id` to implement optimistic locking: it ensures updates are rejected if concurrent + # modifications occur. See more details at https://iceberg.apache.org/docs/latest/aws/#optimistic-locking self._update_glue_table( database_name=database_name, table_name=table_name, From 93f2cec6ec9afcfbee9e2ed8fbfad2318a3d0c76 Mon Sep 17 00:00:00 2001 From: HonahX Date: Fri, 15 Dec 2023 17:20:51 -0800 Subject: [PATCH 24/27] refactor tests --- tests/catalog/integration_test_glue.py | 4 ++++ tests/catalog/test_glue.py | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 6f9a7ac998..99f0adac40 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -62,6 +62,7 @@ def test_create_table( assert table.identifier == (CATALOG_NAME,) + identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 def test_create_table_with_invalid_location(table_schema_nested: Schema, table_name: str, database_name: str) -> None: @@ -83,6 +84,7 @@ def test_create_table_with_default_location( assert table.identifier == (CATALOG_NAME,) + identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 def test_create_table_with_invalid_database(test_catalog: Catalog, table_schema_nested: Schema, table_name: str) -> None: @@ -106,6 +108,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, table_na assert table.identifier == loaded_table.identifier assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: @@ -127,6 +130,7 @@ def test_rename_table( new_table_name = f"rename-{table_name}" identifier = (database_name, table_name) table = test_catalog.create_table(identifier, table_schema_nested) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 assert table.identifier == (CATALOG_NAME,) + identifier new_identifier = (new_database_name, new_table_name) test_catalog.rename_table(identifier, new_identifier) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 5c091d4617..cb07b10dfa 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -199,7 +199,6 @@ def test_drop_table( table = test_catalog.load_table(identifier) assert table.identifier == (catalog_name,) + identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) - assert test_catalog._parse_metadata_version(table.metadata_location) == 0 test_catalog.drop_table(identifier) with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) From 0d38337de5fab02a4fda7d2393a33337a56e3e33 Mon Sep 17 00:00:00 2001 From: HonahX Date: Wed, 20 Dec 2023 02:12:32 -0800 Subject: [PATCH 25/27] add doc and fix test after merge --- pyiceberg/catalog/__init__.py | 2 ++ tests/catalog/test_glue.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 3df8ed79e5..ac6fb4b6c3 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -604,6 +604,8 @@ def _parse_metadata_version(metadata_location: str) -> int: For example, the version of the metadata file `s3://bucket/db/tb/metadata/00001-6c97e413-d51b-4538-ac70-12fe2a85cb83.metadata.json` is 1. + If the path does not comply with the pattern, the version is defaulted to be -1, ensuring + that the next metadata file is treated as having version 0. Args: metadata_location (str): The location of the metadata file. diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 54554b735c..2a33c8e23d 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -519,13 +519,11 @@ def test_passing_profile_name() -> None: @mock_glue def test_commit_table_update_schema( - _bucket_initialize: None, _patch_aiobotocore: None, table_schema_nested: Schema, database_name: str, table_name: str + _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str ) -> None: catalog_name = "glue" identifier = (database_name, table_name) - test_catalog = GlueCatalog( - catalog_name, **{"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO", "warehouse": f"s3://{BUCKET_NAME}"} - ) + test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) original_table_metadata = table.metadata From dccad759c962c5152a765f19959afd4820d61f5d Mon Sep 17 00:00:00 2001 From: HonahX Date: Thu, 21 Dec 2023 18:25:34 -0800 Subject: [PATCH 26/27] make regex more robust, thanks Fokko! --- pyiceberg/catalog/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index ac6fb4b6c3..61e2bdd8f3 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -75,7 +75,7 @@ LOCATION = "location" EXTERNAL_TABLE = "EXTERNAL_TABLE" -TABLE_METADATA_FILE_NAME_REGEX = re.compile(r"""\d*-.*\.json""", re.X) +TABLE_METADATA_FILE_NAME_REGEX = re.compile(r"""(\d+)-.*\.json""", re.X) class CatalogType(Enum): @@ -614,8 +614,8 @@ def _parse_metadata_version(metadata_location: str) -> int: int: The version of the metadata file. -1 if the file name does not have valid version string """ file_name = metadata_location.split("/")[-1] - if TABLE_METADATA_FILE_NAME_REGEX.fullmatch(file_name): - return int(file_name.split("-")[0]) + if file_name_match := TABLE_METADATA_FILE_NAME_REGEX.fullmatch(file_name): + return int(file_name_match.group(1)) else: return -1 From bf06d26fcb2cd086f0a7f5025f7219f07c113767 Mon Sep 17 00:00:00 2001 From: HonahX Date: Mon, 1 Jan 2024 02:05:44 -0800 Subject: [PATCH 27/27] Fix review comments, thanks Patrick! --- pyiceberg/catalog/__init__.py | 17 +++++++++++++++-- pyiceberg/catalog/glue.py | 6 +++--- tests/conftest.py | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 61e2bdd8f3..4cb6c2ff4a 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -75,7 +75,16 @@ LOCATION = "location" EXTERNAL_TABLE = "EXTERNAL_TABLE" -TABLE_METADATA_FILE_NAME_REGEX = re.compile(r"""(\d+)-.*\.json""", re.X) +TABLE_METADATA_FILE_NAME_REGEX = re.compile( + r""" + (\d+) # version number + - # separator + ([\w-]{36}) # UUID (36 characters, including hyphens) + (?:\.\w+)? # optional codec name + \.metadata\.json # file extension + """, + re.X, +) class CatalogType(Enum): @@ -592,7 +601,7 @@ def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> @staticmethod def _get_metadata_location(location: str, new_version: int = 0) -> str: if new_version < 0: - raise ValueError(f"Table metadata version: {new_version} cannot be negative") + raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") version_str = f"{new_version:05d}" return f"{location}/metadata/{version_str}-{uuid.uuid4()}.metadata.json" @@ -615,6 +624,10 @@ def _parse_metadata_version(metadata_location: str) -> int: """ file_name = metadata_location.split("/")[-1] if file_name_match := TABLE_METADATA_FILE_NAME_REGEX.fullmatch(file_name): + try: + uuid.UUID(file_name_match.group(2)) + except ValueError: + return -1 return int(file_name_match.group(1)) else: return -1 diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index f118d450fc..6cf9462b71 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -207,10 +207,10 @@ def _update_glue_table(self, database_name: str, table_name: str, table_input: T VersionId=version_id, ) except self.glue.exceptions.EntityNotFoundException as e: - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e + raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name} (Glue table version {version_id})") from e except self.glue.exceptions.ConcurrentModificationException as e: raise CommitFailedException( - f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update" + f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update to table version {version_id}" ) from e def _get_glue_table(self, database_name: str, table_name: str) -> TableTypeDef: @@ -299,7 +299,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons current_glue_table = self._get_glue_table(database_name=database_name, table_name=table_name) glue_table_version_id = current_glue_table.get("VersionId") - if glue_table_version_id is None: + if not glue_table_version_id: raise CommitFailedException(f"Cannot commit {database_name}.{table_name} because Glue table version id is missing") current_table = self._convert_glue_to_iceberg(glue_table=current_glue_table) base_metadata = current_table.metadata diff --git a/tests/conftest.py b/tests/conftest.py index 64ace326ec..e54f1f54d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1586,7 +1586,7 @@ def fixture_aws_credentials() -> Generator[None, None, None]: os.environ.pop("AWS_DEFAULT_REGION") -MOTO_SERVER = ThreadedMotoServer(port=5000) +MOTO_SERVER = ThreadedMotoServer(ip_address="localhost", port=5000) def pytest_sessionfinish(