apache · Fokko · Jan 18, 2024 · Oct 3, 2023 · Oct 3, 2023 · Oct 4, 2023
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -103,7 +103,11 @@
     OutputFile,
     OutputStream,
 )
-from pyiceberg.manifest import DataFile, FileFormat
+from pyiceberg.manifest import (
+    DataFile,
+    DataFileContent,
+    FileFormat,
+)
 from pyiceberg.schema import (
     PartnerAccessor,
     PreOrderSchemaVisitor,
@@ -117,8 +121,9 @@
     visit,
     visit_with_partner,
 )
+from pyiceberg.table import WriteTask, _generate_datafile_filename
 from pyiceberg.transforms import TruncateTransform
-from pyiceberg.typedef import EMPTY_DICT, Properties
+from pyiceberg.typedef import EMPTY_DICT, Properties, Record
 from pyiceberg.types import (
     BinaryType,
     BooleanType,
@@ -1445,16 +1450,13 @@ def parquet_path_to_id_mapping(
 def fill_parquet_file_metadata(
     df: DataFile,
     parquet_metadata: pq.FileMetaData,
-    file_size: int,
     stats_columns: Dict[int, StatisticsCollector],
     parquet_column_mapping: Dict[str, int],
 ) -> None:
     """
     Compute and fill the following fields of the DataFile object.
 
     - file_format
-    - record_count
-    - file_size_in_bytes
     - column_sizes
     - value_counts
     - null_value_counts
@@ -1466,9 +1468,6 @@ def fill_parquet_file_metadata(
     Args:
         df (DataFile): A DataFile object representing the Parquet file for which metadata is to be filled.
         parquet_metadata (pyarrow.parquet.FileMetaData): A pyarrow metadata object.
-        file_size (int): The total compressed file size cannot be retrieved from the metadata and hence has to
-            be passed here. Depending on the kind of file system and pyarrow library call used, different
-            ways to obtain this value might be appropriate.
         stats_columns (Dict[int, StatisticsCollector]): The statistics gathering plan. It is required to
             set the mode for column metrics collection
     """
@@ -1565,13 +1564,54 @@ def fill_parquet_file_metadata(
         del upper_bounds[field_id]
         del null_value_counts[field_id]
 
-    df.file_format = FileFormat.PARQUET
     df.record_count = parquet_metadata.num_rows
-    df.file_size_in_bytes = file_size
     df.column_sizes = column_sizes
     df.value_counts = value_counts
     df.null_value_counts = null_value_counts
     df.nan_value_counts = nan_value_counts
     df.lower_bounds = lower_bounds
     df.upper_bounds = upper_bounds
     df.split_offsets = split_offsets
+
+
+def write_file(table: Table, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
+    task = next(tasks)
+
+    try:
+        _ = next(tasks)
+        # If there are more tasks, raise an exception
+        raise ValueError("Only unpartitioned writes are supported: https://github.com/apache/iceberg-python/issues/208")
+    except StopIteration:
+        pass
+
+    df = task.df
+
+    file_path = f'{table.location()}/data/{_generate_datafile_filename("parquet")}'
+    file_schema = schema_to_pyarrow(table.schema())
+
+    collected_metrics: List[pq.FileMetaData] = []
+    fo = table.io.new_output(file_path)
+    with fo.create() as fos:
+        with pq.ParquetWriter(fos, schema=file_schema, version="1.0", metadata_collector=collected_metrics) as writer:
+            writer.write_table(df)
+
+    df = DataFile(
+        content=DataFileContent.DATA,
+        file_path=file_path,
+        file_format=FileFormat.PARQUET,
+        partition=Record(),
+        record_count=len(df),
+        file_size_in_bytes=len(fo),
+        # Just copy these from the table for now
+        sort_order_id=table.sort_order().order_id,
+        spec_id=table.spec().spec_id,
+        equality_ids=table.schema().identifier_field_ids,
+        key_metadata=None,
+    )
+    fill_parquet_file_metadata(
+        df=df,
+        parquet_metadata=collected_metrics[0],
+        stats_columns=compute_statistics_plan(table.schema(), table.properties),
+        parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
+    )
+    return iter([df])
diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py
@@ -37,7 +37,7 @@
 from pyiceberg.io import FileIO, InputFile, OutputFile
 from pyiceberg.partitioning import PartitionSpec
 from pyiceberg.schema import Schema
-from pyiceberg.typedef import Record
+from pyiceberg.typedef import EMPTY_DICT, Record
 from pyiceberg.types import (
     BinaryType,
     BooleanType,
@@ -60,6 +60,8 @@
 DEFAULT_BLOCK_SIZE = 67108864  # 64 * 1024 * 1024
 DEFAULT_READ_VERSION: Literal[2] = 2
 
+INITIAL_SEQUENCE_NUMBER = 0
+
 
 class DataFileContent(int, Enum):
     DATA = 0
@@ -504,7 +506,7 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition
     NestedField(517, "content", IntegerType(), required=False, initial_default=ManifestContent.DATA),
     NestedField(515, "sequence_number", LongType(), required=False, initial_default=0),
     NestedField(516, "min_sequence_number", LongType(), required=False, initial_default=0),
-    NestedField(503, "added_snapshot_id", LongType(), required=False),
+    NestedField(503, "added_snapshot_id", LongType(), required=True),
     NestedField(504, "added_files_count", IntegerType(), required=False),
     NestedField(505, "existing_files_count", IntegerType(), required=False),
     NestedField(506, "deleted_files_count", IntegerType(), required=False),
@@ -517,6 +519,7 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition
 
 MANIFEST_FILE_SCHEMA_STRUCT = MANIFEST_FILE_SCHEMA.as_struct()
 
+
 POSITIONAL_DELETE_SCHEMA = Schema(
     NestedField(2147483546, "file_path", StringType()), NestedField(2147483545, "pos", IntegerType())
 )
@@ -665,7 +668,9 @@ class ManifestWriter(ABC):
     _min_data_sequence_number: Optional[int]
     _partitions: List[Record]
 
-    def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str]):
+    def __init__(
+        self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT
+    ) -> None:
         self.closed = False
         self._spec = spec
         self._schema = schema
@@ -746,7 +751,7 @@ def to_manifest_file(self) -> ManifestFile:
             existing_rows_count=self._existing_rows,
             deleted_rows_count=self._deleted_rows,
             partitions=construct_partition_summaries(self._spec, self._schema, self._partitions),
-            key_metadatas=None,
+            key_metadata=None,
         )
 
     def add_entry(self, entry: ManifestEntry) -> ManifestWriter:
@@ -851,7 +856,7 @@ class ManifestListWriter(ABC):
     _commit_snapshot_id: int
     _writer: AvroOutputFile[ManifestFile]
 
-    def __init__(self, output_file: OutputFile, meta: Dict[str, str]):
+    def __init__(self, output_file: OutputFile, meta: Dict[str, Any]):
         self._output_file = output_file
         self._meta = meta
         self._manifest_files = []
@@ -884,7 +889,7 @@ def add_manifests(self, manifest_files: List[ManifestFile]) -> ManifestListWrite
 
 
 class ManifestListWriterV1(ManifestListWriter):
-    def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id: int):
+    def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id: Optional[int]):
         super().__init__(
             output_file, {"snapshot-id": str(snapshot_id), "parent-snapshot-id": str(parent_snapshot_id), "format-version": "1"}
         )
@@ -897,9 +902,11 @@ def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile:
 
 class ManifestListWriterV2(ManifestListWriter):
     _commit_snapshot_id: int
-    _sequence_number: int
+    _sequence_number: Optional[int]
 
-    def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id: int, sequence_number: int):
+    def __init__(
+        self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id: Optional[int], sequence_number: Optional[int]
+    ):
         super().__init__(
             output_file,
             {
@@ -920,9 +927,9 @@ def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile:
             # To validate this, check that the snapshot id matches the current commit
             if self._commit_snapshot_id != wrapped_manifest_file.added_snapshot_id:
                 raise ValueError(
-                    f"Found unassigned sequence number for a manifest from snapshot: {wrapped_manifest_file.added_snapshot_id}"
+                    f"Found unassigned sequence number for a manifest from snapshot: {self._commit_snapshot_id} != {wrapped_manifest_file.added_snapshot_id}"
                 )
-            wrapped_manifest_file.sequence_number = self._sequence_number
+            wrapped_manifest_file.sequence_number = self._sequence_number or INITIAL_SEQUENCE_NUMBER
 
         if wrapped_manifest_file.min_sequence_number == UNASSIGNED_SEQ:
             if self._commit_snapshot_id != wrapped_manifest_file.added_snapshot_id:
@@ -931,12 +938,16 @@ def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile:
                 )
             # if the min sequence number is not determined, then there was no assigned sequence number for any file
             # written to the wrapped manifest. Replace the unassigned sequence number with the one for this commit
-            wrapped_manifest_file.min_sequence_number = self._sequence_number
+            wrapped_manifest_file.min_sequence_number = self._sequence_number or INITIAL_SEQUENCE_NUMBER
         return wrapped_manifest_file
 
 
 def write_manifest_list(
-    format_version: Literal[1, 2], output_file: OutputFile, snapshot_id: int, parent_snapshot_id: int, sequence_number: int
+    format_version: Literal[1, 2],
+    output_file: OutputFile,
+    snapshot_id: int,
+    parent_snapshot_id: Optional[int],
+    sequence_number: Optional[int],
 ) -> ManifestListWriter:
     if format_version == 1:
         return ManifestListWriterV1(output_file, snapshot_id, parent_snapshot_id)