From 500da555e078fde3d80c7a09f188ac32d69c540e Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 30 Oct 2024 10:27:03 +0100 Subject: [PATCH 1/2] Remove numpy as a hard dependency With Arrow 18.0.0 numpy is not a dependency anymore: https://github.com/apache/arrow/pull/44148 I think it would be good to also remove it from PyIceberg --- pyiceberg/io/pyarrow.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index aa27796081..fa96ab801b 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -57,7 +57,6 @@ ) from urllib.parse import urlparse -import numpy as np import pyarrow as pa import pyarrow.compute as pc import pyarrow.dataset as ds @@ -812,7 +811,15 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start all_chunks = positional_deletes[0] else: all_chunks = pa.chunked_array(itertools.chain(*[arr.chunks for arr in positional_deletes])) - return np.subtract(np.setdiff1d(np.arange(start_index, end_index), all_chunks, assume_unique=False), start_index) + + # Create the full range array with pyarrow + full_range = pa.array(range(start_index, end_index)) + + # Filter out values in all_chunks from full_range + result = pc.filter(full_range, pc.invert(pc.is_in(full_range, value_set=all_chunks))) + + # Subtract the start_index from each element in the result + return pc.subtract(result, pa.scalar(start_index)) def pyarrow_to_schema( From d6bb049eefba11463e6d519cef8ffd8a9b518d7a Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 30 Oct 2024 19:21:59 +0100 Subject: [PATCH 2/2] Add link to issue --- pyiceberg/io/pyarrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index fa96ab801b..7f95815ee7 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -814,6 +814,8 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start # Create the full range array with pyarrow full_range = pa.array(range(start_index, end_index)) + # When available, replace with Arrow generator to improve performance + # See https://github.com/apache/iceberg-python/issues/1271 for details # Filter out values in all_chunks from full_range result = pc.filter(full_range, pc.invert(pc.is_in(full_range, value_set=all_chunks)))