From dc89c13cf5c0ae87383fc29a8c2f9925fbff81c9 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Sat, 19 Oct 2024 08:25:00 -0700 Subject: [PATCH 1/4] feature(array): implement Array.append changes the Array.resize to be an inplace operation --- src/zarr/core/array.py | 138 +++++++++++++++++--- tests/v3/test_array.py | 190 ++++++++++++++++++++++++++++ tests/v3/test_codecs/test_codecs.py | 3 +- 3 files changed, 312 insertions(+), 19 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index da477056ee..002f44819a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1,8 +1,9 @@ from __future__ import annotations import json +import warnings from asyncio import gather -from dataclasses import dataclass, field, replace +from dataclasses import dataclass, field from logging import getLogger from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload @@ -1106,15 +1107,15 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) - async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self: + async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + new_shape = parse_shapelike(new_shape) assert len(new_shape) == len(self.metadata.shape) new_metadata = self.metadata.update_shape(new_shape) - # Remove all chunks outside of the new shape - old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) - if delete_outside_chunks: + # Remove all chunks outside of the new shape + old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -1130,7 +1131,61 @@ async def _delete_key(key: str) -> None: # Write new metadata await self._save_metadata(new_metadata) - return replace(self, metadata=new_metadata) + + # Update metadata (in place) + object.__setattr__(self, "metadata", new_metadata) + + async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + """ + # ensure data is array-like + if not hasattr(data, "shape"): + data = np.asanyarray(data) + + self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis) + data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) + if self_shape_preserved != data_shape_preserved: + raise ValueError( + "shape of data to append is not compatible with the array; " + "all dimensions must match except for the dimension being " + "appended" + ) + # remember old shape + old_shape = self.shape + + # determine new shape + new_shape = tuple( + self.shape[i] if i != axis else self.shape[i] + data.shape[i] + for i in range(len(self.shape)) + ) + + # resize + await self.resize(new_shape) + + # store data + append_selection = tuple( + slice(None) if i != axis else slice(old_shape[i], new_shape[i]) + for i in range(len(self.shape)) + ) + await self.setitem(append_selection, data) + + return new_shape async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: # metadata.attributes is "frozen" so we simply clear and update the dict @@ -1149,7 +1204,8 @@ async def info(self) -> None: raise NotImplementedError -@dataclass(frozen=True) +# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed +@dataclass(frozen=False) class Array: """Instantiate an array from an initialized store. @@ -1309,6 +1365,20 @@ def shape(self) -> ChunkCoords: """ return self._async_array.shape + @shape.setter + def shape(self, value: ChunkCoords) -> None: + """Sets the shape of the array by calling resize. + + .. deprecated:: 3.0.0 + Setting a shape using the shape setter is deprecated, use Array.resize instead. + """ + warnings.warn( + "Setting a shape using the shape setter is deprecated, use Array.resize instead.", + stacklevel=2, + category=DeprecationWarning, + ) + self.resize(value) + @property def chunks(self) -> ChunkCoords: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. @@ -2766,18 +2836,18 @@ def blocks(self) -> BlockIndex: :func:`set_block_selection` for documentation and examples.""" return BlockIndex(self) - def resize(self, new_shape: ChunkCoords) -> Array: + def resize(self, new_shape: ShapeLike) -> None: """ Change the shape of the array by growing or shrinking one or more dimensions. - This method does not modify the original Array object. Instead, it returns a new Array - with the specified shape. + Parameters + ---------- + new_shape : tuple + New shape of the array. Notes ----- - When resizing an array, the data are not rearranged in any way. - If one or more dimensions are shrunk, any chunks falling outside the new array shape will be deleted from the underlying store. However, it is noteworthy that the chunks partially falling inside the new array @@ -2790,7 +2860,6 @@ def resize(self, new_shape: ChunkCoords) -> Array: >>> import zarr >>> z = zarr.zeros(shape=(10000, 10000), >>> chunk_shape=(1000, 1000), - >>> store=StorePath(MemoryStore(mode="w")), >>> dtype="i4",) >>> z.shape (10000, 10000) @@ -2803,10 +2872,43 @@ def resize(self, new_shape: ChunkCoords) -> Array: >>> z2.shape (50, 50) """ - resized = sync(self._async_array.resize(new_shape)) - # TODO: remove this cast when type inference improves - _resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized) - return type(self)(_resized) + sync(self._async_array.resize(new_shape)) + + def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + >>> z = zarr.array(a, chunks=(1000, 100)) + >>> z.shape + (10000, 1000) + >>> z.append(a) + (20000, 1000) + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z.shape + (20000, 2000) + """ + return sync(self._async_array.append(data, axis=axis)) def update_attributes(self, new_attributes: dict[str, JSON]) -> Array: # TODO: remove this cast when type inference improves diff --git a/tests/v3/test_array.py b/tests/v3/test_array.py index 829a04d304..52d3db2f63 100644 --- a/tests/v3/test_array.py +++ b/tests/v3/test_array.py @@ -417,3 +417,193 @@ def test_update_attrs(zarr_format: int) -> None: arr2 = zarr.open_array(store=store, zarr_format=zarr_format) assert arr2.attrs["foo"] == "bar" + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: + z = zarr.create( + shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format + ) + a = np.arange(105, dtype="i4") + z[:] = a + assert (105,) == z.shape + assert (105,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + z.resize(205) + assert (205,) == z.shape + assert (205,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:105]) + np.testing.assert_array_equal(np.zeros(100, dtype="i4"), z[105:]) + + z.resize(55) + assert (55,) == z.shape + assert (55,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a[:55], z[:]) + + # via shape setter + new_shape = (105,) + with pytest.warns(DeprecationWarning): + z.shape = new_shape + assert new_shape == z.shape + assert new_shape == z[:].shape + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_resize_2d(store: MemoryStore, zarr_format: int) -> None: + z = zarr.create( + shape=(105, 105), + chunks=(10, 10), + dtype="i4", + fill_value=0, + store=store, + zarr_format=zarr_format, + ) + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z[:] = a + assert (105, 105) == z.shape + assert (105, 105) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + z.resize((205, 205)) + assert (205, 205) == z.shape + assert (205, 205) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:105, :105]) + np.testing.assert_array_equal(np.zeros((100, 205), dtype="i4"), z[105:, :]) + np.testing.assert_array_equal(np.zeros((205, 100), dtype="i4"), z[:, 105:]) + + z.resize((55, 55)) + assert (55, 55) == z.shape + assert (55, 55) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:55, :55], z[:]) + + z.resize((55, 1)) + assert (55, 1) == z.shape + assert (55, 1) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:55, :1], z[:]) + + z.resize((1, 55)) + assert (1, 55) == z.shape + assert (1, 55) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:1, :10], z[:, :10]) + np.testing.assert_array_equal(np.zeros((1, 55 - 10), dtype="i4"), z[:, 10:55]) + + # via shape setter + new_shape = (105, 105) + with pytest.warns(DeprecationWarning): + z.shape = new_shape + assert new_shape == z.shape + assert new_shape == z[:].shape + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_1d(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105) + z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + b = np.arange(105, 205) + e = np.append(a, b) + assert z.shape == (105,) + z.append(b) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(e, z[:]) + + # check append handles array-like + c = [1, 2, 3] + f = np.append(e, c) + z.append(c) + assert f.shape == z.shape + assert f.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(f, z[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_2d(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z = zarr.create( + shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10, 10) == z.chunks + actual = z[:] + np.testing.assert_array_equal(a, actual) + + b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105)) + e = np.append(a, b, axis=0) + z.append(b) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10, 10) == z.chunks + actual = z[:] + np.testing.assert_array_equal(e, actual) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z = zarr.create( + shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105)) + e = np.append(a, b, axis=1) + z.append(b, axis=1) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(e, z[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(100) + z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) + z[:] = a + b = a.reshape(10, 10) + with pytest.raises(ValueError): + z.append(b) diff --git a/tests/v3/test_codecs/test_codecs.py b/tests/v3/test_codecs/test_codecs.py index 7a5fb979a1..0f2f892915 100644 --- a/tests/v3/test_codecs/test_codecs.py +++ b/tests/v3/test_codecs/test_codecs.py @@ -371,8 +371,9 @@ async def test_resize(store: Store) -> None: assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is not None - a = await a.resize((10, 12)) + await a.resize((10, 12)) assert a.metadata.shape == (10, 12) + assert a.shape == (10, 12) assert await store.get(f"{path}/0.0", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None From ef6b919dccb64193a2128caa4a82e6bdbca0a05f Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Sun, 20 Oct 2024 19:56:39 -0700 Subject: [PATCH 2/4] better error message --- src/zarr/core/array.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f338edccbf..c6c81ef3f3 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import warnings from asyncio import gather from dataclasses import dataclass, field from itertools import starmap @@ -1161,9 +1160,11 @@ async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) if self_shape_preserved != data_shape_preserved: raise ValueError( - "shape of data to append is not compatible with the array; " - "all dimensions must match except for the dimension being " - "appended" + f"shape of data to append is not compatible with the array. " + f"The shape of the data is ({data_shape_preserved})" + f"and the shape of the array is ({self_shape_preserved})." + "All dimensions must match except for the dimension being " + "appended." ) # remember old shape old_shape = self.shape @@ -1356,16 +1357,7 @@ def shape(self) -> ChunkCoords: @shape.setter def shape(self, value: ChunkCoords) -> None: - """Sets the shape of the array by calling resize. - - .. deprecated:: 3.0.0 - Setting a shape using the shape setter is deprecated, use Array.resize instead. - """ - warnings.warn( - "Setting a shape using the shape setter is deprecated, use Array.resize instead.", - stacklevel=2, - category=DeprecationWarning, - ) + """Sets the shape of the array by calling resize.""" self.resize(value) @property From 9095e04ad825e2461ad23edd2855adc592a8a8a9 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 21 Oct 2024 09:59:16 -0700 Subject: [PATCH 3/4] no more warn --- tests/test_array.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 52d3db2f63..a5ebe3a4a6 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -453,8 +453,7 @@ def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: # via shape setter new_shape = (105,) - with pytest.warns(DeprecationWarning): - z.shape = new_shape + z.shape = new_shape assert new_shape == z.shape assert new_shape == z[:].shape @@ -516,8 +515,7 @@ def test_resize_2d(store: MemoryStore, zarr_format: int) -> None: # via shape setter new_shape = (105, 105) - with pytest.warns(DeprecationWarning): - z.shape = new_shape + z.shape = new_shape assert new_shape == z.shape assert new_shape == z[:].shape From fd376a3afce3687e58c7d500d284032837cba154 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Oct 2024 13:32:29 +0000 Subject: [PATCH 4/4] style: pre-commit fixes --- tests/test_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_array.py b/tests/test_array.py index 97db2f220a..ae8e7f99c2 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -606,6 +606,7 @@ def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None: with pytest.raises(ValueError): z.append(b) + @pytest.mark.parametrize("order", ["C", "F", None]) @pytest.mark.parametrize("zarr_format", [2, 3]) @pytest.mark.parametrize("store", ["memory"], indirect=True)