Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v3] Array.append #2413

Merged
merged 7 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 112 additions & 18 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
from asyncio import gather
from dataclasses import dataclass, field, replace
from dataclasses import dataclass, field
from itertools import starmap
from logging import getLogger
from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload
Expand Down Expand Up @@ -1104,15 +1104,15 @@ async def setitem(
)
return await self._set_selection(indexer, value, prototype=prototype)

async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self:
async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None:
new_shape = parse_shapelike(new_shape)
assert len(new_shape) == len(self.metadata.shape)
new_metadata = self.metadata.update_shape(new_shape)

# Remove all chunks outside of the new shape
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))

if delete_outside_chunks:
# Remove all chunks outside of the new shape
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))

async def _delete_key(key: str) -> None:
await (self.store_path / key).delete()
Expand All @@ -1128,7 +1128,63 @@ async def _delete_key(key: str) -> None:

# Write new metadata
await self._save_metadata(new_metadata)
return replace(self, metadata=new_metadata)

# Update metadata (in place)
object.__setattr__(self, "metadata", new_metadata)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomAugspurger - calling this out because it will impact the xarray use of resize... now in-place again :)


async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
"""Append `data` to `axis`.

Parameters
----------
data : array-like
Data to be appended.
axis : int
Axis along which to append.

Returns
-------
new_shape : tuple

Notes
-----
The size of all dimensions other than `axis` must match between this
array and `data`.
"""
# ensure data is array-like
if not hasattr(data, "shape"):
data = np.asanyarray(data)

self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis)
data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis)
if self_shape_preserved != data_shape_preserved:
raise ValueError(
f"shape of data to append is not compatible with the array. "
f"The shape of the data is ({data_shape_preserved})"
f"and the shape of the array is ({self_shape_preserved})."
"All dimensions must match except for the dimension being "
"appended."
)
# remember old shape
old_shape = self.shape

# determine new shape
new_shape = tuple(
self.shape[i] if i != axis else self.shape[i] + data.shape[i]
for i in range(len(self.shape))
)

# resize
await self.resize(new_shape)

# store data
append_selection = tuple(
slice(None) if i != axis else slice(old_shape[i], new_shape[i])
for i in range(len(self.shape))
)
await self.setitem(append_selection, data)

return new_shape

async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
# metadata.attributes is "frozen" so we simply clear and update the dict
Expand All @@ -1147,7 +1203,8 @@ async def info(self) -> None:
raise NotImplementedError


@dataclass(frozen=True)
# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
@dataclass(frozen=False)
class Array:
"""Instantiate an array from an initialized store."""

Expand Down Expand Up @@ -1297,6 +1354,11 @@ def shape(self) -> ChunkCoords:
"""
return self._async_array.shape

@shape.setter
def shape(self, value: ChunkCoords) -> None:
"""Sets the shape of the array by calling resize."""
self.resize(value)

@property
def chunks(self) -> ChunkCoords:
"""Returns a tuple of integers describing the length of each dimension of a chunk of the array.
Expand Down Expand Up @@ -2754,18 +2816,18 @@ def blocks(self) -> BlockIndex:
:func:`set_block_selection` for documentation and examples."""
return BlockIndex(self)

def resize(self, new_shape: ChunkCoords) -> Array:
def resize(self, new_shape: ShapeLike) -> None:
"""
Change the shape of the array by growing or shrinking one or more
dimensions.

This method does not modify the original Array object. Instead, it returns a new Array
with the specified shape.
Parameters
----------
new_shape : tuple
New shape of the array.

Notes
-----
When resizing an array, the data are not rearranged in any way.

If one or more dimensions are shrunk, any chunks falling outside the
new array shape will be deleted from the underlying store.
However, it is noteworthy that the chunks partially falling inside the new array
Expand All @@ -2778,7 +2840,6 @@ def resize(self, new_shape: ChunkCoords) -> Array:
>>> import zarr
>>> z = zarr.zeros(shape=(10000, 10000),
>>> chunk_shape=(1000, 1000),
>>> store=StorePath(MemoryStore(mode="w")),
>>> dtype="i4",)
>>> z.shape
(10000, 10000)
Expand All @@ -2791,10 +2852,43 @@ def resize(self, new_shape: ChunkCoords) -> Array:
>>> z2.shape
(50, 50)
"""
resized = sync(self._async_array.resize(new_shape))
# TODO: remove this cast when type inference improves
_resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized)
return type(self)(_resized)
sync(self._async_array.resize(new_shape))

def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
"""Append `data` to `axis`.

Parameters
----------
data : array-like
Data to be appended.
axis : int
Axis along which to append.

Returns
-------
new_shape : tuple

Notes
-----
The size of all dimensions other than `axis` must match between this
array and `data`.

Examples
--------
>>> import numpy as np
>>> import zarr
>>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
>>> z = zarr.array(a, chunks=(1000, 100))
>>> z.shape
(10000, 1000)
>>> z.append(a)
(20000, 1000)
>>> z.append(np.vstack([a, a]), axis=1)
(20000, 2000)
>>> z.shape
(20000, 2000)
"""
return sync(self._async_array.append(data, axis=axis))

def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
# TODO: remove this cast when type inference improves
Expand Down
188 changes: 188 additions & 0 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,194 @@ def test_update_attrs(zarr_format: int) -> None:
assert arr2.attrs["foo"] == "bar"


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_resize_1d(store: MemoryStore, zarr_format: int) -> None:
Comment on lines +422 to +424
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to fight test bloat we should find a way to parameterize over dimensionality instead of making separate 1d, 2d, etc tests. but that's not a blocker here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree! These tests were copied over from v2 so I'm going to leave them for now.

z = zarr.create(
shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format
)
a = np.arange(105, dtype="i4")
z[:] = a
assert (105,) == z.shape
assert (105,) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a, z[:])

z.resize(205)
assert (205,) == z.shape
assert (205,) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a, z[:105])
np.testing.assert_array_equal(np.zeros(100, dtype="i4"), z[105:])

z.resize(55)
assert (55,) == z.shape
assert (55,) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a[:55], z[:])

# via shape setter
new_shape = (105,)
z.shape = new_shape
assert new_shape == z.shape
assert new_shape == z[:].shape


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_resize_2d(store: MemoryStore, zarr_format: int) -> None:
z = zarr.create(
shape=(105, 105),
chunks=(10, 10),
dtype="i4",
fill_value=0,
store=store,
zarr_format=zarr_format,
)
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
z[:] = a
assert (105, 105) == z.shape
assert (105, 105) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a, z[:])

z.resize((205, 205))
assert (205, 205) == z.shape
assert (205, 205) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a, z[:105, :105])
np.testing.assert_array_equal(np.zeros((100, 205), dtype="i4"), z[105:, :])
np.testing.assert_array_equal(np.zeros((205, 100), dtype="i4"), z[:, 105:])

z.resize((55, 55))
assert (55, 55) == z.shape
assert (55, 55) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a[:55, :55], z[:])

z.resize((55, 1))
assert (55, 1) == z.shape
assert (55, 1) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a[:55, :1], z[:])

z.resize((1, 55))
assert (1, 55) == z.shape
assert (1, 55) == z[:].shape
assert np.dtype("i4") == z.dtype
assert np.dtype("i4") == z[:].dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a[:1, :10], z[:, :10])
np.testing.assert_array_equal(np.zeros((1, 55 - 10), dtype="i4"), z[:, 10:55])

# via shape setter
new_shape = (105, 105)
z.shape = new_shape
assert new_shape == z.shape
assert new_shape == z[:].shape


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_1d(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(105)
z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format)
z[:] = a
assert a.shape == z.shape
assert a.dtype == z.dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(a, z[:])

b = np.arange(105, 205)
e = np.append(a, b)
assert z.shape == (105,)
z.append(b)
assert e.shape == z.shape
assert e.dtype == z.dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(e, z[:])

# check append handles array-like
c = [1, 2, 3]
f = np.append(e, c)
z.append(c)
assert f.shape == z.shape
assert f.dtype == z.dtype
assert (10,) == z.chunks
np.testing.assert_array_equal(f, z[:])


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_2d(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
z = zarr.create(
shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format
)
z[:] = a
assert a.shape == z.shape
assert a.dtype == z.dtype
assert (10, 10) == z.chunks
actual = z[:]
np.testing.assert_array_equal(a, actual)

b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105))
e = np.append(a, b, axis=0)
z.append(b)
assert e.shape == z.shape
assert e.dtype == z.dtype
assert (10, 10) == z.chunks
actual = z[:]
np.testing.assert_array_equal(e, actual)


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(105 * 105, dtype="i4").reshape((105, 105))
z = zarr.create(
shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format
)
z[:] = a
assert a.shape == z.shape
assert a.dtype == z.dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(a, z[:])

b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105))
e = np.append(a, b, axis=1)
z.append(b, axis=1)
assert e.shape == z.shape
assert e.dtype == z.dtype
assert (10, 10) == z.chunks
np.testing.assert_array_equal(e, z[:])


@pytest.mark.parametrize("store", ["memory"], indirect=True)
@pytest.mark.parametrize("zarr_format", [2, 3])
def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None:
a = np.arange(100)
z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format)
z[:] = a
b = a.reshape(10, 10)
with pytest.raises(ValueError):
z.append(b)


@pytest.mark.parametrize("order", ["C", "F", None])
@pytest.mark.parametrize("zarr_format", [2, 3])
@pytest.mark.parametrize("store", ["memory"], indirect=True)
Expand Down
Loading