Use 3 numpy arrays for manifest internally (#107)

* change entries property to a structured array, add from_dict * fix validation * equals method * re-implemented concatenation through concatenation of the wrapped structured array * fixed manifest.from_kerchunk_dict * fixed kerchunk tests * change private attributes to 3 numpy arrays * add from_arrays method * to and from dict working again * fix dtype comparisons * depend on numpy release candidate * get concatenation and stacking working * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove manifest-level tests of concatenation * generalized create_manifestarray fixture * added tests of broadcasting * made basic broadcasting tests pass by calling np.broadcast_to on underlying numpy arrays * generalize fixture for creating scalar ManifestArrays * improve regression test for expanding scalar ManifestArray * remove now-unneeded scalar broadcasting logic * rewrite __eq__ method on ManifestArray * remove unused import * reinstate the ChunkManifest.__init__ method to accept dictionaries * hopefully fix broadcasting bug * add backwards compatibility for pre-numpy2.0 * depend on numpy>=2.0.0 * rewrote broadcast_to shape logic * remove faulty hypothesis strategies * remove hypothesis from dependencies * ignore remaining mypy errors * release notes * update dependencies in CI test env --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
zarr-developers · Jun 18, 2024 · c4d4325 · c4d4325
1 parent 27a6f94
commit c4d4325
Show file tree

Hide file tree

Showing 13 changed files with 399 additions and 333 deletions.
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -7,10 +7,10 @@ dependencies:
   - h5py
   - hdf5
   - netcdf4
-  - xarray>=2024.5.0
+  - xarray>=2024.6.0
   - kerchunk>=0.2.5
   - pydantic
-  - numpy
+  - numpy>=2.0.0
   - ujson
   - packaging
   - universal_pathlib

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -13,6 +13,8 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Requires numpy 2.0 (for :pull:`107`).
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 Deprecations
 ~~~~~~~~~~~~
@@ -29,6 +31,8 @@ Documentation
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- Refactor `ChunkManifest` class to store chunk references internally using numpy arrays.
+  (:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
   (:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,11 +21,11 @@ classifiers = [
 requires-python = ">=3.10"
 dynamic = ["version"]
 dependencies = [
-    "xarray>=2024.5.0",
+    "xarray>=2024.06.0",
     "kerchunk>=0.2.5",
     "h5netcdf",
     "pydantic",
-    "numpy",
+    "numpy>=2.0.0",
     "ujson",
     "packaging",
     "universal-pathlib",

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -241,8 +241,8 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
         marr = var.data
 
         arr_refs: dict[str, str | list[str | int]] = {
-            str(chunk_key): chunk_entry.to_kerchunk()
-            for chunk_key, chunk_entry in marr.manifest.entries.items()
+            str(chunk_key): [entry["path"], entry["offset"], entry["length"]]
+            for chunk_key, entry in marr.manifest.dict().items()
         }
 
         zarray = marr.zarray

diff --git a/virtualizarr/manifests/__init__.py b/virtualizarr/manifests/__init__.py
@@ -2,9 +2,4 @@
 # This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring.
 
 from .array import ManifestArray  # type: ignore # noqa
-from .manifest import (  # type: ignore # noqa
-    ChunkEntry,
-    ChunkManifest,
-    concat_manifests,
-    stack_manifests,
-)
+from .manifest import ChunkEntry, ChunkManifest  # type: ignore # noqa
diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
@@ -50,9 +50,12 @@ def __init__(
             _chunkmanifest = ChunkManifest(entries=chunkmanifest)
         else:
             raise TypeError(
-                f"chunkmanifest arg must be of type ChunkManifest, but got type {type(chunkmanifest)}"
+                f"chunkmanifest arg must be of type ChunkManifest or dict, but got type {type(chunkmanifest)}"
             )
 
+        # TODO check that the zarray shape and chunkmanifest shape are consistent with one another
+        # TODO also cover the special case of scalar arrays
+
         self._zarray = _zarray
         self._manifest = _chunkmanifest
 
@@ -140,7 +143,7 @@ def __eq__(  # type: ignore[override]
 
         Returns a numpy array of booleans.
         """
-        if isinstance(other, (int, float, bool)):
+        if isinstance(other, (int, float, bool, np.ndarray)):
             # TODO what should this do when comparing against numpy arrays?
             return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool))
         elif not isinstance(other, ManifestArray):
@@ -164,9 +167,22 @@ def __eq__(  # type: ignore[override]
                     UserWarning,
                 )
 
-                # TODO do chunk-wise comparison
-                # TODO expand it into an element-wise result
-                return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool))
+                # do chunk-wise comparison
+                equal_chunk_paths = self.manifest._paths == other.manifest._paths
+                equal_chunk_offsets = self.manifest._offsets == other.manifest._offsets
+                equal_chunk_lengths = self.manifest._lengths == other.manifest._lengths
+
+                equal_chunks = (
+                    equal_chunk_paths & equal_chunk_offsets & equal_chunk_lengths
+                )
+
+                if not equal_chunks.all():
+                    # TODO expand chunk-wise comparison into an element-wise result instead of just returning all False
+                    return np.full(
+                        shape=self.shape, fill_value=False, dtype=np.dtype(bool)
+                    )
+                else:
+                    raise RuntimeWarning("Should not be possible to get here")
 
     def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray":
         """Cannot change the dtype, but needed because xarray will call this even when it's a no-op."""

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
@@ -1,11 +1,10 @@
-import itertools
-from collections.abc import Callable, Iterable
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Callable, Iterable
 
 import numpy as np
 
-from ..zarr import Codec, ZArray
-from .manifest import concat_manifests, stack_manifests
+from virtualizarr.zarr import Codec, ceildiv
+
+from .manifest import ChunkManifest
 
 if TYPE_CHECKING:
     from .array import ManifestArray
@@ -125,21 +124,28 @@ def concatenate(
     new_shape = list(first_shape)
     new_shape[axis] = new_length_along_concat_axis
 
-    concatenated_manifest = concat_manifests(
-        [arr.manifest for arr in arrays],
+    # do concatenation of entries in manifest
+    concatenated_paths = np.concatenate(
+        [arr.manifest._paths for arr in arrays],
+        axis=axis,
+    )
+    concatenated_offsets = np.concatenate(
+        [arr.manifest._offsets for arr in arrays],
+        axis=axis,
+    )
+    concatenated_lengths = np.concatenate(
+        [arr.manifest._lengths for arr in arrays],
         axis=axis,
     )
+    concatenated_manifest = ChunkManifest.from_arrays(
+        paths=concatenated_paths,
+        offsets=concatenated_offsets,
+        lengths=concatenated_lengths,
+    )
 
-    new_zarray = ZArray(
-        chunks=first_arr.chunks,
-        compressor=first_arr.zarray.compressor,
-        dtype=first_arr.dtype,
-        fill_value=first_arr.zarray.fill_value,
-        filters=first_arr.zarray.filters,
-        shape=new_shape,
-        # TODO presumably these things should be checked for consistency across arrays too?
-        order=first_arr.zarray.order,
-        zarr_format=first_arr.zarray.zarr_format,
+    # chunk shape has not changed, there are just now more chunks along the concatenation axis
+    new_zarray = first_arr.zarray.replace(
+        shape=tuple(new_shape),
     )
 
     return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
@@ -210,26 +216,33 @@ def stack(
     new_shape = list(first_shape)
     new_shape.insert(axis, length_along_new_stacked_axis)
 
-    stacked_manifest = stack_manifests(
-        [arr.manifest for arr in arrays],
+    # do stacking of entries in manifest
+    stacked_paths = np.stack(
+        [arr.manifest._paths for arr in arrays],
+        axis=axis,
+    )
+    stacked_offsets = np.stack(
+        [arr.manifest._offsets for arr in arrays],
+        axis=axis,
+    )
+    stacked_lengths = np.stack(
+        [arr.manifest._lengths for arr in arrays],
         axis=axis,
     )
+    stacked_manifest = ChunkManifest.from_arrays(
+        paths=stacked_paths,
+        offsets=stacked_offsets,
+        lengths=stacked_lengths,
+    )
 
-    # chunk size has changed because a length-1 axis has been inserted
+    # chunk shape has changed because a length-1 axis has been inserted
     old_chunks = first_arr.chunks
     new_chunks = list(old_chunks)
     new_chunks.insert(axis, 1)
 
-    new_zarray = ZArray(
-        chunks=new_chunks,
-        compressor=first_arr.zarray.compressor,
-        dtype=first_arr.dtype,
-        fill_value=first_arr.zarray.fill_value,
-        filters=first_arr.zarray.filters,
-        shape=new_shape,
-        # TODO presumably these things should be checked for consistency across arrays too?
-        order=first_arr.zarray.order,
-        zarr_format=first_arr.zarray.zarr_format,
+    new_zarray = first_arr.zarray.replace(
+        chunks=tuple(new_chunks),
+        shape=tuple(new_shape),
     )
 
     return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray)
@@ -254,74 +267,65 @@ def expand_dims(x: "ManifestArray", /, *, axis: int = 0) -> "ManifestArray":
 @implements(np.broadcast_to)
 def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArray":
     """
-    Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries.
+    Broadcasts a ManifestArray to a specified shape, by either adjusting chunk keys or copying chunk manifest entries.
     """
 
-    if len(x.shape) > len(shape):
-        raise ValueError("input operand has more dimensions than allowed")
-
-    # numpy broadcasting algorithm requires us to start by comparing the length of the final axes and work backwards
-    result = x
-    for axis, d, d_requested in itertools.zip_longest(
-        reversed(range(len(shape))), reversed(x.shape), reversed(shape), fillvalue=None
-    ):
-        # len(shape) check above ensures this can't be type None
-        d_requested = cast(int, d_requested)
-
-        if d == d_requested:
-            pass
-        elif d is None:
-            if result.shape == ():
-                # scalars are a special case because their manifests already have a chunk key with one dimension
-                # see https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282
-                result = _broadcast_scalar(result, new_axis_length=d_requested)
-            else:
-                # stack same array upon itself d_requested number of times, which inserts a new axis at axis=0
-                result = stack([result] * d_requested, axis=0)
-        elif d == 1:
-            # concatenate same array upon itself d_requested number of times along existing axis
-            result = concatenate([result] * d_requested, axis=axis)
-        else:
-            raise ValueError(
-                f"Array with shape {x.shape} cannot be broadcast to shape {shape}"
-            )
-
-    return result
-
-
-def _broadcast_scalar(x: "ManifestArray", new_axis_length: int) -> "ManifestArray":
-    """
-    Add an axis to a scalar ManifestArray, but without adding a new axis to the keys of the chunk manifest.
+    from .array import ManifestArray
 
-    This is not the same as concatenation, because there is no existing axis along which to concatenate.
-    It's also not the same as stacking, because we don't want to insert a new axis into the chunk keys.
+    new_shape = shape
 
-    Scalars are a special case because their manifests still have a chunk key with one dimension.
-    See https://github.com/TomNicholas/VirtualiZarr/issues/100#issuecomment-2097058282
-    """
+    # check its actually possible to broadcast to this new shape
+    mutually_broadcastable_shape = np.broadcast_shapes(x.shape, new_shape)
+    if mutually_broadcastable_shape != new_shape:
+        # we're not trying to broadcast both shapes to a third shape
+        raise ValueError(
+            f"array of shape {x.shape} cannot be broadcast to shape {new_shape}"
+        )
 
-    from .array import ManifestArray
+    # new chunk_shape is old chunk_shape with singleton dimensions pre-pended
+    # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements)
+    old_chunk_shape = x.chunks
+    new_chunk_shape = _prepend_singleton_dimensions(
+        old_chunk_shape, ndim=len(new_shape)
+    )
 
-    new_shape = (new_axis_length,)
-    new_chunks = (new_axis_length,)
+    # find new chunk grid shape by dividing new array shape by new chunk shape
+    new_chunk_grid_shape = tuple(
+        ceildiv(axis_length, chunk_length)
+        for axis_length, chunk_length in zip(new_shape, new_chunk_shape)
+    )
 
-    concatenated_manifest = concat_manifests(
-        [x.manifest] * new_axis_length,
-        axis=0,
+    # do broadcasting of entries in manifest
+    broadcasted_paths = np.broadcast_to(
+        x.manifest._paths,
+        shape=new_chunk_grid_shape,
+    )
+    broadcasted_offsets = np.broadcast_to(
+        x.manifest._offsets,
+        shape=new_chunk_grid_shape,
+    )
+    broadcasted_lengths = np.broadcast_to(
+        x.manifest._lengths,
+        shape=new_chunk_grid_shape,
+    )
+    broadcasted_manifest = ChunkManifest.from_arrays(
+        paths=broadcasted_paths,
+        offsets=broadcasted_offsets,
+        lengths=broadcasted_lengths,
     )
 
-    new_zarray = ZArray(
-        chunks=new_chunks,
-        compressor=x.zarray.compressor,
-        dtype=x.dtype,
-        fill_value=x.zarray.fill_value,
-        filters=x.zarray.filters,
+    new_zarray = x.zarray.replace(
+        chunks=new_chunk_shape,
         shape=new_shape,
-        order=x.zarray.order,
-        zarr_format=x.zarray.zarr_format,
     )
 
-    return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray)
+    return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray)
+
+
+def _prepend_singleton_dimensions(shape: tuple[int, ...], ndim: int) -> tuple[int, ...]:
+    """Prepend as many new length-1 axes to shape as necessary such that the result has ndim number of axes."""
+    n_prepended_dims = ndim - len(shape)
+    return tuple([1] * n_prepended_dims + list(shape))
 
 
 # TODO broadcast_arrays, squeeze, permute_dims