Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: implicit fill value initialisation #2799

Merged
merged 7 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2799.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types
34 changes: 17 additions & 17 deletions src/zarr/core/codec_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[
return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]


def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
fill_value = chunk_spec.fill_value
if fill_value is None:
# Zarr V2 allowed `fill_value` to be null in the metadata.
# Zarr V3 requires it to be set. This has already been
# validated when decoding the metadata, but we support reading
# Zarr V2 data and need to support the case where fill_value
# is None.
return _default_fill_value(dtype=chunk_spec.dtype)
else:
return fill_value


@dataclass(frozen=True)
class BatchedCodecPipeline(CodecPipeline):
"""Default codec pipeline.
Expand Down Expand Up @@ -247,17 +260,7 @@ async def read_batch(
if chunk_array is not None:
out[out_selection] = chunk_array
else:
fill_value = chunk_spec.fill_value

if fill_value is None:
# Zarr V2 allowed `fill_value` to be null in the metadata.
# Zarr V3 requires it to be set. This has already been
# validated when decoding the metadata, but we support reading
# Zarr V2 data and need to support the case where fill_value
# is None.
fill_value = _default_fill_value(dtype=chunk_spec.dtype)

out[out_selection] = fill_value
out[out_selection] = fill_value_or_default(chunk_spec)
else:
chunk_bytes_batch = await concurrent_map(
[
Expand All @@ -284,10 +287,7 @@ async def read_batch(
tmp = tmp.squeeze(axis=drop_axes)
out[out_selection] = tmp
else:
fill_value = chunk_spec.fill_value
if fill_value is None:
fill_value = _default_fill_value(dtype=chunk_spec.dtype)
out[out_selection] = fill_value
out[out_selection] = fill_value_or_default(chunk_spec)

def _merge_chunk_array(
self,
Expand All @@ -305,7 +305,7 @@ def _merge_chunk_array(
shape=chunk_spec.shape,
dtype=chunk_spec.dtype,
order=chunk_spec.order,
fill_value=chunk_spec.fill_value,
fill_value=fill_value_or_default(chunk_spec),
)
else:
chunk_array = existing_chunk_array.copy() # make a writable copy
Expand Down Expand Up @@ -394,7 +394,7 @@ async def _read_key(
chunk_array_batch.append(None) # type: ignore[unreachable]
else:
if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
chunk_spec.fill_value
fill_value_or_default(chunk_spec)
):
chunk_array_batch.append(None)
else:
Expand Down
8 changes: 8 additions & 0 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,14 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
return b""
elif dtype.kind in "UO":
return ""
elif dtype.kind in "Mm":
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could use some keen eyes, I've never used these dtypes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also unfamiliar with those dtypes. perhaps you could find examples from the tests on the v2 branch?

return dtype.type("nat")
elif dtype.kind == "V":
if dtype.fields is not None:
default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()])
return np.array([default], dtype=dtype)
else:
return np.zeros(1, dtype=dtype)
else:
return dtype.type(0)

Expand Down
19 changes: 19 additions & 0 deletions tests/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,22 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
za[...] = a
za = zarr.open_array(store=array_path)
assert (a == za[:]).all()


@pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"])
def test_other_dtype_roundtrip(fill_value, tmp_path) -> None:
a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7")
array_path = tmp_path / "data.zarr"
za = zarr.create(
shape=(3,),
store=array_path,
chunks=(2,),
fill_value=fill_value,
zarr_format=2,
dtype=a.dtype,
)
if fill_value is not None:
assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
za[...] = a
za = zarr.open_array(store=array_path)
assert (a == za[:]).all()