Skip to content
This repository has been archived by the owner on Nov 12, 2024. It is now read-only.

Matt/icechunk encoding #2

Merged
merged 6 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions virtualizarr/tests/test_writers/test_icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def test_write_new_virtual_variable(
assert arr.order == "C"
assert arr.fill_value == 0
# TODO check compressor, filters?
#

# check array attrs
# TODO somehow this is broken by setting the dimension names???
Expand Down Expand Up @@ -113,7 +114,6 @@ def test_set_single_virtual_ref_without_encoding(
# note: we don't need to test that committing works, because now we have confirmed
# the refs are in the store (even uncommitted) it's icechunk's problem to manage them now.

@pytest.mark.xfail(reason="Test doesn't account for scale factor encoding yet")
def test_set_single_virtual_ref_with_encoding(
self, icechunk_filestore: "IcechunkStore", netcdf4_file: Path
):
Expand All @@ -136,7 +136,7 @@ def test_set_single_virtual_ref_with_encoding(
chunkmanifest=manifest,
zarray=zarray,
)
air = Variable(data=ma, dims=["time", "lat", "lon"])
air = Variable(data=ma, dims=["time", "lat", "lon"], encoding={"scale_factor": 0.01})
vds = Dataset(
{"air": air},
)
Expand All @@ -145,21 +145,29 @@ def test_set_single_virtual_ref_with_encoding(

root_group = group(store=icechunk_filestore)
air_array = root_group["air"]
print(air_array)

# check array metadata
assert air_array.shape == (2920, 25, 53)
assert air_array.chunks == (2920, 25, 53)
assert air_array.dtype == np.dtype("int16")
assert air_array.attrs['scale_factor'] == 0.01

# xarray performs this when cf_decoding is True, but we are not loading
# with xarray here so we scale it manually.
scale_factor = air_array.attrs['scale_factor']
scaled_air_array = air_array[:] * scale_factor

# check chunk references
# TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method

expected_ds = open_dataset(netcdf4_file)
expected_air_array = expected_ds["air"].to_numpy()
npt.assert_equal(air_array, expected_air_array)
npt.assert_equal(scaled_air_array, expected_air_array)

# note: we don't need to test that committing works, because now we have confirmed
# the refs are in the store (even uncommitted) it's icechunk's problem to manage them now.


# TODO get test with encoding working

# TODO test writing grids of multiple chunks

# TODO test writing to a group that isn't the root group
Expand Down
32 changes: 18 additions & 14 deletions virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from typing import TYPE_CHECKING

import numpy as np
from xarray import Dataset
from xarray import Dataset, conventions
from xarray.backends.zarr import encode_zarr_attr_value
from xarray.core.variable import Variable
from zarr import Group

Expand All @@ -15,11 +16,11 @@

VALID_URI_PREFIXES = {
"s3://",
"gs://",
"azure://",
"r2://",
"cos://",
"minio://",
# "gs://",
# "azure://",
# "r2://",
# "cos://",
# "minio://",
"file:///",
}

Expand Down Expand Up @@ -50,7 +51,7 @@ def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None:
# TODO this is Frozen, the API for setting attributes must be something else
# root_group.attrs = ds.attrs
for k, v in ds.attrs.items():
root_group.attrs[k] = v
root_group.attrs[k] = encode_zarr_attr_value(v)

asyncio.run(
write_variables_to_icechunk_group(
Expand Down Expand Up @@ -108,7 +109,6 @@ async def write_virtual_variable_to_icechunk(
var: Variable,
) -> None:
"""Write a single virtual variable into an icechunk store"""

ma = var.data
zarray = ma.zarray

Expand All @@ -118,17 +118,21 @@ async def write_virtual_variable_to_icechunk(
shape=zarray.shape,
chunk_shape=zarray.chunks,
dtype=encode_dtype(zarray.dtype),
codecs=zarray._v3_codec_pipeline(),
dimension_names=var.dims,
fill_value=zarray.fill_value,
# TODO fill_value?
# TODO order?
# TODO zarr format?
# TODO compressors?
)

# TODO it would be nice if we could assign directly to the .attrs property
for k, v in var.attrs.items():
arr.attrs[k] = v
# TODO we should probably be doing some encoding of those attributes?
arr.attrs["DIMENSION_NAMES"] = var.dims
arr.attrs[k] = encode_zarr_attr_value(v)
arr.attrs["DIMENSION_NAMES"] = encode_zarr_attr_value(var.dims)

_encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"}
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filters and compressors are packed in the codec pipeline, but CF encoding params are not

for k, v in var.encoding.items():
if k in _encoding_keys:
arr.attrs[k] = encode_zarr_attr_value(v)

await write_manifest_virtual_refs(
store=store,
Expand Down