diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index 6c905f0..72c4fe6 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -63,6 +63,7 @@ def test_write_new_virtual_variable( assert arr.order == "C" assert arr.fill_value == 0 # TODO check compressor, filters? + # # check array attrs # TODO somehow this is broken by setting the dimension names??? @@ -113,7 +114,6 @@ def test_set_single_virtual_ref_without_encoding( # note: we don't need to test that committing works, because now we have confirmed # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now. - @pytest.mark.xfail(reason="Test doesn't account for scale factor encoding yet") def test_set_single_virtual_ref_with_encoding( self, icechunk_filestore: "IcechunkStore", netcdf4_file: Path ): @@ -136,7 +136,7 @@ def test_set_single_virtual_ref_with_encoding( chunkmanifest=manifest, zarray=zarray, ) - air = Variable(data=ma, dims=["time", "lat", "lon"]) + air = Variable(data=ma, dims=["time", "lat", "lon"], encoding={"scale_factor": 0.01}) vds = Dataset( {"air": air}, ) @@ -145,21 +145,29 @@ def test_set_single_virtual_ref_with_encoding( root_group = group(store=icechunk_filestore) air_array = root_group["air"] - print(air_array) + + # check array metadata + assert air_array.shape == (2920, 25, 53) + assert air_array.chunks == (2920, 25, 53) + assert air_array.dtype == np.dtype("int16") + assert air_array.attrs['scale_factor'] == 0.01 + + # xarray performs this when cf_decoding is True, but we are not loading + # with xarray here so we scale it manually. + scale_factor = air_array.attrs['scale_factor'] + scaled_air_array = air_array[:] * scale_factor # check chunk references # TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method expected_ds = open_dataset(netcdf4_file) expected_air_array = expected_ds["air"].to_numpy() - npt.assert_equal(air_array, expected_air_array) + npt.assert_equal(scaled_air_array, expected_air_array) # note: we don't need to test that committing works, because now we have confirmed # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now. -# TODO get test with encoding working - # TODO test writing grids of multiple chunks # TODO test writing to a group that isn't the root group diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index fea3e26..84fbe1b 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -2,7 +2,8 @@ from typing import TYPE_CHECKING import numpy as np -from xarray import Dataset +from xarray import Dataset, conventions +from xarray.backends.zarr import encode_zarr_attr_value from xarray.core.variable import Variable from zarr import Group @@ -15,11 +16,11 @@ VALID_URI_PREFIXES = { "s3://", - "gs://", - "azure://", - "r2://", - "cos://", - "minio://", + # "gs://", + # "azure://", + # "r2://", + # "cos://", + # "minio://", "file:///", } @@ -50,7 +51,7 @@ def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None: # TODO this is Frozen, the API for setting attributes must be something else # root_group.attrs = ds.attrs for k, v in ds.attrs.items(): - root_group.attrs[k] = v + root_group.attrs[k] = encode_zarr_attr_value(v) asyncio.run( write_variables_to_icechunk_group( @@ -108,7 +109,6 @@ async def write_virtual_variable_to_icechunk( var: Variable, ) -> None: """Write a single virtual variable into an icechunk store""" - ma = var.data zarray = ma.zarray @@ -118,17 +118,21 @@ async def write_virtual_variable_to_icechunk( shape=zarray.shape, chunk_shape=zarray.chunks, dtype=encode_dtype(zarray.dtype), + codecs=zarray._v3_codec_pipeline(), + dimension_names=var.dims, + fill_value=zarray.fill_value, # TODO fill_value? - # TODO order? - # TODO zarr format? - # TODO compressors? ) # TODO it would be nice if we could assign directly to the .attrs property for k, v in var.attrs.items(): - arr.attrs[k] = v - # TODO we should probably be doing some encoding of those attributes? - arr.attrs["DIMENSION_NAMES"] = var.dims + arr.attrs[k] = encode_zarr_attr_value(v) + arr.attrs["DIMENSION_NAMES"] = encode_zarr_attr_value(var.dims) + + _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + for k, v in var.encoding.items(): + if k in _encoding_keys: + arr.attrs[k] = encode_zarr_attr_value(v) await write_manifest_virtual_refs( store=store,