From 2fe301279c4d0ed08ea651180aa26dac83ac3f45 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 2 Oct 2024 12:07:29 -0400 Subject: [PATCH 1/5] Work on encoding test --- virtualizarr/tests/test_writers/test_icechunk.py | 14 +++++++++++--- virtualizarr/writers/icechunk.py | 6 ++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index 3861e00..53f4288 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -63,6 +63,7 @@ def test_write_new_virtual_variable( assert arr.order == "C" assert arr.fill_value == 0 # TODO check compressor, filters? + # # check array attrs # TODO somehow this is broken by setting the dimension names??? @@ -93,7 +94,7 @@ def test_set_single_virtual_ref( chunkmanifest=manifest, zarray=zarray, ) - air = Variable(data=ma, dims=["time", "lat", "lon"]) + air = Variable(data=ma, dims=["time", "lat", "lon"], encoding={"scale_factor": 0.01}) vds = Dataset( {"air": air}, ) @@ -102,13 +103,20 @@ def test_set_single_virtual_ref( root_group = group(store=icechunk_filestore) air_array = root_group["air"] - print(air_array) + + # check array metadata + assert air_array.shape == (2920, 25, 53) + assert air_array.chunks == (2920, 25, 53) + assert air_array.dtype == np.dtype("int16") + assert air_array.attrs['scale_factor'] == 0.01 + + scale_factor = air_array.attrs['scale_factor'] # check chunk references # TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method expected_ds = open_dataset(netcdf4_file) - expected_air_array = expected_ds["air"].to_numpy() + expected_air_array = expected_ds["air"].to_numpy() / scale_factor npt.assert_equal(air_array, expected_air_array) # note: we don't need to test that committing works, because now we have confirmed diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index fea3e26..f1c2f4f 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -118,6 +118,7 @@ async def write_virtual_variable_to_icechunk( shape=zarray.shape, chunk_shape=zarray.chunks, dtype=encode_dtype(zarray.dtype), + codecs=zarray._v3_codec_pipeline(), # TODO fill_value? # TODO order? # TODO zarr format? @@ -130,6 +131,11 @@ async def write_virtual_variable_to_icechunk( # TODO we should probably be doing some encoding of those attributes? arr.attrs["DIMENSION_NAMES"] = var.dims + _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + for k, v in var.encoding.items(): + if k in _encoding_keys: + arr.attrs[k] = v + await write_manifest_virtual_refs( store=store, group=group, From 8aa6034cd39d657984d25c29104eb177a93606c2 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 2 Oct 2024 13:09:28 -0400 Subject: [PATCH 2/5] Update test to match --- virtualizarr/tests/test_writers/test_icechunk.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index 03bc8b6..ae6837e 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -114,7 +114,6 @@ def test_set_single_virtual_ref_without_encoding( # note: we don't need to test that committing works, because now we have confirmed # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now. - @pytest.mark.xfail(reason="Test doesn't account for scale factor encoding yet") def test_set_single_virtual_ref_with_encoding( self, icechunk_filestore: "IcechunkStore", netcdf4_file: Path ): @@ -152,15 +151,15 @@ def test_set_single_virtual_ref_with_encoding( assert air_array.chunks == (2920, 25, 53) assert air_array.dtype == np.dtype("int16") assert air_array.attrs['scale_factor'] == 0.01 - scale_factor = air_array.attrs['scale_factor'] + scaled_air_array = air_array[:] * scale_factor # check chunk references # TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method expected_ds = open_dataset(netcdf4_file) - expected_air_array = expected_ds["air"].to_numpy() / scale_factor - npt.assert_equal(air_array, expected_air_array) + expected_air_array = expected_ds["air"].to_numpy() + npt.assert_equal(scaled_air_array, expected_air_array) # note: we don't need to test that committing works, because now we have confirmed # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now. From aa2d415cfeab567859987cbfd44f463833801fad Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 2 Oct 2024 13:16:45 -0400 Subject: [PATCH 3/5] Quick comment --- virtualizarr/tests/test_writers/test_icechunk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index ae6837e..72c4fe6 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -151,6 +151,9 @@ def test_set_single_virtual_ref_with_encoding( assert air_array.chunks == (2920, 25, 53) assert air_array.dtype == np.dtype("int16") assert air_array.attrs['scale_factor'] == 0.01 + + # xarray performs this when cf_decoding is True, but we are not loading + # with xarray here so we scale it manually. scale_factor = air_array.attrs['scale_factor'] scaled_air_array = air_array[:] * scale_factor @@ -165,8 +168,6 @@ def test_set_single_virtual_ref_with_encoding( # the refs are in the store (even uncommitted) it's icechunk's problem to manage them now. -# TODO get test with encoding working - # TODO test writing grids of multiple chunks # TODO test writing to a group that isn't the root group From 7e4e2ce0ebc864b8421e07c59d48eba317df94fe Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 2 Oct 2024 14:44:08 -0400 Subject: [PATCH 4/5] more comprehensive --- virtualizarr/writers/icechunk.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index f1c2f4f..2b1454e 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -119,10 +119,9 @@ async def write_virtual_variable_to_icechunk( chunk_shape=zarray.chunks, dtype=encode_dtype(zarray.dtype), codecs=zarray._v3_codec_pipeline(), + dimension_names=var.dims, + fill_value=zarray.fill_value, # TODO fill_value? - # TODO order? - # TODO zarr format? - # TODO compressors? ) # TODO it would be nice if we could assign directly to the .attrs property From 9a03245991f86acd9fffb45b84aa902077ca86d3 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 3 Oct 2024 13:53:32 -0400 Subject: [PATCH 5/5] add attrtirbute encoding --- virtualizarr/writers/icechunk.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 2b1454e..84fbe1b 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -2,7 +2,8 @@ from typing import TYPE_CHECKING import numpy as np -from xarray import Dataset +from xarray import Dataset, conventions +from xarray.backends.zarr import encode_zarr_attr_value from xarray.core.variable import Variable from zarr import Group @@ -15,11 +16,11 @@ VALID_URI_PREFIXES = { "s3://", - "gs://", - "azure://", - "r2://", - "cos://", - "minio://", + # "gs://", + # "azure://", + # "r2://", + # "cos://", + # "minio://", "file:///", } @@ -50,7 +51,7 @@ def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None: # TODO this is Frozen, the API for setting attributes must be something else # root_group.attrs = ds.attrs for k, v in ds.attrs.items(): - root_group.attrs[k] = v + root_group.attrs[k] = encode_zarr_attr_value(v) asyncio.run( write_variables_to_icechunk_group( @@ -108,7 +109,6 @@ async def write_virtual_variable_to_icechunk( var: Variable, ) -> None: """Write a single virtual variable into an icechunk store""" - ma = var.data zarray = ma.zarray @@ -126,14 +126,13 @@ async def write_virtual_variable_to_icechunk( # TODO it would be nice if we could assign directly to the .attrs property for k, v in var.attrs.items(): - arr.attrs[k] = v - # TODO we should probably be doing some encoding of those attributes? - arr.attrs["DIMENSION_NAMES"] = var.dims + arr.attrs[k] = encode_zarr_attr_value(v) + arr.attrs["DIMENSION_NAMES"] = encode_zarr_attr_value(var.dims) _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} for k, v in var.encoding.items(): if k in _encoding_keys: - arr.attrs[k] = v + arr.attrs[k] = encode_zarr_attr_value(v) await write_manifest_virtual_refs( store=store,