Skip to content
This repository has been archived by the owner on Nov 12, 2024. It is now read-only.

Commit

Permalink
Merge pull request #2 from earth-mover/matt/icechunk-encoding
Browse files Browse the repository at this point in the history
Matt/icechunk encoding
  • Loading branch information
TomNicholas authored Oct 4, 2024
2 parents e9c1287 + 9a03245 commit 9676485
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 20 deletions.
20 changes: 14 additions & 6 deletions virtualizarr/tests/test_writers/test_icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def test_write_new_virtual_variable(
assert arr.order == "C"
assert arr.fill_value == 0
# TODO check compressor, filters?
#

# check array attrs
# TODO somehow this is broken by setting the dimension names???
Expand Down Expand Up @@ -113,7 +114,6 @@ def test_set_single_virtual_ref_without_encoding(
# note: we don't need to test that committing works, because now we have confirmed
# the refs are in the store (even uncommitted) it's icechunk's problem to manage them now.

@pytest.mark.xfail(reason="Test doesn't account for scale factor encoding yet")
def test_set_single_virtual_ref_with_encoding(
self, icechunk_filestore: "IcechunkStore", netcdf4_file: Path
):
Expand All @@ -136,7 +136,7 @@ def test_set_single_virtual_ref_with_encoding(
chunkmanifest=manifest,
zarray=zarray,
)
air = Variable(data=ma, dims=["time", "lat", "lon"])
air = Variable(data=ma, dims=["time", "lat", "lon"], encoding={"scale_factor": 0.01})
vds = Dataset(
{"air": air},
)
Expand All @@ -145,21 +145,29 @@ def test_set_single_virtual_ref_with_encoding(

root_group = group(store=icechunk_filestore)
air_array = root_group["air"]
print(air_array)

# check array metadata
assert air_array.shape == (2920, 25, 53)
assert air_array.chunks == (2920, 25, 53)
assert air_array.dtype == np.dtype("int16")
assert air_array.attrs['scale_factor'] == 0.01

# xarray performs this when cf_decoding is True, but we are not loading
# with xarray here so we scale it manually.
scale_factor = air_array.attrs['scale_factor']
scaled_air_array = air_array[:] * scale_factor

# check chunk references
# TODO we can't explicitly check that the path/offset/length is correct because icechunk doesn't yet expose any get_virtual_refs method

expected_ds = open_dataset(netcdf4_file)
expected_air_array = expected_ds["air"].to_numpy()
npt.assert_equal(air_array, expected_air_array)
npt.assert_equal(scaled_air_array, expected_air_array)

# note: we don't need to test that committing works, because now we have confirmed
# the refs are in the store (even uncommitted) it's icechunk's problem to manage them now.


# TODO get test with encoding working

# TODO test writing grids of multiple chunks

# TODO test writing to a group that isn't the root group
Expand Down
32 changes: 18 additions & 14 deletions virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from typing import TYPE_CHECKING

import numpy as np
from xarray import Dataset
from xarray import Dataset, conventions
from xarray.backends.zarr import encode_zarr_attr_value
from xarray.core.variable import Variable
from zarr import Group

Expand All @@ -15,11 +16,11 @@

VALID_URI_PREFIXES = {
"s3://",
"gs://",
"azure://",
"r2://",
"cos://",
"minio://",
# "gs://",
# "azure://",
# "r2://",
# "cos://",
# "minio://",
"file:///",
}

Expand Down Expand Up @@ -50,7 +51,7 @@ def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None:
# TODO this is Frozen, the API for setting attributes must be something else
# root_group.attrs = ds.attrs
for k, v in ds.attrs.items():
root_group.attrs[k] = v
root_group.attrs[k] = encode_zarr_attr_value(v)

asyncio.run(
write_variables_to_icechunk_group(
Expand Down Expand Up @@ -108,7 +109,6 @@ async def write_virtual_variable_to_icechunk(
var: Variable,
) -> None:
"""Write a single virtual variable into an icechunk store"""

ma = var.data
zarray = ma.zarray

Expand All @@ -118,17 +118,21 @@ async def write_virtual_variable_to_icechunk(
shape=zarray.shape,
chunk_shape=zarray.chunks,
dtype=encode_dtype(zarray.dtype),
codecs=zarray._v3_codec_pipeline(),
dimension_names=var.dims,
fill_value=zarray.fill_value,
# TODO fill_value?
# TODO order?
# TODO zarr format?
# TODO compressors?
)

# TODO it would be nice if we could assign directly to the .attrs property
for k, v in var.attrs.items():
arr.attrs[k] = v
# TODO we should probably be doing some encoding of those attributes?
arr.attrs["DIMENSION_NAMES"] = var.dims
arr.attrs[k] = encode_zarr_attr_value(v)
arr.attrs["DIMENSION_NAMES"] = encode_zarr_attr_value(var.dims)

_encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"}
for k, v in var.encoding.items():
if k in _encoding_keys:
arr.attrs[k] = encode_zarr_attr_value(v)

await write_manifest_virtual_refs(
store=store,
Expand Down

0 comments on commit 9676485

Please sign in to comment.