zarr-developers · TomNicholas · Oct 22, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/ci/upstream.yml b/ci/upstream.yml
@@ -24,7 +24,7 @@ dependencies:
   - fsspec
   - pip
   - pip:
-    - zarr==3.0.0b1  # beta release of zarr-python v3
+    - icechunk # Installs zarr v3 as dependency
     - git+https://github.com/pydata/xarray@zarr-v3  # zarr-v3 compatibility branch
     - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs  # zarr-v3 compatibility branch
     # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
diff --git a/conftest.py b/conftest.py
@@ -1,6 +1,8 @@
 import h5py
+import numpy as np
 import pytest
 import xarray as xr
+from xarray.core.variable import Variable
 
 
 def pytest_addoption(parser):
@@ -96,3 +98,16 @@ def hdf5_scalar(tmpdir):
     dataset = f.create_dataset("scalar", data=0.1, dtype="float32")
     dataset.attrs["scalar"] = "true"
     return filepath
+
+
+@pytest.fixture
+def simple_netcdf4(tmpdir):
+    filepath = f"{tmpdir}/simple.nc"
+
+    arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4)
+    var = Variable(data=arr, dims=["x", "y"])
+    ds = xr.Dataset({"foo": var})
+
+    ds.to_netcdf(filepath)
+
+    return filepath
diff --git a/docs/api.rst b/docs/api.rst
@@ -39,6 +39,7 @@ Serialization
 
     VirtualiZarrDatasetAccessor.to_kerchunk
     VirtualiZarrDatasetAccessor.to_zarr
+    VirtualiZarrDatasetAccessor.to_icechunk
 
 
 Rewriting

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -31,6 +31,9 @@ New Features
 - Support empty files (:pull:`260`)
   By `Justus Magin <https://github.com/keewis>`_.
 
+- Can write virtual datasets to Icechunk stores using `vitualize.to_icechunk` (:pull:`256`)
+  By `Matt Iannucci <https://github.com/mpiannucci>`_.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -396,6 +396,23 @@ combined_ds = xr.open_dataset('combined.parq', engine="kerchunk")
 
 By default references are placed in separate parquet file when the total number of references exceeds `record_size`. If there are fewer than `categorical_threshold` unique urls referenced by a particular variable, url will be stored as a categorical variable.
 
+### Writing to an Icechunk Store
+
+We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`ds.virtualize.to_icechunk <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_icechunk>` accessor method.
+
+```python
+# create an icechunk store
+from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
+storage = StorageConfig.filesystem(str('combined'))
+store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig(
+    virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'),
+))
+
+combined_vds.virtualize.to_icechunk(store)
+```
+
+See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details.
+
 ### Writing as Zarr
 
 Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_zarr>` accessor method.

diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from typing import (
+    TYPE_CHECKING,
     Callable,
     Literal,
     overload,
@@ -12,6 +13,9 @@
 from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs
 from virtualizarr.writers.zarr import dataset_to_zarr
 
+if TYPE_CHECKING:
+    from icechunk import IcechunkStore  # type: ignore[import-not-found]
+
 
 @register_dataset_accessor("virtualize")
 class VirtualiZarrDatasetAccessor:
@@ -39,6 +43,20 @@ def to_zarr(self, storepath: str) -> None:
         """
         dataset_to_zarr(self.ds, storepath)
 
+    def to_icechunk(self, store: "IcechunkStore") -> None:
+        """
+        Write an xarray dataset to an Icechunk store.
+
+        Any variables backed by ManifestArray objects will be be written as virtual references, any other variables will be loaded into memory before their binary chunk data is written into the store.
+
+        Parameters
+        ----------
+        store: IcechunkStore
+        """
+        from virtualizarr.writers.icechunk import dataset_to_icechunk
+
+        dataset_to_icechunk(self.ds, store)
+
     @overload
     def to_kerchunk(
         self, filepath: None, format: Literal["dict"]

diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py
@@ -150,5 +150,7 @@ def _configurable_to_num_codec_config(configurable: dict) -> dict:
     """
     configurable_copy = configurable.copy()
     codec_id = configurable_copy.pop("name")
+    if codec_id.startswith("numcodecs."):
+        codec_id = codec_id[len("numcodecs.") :]
     configuration = configurable_copy.pop("configuration")
     return numcodecs.get_codec({"id": codec_id, **configuration}).get_config()
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -27,7 +27,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
             chunks=(2, 2),
             compressor=None,
             filters=None,
-            fill_value=np.nan,
+            fill_value=None,
             order="C",
         ),
         chunkmanifest=manifest,

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
@@ -47,7 +47,7 @@ def test_create_manifestarray_from_kerchunk_refs(self):
         assert marr.chunks == (2, 3)
         assert marr.dtype == np.dtype("int64")
         assert marr.zarray.compressor is None
-        assert marr.zarray.fill_value is np.nan
+        assert marr.zarray.fill_value == 0
         assert marr.zarray.filters is None
         assert marr.zarray.order == "C"
 

diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py
@@ -37,7 +37,7 @@ def test_dataset_from_df_refs():
 
     assert da.data.zarray.compressor is None
     assert da.data.zarray.filters is None
-    assert da.data.zarray.fill_value is np.nan
+    assert da.data.zarray.fill_value == 0
     assert da.data.zarray.order == "C"
 
     assert da.data.manifest.dict() == {

diff --git a/virtualizarr/tests/test_writers/conftest.py b/virtualizarr/tests/test_writers/conftest.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pytest
+from xarray import Dataset
+from xarray.core.variable import Variable
+
+from virtualizarr.manifests import ChunkManifest, ManifestArray
+
+
+@pytest.fixture
+def vds_with_manifest_arrays() -> Dataset:
+    arr = ManifestArray(
+        chunkmanifest=ChunkManifest(
+            entries={"0.0": dict(path="/test.nc", offset=6144, length=48)}
+        ),
+        zarray=dict(
+            shape=(2, 3),
+            dtype=np.dtype("<i8"),
+            chunks=(2, 3),
+            compressor={"id": "zlib", "level": 1},
+            filters=None,
+            fill_value=0,
+            order="C",
+            zarr_format=3,
+        ),
+    )
+    var = Variable(dims=["x", "y"], data=arr, attrs={"units": "km"})
+    return Dataset({"a": var}, attrs={"something": 0})