diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 091835b..965a766 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -103,7 +103,8 @@ async def write_variable_to_icechunk( ) else: # TODO is writing loadable_variables just normal xarray ds.to_zarr? - raise NotImplementedError() + # raise NotImplementedError() + print("skipping non-virtual variable", name) async def write_virtual_variable_to_icechunk( @@ -117,13 +118,12 @@ async def write_virtual_variable_to_icechunk( zarray = ma.zarray # creates array if it doesn't already exist - codecs = zarray._v3_codec_pipeline() arr = group.require_array( name=name, shape=zarray.shape, chunk_shape=zarray.chunks, dtype=encode_dtype(zarray.dtype), - #codecs=codecs, + codecs=zarray._v3_codec_pipeline(), dimension_names=var.dims, fill_value=zarray.fill_value, # TODO fill_value? diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index f62b126..d047f91 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -74,8 +74,9 @@ def codec(self) -> Codec: @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this + dtype = np.dtype(decoded_arr_refs_zarray["dtype"]) fill_value = decoded_arr_refs_zarray["fill_value"] - if fill_value is None or fill_value == "NaN" or fill_value == "nan": + if np.issubdtype(dtype, np.floating) and (fill_value is None or fill_value == "NaN" or fill_value == "nan"): fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] @@ -86,7 +87,7 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, - dtype=np.dtype(decoded_arr_refs_zarray["dtype"]), + dtype=dtype, fill_value=fill_value, filters=decoded_arr_refs_zarray["filters"], order=decoded_arr_refs_zarray["order"], @@ -140,7 +141,7 @@ def replace( replacements["zarr_format"] = zarr_format return dataclasses.replace(self, **replacements) - def _v3_codec_pipeline(self) -> list: + def _v3_codec_pipeline(self) -> Any: """ VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. @@ -153,44 +154,46 @@ def _v3_codec_pipeline(self) -> list: post_compressor: Iterable[BytesBytesCodec] #optional ``` """ - if self.filters: - filter_codecs_configs = [ - numcodecs.get_codec(filter).get_config() for filter in self.filters - ] - filters = [ - dict(name=codec.pop("id"), configuration=codec) - for codec in filter_codecs_configs - ] - else: - filters = [] - - # Noting here that zarr v3 has very few codecs specificed in the official spec, - # and that there are far more codecs in `numcodecs`. We take a gamble and assume - # that the codec names and configuration are simply mapped into zarrv3 "configurables". - if self.compressor: - compressor = [_num_codec_config_to_configurable(self.compressor)] - else: - compressor = [] + try: + from zarr.core.metadata.v3 import parse_codecs + except ImportError: + raise ImportError( + "zarr v3 is required to generate v3 codec pipelines" + ) + + codec_configs = [] # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 # Either "C" or "F", defining the layout of bytes within each chunk of the array. # "C" means row-major order, i.e., the last dimension varies fastest; # "F" means column-major order, i.e., the first dimension varies fastest. - if self.order == "C": - order = tuple(range(len(self.shape))) - elif self.order == "F": + # For now, we only need transpose if the order is not "C" + if self.order == "F": order = tuple(reversed(range(len(self.shape)))) + transpose = dict(name="transpose", configuration=dict(order=order)) + codec_configs.append(transpose) - transpose = dict(name="transpose", configuration=dict(order=order)) # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" bytes = dict( name="bytes", configuration={} ) # TODO need to handle endianess configuration + codec_configs.append(bytes) + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + if self.filters: + codec_configs.extend([ + _num_codec_config_to_configurable(filter) for filter in self.filters + ]) + + if self.compressor: + codec_configs.append(_num_codec_config_to_configurable(self.compressor)) + + # convert the pipeline repr into actual v3 codec objects + codec_pipeline = parse_codecs(codec_configs) - # The order here is significant! - # [ArrayArray] -> ArrayBytes -> [BytesBytes] - codec_pipeline = [transpose, bytes] + compressor + filters return codec_pipeline @@ -213,4 +216,5 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict: Convert a numcodecs codec into a zarr v3 configurable. """ num_codec_copy = num_codec.copy() - return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy} + name = "numcodecs." + num_codec_copy.pop("id") + return {"name": name, "configuration": num_codec_copy}