Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…Zarr into hdf5_reader
  • Loading branch information
sharkinsspatial committed May 19, 2024
2 parents c573800 + 207c4b5 commit ef0d7a8
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 43 deletions.
16 changes: 4 additions & 12 deletions virtualizarr/readers/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
key_list = [0] * (len(dataset.shape) or 1)
key = ".".join(map(str, key_list))
chunk_entry = ChunkEntry(
path=path,
offset=dsid.get_offset(),
length=dsid.get_storage_size()
path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
)
chunk_key = ChunkKey(key)
chunk_entries = {chunk_key: chunk_entry}
chunk_manifest = ChunkManifest(
entries=chunk_entries
)
chunk_manifest = ChunkManifest(entries=chunk_entries)
return chunk_manifest
else:
num_chunks = dsid.get_num_chunks()
Expand All @@ -60,9 +56,7 @@ def get_key(blob):

def store_chunk_entry(blob):
chunk_entries[get_key(blob)] = ChunkEntry(
path=path,
offset=blob.byte_offset,
length=blob.size
path=path, offset=blob.byte_offset, length=blob.size
)

has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
Expand All @@ -72,9 +66,7 @@ def store_chunk_entry(blob):
for index in range(num_chunks):
store_chunk_entry(dsid.get_chunk_info(index))

chunk_manifest = ChunkManifest(
entries=chunk_entries
)
chunk_manifest = ChunkManifest(entries=chunk_entries)
return chunk_manifest


Expand Down
26 changes: 16 additions & 10 deletions virtualizarr/readers/hdf_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from numcodecs.abc import Codec
from pydantic import BaseModel, validator

_non_standard_filters = {
"gzip": "zlib"
}
_non_standard_filters = {"gzip": "zlib"}


class BloscProperties(BaseModel):
Expand All @@ -20,12 +18,15 @@ class BloscProperties(BaseModel):
@validator("cname", pre=True)
def get_cname_from_code(cls, v):
blosc_compressor_codes = {
value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
value: key
for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
}
return blosc_compressor_codes[v]


def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec:
def _filter_to_codec(
filter_id: str, filter_properties: Union[int, None, Tuple] = None
) -> Codec:
id_int = None
id_str = None
try:
Expand All @@ -40,15 +41,20 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple]
id = id_str
conf = {"id": id}
if id == "zlib":
conf["level"] = filter_properties # type: ignore[assignment]
conf["level"] = filter_properties # type: ignore[assignment]
if id_int:
filter = hdf5plugin.get_filters(id_int)[0]
id = filter.filter_name
if id == "blosc" and isinstance(filter_properties, tuple):
blosc_props = BloscProperties(**{k: v for k, v in
zip(BloscProperties.__fields__.keys(),
filter_properties[-4:])})
conf = blosc_props.model_dump() # type: ignore[assignment]
blosc_props = BloscProperties(
**{
k: v
for k, v in zip(
BloscProperties.__fields__.keys(), filter_properties[-4:]
)
}
)
conf = blosc_props.model_dump() # type: ignore[assignment]
conf["id"] = id

codec = registry.get_codec(conf)
Expand Down
18 changes: 9 additions & 9 deletions virtualizarr/tests/test_readers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
filepath = f"{tmpdir}/{request.param}.nc"
f = h5py.File(filepath, "w")
if request.param == "gzip":
f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
f.create_dataset(
name="data", data=np_uncompressed, compression="gzip", compression_opts=1
)
if request.param == "blosc":
f.create_dataset(name="data", data=np_uncompressed,
**hdf5plugin.Blosc(
cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
))
f.create_dataset(
name="data",
data=np_uncompressed,
**hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
)
return filepath


Expand All @@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
ds = xr.tutorial.open_dataset("air_temperature")
encoding = {}
if request.param == "gzip":
encoding_config = {
"zlib": True,
"complevel": 1
}
encoding_config = {"zlib": True, "complevel": 1}

for var_name in ds.variables:
encoding[var_name] = encoding_config
Expand Down
5 changes: 1 addition & 4 deletions virtualizarr/tests/test_readers/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file):
virtual_vars_from_hdf(group_netcdf4_file)

def test_drop_variables(self, multiple_datasets_netcdf4_file):
variables = virtual_vars_from_hdf(
multiple_datasets_netcdf4_file,
["data2"]
)
variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
assert "data2" not in variables.keys()
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_readers/test_hdf_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
ds = f["data"]
chunk_info = ds.id.get_chunk_info(0)
codecs = codecs_from_dataset(ds)
with open(filter_encoded_netcdf4_file, 'rb') as file:
with open(filter_encoded_netcdf4_file, "rb") as file:
file.seek(chunk_info.byte_offset)
bytes_read = file.read(chunk_info.size)
decoded = codecs[0].decode(bytes_read)
Expand Down
6 changes: 2 additions & 4 deletions virtualizarr/tests/test_readers/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@


class TestIntegration:
def test_filters_end_to_end(self, tmpdir,
filter_encoded_xarray_netcdf4_files):
def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
virtual_ds = virtualizarr.open_virtual_dataset(
filter_encoded_xarray_netcdf4_files,
filetype=FileType("netcdf4")
filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
Expand Down
5 changes: 2 additions & 3 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
_automatically_determine_filetype,
)
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.utils import _fsspec_openfile_from_filepath
from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
from virtualizarr.utils import _fsspec_openfile_from_filepath
from virtualizarr.zarr import (
attrs_from_zarr_group_json,
dataset_to_zarr,
Expand Down Expand Up @@ -109,8 +109,7 @@ def open_virtual_dataset(
if filetype.name.lower() == "netcdf4":
print("wat")
virtual_vars = virtual_vars_from_hdf(
path=filepath,
drop_variables=drop_variables
path=filepath, drop_variables=drop_variables
)
ds_attrs = attrs_from_root_group(path=filepath)
if filetype == "zarr_v3":
Expand Down

0 comments on commit ef0d7a8

Please sign in to comment.