From 5e66ad0a34726a55cfa4d1c1d51876766900a358 Mon Sep 17 00:00:00 2001 From: Timothy Hodson <34148978+thodson-usgs@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:01:35 -0500 Subject: [PATCH 01/15] Update index.md (#275) Remove `fsspec` from `xr.open_dataset` usage example. --- docs/index.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docs/index.md b/docs/index.md index 0e79418f..8dfa80df 100644 --- a/docs/index.md +++ b/docs/index.md @@ -59,14 +59,9 @@ virtual_ds.virtualize.to_kerchunk('combined.json', format='json') Now you can open your shiny new Zarr store instantly: ```python -fs = fsspec.filesystem('reference', fo='combined.json') -m = fs.get_mapper('') - -ds = xr.open_dataset(m, engine='kerchunk', chunks={}) # normal xarray.Dataset object, wrapping dask/numpy arrays etc. +ds = xr.open_dataset('combined.json', engine='kerchunk', chunks={}) # normal xarray.Dataset object, wrapping dask/numpy arrays etc. ``` -(Since we serialized the cached results using the kerchunk specification then opening this zarr store still requires using fsspec via the kerchunk xarray backend.) - No data has been loaded or copied in this process, we have merely created an on-disk lookup table that points xarray into the specific parts of the original netCDF files when it needs to read each chunk. See the [Usage docs page](#usage) for more details. From ba46a7733d9ffb13b40ac6ab2b6e055ca3a695d9 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Nov 2024 10:10:47 -0700 Subject: [PATCH 02/15] remove unused ManifestBackendArray class (#282) --- virtualizarr/readers/common.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 9be2b45f..f6f5dff4 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -19,10 +19,9 @@ Variable, open_dataset, ) -from xarray.backends import AbstractDataStore, BackendArray +from xarray.backends import AbstractDataStore from xarray.core.indexes import PandasIndex -from virtualizarr.manifests import ManifestArray from virtualizarr.utils import _FsspecFSFromFilepath XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore @@ -34,12 +33,6 @@ DataTree = Any -class ManifestBackendArray(ManifestArray, BackendArray): - """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" - - ... - - def open_loadable_vars_and_indexes( filepath: str, loadable_variables, From ab23caaed6b530d66e41947c8a525d194f6b76e6 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 4 Nov 2024 10:54:09 -0700 Subject: [PATCH 03/15] Fix bug in RT of parquet detection (#278) * fix bug in RT of parquet * Update virtualizarr/readers/kerchunk.py Co-authored-by: Justus Magin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adds .parquet info to ValueError * Update kerchunk.py Co-authored-by: Tom Nicholas --------- Co-authored-by: Justus Magin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tom Nicholas --- docs/usage.md | 8 ++++---- virtualizarr/readers/kerchunk.py | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 30eab144..d9b292e0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -385,13 +385,13 @@ Currently you can only serialize in-memory variables to kerchunk references if t When you have many chunks, the reference file can get large enough to be unwieldy as json. In that case the references can be instead stored as parquet. Again this uses kerchunk internally. ```python -combined_vds.virtualize.to_kerchunk('combined.parq', format='parquet') +combined_vds.virtualize.to_kerchunk('combined.parquet', format='parquet') ``` And again we can read these references using the "kerchunk" backend as if it were a regular Zarr store ```python -combined_ds = xr.open_dataset('combined.parq', engine="kerchunk") +combined_ds = xr.open_dataset('combined.parquet', engine="kerchunk") ``` By default references are placed in separate parquet file when the total number of references exceeds `record_size`. If there are fewer than `categorical_threshold` unique urls referenced by a particular variable, url will be stored as a categorical variable. @@ -444,9 +444,9 @@ You can open existing Kerchunk `json` or `parquet` references as Virtualizarr vi ```python -vds = open_virtual_dataset('combined.json', format='kerchunk') +vds = open_virtual_dataset('combined.json', filetype='kerchunk', indexes={}) # or -vds = open_virtual_dataset('combined.parquet', format='kerchunk') +vds = open_virtual_dataset('combined.parquet', filetype='kerchunk', indexes={}) ``` diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 2f1ff4b2..4a41548c 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -38,7 +38,9 @@ def open_virtual_dataset( fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. - if fs.filepath.endswith("ref.parquet"): + if fs.filepath.endswith(".parquet") and fs.fs.isfile( + f"{fs.filepath}/.zmetadata" + ): from fsspec.implementations.reference import LazyReferenceMapper lrm = LazyReferenceMapper(filepath, fs.fs) @@ -61,7 +63,7 @@ def open_virtual_dataset( else: raise ValueError( - "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" + "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" ) # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict From 3fa5cffefcd9af1f536d11fab81972a6e84554ad Mon Sep 17 00:00:00 2001 From: Ayush Nag <35325113+ayushnag@users.noreply.github.com> Date: Tue, 5 Nov 2024 09:56:27 -0800 Subject: [PATCH 04/15] Search for coord_names in separate_coords (#191) * find coord_names in vars * resolve merge conflict * add 2d coords test * add kerchunk dep and add 1d coord test --------- Co-authored-by: Tom Nicholas --- conftest.py | 9 +++++++++ virtualizarr/readers/common.py | 7 ++++++- virtualizarr/tests/test_backend.py | 22 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 810fd833..55c07823 100644 --- a/conftest.py +++ b/conftest.py @@ -35,6 +35,15 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture +def netcdf4_file_with_2d_coords(tmpdir): + ds = xr.tutorial.open_dataset("ROMS_example") + filepath = f"{tmpdir}/ROMS_example.nc" + ds.to_netcdf(filepath, format="NETCDF4") + ds.close() + return filepath + + @pytest.fixture def netcdf4_virtual_dataset(netcdf4_file): from virtualizarr import open_virtual_dataset diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index f6f5dff4..646d26ca 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -144,8 +144,13 @@ def separate_coords( coord_vars: dict[ str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | Variable ] = {} + found_coord_names: set[str] = set() + # Search through variable attributes for coordinate names + for var in vars.values(): + if "coordinates" in var.attrs: + found_coord_names.update(var.attrs["coordinates"].split(" ")) for name, var in vars.items(): - if name in coord_names or var.dims == (name,): + if name in coord_names or var.dims == (name,) or name in found_coord_names: # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 if len(var.dims) == 1: dim1d, *_ = var.dims diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 43a6bbd8..e9b60814 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -156,6 +156,28 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file): } +@requires_kerchunk +class TestDetermineCoords: + def test_infer_one_dimensional_coords(self, netcdf4_file): + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert set(vds.coords) == {"time", "lat", "lon"} + + def test_var_attr_coords(self, netcdf4_file_with_2d_coords): + vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={}) + + expected_dimension_coords = ["ocean_time", "s_rho"] + expected_2d_coords = ["lon_rho", "lat_rho", "h"] + expected_1d_non_dimension_coords = ["Cs_r"] + expected_scalar_coords = ["hc", "Vtransform"] + expected_coords = ( + expected_dimension_coords + + expected_2d_coords + + expected_1d_non_dimension_coords + + expected_scalar_coords + ) + assert set(vds.coords) == set(expected_coords) + + @network @requires_s3fs class TestReadFromS3: From 2316fcbff7cbfbf93faf1884ba4482908ae1d50e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Nov 2024 08:54:52 -0700 Subject: [PATCH 05/15] [pre-commit.ci] pre-commit autoupdate (#283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.9 → v0.7.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.9...v0.7.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3bae6a6c..9990375b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.6.9" + rev: "v0.7.2" hooks: # Run the linter. - id: ruff From efbc4930ff80ff086b67192de589e71eff23bb1c Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 7 Nov 2024 09:40:44 -0700 Subject: [PATCH 06/15] Bump minimum Xarray dependency to 2024.10.0 (#284) * change upstream CI dependency * change non-upstream CI dependencies * change entire project depedendency * use explicit import of xarray.DataTree for type hint * release note --- ci/environment.yml | 2 +- ci/min-deps.yml | 2 +- ci/upstream.yml | 2 +- docs/releases.rst | 2 ++ pyproject.toml | 2 +- virtualizarr/readers/common.py | 10 ++-------- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 883463a2..0bb5b366 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,7 +7,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.6.0 + - xarray>=2024.10.0 - kerchunk>=0.2.5 - numpy>=2.0.0 - ujson diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 7ca8c0b3..05778382 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -7,7 +7,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.6.0 + - xarray>=2024.10.0 - numpy>=2.0.0 - numcodecs - packaging diff --git a/ci/upstream.yml b/ci/upstream.yml index 2c2680bc..035d76f8 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -3,6 +3,7 @@ channels: - conda-forge - nodefaults dependencies: + - xarray>=2024.10.0 - h5netcdf - h5py - hdf5 @@ -25,6 +26,5 @@ dependencies: - pip - pip: - icechunk # Installs zarr v3 as dependency - - git+https://github.com/pydata/xarray@zarr-v3 # zarr-v3 compatibility branch - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) diff --git a/docs/releases.rst b/docs/releases.rst index 93a5fec9..1ca594a1 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -42,6 +42,8 @@ Breaking changes - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) +- Minimum required version of Xarray is now v2024.10.0. + (:pull:`284`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index d216b269..749afb94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ - "xarray>=2024.06.0", + "xarray>=2024.10.0", "numpy>=2.0.0", "packaging", "universal-pathlib", diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 646d26ca..1ad24629 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -4,7 +4,6 @@ from collections.abc import Iterable, Mapping, MutableMapping from io import BufferedIOBase from typing import ( - TYPE_CHECKING, Any, Hashable, Optional, @@ -14,6 +13,7 @@ from xarray import ( Coordinates, Dataset, + DataTree, Index, IndexVariable, Variable, @@ -26,12 +26,6 @@ XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore -if TYPE_CHECKING: - try: - from xarray import DataTree # type: ignore[attr-defined] - except ImportError: - DataTree = Any - def open_loadable_vars_and_indexes( filepath: str, @@ -194,5 +188,5 @@ def open_virtual_datatree( decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, reader_options: Optional[dict] = None, - ) -> "DataTree": + ) -> DataTree: raise NotImplementedError() From 2d7b4ee3ff38a3363ea6de20799cce0c04f19769 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 7 Nov 2024 09:45:14 -0700 Subject: [PATCH 07/15] Dont write _ARRAY_DIMENSIONS to icechunk (#286) * dont write _ARRAY_DIMENSIONS * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * release note * change test * add xarray 2024.10.0 dependency to icechunk CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/releases.rst | 2 ++ virtualizarr/tests/test_writers/test_icechunk.py | 2 +- virtualizarr/writers/icechunk.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 1ca594a1..1e2bdb90 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -59,6 +59,8 @@ Bug fixes - Fixed regression in `fill_value` handling for datetime dtypes making virtual Zarr stores unreadable (:pull:`206`) By `Timothy Hodson `_ +- Fixed bug with writing of `dimension_names` into zarr metadata. + (:pull:`286`) By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index f99b2718..7a22defa 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -68,7 +68,7 @@ def test_write_new_virtual_variable( # assert dict(arr.attrs) == {"units": "km"} # check dimensions - assert arr.attrs["_ARRAY_DIMENSIONS"] == ["x", "y"] + assert arr.metadata.dimension_names == ("x", "y") def test_set_single_virtual_ref_without_encoding( diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 6dadbc08..0ba95a36 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -144,7 +144,6 @@ def write_virtual_variable_to_icechunk( # TODO it would be nice if we could assign directly to the .attrs property for k, v in var.attrs.items(): arr.attrs[k] = encode_zarr_attr_value(v) - arr.attrs["_ARRAY_DIMENSIONS"] = encode_zarr_attr_value(var.dims) _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} for k, v in var.encoding.items(): From 4ae7a19c2bb2996dc4739dfd3ebbe32b17ac1658 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 7 Nov 2024 10:12:13 -0700 Subject: [PATCH 08/15] Fix release notes for v1.1.0 (#288) * add new section to release notes for unreleased additions * add release note for #191 * add release note for #266 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pre-commit --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/releases.rst | 49 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 1e2bdb90..cadbc855 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -1,36 +1,62 @@ Release notes ============= -.. _v1.0.1: +.. _v1.1.1: -v1.0.1 (unreleased) +v1.1.1 (unreleased) ------------------- New Features ~~~~~~~~~~~~ +Breaking changes +~~~~~~~~~~~~~~~~ + +- Minimum required version of Xarray is now v2024.10.0. + (:pull:`284`) By `Tom Nicholas `_. + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +- Fixed bug with writing of `dimension_names` into zarr metadata. + (:pull:`286`) By `Tom Nicholas `_. +- Fixed bug causing CF-compliant variables not to be identified as coordinates (:pull:`191`) + By `Ayush Nag `_. + +Documentation +~~~~~~~~~~~~~ + +- FAQ answers on Icechunk compatibility, converting from existing Kerchunk references to Icechunk, and how to add a new reader for a custom file format. + (:pull:`266`) By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +.. _v1.1.0: + +v1.1.0 (22nd Oct 2024) +---------------------- + +New Features +~~~~~~~~~~~~ - Can open `kerchunk` reference files with ``open_virtual_dataset``. (:pull:`251`, :pull:`186`) By `Raphael Hagen `_ & `Kristen Thyng `_. - - Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`) By `Raphael Hagen `_. - - New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups. (:pull:`165`) By `Scott Henderson `_. - - Adds `decode_times` to open_virtual_dataset (:pull:`232`) By `Raphael Hagen `_. - - Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`) By `Ayush Nag `_. - - Load scalar variables by default. (:pull:`205`) By `Gustavo Hidalgo `_. - - Support empty files (:pull:`260`) By `Justus Magin `_. - - Can write virtual datasets to Icechunk stores using `vitualize.to_icechunk` (:pull:`256`) By `Matt Iannucci `_. @@ -42,8 +68,6 @@ Breaking changes - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) -- Minimum required version of Xarray is now v2024.10.0. - (:pull:`284`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ @@ -59,8 +83,6 @@ Bug fixes - Fixed regression in `fill_value` handling for datetime dtypes making virtual Zarr stores unreadable (:pull:`206`) By `Timothy Hodson `_ -- Fixed bug with writing of `dimension_names` into zarr metadata. - (:pull:`286`) By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ @@ -68,7 +90,6 @@ Documentation - Adds virtualizarr + coiled serverless example notebook (:pull:`223`) By `Raphael Hagen `_. - Internal Changes ~~~~~~~~~~~~~~~~ From 9e7d4302a330a5c2a82fbe96aec75903b4f9112e Mon Sep 17 00:00:00 2001 From: Ayush Nag <35325113+ayushnag@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:29:22 -0800 Subject: [PATCH 09/15] dmrpp root and nested group parsing fix (#265) * add root group and nested group support * refactor functions for readability * upgrade test suite * resolve conflicts * fix mypy types * update dmrpp default fill_val * update dmrpp default datapath * add dedent() to pytest fixtures * update function docs and releases --------- Co-authored-by: Tom Nicholas --- docs/releases.rst | 2 + virtualizarr/backend.py | 4 +- virtualizarr/readers/dmrpp.py | 512 ++++++------------ virtualizarr/tests/test_readers/test_dmrpp.py | 348 +++++++++++- 4 files changed, 511 insertions(+), 355 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index cadbc855..56f3ac90 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -21,6 +21,8 @@ Deprecations Bug fixes ~~~~~~~~~ +- Handle root and nested groups with ``dmrpp`` backend (:pull:`265`) + By `Ayush Nag `_. - Fixed bug with writing of `dimension_names` into zarr metadata. (:pull:`286`) By `Tom Nicholas `_. - Fixed bug causing CF-compliant variables not to be identified as coordinates (:pull:`191`) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 32403d04..fab010c7 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -127,10 +127,10 @@ def open_virtual_dataset( File path to open as a set of virtualized zarr arrays. filetype : FileType, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'dmrpp', 'zarr_v3', 'kerchunk'}. If not provided will attempt to automatically infer the correct filetype from header bytes. group : str, default is None - Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. + Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4”, “hdf5”, and "dmrpp". drop_variables: list[str], default is None Variables in the file to drop before returning. loadable_variables: list[str], default is None diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index c9095c7e..5859ca92 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -1,7 +1,6 @@ -import os import warnings -from collections import defaultdict from collections.abc import Mapping +from pathlib import Path from typing import Any, Iterable, Optional from xml.etree import ElementTree as ET @@ -36,15 +35,15 @@ def open_virtual_dataset( "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." ) - if group: - raise NotImplementedError() - fpath = _FsspecFSFromFilepath( filepath=filepath, reader_options=reader_options ).open_file() - parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp")) - vds = parser.parse_dataset() + parser = DMRParser( + root=ET.parse(fpath).getroot(), + data_filepath=filepath.removesuffix(".dmrpp"), + ) + vds = parser.parse_dataset(group=group, indexes=indexes) return vds.drop_vars(drop_variables) @@ -60,12 +59,12 @@ class DMRParser: """ # DAP and DMRPP XML namespaces - _ns = { + _NS = { "dap": "http://xml.opendap.org/ns/DAP/4.0#", - "dmr": "http://xml.opendap.org/dap/dmrpp/1.0.0#", + "dmrpp": "http://xml.opendap.org/dap/dmrpp/1.0.0#", } # DAP data types to numpy data types - _dap_np_dtype = { + _DAP_NP_DTYPE = { "Byte": "uint8", "UByte": "uint8", "Int8": "int8", @@ -82,24 +81,24 @@ class DMRParser: "String": "object", } # Default zlib compression value - _default_zlib_value = 6 + _DEFAULT_ZLIB_VALUE = 6 # Encoding keys that should be removed from attributes and placed in xarray encoding dict - _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} - def __init__(self, dmr: str, data_filepath: Optional[str] = None): + def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): """ - Initialize the DMRParser with the given DMR data and data file path. + Initialize the DMRParser with the given DMR++ file contents and source data file path. Parameters ---------- - dmr : str - The DMR file contents as a string. + dmrpp_str : str + The dmrpp file contents as a string. data_filepath : str, optional The path to the actual data file that will be set in the chunk manifests. - If None, the data file path is taken from the DMR file. + If None, the data file path is taken from the DMR++ file. """ - self.root = ET.fromstring(dmr) + self.root = root self.data_filepath = ( data_filepath if data_filepath is not None else self.root.attrib["name"] ) @@ -145,170 +144,57 @@ def parse_dataset(self, group=None, indexes: Mapping[str, Index] = {}) -> Datase Data variables: d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... """ + group_tags = self.root.findall("dap:Group", self._NS) if group is not None: - # group = "/" + group.strip("/") # ensure group is in form "/a/b" - group = os.path.normpath(group).removeprefix( - "/" - ) # ensure group is in form "a/b/c" - if self._is_hdf5(self.root): - return self._parse_hdf5_dataset(self.root, group, indexes) - if self.data_filepath.endswith(".nc"): - return self._parse_netcdf4_dataset(self.root, group, indexes) - raise ValueError("DMR file must be HDF5 or netCDF4 based") - - def _parse_netcdf4_dataset( - self, - root: ET.Element, - group: Optional[str] = None, - indexes: Mapping[str, Index] = {}, - ) -> Dataset: - """ - Parse the dataset from the netcdf4 based dmrpp with groups, starting at the given group. - Set root to the given group. - - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - group : str - The group to parse. If None, and no groups are present, the dataset is parsed. - If None and groups are present, the first group is parsed. + group = Path(group) + if not group.is_absolute(): + group = Path("/") / group + if len(group_tags) == 0: + warnings.warn("No groups found in DMR++ file; ignoring group parameter") + else: + all_groups = self._split_groups(self.root) + if group in all_groups: + return self._parse_dataset(all_groups[group], indexes) + else: + raise ValueError(f"Group {group} not found in DMR++ file") + return self._parse_dataset(self.root, indexes) - Returns - ------- - xr.Dataset - """ - group_tags = root.findall("dap:Group", self._ns) - if len(group_tags) == 0: - if group is not None: - # no groups found and group specified -> warning - warnings.warn( - "No groups found in NetCDF4 DMR file; ignoring group parameter" - ) - # no groups found and no group specified -> parse dataset - return self._parse_dataset(root, indexes) - all_groups = self._split_netcdf4(root) - if group is None: - # groups found and no group specified -> parse first group - return self._parse_dataset(group_tags[0], indexes) - if group in all_groups: - # groups found and group specified -> parse specified group - return self._parse_dataset(all_groups[group], indexes) - else: - # groups found and specified group not found -> error - raise ValueError(f"Group {group} not found in NetCDF4 DMR file") - - def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]: + def find_node_fqn(self, fqn: str) -> ET.Element: """ - Split the input element into several ET.Elements by netcdf4 group - E.g. {"left": , "right": } + Find the element in the root element by converting the fully qualified name to an xpath query. - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - Returns - ------- - dict[str, ET.Element] - """ - group_tags = root.findall("dap:Group", self._ns) - all_groups: dict[str, ET.Element] = defaultdict( - lambda: ET.Element(root.tag, root.attrib) - ) - for group_tag in group_tags: - all_groups[os.path.normpath(group_tag.attrib["name"])] = group_tag - return all_groups - - def _is_hdf5(self, root: ET.Element) -> bool: - """Check if the DMR file is HDF5 based.""" - if root.find(".//dap:Attribute[@name='fullnamepath']", self._ns) is not None: - return True - if root.find("./dap:Attribute[@name='HDF5_GLOBAL']", self._ns) is not None: - return True - return False - - def _parse_hdf5_dataset( - self, - root: ET.Element, - group: Optional[str] = None, - indexes: Mapping[str, Index] = {}, - ) -> Dataset: - """ - Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group. - Set root to the given group. + E.g. fqn = "/a/b" --> root.find("./*[@name='a']/*[@name='b']") + See more about OPeNDAP fully qualified names (FQN) here: https://docs.opendap.org/index.php/DAP4:_Specification_Volume_1#Fully_Qualified_Names Parameters ---------- - root : ET.Element - The root element of the DMR file. - - group : str - The group to parse. If None, and no groups are present, the dataset is parsed. - If None and groups are present, the first group is parsed. - - indexes : Mapping[str, Index], default is {} - Indexes to use on the returned xarray Dataset. - Default is {} which will avoid creating any indexes + fqn : str + The fully qualified name of an element. E.g. "/a/b" Returns ------- - xr.Dataset - """ - all_groups = self._split_hdf5(root=root) - if len(all_groups) == 0: - raise ValueError("No groups found in HDF based dmrpp file") - if group is None: - # pick a random group if no group is specified - group = next(iter(all_groups)) - attrs = {} - for attr_tag in root.iterfind("dap:Attribute", self._ns): - if attr_tag.attrib["type"] != "Container": - attrs.update(self._parse_attribute(attr_tag)) - if group in all_groups: - # replace aliased variable names with original names: gt1r_heights -> heights - orignames = self._find_original_names(all_groups[group]) - vds = self._parse_dataset(all_groups[group], indexes) - # Only one group so found attrs are global attrs - if len(all_groups) == 1: - vds.attrs.update(attrs) - return vds.rename(orignames) - raise ValueError(f"Group {group} not found in HDF5 dmrpp file") - - def _find_original_names(self, root: ET.Element) -> dict[str, str]: - """ - Find the original variable names from the HDF based groups. E.g. gt1r_heights -> heights + ET.Element + The matching node found within the root element. - E.g. if the variable name is 'gt1r_heights', the original name is 'heights' from the group 'gt1r'. - - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - Returns - ------- - dict[str, str] + Raises + ------ + ValueError + If the fully qualified name is not found in the root element. """ + if fqn == "/": + return self.root + elements = fqn.strip("/").split("/") # /a/b/ --> ['a', 'b'] + xpath_segments = [f"*[@name='{element}']" for element in elements] + xpath_query = "./" + "/".join(xpath_segments) # "./[*[@name='a']/*[@name='b']" + element = self.root.find(xpath_query, self._NS) + if element is None: + raise ValueError(f"Path {fqn} not found in provided root") + return element - orignames: dict[str, str] = {} - vars_tags: list[ET.Element] = [] - for dap_dtype in self._dap_np_dtype: - vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) - for var_tag in vars_tags: - origname_tag = var_tag.find( - "./dap:Attribute[@name='origname']/dap:Value", self._ns - ) - if origname_tag is not None and origname_tag.text is not None: - orignames[var_tag.attrib["name"]] = origname_tag.text - return orignames - - def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: + def _split_groups(self, root: ET.Element) -> dict[Path, ET.Element]: """ - Split the input element into several ET.Elements by HDF5 group - E.g. {"gtr1/heights": , "gtr1/temperatures": }. Builds up new elements - each with dimensions, variables, and attributes. + Split the input element into several ET.Elements by name. + E.g. {"/": , "left": , "right": } Parameters ---------- @@ -317,86 +203,81 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: Returns ------- - dict[str, ET.Element] - """ - # Add all variable, dimension, and attribute tags to their respective groups - groups_roots: dict[str, ET.Element] = defaultdict( - lambda: ET.Element(root.tag, root.attrib) - ) - group_dims: dict[str, set[str]] = defaultdict( - set - ) # {"gt1r/heights": {"dim1", "dim2", ...}} - vars_tags: list[ET.Element] = [] - for dap_dtype in self._dap_np_dtype: - vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) - # Variables - for var_tag in vars_tags: - fullname_tag = var_tag.find( - "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns - ) - if fullname_tag is not None and fullname_tag.text is not None: - # '/gt1r/heights/ph_id_pulse' -> 'gt1r/heights' - group_name = os.path.dirname(fullname_tag.text).removeprefix("/") - groups_roots[group_name].append(var_tag) - dim_tags = var_tag.findall("dap:Dim", self._ns) - dims = self._parse_multi_dims(dim_tags) - group_dims[group_name].update(dims.keys()) - # Dimensions - for dim_tag in root.iterfind("dap:Dimension", self._ns): - for g, d in group_dims.items(): - if dim_tag.attrib["name"] in d: - groups_roots[g].append(dim_tag) - # Attributes - container_attr_tag = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._ns) - if container_attr_tag is None: - attrs_tags = root.findall("dap:Attribute", self._ns) - for attr_tag in attrs_tags: - fullname_tag = attr_tag.find( - "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns - ) - if fullname_tag is not None and fullname_tag.text is not None: - group_name = os.path.dirname(fullname_tag.text).removeprefix("/") - # Add all attributes to the new dataset - groups_roots[group_name].extend(attr_tag) - else: - groups_roots[next(iter(groups_roots))].extend(container_attr_tag) - return groups_roots + dict[Path, ET.Element] + """ + all_groups: dict[Path, ET.Element] = {} + dataset_tags = [ + d for d in root if d.tag != "{" + self._NS["dap"] + "}" + "Group" + ] + if len(dataset_tags) > 0: + all_groups[Path("/")] = ET.Element(root.tag, root.attrib) + all_groups[Path("/")].extend(dataset_tags) + all_groups.update(self._split_groups_recursive(root, Path("/"))) + return all_groups + + def _split_groups_recursive( + self, root: ET.Element, current_path=Path("") + ) -> dict[Path, ET.Element]: + group_dict: dict[Path, ET.Element] = {} + for g in root.iterfind("dap:Group", self._NS): + new_path = current_path / Path(g.attrib["name"]) + dataset_tags = [ + d for d in g if d.tag != "{" + self._NS["dap"] + "}" + "Group" + ] + group_dict[new_path] = ET.Element(g.tag, g.attrib) + group_dict[new_path].extend(dataset_tags) + group_dict.update(self._split_groups_recursive(g, new_path)) + return group_dict def _parse_dataset( self, root: ET.Element, indexes: Mapping[str, Index] = {} ) -> Dataset: """ - Parse the dataset using the root element of the DMR file. + Parse the dataset using the root element of the DMR++ file. Parameters ---------- root : ET.Element - The root element of the DMR file. + The root element of the DMR++ file. Returns ------- xr.Dataset """ # Dimension names and sizes - dim_tags = root.findall("dap:Dimension", self._ns) - dataset_dims = self._parse_multi_dims(dim_tags) + dims: dict[str, int] = {} + dimension_tags = self._find_dimension_tags(root) + for dim in dimension_tags: + dims.update(self._parse_dim(dim)) # Data variables and coordinates - coord_names = self._find_coord_names(root) - # if no coord_names are found or coords don't include dims, dims are used as coords - if len(coord_names) == 0 or len(coord_names) < len(dataset_dims): - coord_names = set(dataset_dims.keys()) + coord_names: set[str] = set() + coord_tags = root.findall( + ".//dap:Attribute[@name='coordinates']/dap:Value", self._NS + ) + for c in coord_tags: + if c.text is not None: + coord_names.update(c.text.split(" ")) # Seperate and parse coords + data variables coord_vars: dict[str, Variable] = {} data_vars: dict[str, Variable] = {} for var_tag in self._find_var_tags(root): - variable = self._parse_variable(var_tag, dataset_dims) - if var_tag.attrib["name"] in coord_names: + variable = self._parse_variable(var_tag) + # Either coordinates are explicitly defined or 1d variable with same name as dimension is a coordinate + if var_tag.attrib["name"] in coord_names or ( + len(variable.dims) == 1 and variable.dims[0] == var_tag.attrib["name"] + ): coord_vars[var_tag.attrib["name"]] = variable else: data_vars[var_tag.attrib["name"]] = variable # Attributes attrs: dict[str, str] = {} - for attr_tag in self.root.iterfind("dap:Attribute", self._ns): + # Look for an attribute tag called "HDF5_GLOBAL" and unpack it + hdf5_global_attrs = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._NS) + if hdf5_global_attrs is not None: + # Remove the container attribute and add its children to the root dataset + root.remove(hdf5_global_attrs) + root.extend(hdf5_global_attrs) + for attr_tag in root.iterfind("dap:Attribute", self._NS): attrs.update(self._parse_attribute(attr_tag)) return Dataset( data_vars=data_vars, @@ -406,58 +287,28 @@ def _parse_dataset( def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: """ - Find all variable tags in the DMR file. Also known as array tags. + Find all variable tags in the DMR++ file. Also known as array tags. Tags are labeled with the DAP data type. E.g. , , Parameters ---------- root : ET.Element - The root element of the DMR file. + The root element of the DMR++ file. Returns ------- list[ET.Element] """ vars_tags: list[ET.Element] = [] - for dap_dtype in self._dap_np_dtype: - vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + for dap_dtype in self._DAP_NP_DTYPE: + vars_tags += root.findall(f"dap:{dap_dtype}", self._NS) return vars_tags - def _find_coord_names(self, root: ET.Element) -> set[str]: - """ - Find the name of all coordinates in root. Checks inside all variables and global attributes. - - Parameters - ---------- - root : ET.Element - The root element of the DMR file. - - Returns - ------- - set[str] : The set of unique coordinate names. - """ - # Check for coordinate names within each variable attributes - coord_names: set[str] = set() - for var_tag in self._find_var_tags(root): - coord_tag = var_tag.find( - "./dap:Attribute[@name='coordinates']/dap:Value", self._ns - ) - if coord_tag is not None and coord_tag.text is not None: - coord_names.update(coord_tag.text.split(" ")) - for map_tag in var_tag.iterfind("dap:Map", self._ns): - coord_names.add(map_tag.attrib["name"].removeprefix("/")) - # Check for coordinate names in a global attribute - coord_tag = var_tag.find("./dap:Attribute[@name='coordinates']", self._ns) - if coord_tag is not None and coord_tag.text is not None: - coord_names.update(coord_tag.text.split(" ")) - return coord_names - - def _parse_dim(self, root: ET.Element) -> dict[str, int | None]: + def _parse_dim(self, root: ET.Element) -> dict[str, int]: """ Parse single or tag If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim": 300} - If the tag has no size attribute, it is an unlimited dimension. E.g. --> {"time": None} If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} Parameters @@ -472,98 +323,84 @@ def _parse_dim(self, root: ET.Element) -> dict[str, int | None]: """ if "name" not in root.attrib and "size" in root.attrib: return {"phony_dim": int(root.attrib["size"])} - if "name" in root.attrib and "size" not in root.attrib: - return {os.path.basename(root.attrib["name"]): None} if "name" in root.attrib and "size" in root.attrib: - return {os.path.basename(root.attrib["name"]): int(root.attrib["size"])} + return {Path(root.attrib["name"]).name: int(root.attrib["size"])} raise ValueError("Not enough information to parse Dim/Dimension tag") - def _parse_multi_dims( - self, dim_tags: list[ET.Element], global_dims: dict[str, int] = {} - ) -> dict: + def _find_dimension_tags(self, root: ET.Element) -> list[ET.Element]: """ - Parse multiple or tags. Generally tags are found within dmrpp variable tags. + Find the all tags with dimension information. - Returns best possible matching of {dimension: shape} present in the list and global_dims. E.g tags=(Dim("lat", None), Dim("lon", None)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"lat": 100, "lon": 100} - - E.g. tags=(Dim("time", None), Dim("", 200)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"time": 5, "phony_dim0": 200} - - This function is often used to fill in missing sizes from the global_dims. E.g. Variable tags may contain only dimension names and not sizes. If the {name: size} matching is known from the global_dims, it is used to fill in the missing sizes. + First attempts to find Dimension tags, then falls back to Dim tags. + If Dim tags are found, the fully qualified name is used to find the corresponding Dimension tag. Parameters ---------- - dim_tags : tuple[ET.Element] - A tuple of ElementTree Elements representing dimensions in the DMR file. - - global_dims : dict - A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} + root : ET.Element + An ElementTree Element from a DMR++ file. Returns ------- - dict - E.g. {"time": 1, "lat": 1447, "lon": 2895} + list[ET.Element] """ - dims: dict[str, int | None] = {} - for dim_tag in dim_tags: - dim: dict[str, int | None] = self._parse_dim(dim_tag) - if "phony_dim" in dim: - dims["phony_dim_" + str(len(dims))] = dim["phony_dim"] - else: - dims.update(dim) - for name, size in list(dims.items()): - if name in global_dims and size is None: - dims[name] = global_dims[name] - return dims - - def _parse_variable( - self, var_tag: ET.Element, dataset_dims: dict[str, int] - ) -> Variable: + dimension_tags = root.findall("dap:Dimension", self._NS) + if not dimension_tags: + # Dim tags contain a fully qualified name that references a Dimension tag elsewhere in the DMR++ + dim_tags = root.findall("dap:Dim", self._NS) + for d in dim_tags: + dimension_tag = self.find_node_fqn(d.attrib["name"]) + if dimension_tag is not None: + dimension_tags.append(dimension_tag) + return dimension_tags + + def _parse_variable(self, var_tag: ET.Element) -> Variable: """ - Parse a variable from a DMR tag. + Parse a variable from a DMR++ tag. Parameters ---------- var_tag : ET.Element - An ElementTree Element representing a variable in the DMR file. Will have DAP dtype as tag. - - dataset_dims : dict - A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} - Must contain at least all the dimensions used by the variable. Necessary since the variable - metadata only contains the dimension names and not the sizes. + An ElementTree Element representing a variable in the DMR++ file. Will have DAP dtype as tag. E.g. Returns ------- xr.Variable """ - # Dimension names - dim_tags = var_tag.findall("dap:Dim", self._ns) - dim_shapes = self._parse_multi_dims(dim_tags, dataset_dims) + # Dimension info + dims: dict[str, int] = {} + dimension_tags = self._find_dimension_tags(var_tag) + if not dimension_tags: + raise ValueError("Variable has no dimensions") + for dim in dimension_tags: + dims.update(self._parse_dim(dim)) # convert DAP dtype to numpy dtype dtype = np.dtype( - self._dap_np_dtype[var_tag.tag.removeprefix("{" + self._ns["dap"] + "}")] + self._DAP_NP_DTYPE[var_tag.tag.removeprefix("{" + self._NS["dap"] + "}")] ) # Chunks and Filters filters = None - shape: tuple[int, ...] = tuple(dim_shapes.values()) + shape: tuple[int, ...] = tuple(dims.values()) chunks_shape = shape - chunks_tag = var_tag.find("dmr:chunks", self._ns) + chunks_tag = var_tag.find("dmrpp:chunks", self._NS) if chunks_tag is not None: # Chunks - found_chunk_dims = self._parse_chunks_dimensions(chunks_tag) - chunks_shape = found_chunk_dims if found_chunk_dims is not None else shape + chunk_dim_text = chunks_tag.findtext( + "dmrpp:chunkDimensionSizes", namespaces=self._NS + ) + if chunk_dim_text is not None: + # 1 1447 2895 -> (1, 1447, 2895) + chunks_shape = tuple(map(int, chunk_dim_text.split())) + else: + chunks_shape = shape chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) # Filters filters = self._parse_filters(chunks_tag, dtype) # Attributes attrs: dict[str, Any] = {} - for attr_tag in var_tag.iterfind("dap:Attribute", self._ns): + for attr_tag in var_tag.iterfind("dap:Attribute", self._NS): attrs.update(self._parse_attribute(attr_tag)) # Fill value is placed in encoding and thus removed from attributes - fill_value = attrs.pop("_FillValue", 0.0) - # Remove attributes only used for parsing logic - attrs.pop("fullnamepath", None) - attrs.pop("origname", None) - attrs.pop("coordinates", None) + fill_value = attrs.pop("_FillValue", None) # create ManifestArray and ZArray zarray = ZArray( chunks=chunks_shape, @@ -574,14 +411,13 @@ def _parse_variable( shape=shape, ) marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest) - encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs} - return Variable( - dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding - ) + encoding = {k: attrs.get(k) for k in self._ENCODING_KEYS if k in attrs} + return Variable(dims=dims.keys(), data=marr, attrs=attrs, encoding=encoding) def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: """ - Parse an attribute from a DMR attr tag. Converts the attribute value to a native python type. + Parse an attribute from a DMR++ attr tag. Converts the attribute value to a native python type. + Raises an exception if nested attributes are passed. Container attributes must be unwrapped in the parent function. Parameters ---------- @@ -595,8 +431,13 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: attr: dict[str, Any] = {} values = [] if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container": - return attr - dtype = np.dtype(self._dap_np_dtype[attr_tag.attrib["type"]]) + # DMR++ build information that is not part of the dataset + if attr_tag.attrib["name"] == "build_dmrpp_metadata": + return {} + raise ValueError( + "Nested attributes cannot be assigned to a variable or dataset" + ) + dtype = np.dtype(self._DAP_NP_DTYPE[attr_tag.attrib["type"]]) # if multiple Value tags are present, store as "key": "[v1, v2, ...]" for value_tag in attr_tag: # cast attribute to native python type using dmr provided dtype @@ -605,6 +446,7 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: if dtype != np.object_ else value_tag.text ) + # "*" may represent nan values in DMR++ if val == "*": val = np.nan values.append(val) @@ -615,7 +457,7 @@ def _parse_filters( self, chunks_tag: ET.Element, dtype: np.dtype ) -> list[dict] | None: """ - Parse filters from a DMR chunks tag. + Parse filters from a DMR++ chunks tag. Parameters ---------- @@ -643,7 +485,7 @@ def _parse_filters( "id": "zlib", "level": int( chunks_tag.attrib.get( - "deflateLevel", self._default_zlib_value + "deflateLevel", self._DEFAULT_ZLIB_VALUE ) ), } @@ -651,33 +493,11 @@ def _parse_filters( return filters return None - def _parse_chunks_dimensions( - self, chunks_tag: ET.Element - ) -> tuple[int, ...] | None: - """ - Parse the chunk dimensions from a DMR chunks tag. Returns None if no chunk dimensions are found. - - Parameters - ---------- - chunks_tag : ET.Element - An ElementTree Element with a tag. - - Returns - ------- - tuple[int, ...] | None - - """ - chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns) - if chunk_dim_tag is not None and chunk_dim_tag.text is not None: - # 1 1447 2895 -> (1, 1447, 2895) - return tuple(map(int, chunk_dim_tag.text.split())) - return None - def _parse_chunks( self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] ) -> ChunkManifest: """ - Parse the chunk manifest from a DMR chunks tag. + Parse the chunk manifest from a DMR++ chunks tag. Parameters ---------- @@ -696,7 +516,7 @@ def _parse_chunks( [0 for i in range(len(chunks_shape))] if chunks_shape else [0] ) chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) - for chunk_tag in chunks_tag.iterfind("dmr:chunk", self._ns): + for chunk_tag in chunks_tag.iterfind("dmrpp:chunk", self._NS): chunk_num = default_num if "chunkPositionInArray" in chunk_tag.attrib: # "[0,1023,10235]" -> ["0","1023","10235"] diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py index d2b19d60..cbafc40f 100644 --- a/virtualizarr/tests/test_readers/test_dmrpp.py +++ b/virtualizarr/tests/test_readers/test_dmrpp.py @@ -1,22 +1,356 @@ +import textwrap +from pathlib import Path +from xml.etree import ElementTree as ET + +import numpy as np import pytest import xarray as xr +import xarray.testing as xrt from virtualizarr import open_virtual_dataset +from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.readers.dmrpp import DMRParser from virtualizarr.tests import network urls = [ ( - "netcdf4", - "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5", - "dmrpp", - "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp", + "https://its-live-data.s3-us-west-2.amazonaws.com/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc", + "https://its-live-data.s3-us-west-2.amazonaws.com/test-space/cloud-experiments/dmrpp/20240826090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc.dmrpp", ) + # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] +@pytest.fixture +def basic_dmrpp() -> DMRParser: + xml_str = """\ + + + + + + + + + grid x-axis + + + + + + + + + grid y-axis + + + + + + + + + grid z-axis + + + + + + + + + + analysed sea surface temperature + + + 1 + 2 + 3 + + + -32768 + + + 298.14999999999998 + + + 0.001 + + + x y z + + + 360 720 + + + + + + + + + + + + + + + + + mask + + + + + + + CF-1.6 + + + Sample Dataset + + + """ + return DMRParser(root=ET.fromstring(textwrap.dedent(xml_str))) + + +@pytest.fixture +def nested_groups_dmrpp() -> DMRParser: + xml_str = """\ + + + + + + + + + + + + + + + + + + + + + + + test + + + + + + + + + test + + + + + + + + + + + + + + + + + """ + return DMRParser(root=ET.fromstring(textwrap.dedent(xml_str))) + + @network -@pytest.mark.parametrize("data_type, data_url, dmrpp_type, dmrpp_url", urls) -def test_dmrpp_reader(data_type, data_url, dmrpp_type, dmrpp_url): - result = open_virtual_dataset(dmrpp_url, indexes={}, filetype=dmrpp_type) +@pytest.mark.parametrize("data_url, dmrpp_url", urls) +@pytest.mark.skip(reason="Fill_val mismatch") +def test_NASA_dmrpp(data_url, dmrpp_url): + result = open_virtual_dataset(dmrpp_url, indexes={}, filetype="dmrpp") expected = open_virtual_dataset(data_url, indexes={}) xr.testing.assert_identical(result, expected) + + +@pytest.mark.parametrize( + "dmrpp_fixture, fqn_path, expected_xpath", + [ + ("basic_dmrpp", "/", "."), + ("basic_dmrpp", "/data", "./*[@name='data']"), + ("basic_dmrpp", "/data/items", "./*[@name='data']/*[@name='items']"), + ( + "nested_groups_dmrpp", + "/group1/group2/area", + "./*[@name='group1']/*[@name='group2']/*[@name='area']", + ), + ], +) +def test_find_node_fqn(request, dmrpp_fixture, fqn_path, expected_xpath): + parser_instance = request.getfixturevalue(dmrpp_fixture) + result = parser_instance.find_node_fqn(fqn_path) + expected = parser_instance.root.find(expected_xpath, parser_instance._NS) + assert result == expected + + +@pytest.mark.parametrize( + "dmrpp_fixture, group_path", + [ + ("basic_dmrpp", "/"), + ("nested_groups_dmrpp", "/"), + ("nested_groups_dmrpp", "/group1"), + ("nested_groups_dmrpp", "/group1/group2"), + ], +) +def test_split_groups(request, dmrpp_fixture, group_path): + dmrpp_instance = request.getfixturevalue(dmrpp_fixture) + # get all tags in a dataset (so all tags excluding nested groups) + dataset_tags = lambda x: [ + d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" + ] + # check that contents of the split groups dataset match contents of the original dataset + result_tags = dataset_tags( + dmrpp_instance._split_groups(dmrpp_instance.root)[Path(group_path)] + ) + expected_tags = dataset_tags(dmrpp_instance.find_node_fqn(group_path)) + assert result_tags == expected_tags + + +def test_parse_dataset(basic_dmrpp, nested_groups_dmrpp): + vds = basic_dmrpp.parse_dataset() + assert vds.sizes == {"x": 720, "y": 1440, "z": 3} + assert vds.data_vars.keys() == {"data", "mask"} + assert vds.data_vars["data"].dims == ("x", "y") + assert vds.attrs == {"Conventions": "CF-1.6", "title": "Sample Dataset"} + assert vds.coords.keys() == {"x", "y", "z"} + vds_root_implicit = nested_groups_dmrpp.parse_dataset() + vds_root = nested_groups_dmrpp.parse_dataset(group="/") + xrt.assert_identical(vds_root_implicit, vds_root) + assert vds_root.sizes == {"a": 10, "b": 10} + assert vds_root.coords.keys() == {"a", "b"} + vds_g1 = nested_groups_dmrpp.parse_dataset(group="/group1") + assert vds_g1.sizes == {"x": 720, "y": 1440} + assert vds_g1.coords.keys() == {"x", "y"} + vds_g2 = nested_groups_dmrpp.parse_dataset(group="/group1/group2") + assert vds_g2.sizes == {"x": 720, "y": 1440} + assert vds_g2.data_vars.keys() == {"area"} + assert vds_g2.data_vars["area"].dims == ("x", "y") + + +@pytest.mark.parametrize( + "dim_path, expected", + [ + ("/a", {"a": 10}), + ("/group1/x", {"x": 720}), + ], +) +def test_parse_dim(nested_groups_dmrpp, dim_path, expected): + result = nested_groups_dmrpp._parse_dim(nested_groups_dmrpp.find_node_fqn(dim_path)) + assert result == expected + + +@pytest.mark.parametrize("dim_path", ["/", "/mask"]) +def test_find_dimension_tags(basic_dmrpp, dim_path): + # Check that Dimension tags match Dimension tags from the root + # Check that Dim tags reference the same Dimension tags from the root + assert basic_dmrpp._find_dimension_tags( + basic_dmrpp.find_node_fqn(dim_path) + ) == basic_dmrpp.root.findall("dap:Dimension", basic_dmrpp._NS) + + +def test_parse_variable(basic_dmrpp): + var = basic_dmrpp._parse_variable(basic_dmrpp.find_node_fqn("/data")) + assert var.dtype == "float32" + assert var.dims == ("x", "y") + assert var.shape == (720, 1440) + assert var.data.zarray.chunks == (360, 720) + assert var.data.zarray.fill_value == -32768 + assert var.encoding == {"add_offset": 298.15, "scale_factor": 0.001} + assert var.attrs == { + "long_name": "analysed sea surface temperature", + "items": [1, 2, 3], + "coordinates": "x y z", + "add_offset": 298.15, + "scale_factor": 0.001, + } + + +@pytest.mark.parametrize( + "attr_path, expected", + [ + ("data/long_name", {"long_name": "analysed sea surface temperature"}), + ("data/items", {"items": [1, 2, 3]}), + ("data/_FillValue", {"_FillValue": -32768}), + ], +) +def test_parse_attribute(basic_dmrpp, attr_path, expected): + result = basic_dmrpp._parse_attribute(basic_dmrpp.find_node_fqn(attr_path)) + assert result == expected + + +@pytest.mark.parametrize( + "var_path, dtype, expected_filters", + [ + ( + "/data", + np.dtype("float32"), + [ + {"elementsize": np.dtype("float32").itemsize, "id": "shuffle"}, + {"id": "zlib", "level": 5}, + ], + ), + ( + "/mask", + np.dtype("float32"), + [{"elementsize": np.dtype("float32").itemsize, "id": "shuffle"}], + ), + ], +) +def test_parse_filters(basic_dmrpp, var_path, dtype, expected_filters): + chunks_tag = basic_dmrpp.find_node_fqn(var_path).find( + "dmrpp:chunks", basic_dmrpp._NS + ) + result = basic_dmrpp._parse_filters(chunks_tag, dtype) + assert result == expected_filters + + +@pytest.mark.parametrize( + "var_path, chunk_shape, expected_lengths, expected_offsets, expected_paths", + [ + ( + "/data", + (360, 720), + np.full((3, 3), 4083, dtype=np.uint64), + (np.arange(9, dtype=np.uint64) * 4083 + 40762).reshape(3, 3), + np.full((3, 3), "test.dmrpp", dtype=np.dtypes.StringDType), + ), + ( + "/mask", + (720, 1440), + np.array([4], dtype=np.uint64), + np.array([41276], dtype=np.uint64), + np.array(["test.dmrpp"], dtype=np.dtypes.StringDType), + ), + ], +) +def test_parse_chunks( + basic_dmrpp, + var_path, + chunk_shape, + expected_lengths, + expected_offsets, + expected_paths, +): + chunks_tag = basic_dmrpp.find_node_fqn(var_path).find( + "dmrpp:chunks", basic_dmrpp._NS + ) + result = basic_dmrpp._parse_chunks(chunks_tag, chunk_shape) + expected = ChunkManifest.from_arrays( + lengths=expected_lengths, offsets=expected_offsets, paths=expected_paths + ) + assert result == expected From 3bf3f78442d7c3f9978aa2fcb2ad05601820152a Mon Sep 17 00:00:00 2001 From: Josh Moore Date: Thu, 14 Nov 2024 16:24:52 +0100 Subject: [PATCH 10/15] Update README.md (#294) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dc581297..d4e2dbe8 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **VirtualiZarr creates virtual Zarr stores for cloud-friendly access to archival data, using familiar xarray syntax.** -VirtualiZarr (pronounced like "virtualize" but more piratey) grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API. +VirtualiZarr (pronounced like "virtualizer" but more piratey) grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API. You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. From fe39115027685d820169b474754819134e8a69eb Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Fri, 15 Nov 2024 15:41:40 -0500 Subject: [PATCH 11/15] Remove numcodecs specific install (#301) --- ci/upstream.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index 035d76f8..51b5b8dc 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -26,5 +26,4 @@ dependencies: - pip - pip: - icechunk # Installs zarr v3 as dependency - - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) From cb9951c521e861781f11242ae66131b60dfd24d9 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 15 Nov 2024 14:39:41 -0700 Subject: [PATCH 12/15] Update contributors guide (#298) * releases * release versions * what happens when you publish * extra step to add empty release notes * need to install package in docs env * correct name of docs env * how to open html build on MacOS * link to code of conduct * text explanation * explain --run-network-tests * release note --- docs/contributing.md | 22 +++++++++++++++++----- docs/releases.rst | 2 ++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index 4028dcaf..4d6b0fcf 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,7 +1,11 @@ # Contributing +Contributions are welcome and encouraged! We ask only that all contributors follow the [Zarr Developers Code of Conduct](https://github.com/zarr-developers/.github/blob/main/CODE_OF_CONDUCT.md). + ## Contributing code +Before opening a PR to contribute code you should check that your changes work by running the test suite locally. + ```bash mamba env create -f ci/environment.yml mamba activate virtualizarr-tests @@ -11,13 +15,18 @@ python -m pip install -e . --no-deps python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose ``` +The `--run-network-tests` argument is optional - it will run additional tests that require downloading files over the network. Skip this if you want the tests to run faster or you have no internet access. + ## Contributing documentation +Whilst the CI will build the updated documentation for each PR, it can also be useful to check that the documentation has rendered as expected by building it locally. + ### Build the documentation locally ```bash mamba env create -f ci/doc.yml -mamba activate docs +mamba activate virtualizarr-docs +pip install -e . # From project's root - needed to generate API docs cd docs # From project's root rm -rf generated make clean @@ -26,14 +35,17 @@ make html ### Access the documentation locally -Open `docs/_build/html/index.html` in a web browser +Open `docs/_build/html/index.html` in a web browser (on MacOS you can do this from the terminal using `open docs/_build/html/index.html`). ## Making a release -1. Navigate to the [https://github.com/zarr-developers/virtualizarr/releases](https://github.com/zarr-developers/virtualizarr/releases) release page. +Anyone with commit privileges to the repository can issue a release. + +1. Navigate to the [https://github.com/zarr-developers/virtualizarr/releases](https://github.com/zarr-developers/virtualizarr/releases) releases page. 2. Select draft a new release. 3. Select 'Choose a tag', then 'create a new tag' -4. Enter the name for the new tag following the [EffVer](https://jacobtomlinson.dev/effver/) versioning scheme (e.g., releasing v0.2.0 as the next release denotes that “some small effort may be required to make sure this version works for you”). +4. Enter the name for the new tag following the [EffVer](https://jacobtomlinson.dev/effver/) versioning scheme (e.g., releasing v0.2.0 as the next release after v0.1.0 denotes that “some small effort may be required to make sure this version works for you”). 4. Click 'Generate Release Notes' to draft notes based on merged pull requests. 5. Edit the draft release notes for consistency. -6. Publish the release. +6. Select 'Publish' to publish the release. This should automatically upload the new release to PyPI and Conda-Forge. +7. Create and merge a PR to add a new empty section to the `docs/releases.rst` for the next release in the future. diff --git a/docs/releases.rst b/docs/releases.rst index 56f3ac90..42d92743 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -33,6 +33,8 @@ Documentation - FAQ answers on Icechunk compatibility, converting from existing Kerchunk references to Icechunk, and how to add a new reader for a custom file format. (:pull:`266`) By `Tom Nicholas `_. +- Minor improvements to the Contributing Guide. + (:pull:`298`) By `Tom Nicholas `_. Internal Changes ~~~~~~~~~~~~~~~~ From 545cae7fc6bb03fa5df55b0f7febb9c23bff1b21 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 15 Nov 2024 14:54:35 -0700 Subject: [PATCH 13/15] Clarify which features are currently available in FAQ (#296) * clarifies which readers actually work * removes tiff from auto-detection for now * release notes * xfail tiff filetype detection test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix emojis * clarify how to combine in coordinate order * add line for generating references from zarr v3 store * typo * add link to icechunk * add table entry for icechunk * add dmr++ table entry * remove reference to xarray backend for virtualizarr that doesn't exist * mention icechunk in overall explanation * don't imply that all virtualizarr readers use kerchunk * use crosses to indicate features kerchunk doesn't have * add table entry for a HDF4 reader * add table entries on how to rename vars/dims * add table entry for renaming paths in manifest * add warning emojis to the parallelization ideas to indicate they are as-yet untested * actually kerchunk does support renaming filepaths in the manifest * remove rogue | * remove redundant link * specify filetype kwarg needed in open_virtual_dataset * add table entry on how to open existing kerchunk references * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/faq.md | 50 ++++++++++++++++++------------ docs/releases.rst | 2 ++ virtualizarr/backend.py | 3 +- virtualizarr/tests/test_backend.py | 5 +-- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 81f55aa3..a0274620 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -4,30 +4,35 @@ I'm glad you asked! We can think of the problem of providing virtualized zarr-like access to a set of legacy files in some other format as a series of steps: -1) **Read byte ranges** - We use the various [kerchunk file format backends](https://fsspec.github.io/kerchunk/reference.html#file-format-backends) to determine which byte ranges within a given legacy file would have to be read in order to get a specific chunk of data we want. -2) **Construct a representation of a single file (or array within a file)** - Kerchunk's backends return a nested dictionary representing an entire file, but we instead immediately parse this dict and wrap it up into a set of `ManifestArray` objects. The record of where to look to find the file and the byte ranges is stored under the `ManifestArray.manifest` attribute, in a `ChunkManifest` object. Both steps (1) and (2) are handled by the `'virtualizarr'` xarray backend, which returns one `xarray.Dataset` object per file, each wrapping multiple `ManifestArray` instances (as opposed to e.g. numpy/dask arrays). +1) **Read byte ranges** - We use various [virtualizarr readers](https://github.com/zarr-developers/VirtualiZarr/tree/main/virtualizarr/readers) to determine which byte ranges within a given legacy file would have to be read in order to get a specific chunk of data we want. Several of these readers work by calling one of the [kerchunk file format backends](https://fsspec.github.io/kerchunk/reference.html#file-format-backends) and parsing the output. +2) **Construct a representation of a single file (or array within a file)** - Kerchunk's backends return a nested dictionary representing an entire file, but we instead immediately parse this dict and wrap it up into a set of `ManifestArray` objects. The record of where to look to find the file and the byte ranges is stored under the `ManifestArray.manifest` attribute, in a `ChunkManifest` object. Both steps (1) and (2) are handled by the `virtualizarr.open_virtual_dataset`, which returns one `xarray.Dataset` object for the given file, which wraps multiple `ManifestArray` instances (as opposed to e.g. numpy/dask arrays). 3) **Deduce the concatenation order** - The desired order of concatenation can either be inferred from the order in which the datasets are supplied (which is what `xr.combined_nested` assumes), or it can be read from the coordinate data in the files (which is what `xr.combine_by_coords` does). If the ordering information is not present as a coordinate (e.g. because it's in the filename), a pre-processing step might be required. 4) **Check that the desired concatenation is valid** - Whether called explicitly by the user or implicitly via `xr.combine_nested/combine_by_coords/open_mfdataset`, `xr.concat` is used to concatenate/stack the wrapped `ManifestArray` objects. When doing this xarray will spend time checking that the array objects and any coordinate indexes can be safely aligned and concatenated. Along with opening files, and loading coordinates in step (3), this is the main reason why `xr.open_mfdataset` can take a long time to return a dataset created from a large number of files. 5) **Combine into one big dataset** - `xr.concat` dispatches to the `concat/stack` methods of the underlying `ManifestArray` objects. These perform concatenation by merging their respective Chunk Manifests. Using xarray's `combine_*` methods means that we can handle multi-dimensional concatenations as well as merging many different variables. -6) **Serialize the combined result to disk** - The resultant `xr.Dataset` object wraps `ManifestArray` objects which contain the complete list of byte ranges for every chunk we might want to read. We now serialize this information to disk, either using the [kerchunk specification](https://fsspec.github.io/kerchunk/spec.html#version-1), or in future we plan to use [new Zarr extensions](https://github.com/zarr-developers/zarr-specs/issues/287) to write valid Zarr stores directly. -7) **Open the virtualized dataset from disk** - The virtualized zarr store can now be read from disk, skipping all the work we did above. Chunk reads from this store will be redirected to read the corresponding bytes in the original legacy files. +6) **Serialize the combined result to disk** - The resultant `xr.Dataset` object wraps `ManifestArray` objects which contain the complete list of byte ranges for every chunk we might want to read. We now serialize this information to disk, either using the [Kerchunk specification](https://fsspec.github.io/kerchunk/spec.html#version-1), or the [Icechunk specification](https://icechunk.io/spec/). +7) **Open the virtualized dataset from disk** - The virtualized zarr store can now be read from disk, avoiding redoing all the work we did above and instead just opening all the virtualized data immediately. Chunk reads will be redirected to read the corresponding bytes in the original legacy files. -The above steps would also be performed using the `kerchunk` library alone, but because (3), (4), (5), and (6) are all performed by the `kerchunk.combine.MultiZarrToZarr` function, and no internal abstractions are exposed, kerchunk's design is much less modular, and the use cases are limited by kerchunk's API surface. +The above steps could also be performed using the `kerchunk` library alone, but because (3), (4), (5), and (6) are all performed by the `kerchunk.combine.MultiZarrToZarr` function, and no internal abstractions are exposed, kerchunk's design is much less modular, and the use cases are limited by kerchunk's API surface. ## How do VirtualiZarr and Kerchunk compare? -You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides almost all the same features as Kerchunk. Users of kerchunk may find the following comparison table useful, which shows which features of kerchunk map on to which features of VirtualiZarr. + | Component / Feature | Kerchunk | VirtualiZarr | | ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | | **Generation of references from archival files (1)** | | | -| From a netCDF4/HDF5 file | `kerchunk.hdf.SingleHdf5ToZarr` | `open_virtual_dataset`, via `kerchunk.hdf.SingleHdf5ToZarr` or potentially `hidefix` | -| From a netCDF3 file | `kerchunk.netCDF3.NetCDF3ToZarr` | `open_virtual_dataset`, via `kerchunk.netCDF3.NetCDF3ToZarr` | -| From a COG / tiff file | `kerchunk.tiff.tiff_to_zarr` | `open_virtual_dataset`, via `kerchunk.tiff.tiff_to_zarr` or potentially `cog3pio` | -| From a Zarr v2 store | `kerchunk.zarr.ZarrToZarr` | `open_virtual_dataset`, via `kerchunk.zarr.ZarrToZarr` ? | -| From a GRIB2 file | `kerchunk.grib2.scan_grib` | `open_virtual_datatree`, via `kerchunk.grib2.scan_grib` ? | -| From a FITS file | `kerchunk.fits.process_file` | `open_virtual_dataset`, via `kerchunk.fits.process_file` | +| From a netCDF4/HDF5 file | `kerchunk.hdf.SingleHdf5ToZarr` | `open_virtual_dataset(..., filetype='hdf5')`, via `kerchunk.hdf.SingleHdf5ToZarr` | +| From a netCDF3 file | `kerchunk.netCDF3.NetCDF3ToZarr` | `open_virtual_dataset(..., filetype='netcdf3')`, via `kerchunk.netCDF3.NetCDF3ToZarr` | +| From a COG / tiff file | `kerchunk.tiff.tiff_to_zarr` | `open_virtual_dataset(..., filetype='tiff')`, via `kerchunk.tiff.tiff_to_zarr` or potentially `tifffile` (❌ Not yet implemented - see [issue #291](https://github.com/zarr-developers/VirtualiZarr/issues/291)) | +| From a Zarr v2 store | `kerchunk.zarr.ZarrToZarr` | `open_virtual_dataset(..., filetype='zarr')` (❌ Not yet implemented - see [issue #262](https://github.com/zarr-developers/VirtualiZarr/issues/262)) | +| From a Zarr v3 store | ❌ | `open_virtual_dataset(..., filetype='zarr')` (❌ Not yet implemented - see [issue #262](https://github.com/zarr-developers/VirtualiZarr/issues/262)) | +| From a GRIB2 file | `kerchunk.grib2.scan_grib` | `open_virtual_datatree(..., filetype='grib')` (❌ Not yet implemented - see [issue #11](https://github.com/zarr-developers/VirtualiZarr/issues/11)) | +| From a FITS file | `kerchunk.fits.process_file` | `open_virtual_dataset(..., filetype='fits')`, via `kerchunk.fits.process_file` | +| From a HDF4 file | `kerchunk.hdf4.HDF4ToZarr` | `open_virtual_dataset(..., filetype='hdf4')`, via `kerchunk.hdf4.HDF4ToZarr` (❌ Not yet implemented - see [issue #216](https://github.com/zarr-developers/VirtualiZarr/issues/216)) | +| From a [DMR++](https://opendap.github.io/DMRpp-wiki/DMRpp.html) metadata file | ❌ | `open_virtual_dataset(..., filetype='dmrpp')`, via `virtualizarr.readers.dmrpp.DMRParser` | +| From existing kerchunk JSON/parquet references | `kerchunk.combine.MultiZarrToZarr(append=True)` | `open_virtual_dataset(..., filetype='kerchunk')` | | **In-memory representation (2)** | | | | In-memory representation of byte ranges for single array | Part of a "reference `dict`" with keys for each chunk in array | `ManifestArray` instance (wrapping a `ChunkManifest` instance) | | In-memory representation of actual data values | Encoded bytes directly serialized into the "reference `dict`", created on a per-chunk basis using the `inline_threshold` kwarg | `numpy.ndarray` instances, created on a per-variable basis using the `loadable_variables` kwarg | @@ -35,15 +40,22 @@ Users of kerchunk may find the following comparison table useful, which shows wh | **Manipulation of in-memory references (3, 4 & 5)** | | | | Combining references to multiple arrays representing different variables | `kerchunk.combine.MultiZarrToZarr` | `xarray.merge` | | Combining references to multiple arrays representing the same variable | `kerchunk.combine.MultiZarrToZarr` using the `concat_dims` kwarg | `xarray.concat` | -| Combining references in coordinate order | `kerchunk.combine.MultiZarrToZarr` using the `coo_map` kwarg | `xarray.combine_by_coords` with in-memory xarray indexes created by loading coordinate variables first | -| Combining along multiple dimensions without coordinate data | n/a | `xarray.combine_nested` | -| **Parallelization** | | | -| Parallelized generation of references | Wrapping kerchunk's opener inside `dask.delayed` | Wrapping `open_virtual_dataset` inside `dask.delayed` but eventually instead using `xarray.open_mfdataset(..., parallel=True)` | -| Parallelized combining of references (tree-reduce) | `kerchunk.combine.auto_dask` | Wrapping `ManifestArray` objects within `dask.array.Array` objects inside `xarray.Dataset` to use dask's `concatenate` | +| Combining references in coordinate order | `kerchunk.combine.MultiZarrToZarr` using the `coo_map` kwarg | `xarray.combine_by_coords` with in-memory coordinate variables loaded via the `loadable_variables` kwarg | +| Combining along multiple dimensions without coordinate data | ❌ | `xarray.combine_nested` | +| Dropping variables | `kerchunk.combine.drop` | `xarray.Dataset.drop_vars`, or `open_virtual_dataset(..., drop_variables=...)` | +| Renaming variables | ❌ | `xarray.Dataset.rename_vars` | +| Renaming dimensions | ❌ | `xarray.Dataset.rename_dims` | +| Renaming manifest file paths | `kerchunk.utils.rename_target` | `vds.virtualize.rename_paths` | +| Splitting uncompressed data into chunks | `kerchunk.utils.subchunk` | `xarray.Dataset.chunk` (❌ Not yet implemented - see [PR #199](https://github.com/zarr-developers/VirtualiZarr/pull/199)) +| Selecting specific chunks | ❌ | `xarray.Dataset.isel` (❌ Not yet implemented - see [issue #51](https://github.com/zarr-developers/VirtualiZarr/issues/51)) | +**Parallelization** | | | +| Parallelized generation of references | Wrapping kerchunk's opener inside `dask.delayed` | Wrapping `open_virtual_dataset` inside `dask.delayed` (⚠️ Untested) +| Parallelized combining of references (tree-reduce) | `kerchunk.combine.auto_dask` | Wrapping `ManifestArray` objects within `dask.array.Array` objects inside `xarray.Dataset` to use dask's `concatenate` (⚠️ Untested) | | **On-disk serialization (6) and reading (7)** | | | | Kerchunk reference format as JSON | `ujson.dumps(h5chunks.translate())` , then read using an `fsspec.filesystem` mapper | `ds.virtualize.to_kerchunk('combined.json', format='JSON')` , then read using an `fsspec.filesystem` mapper | | Kerchunk reference format as parquet | `df.refs_to_dataframe(out_dict, "combined.parq")`, then read using an `fsspec` `ReferenceFileSystem` mapper | `ds.virtualize.to_kerchunk('combined.parq', format=parquet')` , then read using an `fsspec` `ReferenceFileSystem` mapper | -| Zarr v3 store with `manifest.json` files | n/a | `ds.virtualize.to_zarr()`, then read via any Zarr v3 reader which implements the manifest storage transformer ZEP | +| Zarr v3 store with `manifest.json` files | ❌ | `ds.virtualize.to_zarr()`, then read via any Zarr v3 reader which implements the manifest storage transformer ZEP | +| [Icechunk](https://icechunk.io/) store | ❌ | `ds.virtualize.to_icechunk()`, then read back via xarray (requires zarr-python v3). | ## Why a new project? @@ -71,7 +83,7 @@ If you see other opportunities then we would love to hear your ideas! ## Is this compatible with Icechunk? -Yes! VirtualiZarr allows you to ingest data as virtual references and write those references into an Icechunk Store. See the [Icechunk documentation on creating virtaul datasets.](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) +Yes! VirtualiZarr allows you to ingest data as virtual references and write those references into an [Icechunk](https://icechunk.io/) Store. See the [Icechunk documentation on creating virtual datasets.](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) ## I already have Kerchunked data, do I have to redo that work? diff --git a/docs/releases.rst b/docs/releases.rst index 42d92743..cd30f128 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -33,6 +33,8 @@ Documentation - FAQ answers on Icechunk compatibility, converting from existing Kerchunk references to Icechunk, and how to add a new reader for a custom file format. (:pull:`266`) By `Tom Nicholas `_. +- Clarify which readers actually currently work in FAQ, and temporarily remove tiff from the auto-detection. + (:issue:`291`, :pull:`296`) By `Tom Nicholas `_. - Minor improvements to the Contributing Guide. (:pull:`298`) By `Tom Nicholas `_. diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index fab010c7..3b7195cb 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -16,7 +16,6 @@ HDF5VirtualBackend, KerchunkVirtualBackend, NetCDF3VirtualBackend, - TIFFVirtualBackend, ZarrV3VirtualBackend, ) from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions @@ -30,7 +29,7 @@ "netcdf3": NetCDF3VirtualBackend, "hdf5": HDF5VirtualBackend, "netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5 - "tiff": TIFFVirtualBackend, + # "tiff": TIFFVirtualBackend, "fits": FITSVirtualBackend, } diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index e9b60814..b1ddeee4 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -13,7 +13,6 @@ from virtualizarr.manifests import ManifestArray from virtualizarr.tests import ( has_astropy, - has_tifffile, network, requires_kerchunk, requires_s3fs, @@ -233,9 +232,7 @@ class TestReadFromURL: pytest.param( "tiff", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", - marks=pytest.mark.skipif( - not has_tifffile, reason="package tifffile is not available" - ), + marks=pytest.mark.xfail(reason="not yet implemented"), ), pytest.param( "fits", From 09e47529f57a5b5114ebf1a5448789d98d95bc97 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 15 Nov 2024 17:05:23 -0700 Subject: [PATCH 14/15] Fix sphinx warnings (#300) --- .readthedocs.yml | 1 + docs/conf.py | 11 ++++++++++- virtualizarr/manifests/manifest.py | 24 ++++++++++++------------ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 08a0fa94..db97ed7c 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -13,6 +13,7 @@ build: # Build documentation in the doc/ directory with Sphinx sphinx: configuration: docs/conf.py + fail_on_warning: true # Optionally declare the Python requirements required to build your docs conda: diff --git a/docs/conf.py b/docs/conf.py index d5312069..b3c482ec 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,12 +18,18 @@ "myst_nb", "sphinx.ext.autodoc", "sphinx.ext.autosummary", + "sphinx.ext.extlinks", "sphinx_copybutton", "sphinx_togglebutton", "sphinx_design", "sphinx.ext.napoleon", ] +extlinks = { + "issue": ("https://github.com/zarr-developers/virtualizarr/issues/%s", "GH%s"), + "pull": ("https://github.com/zarr-developers/virtualizarr/pull/%s", "PR%s"), + "discussion": ("https://github.com/zarr-developers/virtualizarr/discussions/%s", "D%s"), +} # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -49,6 +55,9 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False +# -- Myst Options ------------------------------------------------- + +myst_heading_anchors = 3 # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output @@ -78,7 +87,7 @@ 'custom.css', ] -html_logo = "_static/_future_logo.png" +# html_logo = "_static/_future_logo.png" html_static_path = ["_static"] diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 1933844a..38743f9b 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -70,12 +70,12 @@ class ChunkManifest: The manifest can be converted to or from a dictionary which looks like this - { - "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, - "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, - } + | { + | "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + | "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + | "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + | "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + | } using the .__init__() and .dict() methods, so users of this class can think of the manifest as if it were a dict mapping zarr chunk keys to byte ranges. @@ -98,12 +98,12 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: entries: dict Chunk keys and byte range information, as a dictionary of the form - { - "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, - "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, - } + | { + | "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + | "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + | "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + | "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + | } """ if shape is None and not entries: raise ValueError("need a chunk grid shape if no chunks given") From c7970e222edc7ae1f29f77f0b1097538f631bed7 Mon Sep 17 00:00:00 2001 From: Doug Latornell Date: Mon, 18 Nov 2024 08:31:35 -0800 Subject: [PATCH 15/15] Update pkg install in docs contribution guide (#304) * Update pkg install in docs contribution guide Changed the command to install the project for docs development to use `python -m pip`. This ensures that the Python interpreter from the activated environment is used when installing the package in editable mode. * Add PR#304 to docs section of v1.1.1 Release Notes --- docs/contributing.md | 2 +- docs/releases.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/contributing.md b/docs/contributing.md index 4d6b0fcf..92cf9c9b 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -26,7 +26,7 @@ Whilst the CI will build the updated documentation for each PR, it can also be u ```bash mamba env create -f ci/doc.yml mamba activate virtualizarr-docs -pip install -e . # From project's root - needed to generate API docs +python -m pip install -e . # From project's root - needed to generate API docs cd docs # From project's root rm -rf generated make clean diff --git a/docs/releases.rst b/docs/releases.rst index cd30f128..31fa06c1 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -37,6 +37,8 @@ Documentation (:issue:`291`, :pull:`296`) By `Tom Nicholas `_. - Minor improvements to the Contributing Guide. (:pull:`298`) By `Tom Nicholas `_. +- More minor improvements to the Contributing Guide. + (:pull:`304`) By `Doug Latornell `_. Internal Changes ~~~~~~~~~~~~~~~~