diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index c6d23541a67..aeac92250b6 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -92,7 +92,7 @@ jobs: shell: bash -l {0} env: CONDA_ENV_FILE: ci/requirements/environment.yml - PYTHON_VERSION: "3.11" + PYTHON_VERSION: "3.12" steps: - uses: actions/checkout@v4 diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index af8a21c1dbb..a2c41242963 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -45,15 +45,15 @@ python -m pip install \ --pre \ --upgrade \ pyarrow -# manually install `pint` to pull in new dependencies -python -m pip install --upgrade pint +# manually install `pint`, `donfig`, and `crc32c` to pull in new dependencies +python -m pip install --upgrade pint donfig crc32c python -m pip install \ --no-deps \ --upgrade \ git+https://github.com/dask/dask \ git+https://github.com/dask/dask-expr \ git+https://github.com/dask/distributed \ - git+https://github.com/zarr-developers/zarr.git@main \ + git+https://github.com/zarr-developers/zarr \ git+https://github.com/Unidata/cftime \ git+https://github.com/pypa/packaging \ git+https://github.com/hgrecco/pint \ diff --git a/doc/api.rst b/doc/api.rst index 308f040244f..63427447d53 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -656,6 +656,7 @@ This interface echoes that of ``xarray.Dataset``. DataTree.has_attrs DataTree.is_empty DataTree.is_hollow + DataTree.chunksizes Dictionary Interface -------------------- @@ -968,6 +969,10 @@ DataTree methods DataTree.to_dict DataTree.to_netcdf DataTree.to_zarr + DataTree.chunk + DataTree.load + DataTree.compute + DataTree.persist .. .. diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 5687e1399cc..f4b3e5ab9f6 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -823,8 +823,9 @@ For example: .. ipython:: python import zarr + from numcodecs.blosc import Blosc - compressor = zarr.Blosc(cname="zstd", clevel=3, shuffle=2) + compressor = Blosc(cname="zstd", clevel=3, shuffle=2) ds.to_zarr("foo.zarr", encoding={"foo": {"compressor": compressor}}) .. note:: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index eede5b88e0a..8b9319f5197 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,34 +14,18 @@ What's New np.random.seed(123456) -.. _whats-new.2024.09.1: +.. _whats-new.2024.10.1: -v2024.09.1 (unreleased) ------------------------ +v.2024.10.1 (unreleased) +------------------------ New Features ~~~~~~~~~~~~ -- ``DataTree`` related functionality is now exposed in the main ``xarray`` public - API. This includes: ``xarray.DataTree``, ``xarray.open_datatree``, ``xarray.open_groups``, - ``xarray.map_over_datasets``, ``xarray.group_subtrees``, - ``xarray.register_datatree_accessor`` and ``xarray.testing.assert_isomorphic``. - By `Owen Littlejohns `_, - `Eni Awowale `_, - `Matt Savoie `_, - `Stephan Hoyer `_ and - `Tom Nicholas `_. -- A migration guide for users of the prototype `xarray-contrib/datatree repository `_ has been added, and can be found in the `DATATREE_MIGRATION_GUIDE.md` file in the repository root. - By `Tom Nicholas `_. -- Added zarr backends for :py:func:`open_groups` (:issue:`9430`, :pull:`9469`). - By `Eni Awowale `_. +- Added :py:meth:`DataTree.persist` method (:issue:`9675`, :pull:`9682`). + By `Sam Levang `_. - Support lazy grouping by dask arrays, and allow specifying ordered groups with ``UniqueGrouper(labels=["a", "b", "c"])`` (:issue:`2852`, :issue:`757`). By `Deepak Cherian `_. -- Added support for vectorized interpolation using additional interpolators - from the ``scipy.interpolate`` module (:issue:`9049`, :pull:`9526`). - By `Holly Mandel `_. -- Implement handling of complex numbers (netcdf4/h5netcdf) and enums (h5netcdf) (:issue:`9246`, :issue:`3297`, :pull:`9509`). - By `Kai Mühlbauer `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -55,6 +39,62 @@ Deprecations provide expected group labels using the ``labels`` kwarg to a grouper object such as :py:class:`grouper.UniqueGrouper` or :py:class:`grouper.BinGrouper`. +Bug fixes +~~~~~~~~~ + +- Fix inadvertent deep-copying of child data in DataTree. + By `Stephan Hoyer `_. + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ +- ``persist`` methods now route through the :py:class:`xr.core.parallelcompat.ChunkManagerEntrypoint` (:pull:`9682`). + By `Sam Levang `_. + +.. _whats-new.2024.10.0: + +v2024.10.0 (Oct 24th, 2024) +--------------------------- + +This release brings official support for `xarray.DataTree`, and compatibility with zarr-python v3! + +Aside from these two huge features, it also improves support for vectorised interpolation and fixes various bugs. + +Thanks to the 31 contributors to this release: +Alfonso Ladino, DWesl, Deepak Cherian, Eni, Etienne Schalk, Holly Mandel, Ilan Gold, Illviljan, Joe Hamman, Justus Magin, Kai Mühlbauer, Karl Krauth, Mark Harfouche, Martey Dodoo, Matt Savoie, Maximilian Roos, Patrick Hoefler, Peter Hill, Renat Sibgatulin, Ryan Abernathey, Spencer Clark, Stephan Hoyer, Tom Augspurger, Tom Nicholas, Vecko, Virgile Andreani, Yvonne Fröhlich, carschandler, joseph nowak, mgunyho and owenlittlejohns + +New Features +~~~~~~~~~~~~ +- ``DataTree`` related functionality is now exposed in the main ``xarray`` public + API. This includes: ``xarray.DataTree``, ``xarray.open_datatree``, ``xarray.open_groups``, + ``xarray.map_over_datasets``, ``xarray.group_subtrees``, + ``xarray.register_datatree_accessor`` and ``xarray.testing.assert_isomorphic``. + By `Owen Littlejohns `_, + `Eni Awowale `_, + `Matt Savoie `_, + `Stephan Hoyer `_, + `Tom Nicholas `_, + `Justus Magin `_, and + `Alfonso Ladino `_. +- A migration guide for users of the prototype `xarray-contrib/datatree repository `_ has been added, and can be found in the `DATATREE_MIGRATION_GUIDE.md` file in the repository root. + By `Tom Nicholas `_. +- Support for Zarr-Python 3 (:issue:`95515`, :pull:`9552`). + By `Tom Augspurger `_, + `Ryan Abernathey `_ and + `Joe Hamman `_. +- Added zarr backends for :py:func:`open_groups` (:issue:`9430`, :pull:`9469`). + By `Eni Awowale `_. +- Added support for vectorized interpolation using additional interpolators + from the ``scipy.interpolate`` module (:issue:`9049`, :pull:`9526`). + By `Holly Mandel `_. +- Implement handling of complex numbers (netcdf4/h5netcdf) and enums (h5netcdf) (:issue:`9246`, :issue:`3297`, :pull:`9509`). + By `Kai Mühlbauer `_. +- Fix passing missing arguments to when opening hdf5 and netCDF4 datatrees + (:issue:`9427`, :pull: `9428`). + By `Alfonso Ladino `_. Bug fixes ~~~~~~~~~ @@ -78,6 +118,8 @@ Bug fixes `_. - Fix binning by multiple variables where some bins have no observations. (:issue:`9630`). By `Deepak Cherian `_. +- Fix issue where polyfit wouldn't handle non-dimension coordinates. (:issue:`4375`, :pull:`9369`) + By `Karl Krauth `_. Documentation ~~~~~~~~~~~~~ @@ -88,12 +130,9 @@ Documentation By `Owen Littlejohns `_, `Matt Savoie `_, and `Tom Nicholas `_. - - Internal Changes ~~~~~~~~~~~~~~~~ - .. _whats-new.2024.09.0: v2024.09.0 (Sept 11, 2024) @@ -161,17 +200,12 @@ Bug fixes date "0001-01-01". (:issue:`9108`, :pull:`9116`) By `Spencer Clark `_ and `Deepak Cherian `_. -- Fix issue where polyfit wouldn't handle non-dimension coordinates. (:issue:`4375`, :pull:`9369`) - By `Karl Krauth `_. - Fix issue with passing parameters to ZarrStore.open_store when opening datatree in zarr format (:issue:`9376`, :pull:`9377`). By `Alfonso Ladino `_ - Fix deprecation warning that was raised when calling ``np.array`` on an ``xr.DataArray`` in NumPy 2.0 (:issue:`9312`, :pull:`9393`) By `Andrew Scherer `_. -- Fix passing missing arguments to when opening hdf5 and netCDF4 datatrees - (:issue:`9427`, :pull: `9428`). - By `Alfonso Ladino `_. - Fix support for using ``pandas.DateOffset``, ``pandas.Timedelta``, and ``datetime.timedelta`` objects as ``resample`` frequencies (:issue:`9408`, :pull:`9413`). diff --git a/pyproject.toml b/pyproject.toml index 8dad98444ac..7c0126399bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ accel = ["scipy", "bottleneck", "numbagg", "numba>=0.54", "flox", "opt_einsum"] complete = ["xarray[accel,etc,io,parallel,viz]"] dev = [ "hypothesis", + "jinja2", "mypy", "pre-commit", "pytest", @@ -49,7 +50,7 @@ dev = [ "sphinx_autosummary_accessors", "xarray[complete]", ] -io = ["netCDF4", "h5netcdf", "scipy", 'pydap; python_version<"3.10"', "zarr<3", "fsspec", "cftime", "pooch"] +io = ["netCDF4", "h5netcdf", "scipy", 'pydap; python_version<"3.10"', "zarr", "fsspec", "cftime", "pooch"] etc = ["sparse"] parallel = ["dask[complete]"] viz = ["cartopy", "matplotlib", "nc-time-axis", "seaborn"] @@ -124,6 +125,7 @@ module = [ "nc_time_axis.*", "netCDF4.*", "netcdftime.*", + "numcodecs.*", "opt_einsum.*", "pint.*", "pooch.*", diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b367da586c7..a52e73701ab 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -33,6 +33,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.backends.zarr import _zarr_v3 from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -41,7 +42,9 @@ ) from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk +from xarray.core.datatree import DataTree from xarray.core.indexes import Index +from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes from xarray.core.utils import is_remote_uri from xarray.namedarray.daskmanager import DaskManager @@ -74,7 +77,6 @@ T_NetcdfTypes = Literal[ "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC" ] - from xarray.core.datatree import DataTree DATAARRAY_NAME = "__xarray_dataarray_name__" DATAARRAY_VARIABLE = "__xarray_dataarray_variable__" @@ -254,16 +256,22 @@ def _get_mtime(filename_or_obj): return mtime -def _protect_dataset_variables_inplace(dataset, cache): +def _protect_dataset_variables_inplace(dataset: Dataset, cache: bool) -> None: for name, variable in dataset.variables.items(): if name not in dataset._indexes: # no need to protect IndexVariable objects + data: indexing.ExplicitlyIndexedNDArrayMixin data = indexing.CopyOnWriteArray(variable._data) if cache: data = indexing.MemoryCachedArray(data) variable.data = data +def _protect_datatree_variables_inplace(tree: DataTree, cache: bool) -> None: + for node in tree.subtree: + _protect_dataset_variables_inplace(node, cache) + + def _finalize_store(write, store): """Finalize this store by explicitly syncing and closing""" del write # ensure writing is done first @@ -414,6 +422,58 @@ def _dataset_from_backend_dataset( return ds +def _datatree_from_backend_datatree( + backend_tree, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + **extra_tokens, +): + if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: + raise ValueError( + f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + ) + + _protect_datatree_variables_inplace(backend_tree, cache) + if chunks is None: + tree = backend_tree + else: + tree = DataTree.from_dict( + { + path: _chunk_ds( + node.dataset, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + **extra_tokens, + ) + for path, [node] in group_subtrees(backend_tree) + }, + name=backend_tree.name, + ) + + for path, [node] in group_subtrees(backend_tree): + tree[path].set_close(node._close) + + # Ensure source filename always stored in dataset object + if "source" not in tree.encoding: + path = getattr(filename_or_obj, "path", filename_or_obj) + + if isinstance(path, str | os.PathLike): + tree.encoding["source"] = _normalize_path(path) + + return tree + + def open_dataset( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, @@ -838,7 +898,22 @@ def open_dataarray( def open_datatree( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, engine: T_Engine = None, + chunks: T_Chunks = None, + cache: bool | None = None, + decode_cf: bool | None = None, + mask_and_scale: bool | Mapping[str, bool] | None = None, + decode_times: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + concat_characters: bool | Mapping[str, bool] | None = None, + decode_coords: Literal["coordinates", "all"] | bool | None = None, + drop_variables: str | Iterable[str] | None = None, + inline_array: bool = False, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, + backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataTree: """ @@ -848,29 +923,217 @@ def open_datatree( ---------- filename_or_obj : str, Path, file-like, or DataStore Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. - engine : str, optional - Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf", "zarr"}`. - **kwargs : dict - Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group. + engine : {"netcdf4", "h5netcdf", "zarr", None}, \ + installed backend or xarray.backends.BackendEntrypoint, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``) + can also be used. + chunks : int, dict, 'auto' or None, default: None + If provided, used to load the data into dask arrays. + + - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using the engine's preferred chunk + size, generally identical to the format's chunk size. If not available, a + single chunk for all arrays. + + See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_times : bool or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_timedelta : bool or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + use_cftime: bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + concat_characters : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + drop_variables: str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + inline_array: bool, default: False + How to include the array in the dask task graph. + By default(``inline_array=False``) the array is included in a task by + itself, and each chunk refers to that task by its key. With + ``inline_array=True``, Dask will instead inline the array directly + in the values of the task graph. See :py:func:`dask.array.from_array`. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the group in the given file to open as the root group as + a str. + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "scipy". + + See engine open function for kwargs accepted by each specific engine. + Returns ------- - xarray.DataTree + tree : DataTree + The newly created datatree. + + Notes + ----- + ``open_datatree`` opens the file with read-only access. When you modify + values of a DataTree, even one linked to files on disk, only the in-memory + copy you are manipulating in xarray is modified: the original file on disk + is never touched. + + See Also + -------- + xarray.open_groups + xarray.open_dataset """ + if cache is None: + cache = chunks is None + + if backend_kwargs is not None: + kwargs.update(backend_kwargs) + if engine is None: engine = plugins.guess_engine(filename_or_obj) + if from_array_kwargs is None: + from_array_kwargs = {} + backend = plugins.get_backend(engine) - return backend.open_datatree(filename_or_obj, **kwargs) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=(), + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + + backend_tree = backend.open_datatree( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + + tree = _datatree_from_backend_datatree( + backend_tree, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + + return tree def open_groups( filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, + *, engine: T_Engine = None, + chunks: T_Chunks = None, + cache: bool | None = None, + decode_cf: bool | None = None, + mask_and_scale: bool | Mapping[str, bool] | None = None, + decode_times: bool | Mapping[str, bool] | None = None, + decode_timedelta: bool | Mapping[str, bool] | None = None, + use_cftime: bool | Mapping[str, bool] | None = None, + concat_characters: bool | Mapping[str, bool] | None = None, + decode_coords: Literal["coordinates", "all"] | bool | None = None, + drop_variables: str | Iterable[str] | None = None, + inline_array: bool = False, + chunked_array_type: str | None = None, + from_array_kwargs: dict[str, Any] | None = None, + backend_kwargs: dict[str, Any] | None = None, **kwargs, ) -> dict[str, Dataset]: """ Open and decode a file or file-like object, creating a dictionary containing one xarray Dataset for each group in the file. + Useful for an HDF file ("netcdf4" or "h5netcdf") containing many groups that are not alignable with their parents and cannot be opened directly with ``open_datatree``. It is encouraged to use this function to inspect your data, then make the necessary changes to make the structure coercible to a `DataTree` object before calling `DataTree.from_dict()` and proceeding with your analysis. @@ -879,26 +1142,200 @@ def open_groups( ---------- filename_or_obj : str, Path, file-like, or DataStore Strings and Path objects are interpreted as a path to a netCDF file. - engine : str, optional - Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf"}`. - **kwargs : dict - Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group. + Parameters + ---------- + filename_or_obj : str, Path, file-like, or DataStore + Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. + engine : {"netcdf4", "h5netcdf", "zarr", None}, \ + installed backend or xarray.backends.BackendEntrypoint, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``) + can also be used. + chunks : int, dict, 'auto' or None, default: None + If provided, used to load the data into dask arrays. + + - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using the engine's preferred chunk + size, generally identical to the format's chunk size. If not available, a + single chunk for all arrays. + + See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_times : bool or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_timedelta : bool or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + use_cftime: bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + concat_characters : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + drop_variables: str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + inline_array: bool, default: False + How to include the array in the dask task graph. + By default(``inline_array=False``) the array is included in a task by + itself, and each chunk refers to that task by its key. With + ``inline_array=True``, Dask will instead inline the array directly + in the values of the task graph. See :py:func:`dask.array.from_array`. + chunked_array_type: str, optional + Which chunked array type to coerce this datasets' arrays to. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Experimental API that should not be relied upon. + from_array_kwargs: dict + Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create + chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed + to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the group in the given file to open as the root group as + a str. + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "scipy". + + See engine open function for kwargs accepted by each specific engine. Returns ------- - dict[str, xarray.Dataset] + groups : dict of str to xarray.Dataset + The groups as Dataset objects + + Notes + ----- + ``open_groups`` opens the file with read-only access. When you modify + values of a Dataset, even one linked to files on disk, only the in-memory + copy you are manipulating in xarray is modified: the original file on disk + is never touched. See Also -------- - open_datatree() - DataTree.from_dict() + xarray.open_datatree + xarray.open_dataset + xarray.DataTree.from_dict """ + if cache is None: + cache = chunks is None + + if backend_kwargs is not None: + kwargs.update(backend_kwargs) + if engine is None: engine = plugins.guess_engine(filename_or_obj) + if from_array_kwargs is None: + from_array_kwargs = {} + backend = plugins.get_backend(engine) - return backend.open_groups_as_dict(filename_or_obj, **kwargs) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=(), + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + + backend_groups = backend.open_groups_as_dict( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + + groups = { + name: _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + inline_array, + chunked_array_type, + from_array_kwargs, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + for name, backend_ds in backend_groups.items() + } + + return groups def open_mfdataset( @@ -1685,6 +2122,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + zarr_format: int | None = None, write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore | Delayed: @@ -1703,21 +2141,28 @@ def to_zarr( store = _normalize_path(store) chunk_store = _normalize_path(chunk_store) + kwargs = {} if storage_options is None: mapper = store chunk_mapper = chunk_store else: - from fsspec import get_mapper - if not isinstance(store, str): raise ValueError( f"store must be a string to use storage_options. Got {type(store)}" ) - mapper = get_mapper(store, **storage_options) - if chunk_store is not None: - chunk_mapper = get_mapper(chunk_store, **storage_options) - else: + + if _zarr_v3(): + kwargs["storage_options"] = storage_options + mapper = store chunk_mapper = chunk_store + else: + from fsspec import get_mapper + + mapper = get_mapper(store, **storage_options) + if chunk_store is not None: + chunk_mapper = get_mapper(chunk_store, **storage_options) + else: + chunk_mapper = chunk_store if encoding is None: encoding = {} @@ -1747,13 +2192,6 @@ def to_zarr( # validate Dataset keys, DataArray names _validate_dataset_names(dataset) - if zarr_version is None: - # default to 2 if store doesn't specify its version (e.g. a path) - zarr_version = int(getattr(store, "_store_version", 2)) - - if consolidated is None and zarr_version > 2: - consolidated = False - if mode == "r+": already_consolidated = consolidated consolidate_on_close = False @@ -1773,7 +2211,9 @@ def to_zarr( safe_chunks=safe_chunks, stacklevel=4, # for Dataset.to_zarr() zarr_version=zarr_version, + zarr_format=zarr_format, write_empty=write_empty_chunks, + **kwargs, ) if region is not None: diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 12382c3f39b..8d1d089a913 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -135,7 +135,6 @@ def _iter_nc_groups(root, parent="/"): yield str(parent) for path, group in root.groups.items(): gpath = parent / path - yield str(gpath) yield from _iter_nc_groups(group, parent=gpath) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 888489c0c04..f51f5873817 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -494,6 +494,7 @@ def open_datatree( driver_kwds=driver_kwds, **kwargs, ) + return datatree_from_dict_with_io_cleanup(groups_dict) def open_groups_as_dict( @@ -556,7 +557,10 @@ def open_groups_as_dict( decode_timedelta=decode_timedelta, ) - group_name = str(NodePath(path_group)) + if group: + group_name = str(NodePath(path_group).relative_to(parent)) + else: + group_name = str(NodePath(path_group)) groups_dict[group_name] = group_ds return groups_dict diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index b4609e626b5..7a08a1da8d4 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -729,6 +729,7 @@ def open_datatree( autoclose=autoclose, **kwargs, ) + return datatree_from_dict_with_io_cleanup(groups_dict) def open_groups_as_dict( @@ -789,7 +790,10 @@ def open_groups_as_dict( use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - group_name = str(NodePath(path_group)) + if group: + group_name = str(NodePath(path_group).relative_to(parent)) + else: + group_name = str(NodePath(path_group)) groups_dict[group_name] = group_ds return groups_dict diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 06ec4c9b30d..5090ec7728e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,10 +1,12 @@ from __future__ import annotations +import base64 import json import os +import struct import warnings from collections.abc import Iterable -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np import pandas as pd @@ -27,10 +29,12 @@ FrozenDict, HiddenKeyDict, close_on_error, + emit_user_level_warning, ) from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.namedarray.pycompat import integer_types +from xarray.namedarray.utils import module_available if TYPE_CHECKING: from io import BufferedIOBase @@ -41,8 +45,61 @@ from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree + +def _zarr_v3() -> bool: + # TODO: switch to "3" once Zarr V3 is released + return module_available("zarr", minversion="2.99") + + # need some special secret attributes to tell us the dimensions DIMENSION_KEY = "_ARRAY_DIMENSIONS" +ZarrFormat = Literal[2, 3] + + +class FillValueCoder: + """Handle custom logic to safely encode and decode fill values in Zarr. + Possibly redundant with logic in xarray/coding/variables.py but needs to be + isolated from NetCDF-specific logic. + """ + + @classmethod + def encode(cls, value: int | float | str | bytes, dtype: np.dtype[Any]) -> Any: + if dtype.kind in "S": + # byte string, this implies that 'value' must also be `bytes` dtype. + assert isinstance(value, bytes) + return base64.standard_b64encode(value).decode() + elif dtype.kind in "b": + # boolean + return bool(value) + elif dtype.kind in "iu": + # todo: do we want to check for decimals? + return int(value) + elif dtype.kind in "f": + return base64.standard_b64encode(struct.pack("