diff --git a/doc/io.rst b/doc/io.rst index 5119bac1a79..bc6e2ecd2d2 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -874,6 +874,38 @@ store is already present at that path, an error will be raised, preventing it from being overwritten. To override this behavior and overwrite an existing store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`. +It is also possible to append to an existing store. For that, set +``append_dim`` to the name of the dimension along which to append. ``mode`` +can be omitted as it will internally be set to ``'a'``. + +.. ipython:: python + :suppress: + + ! rm -rf path/to/directory.zarr + +.. ipython:: python + :okexcept: + + ds1 = xr.Dataset( + {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))}, + coords={ + "x": [10, 20, 30, 40], + "y": [1, 2, 3, 4, 5], + "t": pd.date_range("2001-01-01", periods=2), + }, + ) + ds1.to_zarr("path/to/directory.zarr") + ds2 = xr.Dataset( + {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))}, + coords={ + "x": [10, 20, 30, 40], + "y": [1, 2, 3, 4, 5], + "t": pd.date_range("2001-01-03", periods=2), + }, + ) + ds2.to_zarr("path/to/directory.zarr", append_dim="t") + + To store variable length strings, convert them to object arrays first with ``dtype=object``. @@ -890,8 +922,28 @@ Cloud Storage Buckets It is possible to read and write xarray datasets directly from / to cloud storage buckets using zarr. This example uses the `gcsfs`_ package to provide -a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then -pass to xarray:: +an interface to `Google Cloud Storage`_. + +From v0.16.2: general `fsspec`_ URLs are parsed and the store set up for you +automatically when reading, such that you can open a dataset in a single +call. You should include any arguments to the storage backend as the +key ``storage_options``, part of ``backend_kwargs``. + +.. code:: python + + ds_gcs = xr.open_dataset( + "gcs:///path.zarr", + backend_kwargs={"storage_options": {"project": '', "token": None}}, + engine="zarr" + ) + +This also works with ``open_mfdataset``, allowing you to pass a list of paths or +a URL to be interpreted as a glob string. + +For older versions, and for writing, you must explicitly set up a ``MutableMapping`` +instance and pass this, as follows: + +.. code:: python import gcsfs fs = gcsfs.GCSFileSystem(project='', token=None) @@ -901,6 +953,9 @@ pass to xarray:: # read it back ds_gcs = xr.open_zarr(gcsmap) +(or use the utility function ``fsspec.get_mapper()``). + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ .. _Zarr: http://zarr.readthedocs.io/ .. _Amazon S3: https://aws.amazon.com/s3/ .. _Google Cloud Storage: https://cloud.google.com/storage/ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 308ed33f0bc..0c391c28ddc 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -59,6 +59,11 @@ New Features ``CFTimeIndex.repr``. (:issue:`2416`, :pull:`4597`) By `Aaron Spring `_. +- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs + (including globs for the latter) for ``engine="zarr"``, and so allow reading from + many remote and other file systems (:pull:`4461`) + By `Martin Durant `_ + Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0b9b5046cb9..f8c95a1b40b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -340,6 +340,7 @@ def open_dataset( ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). + Also supports arbitrary ``fsspec`` URLs, only for the "zarr" backend. group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). @@ -400,7 +401,10 @@ def open_dataset( backend_kwargs: dict, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or - allow user control of dataset processing. + allow user control of dataset processing. When using an ``fsspec`` + path for the filename, they key ``storage_options`` can be used + here to configure the backend storage instance. Alternatively, a + pre-configured file instance can be supplied with key ``fs``. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -548,11 +552,14 @@ def maybe_decode_store(store, chunks): ds2._file_obj = ds._file_obj return ds2 - filename_or_obj = _normalize_path(filename_or_obj) + fs = backend_kwargs.get("fs", None) + if fs is None: + filename_or_obj = _normalize_path(filename_or_obj) if isinstance(filename_or_obj, AbstractDataStore): store = filename_or_obj else: + backend_kwargs = backend_kwargs.copy() if engine is None: engine = _autodetect_engine(filename_or_obj) @@ -564,9 +571,15 @@ def maybe_decode_store(store, chunks): if engine == "zarr": backend_kwargs = backend_kwargs.copy() + backend_kwargs.pop("fs", None) overwrite_encoded_chunks = backend_kwargs.pop( "overwrite_encoded_chunks", None ) + extra_kwargs["mode"] = "r" + extra_kwargs["group"] = group + if fs is not None: + filename_or_obj = fs.get_mapper(filename_or_obj) + backend_kwargs.pop("storage_options", None) opener = _get_backend_cls(engine) store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) @@ -786,7 +799,8 @@ def open_mfdataset( files to open. Paths can be given as strings or as pathlib Paths. If concatenation along more than one dimension is desired, then ``paths`` must be a nested list-of-lists (see ``combine_nested`` for details). (A string glob will - be expanded to a 1-dimensional list.) + be expanded to a 1-dimensional list.). When engine=="zarr", the path(s) can + be of any type understood by ``fsspec``. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk @@ -907,13 +921,27 @@ def open_mfdataset( """ if isinstance(paths, str): if is_remote_uri(paths): - raise ValueError( - "cannot do wild-card matching for paths that are remote URLs: " - "{!r}. Instead, supply paths as an explicit list of strings.".format( - paths + if engine != "zarr": + raise ValueError( + "cannot do wild-card matching for paths that are remote URLs: " + "{!r}. Instead, supply paths as an explicit list of strings.".format( + paths + ) ) - ) - paths = sorted(glob(paths)) + else: + import fsspec # type: ignore + + backend_kwargs = kwargs.get("backend_kwargs", {}) + storage_options = backend_kwargs.get("storage_options", None) + + fs, _, _ = fsspec.core.get_fs_token_paths( + paths, storage_options=storage_options + ) + paths = fs.expand_path(paths) + backend_kwargs["fs"] = fs + kwargs["backend_kwargs"] = backend_kwargs + else: + paths = sorted(glob(paths)) else: paths = [str(p) if isinstance(p, Path) else p for p in paths] diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 7e4605c42ce..88fe611999c 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -214,7 +214,9 @@ def open_dataset( if backend_kwargs is None: backend_kwargs = {} - filename_or_obj = _normalize_path(filename_or_obj) + if "fs" not in backend_kwargs: + # do *not* mange paths meant for a specific file system made in open_mfdataset + filename_or_obj = _normalize_path(filename_or_obj) if engine is None: engine = _autodetect_engine(filename_or_obj) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9827c345239..e8dd44216d4 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -280,6 +280,7 @@ def open_group( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, append_dim=None, write_region=None, ): @@ -289,6 +290,8 @@ def open_group( if chunk_store: open_kwargs["chunk_store"] = chunk_store + if storage_options: + open_kwargs["storage_options"] = storage_options if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) @@ -706,8 +709,12 @@ def open_backend_dataset_zarr( consolidated=False, consolidate_on_close=False, chunk_store=None, + fs=None, ): + if fs is not None: + filename_or_obj = fs.get_mapper(filename_or_obj) + store = ZarrStore.open_group( filename_or_obj, group=group, diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 05e6ee8716b..7b5db30c68f 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -608,7 +608,7 @@ def close_on_error(f): def is_remote_uri(path: str) -> bool: - return bool(re.search(r"^https?\://", path)) + return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) def is_grib_path(path: str) -> bool: diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 7c18f1a8c8a..6e883fd90f5 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -73,6 +73,7 @@ def LooseVersion(vstring): has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") +has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 43bf2de245b..9a7ad482903 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ requires_cfgrib, requires_cftime, requires_dask, + requires_fsspec, requires_h5netcdf, requires_netCDF4, requires_pseudonetcdf, @@ -4784,6 +4785,36 @@ def test_extract_zarr_variable_encoding(): ) +@requires_zarr +@requires_fsspec +def test_open_fsspec(): + import fsspec # type: ignore + import zarr + + if not hasattr(zarr.storage, "FSStore") or not hasattr( + zarr.storage.FSStore, "getitems" + ): + pytest.skip("zarr too old") + + ds = open_dataset(os.path.join(os.path.dirname(__file__), "data", "example_1.nc")) + + m = fsspec.filesystem("memory") + mm = m.get_mapper("out1.zarr") + ds.to_zarr(mm) # old interface + ds0 = ds.copy() + ds0["time"] = ds.time + pd.to_timedelta("1 day") + mm = m.get_mapper("out2.zarr") + ds0.to_zarr(mm) # old interface + + url = "memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + url = "memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + @requires_h5netcdf def test_load_single_value_h5netcdf(tmp_path): """Test that numeric single-element vector attributes are handled fine.