From 8bf415a15cc17995df5afa9af27409d903d48dcc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 16 Feb 2021 16:18:05 -0500 Subject: [PATCH] Allow fsspec URLs in open_(mf)dataset (#4823) Co-authored-by: keewis Co-authored-by: Ray Bell --- ci/requirements/environment.yml | 1 + ci/requirements/py38-all-but-dask.yml | 2 ++ doc/io.rst | 35 +++++++++++++++--- doc/whats-new.rst | 5 +++ setup.cfg | 2 ++ xarray/backends/api.py | 27 ++++++++++++-- xarray/backends/zarr.py | 16 ++++++++- xarray/core/utils.py | 7 +++- xarray/tests/__init__.py | 1 + xarray/tests/test_backends.py | 52 ++++++++++++++++++++++++++- 10 files changed, 138 insertions(+), 10 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 0f59d9570c8..1bbe349ab21 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -3,6 +3,7 @@ channels: - conda-forge - nodefaults dependencies: + - aiobotocore - boto3 - bottleneck - cartopy diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml index 51ec48cc6b1..07dc6344a25 100644 --- a/ci/requirements/py38-all-but-dask.yml +++ b/ci/requirements/py38-all-but-dask.yml @@ -4,6 +4,8 @@ channels: - nodefaults dependencies: - python=3.8 + - black + - aiobotocore - boto3 - bottleneck - cartopy diff --git a/doc/io.rst b/doc/io.rst index 2e46879929b..b97f1f5a699 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -890,17 +890,44 @@ Cloud Storage Buckets It is possible to read and write xarray datasets directly from / to cloud storage buckets using zarr. This example uses the `gcsfs`_ package to provide -a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then -pass to xarray:: +an interface to `Google Cloud Storage`_. + +From v0.16.2: general `fsspec`_ URLs are parsed and the store set up for you +automatically when reading, such that you can open a dataset in a single +call. You should include any arguments to the storage backend as the +key ``storage_options``, part of ``backend_kwargs``. + +.. code:: python + + ds_gcs = xr.open_dataset( + "gcs:///path.zarr", + backend_kwargs={ + "storage_options": {"project": "", "token": None} + }, + engine="zarr", + ) + + +This also works with ``open_mfdataset``, allowing you to pass a list of paths or +a URL to be interpreted as a glob string. + +For older versions, and for writing, you must explicitly set up a ``MutableMapping`` +instance and pass this, as follows: + +.. code:: python import gcsfs - fs = gcsfs.GCSFileSystem(project='', token=None) - gcsmap = gcsfs.mapping.GCSMap('', gcs=fs, check=True, create=False) + + fs = gcsfs.GCSFileSystem(project="", token=None) + gcsmap = gcsfs.mapping.GCSMap("", gcs=fs, check=True, create=False) # write to the bucket ds.to_zarr(store=gcsmap) # read it back ds_gcs = xr.open_zarr(gcsmap) +(or use the utility function ``fsspec.get_mapper()``). + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ .. _Zarr: http://zarr.readthedocs.io/ .. _Amazon S3: https://aws.amazon.com/s3/ .. _Google Cloud Storage: https://cloud.google.com/storage/ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b06003b630..ef4abb15129 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,6 +76,11 @@ New Features in the form of kwargs as well as a dict, like most similar methods. By `Maximilian Roos `_. +- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs + (including globs for the latter) for ``engine="zarr"``, and so allow reading from + many remote and other file systems (:pull:`4461`) + By `Martin Durant `_ + Bug fixes ~~~~~~~~~ - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke `_. diff --git a/setup.cfg b/setup.cfg index 72d28d3ca6f..231865d7788 100644 --- a/setup.cfg +++ b/setup.cfg @@ -185,6 +185,8 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-distributed.*] ignore_missing_imports = True +[mypy-fsspec.*] +ignore_missing_imports = True [mypy-h5netcdf.*] ignore_missing_imports = True [mypy-h5py.*] diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 81314588784..0791d1cdaf1 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -643,7 +643,9 @@ def open_dataarray( backend_kwargs: dict, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or - allow user control of dataset processing. + allow user control of dataset processing. If using fsspec URLs, + include the key "storage_options" to pass arguments to the + storage layer. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -869,14 +871,33 @@ def open_mfdataset( .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ if isinstance(paths, str): - if is_remote_uri(paths): + if is_remote_uri(paths) and engine == "zarr": + try: + from fsspec.core import get_fs_token_paths + except ImportError as e: + raise ImportError( + "The use of remote URLs for opening zarr requires the package fsspec" + ) from e + + fs, _, _ = get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + paths = fs.glob(fs._strip_protocol(paths)) # finds directories + paths = [fs.get_mapper(path) for path in paths] + elif is_remote_uri(paths): raise ValueError( "cannot do wild-card matching for paths that are remote URLs: " "{!r}. Instead, supply paths as an explicit list of strings.".format( paths ) ) - paths = sorted(glob(_normalize_path(paths))) + else: + paths = sorted(glob(_normalize_path(paths))) else: paths = [str(p) if isinstance(p, Path) else p for p in paths] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 04fdeac6450..074572169ce 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,5 +1,6 @@ import os import pathlib +from distutils.version import LooseVersion import numpy as np @@ -295,6 +296,7 @@ def open_group( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, append_dim=None, write_region=None, ): @@ -303,7 +305,15 @@ def open_group( if isinstance(store, pathlib.Path): store = os.fspath(store) - open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) + open_kwargs = dict( + mode=mode, + synchronizer=synchronizer, + path=group, + ) + if LooseVersion(zarr.__version__) >= "2.5.0": + open_kwargs["storage_options"] = storage_options + elif storage_options: + raise ValueError("Storage options only compatible with zarr>=2.5.0") if chunk_store: open_kwargs["chunk_store"] = chunk_store @@ -537,6 +547,7 @@ def open_zarr( consolidated=False, overwrite_encoded_chunks=False, chunk_store=None, + storage_options=None, decode_timedelta=None, use_cftime=None, **kwargs, @@ -649,6 +660,7 @@ def open_zarr( "consolidated": consolidated, "overwrite_encoded_chunks": overwrite_encoded_chunks, "chunk_store": chunk_store, + "storage_options": storage_options, } ds = open_dataset( @@ -687,6 +699,7 @@ def open_dataset( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, ): store = ZarrStore.open_group( filename_or_obj, @@ -696,6 +709,7 @@ def open_dataset( consolidated=consolidated, consolidate_on_close=consolidate_on_close, chunk_store=chunk_store, + storage_options=storage_options, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ced688f32dd..9648458ec6d 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -645,7 +645,12 @@ def close_on_error(f): def is_remote_uri(path: str) -> bool: - return bool(re.search(r"^https?\://", path)) + """Finds URLs of the form protocol:// or protocol:: + + This also matches for http[s]://, which were the only remote URLs + supported in <=v0.16.2. + """ + return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) def read_magic_number(filename_or_obj, count=8): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a7761aefa3d..4b47e1d2c7e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -74,6 +74,7 @@ def LooseVersion(vstring): has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") +has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3750c0715ae..75e0edb4fb2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ requires_cfgrib, requires_cftime, requires_dask, + requires_fsspec, requires_h5netcdf, requires_netCDF4, requires_pseudonetcdf, @@ -3040,10 +3041,17 @@ def test_open_mfdataset(self): with raises_regex(IOError, "no files to open"): open_mfdataset("foo-bar-baz-*.nc") - with raises_regex(ValueError, "wild-card"): open_mfdataset("http://some/remote/uri") + @requires_fsspec + def test_open_mfdataset_no_files(self): + pytest.importorskip("aiobotocore") + + # glob is attempted as of #4823, but finds no files + with raises_regex(OSError, "no files"): + open_mfdataset("http://some/remote/uri", engine="zarr") + def test_open_mfdataset_2d(self): original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) with create_tmp_file() as tmp1: @@ -4799,6 +4807,48 @@ def test_extract_zarr_variable_encoding(): ) +@requires_zarr +@requires_fsspec +def test_open_fsspec(): + import fsspec + import zarr + + if not hasattr(zarr.storage, "FSStore") or not hasattr( + zarr.storage.FSStore, "getitems" + ): + pytest.skip("zarr too old") + + ds = open_dataset(os.path.join(os.path.dirname(__file__), "data", "example_1.nc")) + + m = fsspec.filesystem("memory") + mm = m.get_mapper("out1.zarr") + ds.to_zarr(mm) # old interface + ds0 = ds.copy() + ds0["time"] = ds.time + pd.to_timedelta("1 day") + mm = m.get_mapper("out2.zarr") + ds0.to_zarr(mm) # old interface + + # single dataset + url = "memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + # single dataset with caching + url = "simplecache::memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + # multi dataset + url = "memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + # multi dataset with caching + url = "simplecache::memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + @requires_h5netcdf def test_load_single_value_h5netcdf(tmp_path): """Test that numeric single-element vector attributes are handled fine.