diff --git a/argopy/utils/checkers.py b/argopy/utils/checkers.py index e5ce2a20..b105d694 100644 --- a/argopy/utils/checkers.py +++ b/argopy/utils/checkers.py @@ -441,7 +441,7 @@ def check_index_cols(column_names: list, convention: str = "ar_index_global_prof return column_names -def check_gdac_path(path, errors="ignore"): # noqa: C901 +def check_gdac_path(path, errors:str="ignore", ignore_knowns:bool=False): # noqa: C901 """Check if a path has the expected GDAC structure Expected GDAC structure:: @@ -469,8 +469,11 @@ def check_gdac_path(path, errors="ignore"): # noqa: C901 ---------- path: str Path name to check, including access protocol - errors: str - "ignore" or "raise" (or "warn") + errors: str, default="ignore" + Determine how check procedure error are handled: "ignore", "raise" or "warn" + ignore_knowns: bool, default=False + Should the checking procedure be by-passed for the internal list of known GDACs. + Set this to True to check if a known GDACs is connected or not. Returns ------- @@ -481,7 +484,7 @@ def check_gdac_path(path, errors="ignore"): # noqa: C901 :class:`argopy.stores.gdacfs`, :meth:`argopy.utils.list_gdac_servers` """ - if path in list_gdac_servers(): + if path in list_gdac_servers() and ignore_knowns: return True else: @@ -644,7 +647,7 @@ def isAPIconnected(src="erddap", data=True): if src in list_src and getattr(list_src[src], "api_server_check", None): if src == 'gdac': - return check_gdac_path(list_src[src].api_server_check) + return check_gdac_path(list_src[src].api_server_check, ignore_knowns=True) else: return isalive(list_src[src].api_server_check) else: diff --git a/argopy/utils/locals.py b/argopy/utils/locals.py index 6e0b169f..ede7881c 100644 --- a/argopy/utils/locals.py +++ b/argopy/utils/locals.py @@ -184,6 +184,7 @@ def show_versions(file=sys.stdout, conda=False): # noqa: C901 [ ("boto3", get_version), ("h5netcdf", get_version), + ("numcodecs", get_version), ("s3fs", get_version), ("kerchunk", get_version), ("zarr", get_version), diff --git a/argopy/xarray.py b/argopy/xarray.py index 26cca9a3..c4ea4448 100644 --- a/argopy/xarray.py +++ b/argopy/xarray.py @@ -5,7 +5,9 @@ import pandas as pd import xarray as xr import logging +from typing import Union from xarray.backends import BackendEntrypoint # For xarray > 0.18 +from xarray.backends import ZarrStore try: import gsw @@ -14,6 +16,15 @@ except ModuleNotFoundError: with_gsw = False +try: + from dask.delayed import Delayed + + with_dask = True +except ModuleNotFoundError: + with_dask = False + Delayed = lambda x: x + + from .utils import is_list_of_strings from .utils import ( cast_Argo_variable_type, @@ -1929,6 +1940,44 @@ def list_WMO(self): """Return all possible WMO as a list""" return to_list(np.unique(self._obj["PLATFORM_NUMBER"].values)) + def to_zarr(self, *args, **kwargs) -> Union[ZarrStore, Delayed]: + """Write Argo dataset content to a zarr group + + Before write operation is delegated to :class:`xarray.Dataset.to_zarr`, we perform the following: + + - Ensure all variables are appropriately cast. + - If the ``encoding`` argument is not specified, we automatically add a ``Blosc(cname="zstd", clevel=3, shuffle=2)`` compression to all variables. Set `encoding=None` for no compression. + + Parameters + ---------- + *args, **kwargs: + Passed to :class:`xarray.Dataset.to_zarr`. + + Returns + ------- + The output from :class:`xarray.Dataset.to_zarr` call + + See Also + -------- + :class:`xarray.Dataset.to_zarr`, :class:`numcodecs.blosc.Blosc` + """ + + # Ensure that all variables are cast appropriately + # (those already cast are not changed) + self._obj = self.cast_types() + + # Add zarr compression to encoding: + if "encoding" not in kwargs: + from numcodecs import Blosc + compressor = Blosc(cname="zstd", clevel=3, shuffle=2) + encoding = {} + for v in self._obj: + encoding.update({v: {"compressor": compressor}}) + kwargs.update({'encoding': encoding}) + + # Convert to a zarr file using compression: + return self._obj.to_zarr(*args, **kwargs) + def open_Argo_dataset(filename_or_obj): ds = xr.open_dataset(filename_or_obj, decode_cf=1, use_cftime=0, mask_and_scale=1) diff --git a/cli/show_versions b/cli/show_versions index 86d7d589..2e14dba3 100755 --- a/cli/show_versions +++ b/cli/show_versions @@ -169,6 +169,7 @@ def show_versions(file=sys.stdout, conda=False, free=False, core=False): # noqa [ ("zarr", get_version), ("boto3", get_version), + ("numcodecs", get_version), ("s3fs", get_version), ("kerchunk", get_version), ] diff --git a/docs/api-hidden.rst b/docs/api-hidden.rst index 767379da..850e7a4c 100644 --- a/docs/api-hidden.rst +++ b/docs/api-hidden.rst @@ -330,6 +330,7 @@ argopy.xarray.ArgoAccessor.list_WMO_CYC argopy.xarray.ArgoAccessor.N_POINTS argopy.xarray.ArgoAccessor.N_PROF + argopy.xarray.ArgoAccessor.to_zarr argopy.xarray.ArgoEngine diff --git a/docs/api.rst b/docs/api.rst index c602d908..84a22cdb 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -207,6 +207,7 @@ Misc Dataset.argo.uid Dataset.argo.cast_types Dataset.argo.N_POINTS + Dataset.argo.to_zarr Utilities diff --git a/docs/conf.py b/docs/conf.py index 4c0988f2..41391cf9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -404,4 +404,5 @@ 'boto3': ('https://boto3.amazonaws.com/v1/documentation/api/latest/', None), 's3fs': ('https://s3fs.readthedocs.io/en/latest/', None), 'kerchunk': ('https://fsspec.github.io/kerchunk/', None), + 'numcodecs': ('https://numcodecs.readthedocs.io/en/stable/', None), } diff --git a/docs/energy.rst b/docs/energy.rst index 05af8e47..da940f4e 100644 --- a/docs/energy.rst +++ b/docs/energy.rst @@ -32,6 +32,7 @@ All branches are also monitored. Their metrics can be summed to compute each new - `Energy used by upstream CI tests running daily and on each commit in the master branch`_ + .. |energyused_CItests| image:: https://api.green-coding.io/v1/ci/badge/get?repo=euroargodev/argopy&branch=master&workflow=22344160&mode=totals :target: https://metrics.green-coding.io/ci.html?repo=euroargodev/argopy&branch=master&workflow=22344160 diff --git a/docs/whats-new.rst b/docs/whats-new.rst index 1b1cf1db..0ea22eb0 100644 --- a/docs/whats-new.rst +++ b/docs/whats-new.rst @@ -56,6 +56,18 @@ With more details: Internals ^^^^^^^^^ +- **Support Argo dataset export to zarr**. Provide preliminary support to export Argo datasets to zarr files (local or remote). (:pr:`423`) by |gmaze|. + +.. code-block:: python + :caption: Export to zarr + + from argopy import DataFetcher + ds = DataFetcher(src='gdac').float(6903091).to_xarray() + # then: + ds.argo.to_zarr("6903091_prof.zarr") + # or: + ds.argo.to_zarr("s3://argopy/sample-data/6903091_prof.zarr") + - **Open netcdf files lazily**. We now provide low-level support for opening a netcdf Argo dataset lazily with `kerchunk `_. Simply use the new option ``lazy=True`` with a :class:`stores.httpstore.open_dataset` or :class:`stores.s3store.open_dataset`. (:pr:`385`) by |gmaze|. .. code-block:: python