zarr-developers · TomNicholas · Aug 27, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -9,6 +9,9 @@ v1.0.1 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Adds `decode_times` to open_virtual_dataset (:pull:`232`)
+  By `Raphael Hagen <https://github.com/norlandrhagen>`_.
+
 - Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`)
   By `Ayush Nag <https://github.com/ayushnag>`_.
 
@@ -17,17 +20,18 @@ New Features
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
-
 - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
 - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass
   `pydantic.BaseModel` (:pull:`210`)
 - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)
 
-
 Deprecations
 ~~~~~~~~~~~~
 
+- Depreciates cftime_variables in open_virtual_dataset in favor of decode_times. (:pull:`232`)
+  By `Raphael Hagen <https://github.com/norlandrhagen>`_.
+
 Bug fixes
 ~~~~~~~~~
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -306,8 +306,8 @@ Dimensions:  (time: 2920, lat: 25, lon: 53)
 Coordinates:
     lat      (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
     lon      (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
-  * time     (time) float32 12kB 1.867e+06 1.867e+06 ... 1.885e+06 1.885e+06
-Data variables:
+  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
+  Data variables:
     air      (time, lat, lon) float64 31MB ...
 Attributes:
     Conventions:  COARDS
@@ -325,13 +325,15 @@ Loading variables can be useful in a few scenarios:
 
 ### CF-encoded time variables
 
-Notice that the `time` variable that was loaded above does not have the expected dtype. To correctly decode time variables according to the CF conventions (like `xr.open_dataset` does by default), you need to include them in an additional keyword argument `cftime_variables`:
+To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to None).
+
+
 
 ```python
 vds = open_virtual_dataset(
     'air.nc',
     loadable_variables=['air', 'time'],
-    cftime_variables=['time'],
+    decode_times=True,
     indexes={},
 )
 ```
@@ -352,7 +354,7 @@ Attributes:
     title:        4x daily NMC reanalysis (1948)
 ```
 
-Now the loaded time variable has a `datetime64[ns]` dtype. Any variables listed as `cftime_variables` must also be listed as `loadable_variables`.
+
 
 ## Writing virtual stores to disk
 

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -12,7 +12,6 @@
 
 import xarray as xr
 from xarray.backends import AbstractDataStore, BackendArray
-from xarray.coding.times import CFDatetimeCoder
 from xarray.core.indexes import Index, PandasIndex
 from xarray.core.variable import IndexVariable
 
@@ -54,6 +53,7 @@ def open_virtual_dataset(
     filetype: FileType | None = None,
     drop_variables: Iterable[str] | None = None,
     loadable_variables: Iterable[str] | None = None,
+    decode_times: bool | None = None,
     cftime_variables: Iterable[str] | None = None,
     indexes: Mapping[str, Index] | None = None,
     virtual_array_class=ManifestArray,
@@ -79,10 +79,9 @@ def open_virtual_dataset(
     loadable_variables: list[str], default is None
         Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class.
         Default is to open all variables as virtual arrays (i.e. ManifestArray).
-    cftime_variables : list[str], default is None
-        Interpret the value of specified vars using cftime, returning a datetime.
-        These will be automatically re-encoded with cftime. This list must be a subset
-        of ``loadable_variables``.
+    decode_times: bool | None, default is None
+        Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object.
+    cftime_variables: Iterable[str] | None = None
     indexes : Mapping[str, Index], default is None
         Indexes to use on the returned xarray Dataset.
         Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.
@@ -99,6 +98,15 @@ def open_virtual_dataset(
     vds
         An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables.
     """
+
+    if cftime_variables is not None:
+        # It seems like stacklevel=2 is req to surface this warning.
+        warnings.warn(
+            "cftime_variables is depreciated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
     loadable_vars: dict[str, xr.Variable]
     virtual_vars: dict[str, xr.Variable]
     vars: dict[str, xr.Variable]
@@ -119,20 +127,6 @@ def open_virtual_dataset(
     if common:
         raise ValueError(f"Cannot both load and drop variables {common}")
 
-    if cftime_variables is None:
-        cftime_variables = []
-    elif isinstance(cftime_variables, str):
-        cftime_variables = [cftime_variables]
-    else:
-        cftime_variables = list(cftime_variables)
-
-    if diff := (set(cftime_variables) - set(loadable_variables)):
-        missing_str = ", ".join([f"'{v}'" for v in diff])
-        raise ValueError(
-            "All ``cftime_variables`` must be included in ``loadable_variables`` "
-            f"({missing_str} not in ``loadable_variables``)"
-        )
-
     if virtual_array_class is not ManifestArray:
         raise NotImplementedError()
 
@@ -150,9 +144,9 @@ def open_virtual_dataset(
     elif filetype == FileType.dmrpp:
         from virtualizarr.readers.dmrpp import DMRParser
 
-        if loadable_variables != [] or cftime_variables != [] or indexes is None:
+        if loadable_variables != [] or indexes is None:
             raise NotImplementedError(
-                "Specifying `loadable_variables`, `cftime_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
+                "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
             )
 
         fpath = _fsspec_openfile_from_filepath(
@@ -202,7 +196,7 @@ def open_virtual_dataset(
             ds = xr.open_dataset(
                 cast(XArrayOpenT, fpath),
                 drop_variables=drop_variables,
-                decode_times=False,
+                decode_times=decode_times,
             )
 
             if indexes is None:
@@ -225,10 +219,6 @@ def open_virtual_dataset(
                 if name in loadable_variables
             }
 
-            for name in cftime_variables:
-                var = loadable_vars[name]
-                loadable_vars[name] = CFDatetimeCoder().decode(var, name=name)
-
             # if we only read the indexes we can just close the file right away as nothing is lazy
             if loadable_vars == {}:
                 ds.close()

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -82,7 +82,7 @@ def test_no_indexes(self, netcdf4_file):
     def test_create_default_indexes(self, netcdf4_file):
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
             vds = open_virtual_dataset(netcdf4_file, indexes=None)
-        ds = open_dataset(netcdf4_file, decode_times=False)
+        ds = open_dataset(netcdf4_file, decode_times=True)
 
         # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
         assert index_mappings_equal(vds.xindexes, ds.xindexes)
@@ -104,6 +104,31 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
     return True
 
 
+def test_cftime_index(tmpdir):
+    """Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
+    # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
+    ds = xr.Dataset(
+        data_vars={
+            "tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)),
+        },
+        coords={
+            "time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"),
+            "lat": np.arange(-90, 90, 10),
+            "lon": np.arange(-180, 180, 10),
+        },
+        attrs={"attr1_key": "attr1_val"},
+    )
+    ds.to_netcdf(f"{tmpdir}/tmp.nc")
+    vds = open_virtual_dataset(
+        f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
+    )
+    # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
+    assert index_mappings_equal(vds.xindexes, ds.xindexes)
+    assert list(ds.coords) == list(vds.coords)
+    assert vds.dims == ds.dims
+    assert vds.attrs == ds.attrs
+
+
 class TestOpenVirtualDatasetAttrs:
     def test_drop_array_dimensions(self, netcdf4_file):
         # regression test for GH issue #150
@@ -211,7 +236,7 @@ def test_loadable_variables(self, netcdf4_file):
             else:
                 assert isinstance(vds[name].data, ManifestArray), name
 
-        full_ds = xr.open_dataset(netcdf4_file, decode_times=False)
+        full_ds = xr.open_dataset(netcdf4_file, decode_times=True)
 
         for name in full_ds.variables:
             if name in vars_to_load:
@@ -246,10 +271,3 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
         vds = open_virtual_dataset(hdf5_scalar)
         assert vds.scalar.dims == ()
         assert vds.scalar.attrs == {"scalar": "true"}
-
-
-def test_cftime_variables_must_be_in_loadable_variables(tmpdir):
-    ds = xr.Dataset(data_vars={"time": ["2024-06-21"]})
-    ds.to_netcdf(f"{tmpdir}/scalar.nc")
-    with pytest.raises(ValueError, match="'time' not in"):
-        open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"])
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -92,13 +92,11 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             f"{tmpdir}/air1.nc",
             indexes={},
             loadable_variables=time_vars,
-            cftime_variables=time_vars,
         )
         vds2 = open_virtual_dataset(
             f"{tmpdir}/air2.nc",
             indexes={},
             loadable_variables=time_vars,
-            cftime_variables=time_vars,
         )
 
         if decode_times is False: