diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7e632bdfb7..3cc2efde599 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -44,6 +44,8 @@ Bug fixes - Fix matplotlib raising a UserWarning when plotting a scatter plot with an unfilled marker (:issue:`7313`, :pull:`7318`). By `Jimmy Westling `_. +- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`). + By `Alex Goodman `_ and `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py index 06f57757619..dc2f95b832e 100644 --- a/xarray/coding/calendar_ops.py +++ b/xarray/coding/calendar_ops.py @@ -147,7 +147,7 @@ def convert_calendar( from xarray.core.dataarray import DataArray time = obj[dim] - if not _contains_datetime_like_objects(time): + if not _contains_datetime_like_objects(time.variable): raise ValueError(f"Coordinate {dim} must contain datetime objects.") use_cftime = _should_cftime_be_used(time, calendar, use_cftime) @@ -319,8 +319,8 @@ def interp_calendar(source, target, dim="time"): target = DataArray(target, dims=(dim,), name=dim) if not _contains_datetime_like_objects( - source[dim] - ) or not _contains_datetime_like_objects(target): + source[dim].variable + ) or not _contains_datetime_like_objects(target.variable): raise ValueError( f"Both 'source.{dim}' and 'target' must contain datetime objects." ) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index bc3e3545892..792724ecc79 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -1267,7 +1267,7 @@ def date_range_like(source, calendar, use_cftime=None): if not isinstance(source, (pd.DatetimeIndex, CFTimeIndex)) and ( isinstance(source, DataArray) and (source.ndim != 1) - or not _contains_datetime_like_objects(source) + or not _contains_datetime_like_objects(source.variable) ): raise ValueError( "'source' must be a 1D array of datetime objects for inferring its range." diff --git a/xarray/coding/frequencies.py b/xarray/coding/frequencies.py index fef2f5a8319..4d24327aa2f 100644 --- a/xarray/coding/frequencies.py +++ b/xarray/coding/frequencies.py @@ -79,11 +79,12 @@ def infer_freq(index): If there are fewer than three values or the index is not 1D. """ from xarray.core.dataarray import DataArray + from xarray.core.variable import Variable if isinstance(index, (DataArray, pd.Series)): if index.ndim != 1: raise ValueError("'index' must be 1D") - elif not _contains_datetime_like_objects(DataArray(index)): + elif not _contains_datetime_like_objects(Variable("dim", index)): raise ValueError("'index' must contain datetime-like objects") dtype = np.asarray(index).dtype if dtype == "datetime64[ns]": diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index 118cbcb7ac5..b261bb26d23 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -574,7 +574,7 @@ def __new__(cls, obj: T_DataArray) -> CombinedDatetimelikeAccessor: # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just # do all the validation here. - if not _contains_datetime_like_objects(obj): + if not _contains_datetime_like_objects(obj.variable): raise TypeError( "'.dt' accessor only available for " "DataArray with datetime64 timedelta64 dtype or " diff --git a/xarray/core/common.py b/xarray/core/common.py index 3a73f463ea9..d980e622763 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -11,6 +11,7 @@ import pandas as pd from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops +from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar @@ -40,6 +41,7 @@ ScalarOrArray, SideOptions, T_DataWithCoords, + T_Variable, ) from xarray.core.variable import Variable @@ -1770,31 +1772,27 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool: return np.issubdtype(dtype, np.timedelta64) -def _contains_cftime_datetimes(array) -> bool: - """Check if an array contains cftime.datetime objects""" +def _contains_cftime_datetimes(array: Any) -> bool: + """Check if a array inside a Variable contains cftime.datetime objects""" if cftime is None: return False - else: - if array.dtype == np.dtype("O") and array.size > 0: - sample = np.asarray(array).flat[0] - if is_duck_dask_array(sample): - sample = sample.compute() - if isinstance(sample, np.ndarray): - sample = sample.item() - return isinstance(sample, cftime.datetime) - else: - return False + if array.dtype == np.dtype("O") and array.size > 0: + first_idx = (0,) * array.ndim + if isinstance(array, ExplicitlyIndexed): + first_idx = BasicIndexer(first_idx) + sample = array[first_idx] + return isinstance(np.asarray(sample).item(), cftime.datetime) + + return False -def contains_cftime_datetimes(var) -> bool: + +def contains_cftime_datetimes(var: T_Variable) -> bool: """Check if an xarray.Variable contains cftime.datetime objects""" - if var.dtype == np.dtype("O") and var.size > 0: - return _contains_cftime_datetimes(var.data) - else: - return False + return _contains_cftime_datetimes(var._data) -def _contains_datetime_like_objects(var) -> bool: +def _contains_datetime_like_objects(var: T_Variable) -> bool: """Check if a variable contains datetime like objects (either np.datetime64, np.timedelta64, or cftime.datetime) """ diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 1fe65a7e831..eda790694af 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -143,7 +143,18 @@ def __init__(self, array): self.array = array def __getitem__(self, key): - raise UnexpectedDataAccess("Tried accessing data") + raise UnexpectedDataAccess("Tried accessing data.") + + def __array__(self): + raise UnexpectedDataAccess("Tried accessing data.") + + +class FirstElementAccessibleArray(InaccessibleArray): + def __getitem__(self, key): + tuple_idxr = key.tuple + if len(tuple_idxr) > 1: + raise UnexpectedDataAccess("Tried accessing more than one element.") + return self.array[tuple_idxr] class ReturnItem: diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index aabdf6a5c75..ef91257c4d9 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -418,7 +418,6 @@ def test_calendar_cftime(data) -> None: assert data.time.dt.calendar == expected -@requires_cftime def test_calendar_datetime64_2d() -> None: data = xr.DataArray(np.zeros((4, 5), dtype="datetime64[ns]"), dims=("x", "y")) assert data.dt.calendar == "proleptic_gregorian" diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 91daabd12d5..bc6b095fc4e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2560,7 +2560,7 @@ def test_open_zarr_use_cftime(self) -> None: ds_a = xr.open_zarr(store_target, **self.version_kwargs) assert_identical(ds, ds_a) ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs) - assert xr.coding.times.contains_cftime_datetimes(ds_b.time) + assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable) def test_write_read_select_write(self) -> None: # Test for https://github.com/pydata/xarray/issues/4084 diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 5bf23819d87..f7579c4b488 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -65,13 +65,13 @@ def test_CFMaskCoder_missing_value() -> None: expected.attrs["missing_value"] = -9999 decoded = xr.decode_cf(expected.to_dataset()) - encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs) + encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs) assert_equal(encoded["tmpk"], expected.variable) decoded.tmpk.encoding["_FillValue"] = -9940 with pytest.raises(ValueError): - encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs) + encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs) @requires_dask diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 0746a949cc8..580de878fe6 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -31,6 +31,7 @@ from xarray.core.common import contains_cftime_datetimes from xarray.testing import assert_equal, assert_identical from xarray.tests import ( + FirstElementAccessibleArray, arm_xfail, assert_array_equal, assert_no_warnings, @@ -787,35 +788,35 @@ def times_3d(times): @requires_cftime def test_contains_cftime_datetimes_1d(data) -> None: - assert contains_cftime_datetimes(data.time) + assert contains_cftime_datetimes(data.time.variable) @requires_cftime @requires_dask def test_contains_cftime_datetimes_dask_1d(data) -> None: - assert contains_cftime_datetimes(data.time.chunk()) + assert contains_cftime_datetimes(data.time.variable.chunk()) @requires_cftime def test_contains_cftime_datetimes_3d(times_3d) -> None: - assert contains_cftime_datetimes(times_3d) + assert contains_cftime_datetimes(times_3d.variable) @requires_cftime @requires_dask def test_contains_cftime_datetimes_dask_3d(times_3d) -> None: - assert contains_cftime_datetimes(times_3d.chunk()) + assert contains_cftime_datetimes(times_3d.variable.chunk()) @pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])]) def test_contains_cftime_datetimes_non_cftimes(non_cftime_data) -> None: - assert not contains_cftime_datetimes(non_cftime_data) + assert not contains_cftime_datetimes(non_cftime_data.variable) @requires_dask @pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])]) def test_contains_cftime_datetimes_non_cftimes_dask(non_cftime_data) -> None: - assert not contains_cftime_datetimes(non_cftime_data.chunk()) + assert not contains_cftime_datetimes(non_cftime_data.variable.chunk()) @requires_cftime @@ -1176,3 +1177,17 @@ def test_scalar_unit() -> None: variable = Variable(("x", "y"), np.array([[0, 1], [2, 3]]), {"units": np.nan}) result = coding.times.CFDatetimeCoder().decode(variable) assert np.isnan(result.attrs["units"]) + + +@requires_cftime +def test_contains_cftime_lazy() -> None: + import cftime + + from xarray.core.common import _contains_cftime_datetimes + + times = np.array( + [cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)], + dtype=object, + ) + array = FirstElementAccessibleArray(times) + assert _contains_cftime_datetimes(array) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 836d30b60b8..ed1abea5fbe 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6219,7 +6219,7 @@ def test_to_and_from_iris(self) -> None: original_coord = original.coords[orginal_key] assert coord.var_name == original_coord.name assert_array_equal( - coord.points, CFDatetimeCoder().encode(original_coord).values + coord.points, CFDatetimeCoder().encode(original_coord.variable).values ) assert actual.coord_dims(coord) == original.get_axis_num( original.coords[coord.var_name].dims @@ -6295,7 +6295,7 @@ def test_to_and_from_iris_dask(self) -> None: original_coord = original.coords[orginal_key] assert coord.var_name == original_coord.name assert_array_equal( - coord.points, CFDatetimeCoder().encode(original_coord).values + coord.points, CFDatetimeCoder().encode(original_coord.variable).values ) assert actual.coord_dims(coord) == original.get_axis_num( original.coords[coord.var_name].dims