Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update contains_cftime_datetimes to avoid loading entire variable array #7494

Merged
merged 23 commits into from
Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ Bug fixes
- Fix matplotlib raising a UserWarning when plotting a scatter plot
with an unfilled marker (:issue:`7313`, :pull:`7318`).
By `Jimmy Westling <https://github.com/illviljan>`_.
- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`).
By `Alex Goodman <https://github.com/agoodm>`_ and `Deepak Cherian <https://github.com/dcherian>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
6 changes: 3 additions & 3 deletions xarray/coding/calendar_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def convert_calendar(
from xarray.core.dataarray import DataArray

time = obj[dim]
if not _contains_datetime_like_objects(time):
if not _contains_datetime_like_objects(time.variable):
raise ValueError(f"Coordinate {dim} must contain datetime objects.")

use_cftime = _should_cftime_be_used(time, calendar, use_cftime)
Expand Down Expand Up @@ -319,8 +319,8 @@ def interp_calendar(source, target, dim="time"):
target = DataArray(target, dims=(dim,), name=dim)

if not _contains_datetime_like_objects(
source[dim]
) or not _contains_datetime_like_objects(target):
source[dim].variable
) or not _contains_datetime_like_objects(target.variable):
raise ValueError(
f"Both 'source.{dim}' and 'target' must contain datetime objects."
)
Expand Down
2 changes: 1 addition & 1 deletion xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ def date_range_like(source, calendar, use_cftime=None):
if not isinstance(source, (pd.DatetimeIndex, CFTimeIndex)) and (
isinstance(source, DataArray)
and (source.ndim != 1)
or not _contains_datetime_like_objects(source)
or not _contains_datetime_like_objects(source.variable)
):
raise ValueError(
"'source' must be a 1D array of datetime objects for inferring its range."
Expand Down
3 changes: 2 additions & 1 deletion xarray/coding/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@ def infer_freq(index):
If there are fewer than three values or the index is not 1D.
"""
from xarray.core.dataarray import DataArray
from xarray.core.variable import Variable

if isinstance(index, (DataArray, pd.Series)):
if index.ndim != 1:
raise ValueError("'index' must be 1D")
elif not _contains_datetime_like_objects(DataArray(index)):
elif not _contains_datetime_like_objects(Variable("dim", index)):
raise ValueError("'index' must contain datetime-like objects")
dtype = np.asarray(index).dtype
if dtype == "datetime64[ns]":
Expand Down
2 changes: 1 addition & 1 deletion xarray/core/accessor_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ def __new__(cls, obj: T_DataArray) -> CombinedDatetimelikeAccessor:
# we need to choose which parent (datetime or timedelta) is
# appropriate. Since we're checking the dtypes anyway, we'll just
# do all the validation here.
if not _contains_datetime_like_objects(obj):
if not _contains_datetime_like_objects(obj.variable):
raise TypeError(
"'.dt' accessor only available for "
"DataArray with datetime64 timedelta64 dtype or "
Expand Down
34 changes: 16 additions & 18 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pandas as pd

from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops
from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed
from xarray.core.options import OPTIONS, _get_keep_attrs
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar
Expand Down Expand Up @@ -40,6 +41,7 @@
ScalarOrArray,
SideOptions,
T_DataWithCoords,
T_Variable,
)
from xarray.core.variable import Variable

Expand Down Expand Up @@ -1770,31 +1772,27 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool:
return np.issubdtype(dtype, np.timedelta64)


def _contains_cftime_datetimes(array) -> bool:
"""Check if an array contains cftime.datetime objects"""
def _contains_cftime_datetimes(array: Any) -> bool:
"""Check if a array inside a Variable contains cftime.datetime objects"""
if cftime is None:
return False
else:
if array.dtype == np.dtype("O") and array.size > 0:
sample = np.asarray(array).flat[0]
if is_duck_dask_array(sample):
sample = sample.compute()
if isinstance(sample, np.ndarray):
sample = sample.item()
return isinstance(sample, cftime.datetime)
else:
return False

if array.dtype == np.dtype("O") and array.size > 0:
first_idx = (0,) * array.ndim
if isinstance(array, ExplicitlyIndexed):
first_idx = BasicIndexer(first_idx)
sample = array[first_idx]
return isinstance(np.asarray(sample).item(), cftime.datetime)

return False

def contains_cftime_datetimes(var) -> bool:

def contains_cftime_datetimes(var: T_Variable) -> bool:
"""Check if an xarray.Variable contains cftime.datetime objects"""
if var.dtype == np.dtype("O") and var.size > 0:
return _contains_cftime_datetimes(var.data)
else:
return False
return _contains_cftime_datetimes(var._data)


def _contains_datetime_like_objects(var) -> bool:
def _contains_datetime_like_objects(var: T_Variable) -> bool:
"""Check if a variable contains datetime like objects (either
np.datetime64, np.timedelta64, or cftime.datetime)
"""
Expand Down
13 changes: 12 additions & 1 deletion xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,18 @@ def __init__(self, array):
self.array = array

def __getitem__(self, key):
raise UnexpectedDataAccess("Tried accessing data")
raise UnexpectedDataAccess("Tried accessing data.")

def __array__(self):
raise UnexpectedDataAccess("Tried accessing data.")


class FirstElementAccessibleArray(InaccessibleArray):
def __getitem__(self, key):
tuple_idxr = key.tuple
if len(tuple_idxr) > 1:
raise UnexpectedDataAccess("Tried accessing more than one element.")
return self.array[tuple_idxr]


class ReturnItem:
Expand Down
1 change: 0 additions & 1 deletion xarray/tests/test_accessor_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,6 @@ def test_calendar_cftime(data) -> None:
assert data.time.dt.calendar == expected


@requires_cftime
def test_calendar_datetime64_2d() -> None:
data = xr.DataArray(np.zeros((4, 5), dtype="datetime64[ns]"), dims=("x", "y"))
assert data.dt.calendar == "proleptic_gregorian"
Expand Down
2 changes: 1 addition & 1 deletion xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2560,7 +2560,7 @@ def test_open_zarr_use_cftime(self) -> None:
ds_a = xr.open_zarr(store_target, **self.version_kwargs)
assert_identical(ds, ds_a)
ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs)
assert xr.coding.times.contains_cftime_datetimes(ds_b.time)
assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable)

def test_write_read_select_write(self) -> None:
# Test for https://github.com/pydata/xarray/issues/4084
Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ def test_CFMaskCoder_missing_value() -> None:
expected.attrs["missing_value"] = -9999

decoded = xr.decode_cf(expected.to_dataset())
encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs)
encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs)
dcherian marked this conversation as resolved.
Show resolved Hide resolved

assert_equal(encoded["tmpk"], expected.variable)

decoded.tmpk.encoding["_FillValue"] = -9940
with pytest.raises(ValueError):
encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs)
encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs)


@requires_dask
Expand Down
27 changes: 21 additions & 6 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from xarray.core.common import contains_cftime_datetimes
from xarray.testing import assert_equal, assert_identical
from xarray.tests import (
FirstElementAccessibleArray,
arm_xfail,
assert_array_equal,
assert_no_warnings,
Expand Down Expand Up @@ -787,35 +788,35 @@ def times_3d(times):

@requires_cftime
def test_contains_cftime_datetimes_1d(data) -> None:
assert contains_cftime_datetimes(data.time)
assert contains_cftime_datetimes(data.time.variable)


@requires_cftime
@requires_dask
def test_contains_cftime_datetimes_dask_1d(data) -> None:
assert contains_cftime_datetimes(data.time.chunk())
assert contains_cftime_datetimes(data.time.variable.chunk())


@requires_cftime
def test_contains_cftime_datetimes_3d(times_3d) -> None:
assert contains_cftime_datetimes(times_3d)
assert contains_cftime_datetimes(times_3d.variable)


@requires_cftime
@requires_dask
def test_contains_cftime_datetimes_dask_3d(times_3d) -> None:
assert contains_cftime_datetimes(times_3d.chunk())
assert contains_cftime_datetimes(times_3d.variable.chunk())


@pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])])
def test_contains_cftime_datetimes_non_cftimes(non_cftime_data) -> None:
assert not contains_cftime_datetimes(non_cftime_data)
assert not contains_cftime_datetimes(non_cftime_data.variable)


@requires_dask
@pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])])
def test_contains_cftime_datetimes_non_cftimes_dask(non_cftime_data) -> None:
assert not contains_cftime_datetimes(non_cftime_data.chunk())
assert not contains_cftime_datetimes(non_cftime_data.variable.chunk())


@requires_cftime
Expand Down Expand Up @@ -1176,3 +1177,17 @@ def test_scalar_unit() -> None:
variable = Variable(("x", "y"), np.array([[0, 1], [2, 3]]), {"units": np.nan})
result = coding.times.CFDatetimeCoder().decode(variable)
assert np.isnan(result.attrs["units"])


@requires_cftime
def test_contains_cftime_lazy() -> None:
import cftime

from xarray.core.common import _contains_cftime_datetimes

times = np.array(
[cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)],
dtype=object,
)
array = FirstElementAccessibleArray(times)
assert _contains_cftime_datetimes(array)
4 changes: 2 additions & 2 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6219,7 +6219,7 @@ def test_to_and_from_iris(self) -> None:
original_coord = original.coords[orginal_key]
assert coord.var_name == original_coord.name
assert_array_equal(
coord.points, CFDatetimeCoder().encode(original_coord).values
coord.points, CFDatetimeCoder().encode(original_coord.variable).values
)
assert actual.coord_dims(coord) == original.get_axis_num(
original.coords[coord.var_name].dims
Expand Down Expand Up @@ -6295,7 +6295,7 @@ def test_to_and_from_iris_dask(self) -> None:
original_coord = original.coords[orginal_key]
assert coord.var_name == original_coord.name
assert_array_equal(
coord.points, CFDatetimeCoder().encode(original_coord).values
coord.points, CFDatetimeCoder().encode(original_coord.variable).values
)
assert actual.coord_dims(coord) == original.get_axis_num(
original.coords[coord.var_name].dims
Expand Down