Skip to content

Commit

Permalink
Update contains_cftime_datetimes to avoid loading entire variable arr…
Browse files Browse the repository at this point in the history
…ay (#7494)

* Update contains_cftime_datetimes to avoid loading entire variable array

* Update whats-new.rst

* Convert arrays to variable instead for better control

* fix mypy?

* Update common.py

* Update xarray/core/common.py

Co-authored-by: Mathias Hauser <[email protected]>

* Update common.py

remove _variable_contains_cftime_datetimes

* Avoid creating variable.

* Add test

* minimize diff

* Update tests.

* address comment

* Fix test

* Fix whats-new

* Fix more tests

* More fixes

* fix iris tests

---------

Co-authored-by: Illviljan <[email protected]>
Co-authored-by: Mathias Hauser <[email protected]>
Co-authored-by: dcherian <[email protected]>
Co-authored-by: Deepak Cherian <[email protected]>
  • Loading branch information
5 people authored Mar 7, 2023
1 parent 830ee6d commit 798f4d4
Show file tree
Hide file tree
Showing 12 changed files with 63 additions and 37 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ Bug fixes
- Fix matplotlib raising a UserWarning when plotting a scatter plot
with an unfilled marker (:issue:`7313`, :pull:`7318`).
By `Jimmy Westling <https://github.com/illviljan>`_.
- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`).
By `Alex Goodman <https://github.com/agoodm>`_ and `Deepak Cherian <https://github.com/dcherian>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
6 changes: 3 additions & 3 deletions xarray/coding/calendar_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def convert_calendar(
from xarray.core.dataarray import DataArray

time = obj[dim]
if not _contains_datetime_like_objects(time):
if not _contains_datetime_like_objects(time.variable):
raise ValueError(f"Coordinate {dim} must contain datetime objects.")

use_cftime = _should_cftime_be_used(time, calendar, use_cftime)
Expand Down Expand Up @@ -319,8 +319,8 @@ def interp_calendar(source, target, dim="time"):
target = DataArray(target, dims=(dim,), name=dim)

if not _contains_datetime_like_objects(
source[dim]
) or not _contains_datetime_like_objects(target):
source[dim].variable
) or not _contains_datetime_like_objects(target.variable):
raise ValueError(
f"Both 'source.{dim}' and 'target' must contain datetime objects."
)
Expand Down
2 changes: 1 addition & 1 deletion xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ def date_range_like(source, calendar, use_cftime=None):
if not isinstance(source, (pd.DatetimeIndex, CFTimeIndex)) and (
isinstance(source, DataArray)
and (source.ndim != 1)
or not _contains_datetime_like_objects(source)
or not _contains_datetime_like_objects(source.variable)
):
raise ValueError(
"'source' must be a 1D array of datetime objects for inferring its range."
Expand Down
3 changes: 2 additions & 1 deletion xarray/coding/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@ def infer_freq(index):
If there are fewer than three values or the index is not 1D.
"""
from xarray.core.dataarray import DataArray
from xarray.core.variable import Variable

if isinstance(index, (DataArray, pd.Series)):
if index.ndim != 1:
raise ValueError("'index' must be 1D")
elif not _contains_datetime_like_objects(DataArray(index)):
elif not _contains_datetime_like_objects(Variable("dim", index)):
raise ValueError("'index' must contain datetime-like objects")
dtype = np.asarray(index).dtype
if dtype == "datetime64[ns]":
Expand Down
2 changes: 1 addition & 1 deletion xarray/core/accessor_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ def __new__(cls, obj: T_DataArray) -> CombinedDatetimelikeAccessor:
# we need to choose which parent (datetime or timedelta) is
# appropriate. Since we're checking the dtypes anyway, we'll just
# do all the validation here.
if not _contains_datetime_like_objects(obj):
if not _contains_datetime_like_objects(obj.variable):
raise TypeError(
"'.dt' accessor only available for "
"DataArray with datetime64 timedelta64 dtype or "
Expand Down
34 changes: 16 additions & 18 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pandas as pd

from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops
from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed
from xarray.core.options import OPTIONS, _get_keep_attrs
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar
Expand Down Expand Up @@ -40,6 +41,7 @@
ScalarOrArray,
SideOptions,
T_DataWithCoords,
T_Variable,
)
from xarray.core.variable import Variable

Expand Down Expand Up @@ -1770,31 +1772,27 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool:
return np.issubdtype(dtype, np.timedelta64)


def _contains_cftime_datetimes(array) -> bool:
"""Check if an array contains cftime.datetime objects"""
def _contains_cftime_datetimes(array: Any) -> bool:
"""Check if a array inside a Variable contains cftime.datetime objects"""
if cftime is None:
return False
else:
if array.dtype == np.dtype("O") and array.size > 0:
sample = np.asarray(array).flat[0]
if is_duck_dask_array(sample):
sample = sample.compute()
if isinstance(sample, np.ndarray):
sample = sample.item()
return isinstance(sample, cftime.datetime)
else:
return False

if array.dtype == np.dtype("O") and array.size > 0:
first_idx = (0,) * array.ndim
if isinstance(array, ExplicitlyIndexed):
first_idx = BasicIndexer(first_idx)
sample = array[first_idx]
return isinstance(np.asarray(sample).item(), cftime.datetime)

return False

def contains_cftime_datetimes(var) -> bool:

def contains_cftime_datetimes(var: T_Variable) -> bool:
"""Check if an xarray.Variable contains cftime.datetime objects"""
if var.dtype == np.dtype("O") and var.size > 0:
return _contains_cftime_datetimes(var.data)
else:
return False
return _contains_cftime_datetimes(var._data)


def _contains_datetime_like_objects(var) -> bool:
def _contains_datetime_like_objects(var: T_Variable) -> bool:
"""Check if a variable contains datetime like objects (either
np.datetime64, np.timedelta64, or cftime.datetime)
"""
Expand Down
13 changes: 12 additions & 1 deletion xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,18 @@ def __init__(self, array):
self.array = array

def __getitem__(self, key):
raise UnexpectedDataAccess("Tried accessing data")
raise UnexpectedDataAccess("Tried accessing data.")

def __array__(self):
raise UnexpectedDataAccess("Tried accessing data.")


class FirstElementAccessibleArray(InaccessibleArray):
def __getitem__(self, key):
tuple_idxr = key.tuple
if len(tuple_idxr) > 1:
raise UnexpectedDataAccess("Tried accessing more than one element.")
return self.array[tuple_idxr]


class ReturnItem:
Expand Down
1 change: 0 additions & 1 deletion xarray/tests/test_accessor_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,6 @@ def test_calendar_cftime(data) -> None:
assert data.time.dt.calendar == expected


@requires_cftime
def test_calendar_datetime64_2d() -> None:
data = xr.DataArray(np.zeros((4, 5), dtype="datetime64[ns]"), dims=("x", "y"))
assert data.dt.calendar == "proleptic_gregorian"
Expand Down
2 changes: 1 addition & 1 deletion xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2560,7 +2560,7 @@ def test_open_zarr_use_cftime(self) -> None:
ds_a = xr.open_zarr(store_target, **self.version_kwargs)
assert_identical(ds, ds_a)
ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs)
assert xr.coding.times.contains_cftime_datetimes(ds_b.time)
assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable)

def test_write_read_select_write(self) -> None:
# Test for https://github.com/pydata/xarray/issues/4084
Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ def test_CFMaskCoder_missing_value() -> None:
expected.attrs["missing_value"] = -9999

decoded = xr.decode_cf(expected.to_dataset())
encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs)
encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs)

assert_equal(encoded["tmpk"], expected.variable)

decoded.tmpk.encoding["_FillValue"] = -9940
with pytest.raises(ValueError):
encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs)
encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs)


@requires_dask
Expand Down
27 changes: 21 additions & 6 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from xarray.core.common import contains_cftime_datetimes
from xarray.testing import assert_equal, assert_identical
from xarray.tests import (
FirstElementAccessibleArray,
arm_xfail,
assert_array_equal,
assert_no_warnings,
Expand Down Expand Up @@ -787,35 +788,35 @@ def times_3d(times):

@requires_cftime
def test_contains_cftime_datetimes_1d(data) -> None:
assert contains_cftime_datetimes(data.time)
assert contains_cftime_datetimes(data.time.variable)


@requires_cftime
@requires_dask
def test_contains_cftime_datetimes_dask_1d(data) -> None:
assert contains_cftime_datetimes(data.time.chunk())
assert contains_cftime_datetimes(data.time.variable.chunk())


@requires_cftime
def test_contains_cftime_datetimes_3d(times_3d) -> None:
assert contains_cftime_datetimes(times_3d)
assert contains_cftime_datetimes(times_3d.variable)


@requires_cftime
@requires_dask
def test_contains_cftime_datetimes_dask_3d(times_3d) -> None:
assert contains_cftime_datetimes(times_3d.chunk())
assert contains_cftime_datetimes(times_3d.variable.chunk())


@pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])])
def test_contains_cftime_datetimes_non_cftimes(non_cftime_data) -> None:
assert not contains_cftime_datetimes(non_cftime_data)
assert not contains_cftime_datetimes(non_cftime_data.variable)


@requires_dask
@pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])])
def test_contains_cftime_datetimes_non_cftimes_dask(non_cftime_data) -> None:
assert not contains_cftime_datetimes(non_cftime_data.chunk())
assert not contains_cftime_datetimes(non_cftime_data.variable.chunk())


@requires_cftime
Expand Down Expand Up @@ -1176,3 +1177,17 @@ def test_scalar_unit() -> None:
variable = Variable(("x", "y"), np.array([[0, 1], [2, 3]]), {"units": np.nan})
result = coding.times.CFDatetimeCoder().decode(variable)
assert np.isnan(result.attrs["units"])


@requires_cftime
def test_contains_cftime_lazy() -> None:
import cftime

from xarray.core.common import _contains_cftime_datetimes

times = np.array(
[cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)],
dtype=object,
)
array = FirstElementAccessibleArray(times)
assert _contains_cftime_datetimes(array)
4 changes: 2 additions & 2 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6219,7 +6219,7 @@ def test_to_and_from_iris(self) -> None:
original_coord = original.coords[orginal_key]
assert coord.var_name == original_coord.name
assert_array_equal(
coord.points, CFDatetimeCoder().encode(original_coord).values
coord.points, CFDatetimeCoder().encode(original_coord.variable).values
)
assert actual.coord_dims(coord) == original.get_axis_num(
original.coords[coord.var_name].dims
Expand Down Expand Up @@ -6295,7 +6295,7 @@ def test_to_and_from_iris_dask(self) -> None:
original_coord = original.coords[orginal_key]
assert coord.var_name == original_coord.name
assert_array_equal(
coord.points, CFDatetimeCoder().encode(original_coord).values
coord.points, CFDatetimeCoder().encode(original_coord.variable).values
)
assert actual.coord_dims(coord) == original.get_axis_num(
original.coords[coord.var_name].dims
Expand Down

0 comments on commit 798f4d4

Please sign in to comment.