diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 3c957978acf..793da9d1bdd 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -233,7 +233,7 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: .. ipython:: python - da.resample(time="81T", closed="right", label="right", base=3).mean() + da.resample(time="81T", closed="right", label="right", offset="3T").mean() .. _Timestamp-valid range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601 diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3cc2efde599..52e58782a5b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,13 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +- Following pandas, the ``base`` and ``loffset`` parameters of + :py:meth:`xr.DataArray.resample` and :py:meth:`xr.Dataset.resample` have been + deprecated and will be removed in a future version of xarray. Using the + ``origin`` or ``offset`` parameters is recommended as a replacement for using + the ``base`` parameter and using time offset arithmetic is recommended as a + replacement for using the ``loffset`` parameter (:pull:`8459`). By `Spencer + Clark `_. Bug fixes diff --git a/xarray/core/common.py b/xarray/core/common.py index d980e622763..af935ae15d2 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -13,8 +13,14 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs +from xarray.core.pdcompat import _convert_base_to_offset from xarray.core.pycompat import is_duck_dask_array -from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar +from xarray.core.utils import ( + Frozen, + either_dict_or_kwargs, + emit_user_level_warning, + is_scalar, +) try: import cftime @@ -845,6 +851,12 @@ def _resample( For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for "24H" frequency, base could range from 0 through 23. + + .. deprecated:: 2023.03.0 + Following pandas, the ``base`` parameter is deprecated in favor + of the ``origin`` and ``offset`` parameters, and will be removed + in a future version of xarray. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, pd.Timestamp, datetime.datetime, np.datetime64, or cftime.datetime, default 'start_day' The datetime on which to adjust the grouping. The timezone of origin must match the timezone of the index. @@ -860,6 +872,12 @@ def _resample( loffset : timedelta or str, optional Offset used to adjust the resampled time labels. Some pandas date offset strings are supported. + + .. deprecated:: 2023.03.0 + Following pandas, the ``loffset`` parameter is deprecated in favor + of using time offset arithmetic, and will be removed in a future + version of xarray. + restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. @@ -930,8 +948,8 @@ def _resample( """ # TODO support non-string indexer after removing the old API. - from xarray.coding.cftimeindex import CFTimeIndex from xarray.core.dataarray import DataArray + from xarray.core.groupby import TimeResampleGrouper from xarray.core.resample import RESAMPLE_DIM if keep_attrs is not None: @@ -961,28 +979,36 @@ def _resample( dim_name: Hashable = dim dim_coord = self[dim] - if isinstance(self._indexes[dim_name].to_pandas_index(), CFTimeIndex): - from xarray.core.resample_cftime import CFTimeGrouper - - grouper = CFTimeGrouper( - freq=freq, - closed=closed, - label=label, - base=base, - loffset=loffset, - origin=origin, - offset=offset, + if loffset is not None: + emit_user_level_warning( + "Following pandas, the `loffset` parameter to resample will be deprecated " + "in a future version of xarray. Switch to using time offset arithmetic.", + FutureWarning, ) - else: - grouper = pd.Grouper( - freq=freq, - closed=closed, - label=label, - base=base, - offset=offset, - origin=origin, - loffset=loffset, + + if base is not None: + emit_user_level_warning( + "Following pandas, the `base` parameter to resample will be deprecated in " + "a future version of xarray. Switch to using `origin` or `offset` instead.", + FutureWarning, ) + + if base is not None and offset is not None: + raise ValueError("base and offset cannot be present at the same time") + + if base is not None: + index = self._indexes[dim_name].to_pandas_index() + offset = _convert_base_to_offset(base, freq, index) + + grouper = TimeResampleGrouper( + freq=freq, + closed=closed, + label=label, + origin=origin, + offset=offset, + loffset=loffset, + ) + group = DataArray( dim_coord, coords=dim_coord.coords, dims=dim_coord.dims, name=RESAMPLE_DIM ) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 5bfa0229af5..15694b41219 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -40,6 +40,7 @@ from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset + from xarray.core.types import DatetimeLike, SideOptions from xarray.core.utils import Frozen GroupKey = Any @@ -245,7 +246,10 @@ def _unique_and_monotonic(group: T_Group) -> bool: return index.is_unique and index.is_monotonic_increasing -def _apply_loffset(grouper, result): +def _apply_loffset( + loffset: str | pd.DateOffset | datetime.timedelta | pd.Timedelta, + result: pd.Series | pd.DataFrame, +): """ (copied from pandas) if loffset is set, offset the result index @@ -258,17 +262,25 @@ def _apply_loffset(grouper, result): result : Series or DataFrame the result of resample """ + # pd.Timedelta is a subclass of datetime.timedelta so we do not need to + # include it in instance checks. + if not isinstance(loffset, (str, pd.DateOffset, datetime.timedelta)): + raise ValueError( + f"`loffset` must be a str, pd.DateOffset, datetime.timedelta, or pandas.Timedelta object. " + f"Got {loffset}." + ) + + if isinstance(loffset, str): + loffset = pd.tseries.frequencies.to_offset(loffset) needs_offset = ( - isinstance(grouper.loffset, (pd.DateOffset, datetime.timedelta)) + isinstance(loffset, (pd.DateOffset, datetime.timedelta)) and isinstance(result.index, pd.DatetimeIndex) and len(result.index) > 0 ) if needs_offset: - result.index = result.index + grouper.loffset - - grouper.loffset = None + result.index = result.index + loffset class GroupBy(Generic[T_Xarray]): @@ -530,14 +542,7 @@ def __repr__(self) -> str: ) def _get_index_and_items(self, index, grouper): - from xarray.core.resample_cftime import CFTimeGrouper - - s = pd.Series(np.arange(index.size), index) - if isinstance(grouper, CFTimeGrouper): - first_items = grouper.first_items(index) - else: - first_items = s.groupby(grouper).first() - _apply_loffset(grouper, first_items) + first_items = grouper.first_items(index) full_index = first_items.index if first_items.isnull().any(): first_items = first_items.dropna() @@ -1365,3 +1370,50 @@ class DatasetGroupBy( # type: ignore[misc] ImplementsDatasetReduce, ): __slots__ = () + + +class TimeResampleGrouper: + def __init__( + self, + freq: str, + closed: SideOptions | None, + label: SideOptions | None, + origin: str | DatetimeLike, + offset: pd.Timedelta | datetime.timedelta | str | None, + loffset: datetime.timedelta | str | None, + ): + self.freq = freq + self.closed = closed + self.label = label + self.origin = origin + self.offset = offset + self.loffset = loffset + + def first_items(self, index): + from xarray import CFTimeIndex + from xarray.core.resample_cftime import CFTimeGrouper + + if isinstance(index, CFTimeIndex): + grouper = CFTimeGrouper( + freq=self.freq, + closed=self.closed, + label=self.label, + origin=self.origin, + offset=self.offset, + loffset=self.loffset, + ) + return grouper.first_items(index) + else: + s = pd.Series(np.arange(index.size), index) + grouper = pd.Grouper( + freq=self.freq, + closed=self.closed, + label=self.label, + origin=self.origin, + offset=self.offset, + ) + + first_items = s.groupby(grouper).first() + if self.loffset is not None: + _apply_loffset(self.loffset, first_items) + return first_items diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 018bb19b871..b20a96bb8d6 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -38,6 +38,10 @@ from enum import Enum from typing import Literal +import pandas as pd + +from xarray.coding import cftime_offsets + def count_not_none(*args) -> int: """Compute the number of non-None arguments. @@ -68,3 +72,22 @@ def __repr__(self) -> str: _NoDefault.no_default ) # Sentinel indicating the default value following pandas NoDefault = Literal[_NoDefault.no_default] # For typing following pandas + + +def _convert_base_to_offset(base, freq, index): + """Required until we officially deprecate the base argument to resample. This + translates a provided `base` argument to an `offset` argument, following logic + from pandas. + """ + from xarray.coding.cftimeindex import CFTimeIndex + + if isinstance(index, pd.DatetimeIndex): + freq = pd.tseries.frequencies.to_offset(freq) + if isinstance(freq, pd.offsets.Tick): + return pd.Timedelta(base * freq.nanos // freq.n) + elif isinstance(index, CFTimeIndex): + freq = cftime_offsets.to_offset(freq) + if isinstance(freq, cftime_offsets.Tick): + return base * freq.as_timedelta() // freq.n + else: + raise ValueError("Can only resample using a DatetimeIndex or CFTimeIndex.") diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index 7fdd372ec74..920a6873814 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -71,7 +71,6 @@ def __init__( freq: str | BaseCFTimeOffset, closed: SideOptions | None = None, label: SideOptions | None = None, - base: int | None = None, loffset: str | datetime.timedelta | BaseCFTimeOffset | None = None, origin: str | CFTimeDatetime = "start_day", offset: str | datetime.timedelta | None = None, @@ -79,10 +78,6 @@ def __init__( self.offset: datetime.timedelta | None self.closed: SideOptions self.label: SideOptions - - if base is not None and offset is not None: - raise ValueError("base and offset cannot be provided at the same time") - self.freq = to_offset(freq) self.loffset = loffset self.origin = origin @@ -122,9 +117,6 @@ def __init__( else: self.label = label - if base is not None and isinstance(self.freq, Tick): - offset = type(self.freq)(n=base % self.freq.n).as_timedelta() - if offset is not None: try: self.offset = _convert_offset_to_timedelta(offset) @@ -150,6 +142,16 @@ def first_items(self, index: CFTimeIndex): index, self.freq, self.closed, self.label, self.origin, self.offset ) if self.loffset is not None: + if not isinstance( + self.loffset, (str, datetime.timedelta, BaseCFTimeOffset) + ): + # BaseCFTimeOffset is not public API so we do not include it in + # the error message for now. + raise ValueError( + f"`loffset` must be a str or datetime.timedelta object. " + f"Got {self.loffset}." + ) + if isinstance(self.loffset, datetime.timedelta): labels = labels + self.loffset else: diff --git a/xarray/tests/test_cftimeindex_resample.py b/xarray/tests/test_cftimeindex_resample.py index 5f818b7663d..07bc14f8983 100644 --- a/xarray/tests/test_cftimeindex_resample.py +++ b/xarray/tests/test_cftimeindex_resample.py @@ -8,6 +8,7 @@ import pytest import xarray as xr +from xarray.core.pdcompat import _convert_base_to_offset from xarray.core.resample_cftime import CFTimeGrouper cftime = pytest.importorskip("cftime") @@ -130,17 +131,18 @@ def test_resample(freqs, closed, label, base, offset) -> None: da_datetimeindex = da(datetime_index) da_cftimeindex = da(cftime_index) - compare_against_pandas( - da_datetimeindex, - da_cftimeindex, - resample_freq, - closed=closed, - label=label, - base=base, - offset=offset, - origin=origin, - loffset=loffset, - ) + with pytest.warns(FutureWarning, match="`loffset` parameter"): + compare_against_pandas( + da_datetimeindex, + da_cftimeindex, + resample_freq, + closed=closed, + label=label, + base=base, + offset=offset, + origin=origin, + loffset=loffset, + ) @pytest.mark.parametrize( @@ -245,3 +247,43 @@ def test_timedelta_offset() -> None: timedelta_result = da_cftime.resample(time="2D", offset=timedelta).mean() string_result = da_cftime.resample(time="2D", offset=string).mean() xr.testing.assert_identical(timedelta_result, string_result) + + +@pytest.mark.parametrize("loffset", ["12H", datetime.timedelta(hours=-12)]) +def test_resample_loffset_cftimeindex(loffset) -> None: + datetimeindex = pd.date_range("2000-01-01", freq="6H", periods=10) + da_datetimeindex = xr.DataArray(np.arange(10), [("time", datetimeindex)]) + + cftimeindex = xr.cftime_range("2000-01-01", freq="6H", periods=10) + da_cftimeindex = xr.DataArray(np.arange(10), [("time", cftimeindex)]) + + with pytest.warns(FutureWarning, match="`loffset` parameter"): + result = da_cftimeindex.resample(time="24H", loffset=loffset).mean() + expected = da_datetimeindex.resample(time="24H", loffset=loffset).mean() + + result["time"] = result.xindexes["time"].to_pandas_index().to_datetimeindex() + xr.testing.assert_identical(result, expected) + + +def test_resample_invalid_loffset_cftimeindex() -> None: + times = xr.cftime_range("2000-01-01", freq="6H", periods=10) + da = xr.DataArray(np.arange(10), [("time", times)]) + + with pytest.raises(ValueError): + da.resample(time="24H", loffset=1) # type: ignore + + +@pytest.mark.parametrize(("base", "freq"), [(1, "10S"), (17, "3H"), (15, "5U")]) +def test__convert_base_to_offset(base, freq): + # Verify that the cftime_offset adapted version of _convert_base_to_offset + # produces the same result as the pandas version. + datetimeindex = pd.date_range("2000", periods=2) + cftimeindex = xr.cftime_range("2000", periods=2) + pandas_result = _convert_base_to_offset(base, freq, datetimeindex) + cftime_result = _convert_base_to_offset(base, freq, cftimeindex) + assert pandas_result.to_pytimedelta() == cftime_result + + +def test__convert_base_to_offset_invalid_index(): + with pytest.raises(ValueError, match="Can only resample"): + _convert_base_to_offset(1, "12H", pd.Index([0])) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index cec37560d8f..a7d98405017 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime import warnings import numpy as np @@ -16,6 +17,7 @@ assert_equal, assert_identical, create_test_data, + has_pandas_version_two, requires_dask, requires_flox, requires_scipy, @@ -1475,14 +1477,6 @@ def test_resample(self): actual = array.resample(time="24H").reduce(np.mean) assert_identical(expected, actual) - # Our use of `loffset` may change if we align our API with pandas' changes. - # ref https://github.com/pydata/xarray/pull/4537 - actual = array.resample(time="24H", loffset="-12H").mean() - expected_ = array.to_series().resample("24H").mean() - expected_.index += to_offset("-12H") - expected = DataArray.from_series(expected_) - assert_identical(actual, expected) - with pytest.raises(ValueError, match=r"index must be monotonic"): array[[2, 0, 1]].resample(time="1D") @@ -1802,12 +1796,15 @@ def test_upsample_interpolate_dask(self, chunked_time): # done here due to floating point arithmetic assert_allclose(expected, actual, rtol=1e-16) + @pytest.mark.skipif(has_pandas_version_two, reason="requires pandas < 2.0.0") def test_resample_base(self) -> None: times = pd.date_range("2000-01-01T02:03:01", freq="6H", periods=10) array = DataArray(np.arange(10), [("time", times)]) base = 11 - actual = array.resample(time="24H", base=base).mean() + + with pytest.warns(FutureWarning, match="the `base` parameter to resample"): + actual = array.resample(time="24H", base=base).mean() expected = DataArray(array.to_series().resample("24H", base=base).mean()) assert_identical(expected, actual) @@ -1829,6 +1826,32 @@ def test_resample_origin(self) -> None: expected = DataArray(array.to_series().resample("24H", origin=origin).mean()) assert_identical(expected, actual) + @pytest.mark.skipif(has_pandas_version_two, reason="requires pandas < 2.0.0") + @pytest.mark.parametrize( + "loffset", + [ + "-12H", + datetime.timedelta(hours=-12), + pd.Timedelta(hours=-12), + pd.DateOffset(hours=-12), + ], + ) + def test_resample_loffset(self, loffset) -> None: + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + with pytest.warns(FutureWarning, match="`loffset` parameter"): + actual = array.resample(time="24H", loffset=loffset).mean() + expected = DataArray(array.to_series().resample("24H", loffset=loffset).mean()) + assert_identical(actual, expected) + + def test_resample_invalid_loffset(self) -> None: + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + with pytest.raises(ValueError, match="`loffset` must be"): + array.resample(time="24H", loffset=1).mean() # type: ignore + class TestDatasetResample: def test_resample_and_first(self):