From 91fda3799a3c6d0efb30335cb6f854c36d0970b2 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Mon, 28 Nov 2022 18:38:52 -0500 Subject: [PATCH] Enable `origin` and `offset` arguments in `resample` (#7284) * Initial work toward enabling origin and offset arguments in resample * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix _convert_offset_to_timedelta * Reduce number of tests * Address initial review comments * Add more typing information * Make cftime import lazy * Fix module_available import and test * Remove old origin argument * Add type annotations for resample_cftime.py * Add None as a possibility for closed and label * Add what's new entry * Add missing type annotation * Delete added line * Fix typing errors * Add comment and test for as_timedelta stub * Remove old code * [test-upstream] Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 4 +- xarray/coding/cftime_offsets.py | 4 + xarray/core/common.py | 42 +++- xarray/core/dataarray.py | 21 +- xarray/core/dataset.py | 21 +- xarray/core/resample_cftime.py | 259 +++++++++++++++++----- xarray/core/types.py | 8 +- xarray/tests/test_cftime_offsets.py | 6 + xarray/tests/test_cftimeindex_resample.py | 171 ++++++++++---- xarray/tests/test_groupby.py | 27 +++ 10 files changed, 456 insertions(+), 107 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8a2f47bcf8..48113862c67 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,9 @@ v2022.11.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Enable using `offset` and `origin` arguments in :py:meth:`DataArray.resample` + and :py:meth:`Dataset.resample` (:issue:`7266`, :pull:`6538`). By `Spencer + Clark `_. - Add experimental support for Zarr's in-progress V3 specification. (:pull:`6475`). By `Gregory Lee `_ and `Joe Hamman `_. diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index a029f39c7b8..04b2d773e2e 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -207,6 +207,10 @@ def __mul__(self, other): return new_self * other return type(self)(n=other * self.n) + def as_timedelta(self): + """All Tick subclasses must implement an as_timedelta method.""" + raise NotImplementedError + def _get_day_of_month(other, day_option): """Find the day in `other`'s month that satisfies a BaseCFTimeOffset's diff --git a/xarray/core/common.py b/xarray/core/common.py index b613db9926d..d1387d62e99 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -44,7 +44,13 @@ from .indexes import Index from .resample import Resample from .rolling_exp import RollingExp - from .types import DTypeLikeSave, ScalarOrArray, SideOptions, T_DataWithCoords + from .types import ( + DatetimeLike, + DTypeLikeSave, + ScalarOrArray, + SideOptions, + T_DataWithCoords, + ) from .variable import Variable DTypeMaybeMapping = Union[DTypeLikeSave, Mapping[Any, DTypeLikeSave]] @@ -817,7 +823,9 @@ def _resample( skipna: bool | None, closed: SideOptions | None, label: SideOptions | None, - base: int, + base: int | None, + offset: pd.Timedelta | datetime.timedelta | str | None, + origin: str | DatetimeLike, keep_attrs: bool | None, loffset: datetime.timedelta | str | None, restore_coord_dims: bool | None, @@ -845,6 +853,18 @@ def _resample( For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for "24H" frequency, base could range from 0 through 23. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, pd.Timestamp, datetime.datetime, np.datetime64, or cftime.datetime, default 'start_day' + The datetime on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + + If a datetime is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + offset : pd.Timedelta, datetime.timedelta, or str, default is None + An offset timedelta added to the origin. loffset : timedelta or str, optional Offset used to adjust the resampled time labels. Some pandas date offset strings are supported. @@ -960,10 +980,24 @@ def _resample( if isinstance(self._indexes[dim_name].to_pandas_index(), CFTimeIndex): from .resample_cftime import CFTimeGrouper - grouper = CFTimeGrouper(freq, closed, label, base, loffset) + grouper = CFTimeGrouper( + freq=freq, + closed=closed, + label=label, + base=base, + loffset=loffset, + origin=origin, + offset=offset, + ) else: grouper = pd.Grouper( - freq=freq, closed=closed, label=label, base=base, loffset=loffset + freq=freq, + closed=closed, + label=label, + base=base, + offset=offset, + origin=origin, + loffset=loffset, ) group = DataArray( dim_coord, coords=dim_coord.coords, dims=dim_coord.dims, name=RESAMPLE_DIM diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index caa68bfae5c..6eac634bfff 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -78,6 +78,7 @@ from .rolling import DataArrayCoarsen, DataArrayRolling from .types import ( CoarsenBoundaryOptions, + DatetimeLike, DatetimeUnitOptions, Dims, ErrorOptions, @@ -6531,7 +6532,9 @@ def resample( skipna: bool | None = None, closed: SideOptions | None = None, label: SideOptions | None = None, - base: int = 0, + base: int | None = None, + offset: pd.Timedelta | datetime.timedelta | str | None = None, + origin: str | DatetimeLike = "start_day", keep_attrs: bool | None = None, loffset: datetime.timedelta | str | None = None, restore_coord_dims: bool | None = None, @@ -6555,10 +6558,22 @@ def resample( Side of each interval to treat as closed. label : {"left", "right"}, optional Side of each interval to use for labeling. - base : int, default = 0 + base : int, optional For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for "24H" frequency, base could range from 0 through 23. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, pd.Timestamp, datetime.datetime, np.datetime64, or cftime.datetime, default 'start_day' + The datetime on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + + If a datetime is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + offset : pd.Timedelta, datetime.timedelta, or str, default is None + An offset timedelta added to the origin. loffset : timedelta or str, optional Offset used to adjust the resampled time labels. Some pandas date offset strings are supported. @@ -6640,6 +6655,8 @@ def resample( closed=closed, label=label, base=base, + offset=offset, + origin=origin, keep_attrs=keep_attrs, loffset=loffset, restore_coord_dims=restore_coord_dims, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4e7a2b5603b..4f376bdf811 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -107,6 +107,7 @@ CoarsenBoundaryOptions, CombineAttrsOptions, CompatOptions, + DatetimeLike, DatetimeUnitOptions, Dims, ErrorOptions, @@ -9128,7 +9129,9 @@ def resample( skipna: bool | None = None, closed: SideOptions | None = None, label: SideOptions | None = None, - base: int = 0, + base: int | None = None, + offset: pd.Timedelta | datetime.timedelta | str | None = None, + origin: str | DatetimeLike = "start_day", keep_attrs: bool | None = None, loffset: datetime.timedelta | str | None = None, restore_coord_dims: bool | None = None, @@ -9152,10 +9155,22 @@ def resample( Side of each interval to treat as closed. label : {"left", "right"}, optional Side of each interval to use for labeling. - base : int, default = 0 + base : int, optional For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for "24H" frequency, base could range from 0 through 23. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, pd.Timestamp, datetime.datetime, np.datetime64, or cftime.datetime, default 'start_day' + The datetime on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + + If a datetime is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + offset : pd.Timedelta, datetime.timedelta, or str, default is None + An offset timedelta added to the origin. loffset : timedelta or str, optional Offset used to adjust the resampled time labels. Some pandas date offset strings are supported. @@ -9190,6 +9205,8 @@ def resample( closed=closed, label=label, base=base, + offset=offset, + origin=origin, keep_attrs=keep_attrs, loffset=loffset, restore_coord_dims=restore_coord_dims, diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index 11eceda77ee..da21fdd17cf 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -38,21 +38,27 @@ from __future__ import annotations import datetime +import typing import numpy as np import pandas as pd from ..coding.cftime_offsets import ( - CFTIME_TICKS, + BaseCFTimeOffset, Day, MonthEnd, QuarterEnd, + Tick, YearEnd, cftime_range, normalize_date, to_offset, ) from ..coding.cftimeindex import CFTimeIndex +from .types import SideOptions + +if typing.TYPE_CHECKING: + from .types import CFTimeDatetime class CFTimeGrouper: @@ -60,25 +66,77 @@ class CFTimeGrouper: single method, the only one required for resampling in xarray. It cannot be used in a call to groupby like a pandas.Grouper object can.""" - def __init__(self, freq, closed=None, label=None, base=0, loffset=None): + def __init__( + self, + freq: str | BaseCFTimeOffset, + closed: SideOptions | None = None, + label: SideOptions | None = None, + base: int | None = None, + loffset: str | datetime.timedelta | BaseCFTimeOffset | None = None, + origin: str | CFTimeDatetime = "start_day", + offset: str | datetime.timedelta | None = None, + ): + self.offset: datetime.timedelta | None + self.closed: SideOptions + self.label: SideOptions + + if base is not None and offset is not None: + raise ValueError("base and offset cannot be provided at the same time") + self.freq = to_offset(freq) - self.closed = closed - self.label = label - self.base = base self.loffset = loffset + self.origin = origin if isinstance(self.freq, (MonthEnd, QuarterEnd, YearEnd)): - if self.closed is None: + if closed is None: self.closed = "right" - if self.label is None: + else: + self.closed = closed + if label is None: self.label = "right" + else: + self.label = label + else: + # The backward resample sets ``closed`` to ``'right'`` by default + # since the last value should be considered as the edge point for + # the last bin. When origin in "end" or "end_day", the value for a + # specific ``cftime.datetime`` index stands for the resample result + # from the current ``cftime.datetime`` minus ``freq`` to the current + # ``cftime.datetime`` with a right close. + if self.origin in ["end", "end_day"]: + if closed is None: + self.closed = "right" + else: + self.closed = closed + if label is None: + self.label = "right" + else: + self.label = label + else: + if closed is None: + self.closed = "left" + else: + self.closed = closed + if label is None: + self.label = "left" + else: + self.label = label + + if base is not None and isinstance(self.freq, Tick): + offset = type(self.freq)(n=base % self.freq.n).as_timedelta() + + if offset is not None: + try: + self.offset = _convert_offset_to_timedelta(offset) + except (ValueError, AttributeError) as error: + raise ValueError( + f"offset must be a datetime.timedelta object or an offset string " + f"that can be converted to a timedelta. Got {offset} instead." + ) from error else: - if self.closed is None: - self.closed = "left" - if self.label is None: - self.label = "left" + self.offset = None - def first_items(self, index): + def first_items(self, index: CFTimeIndex): """Meant to reproduce the results of the following grouper = pandas.Grouper(...) @@ -89,7 +147,7 @@ def first_items(self, index): """ datetime_bins, labels = _get_time_bins( - index, self.freq, self.closed, self.label, self.base + index, self.freq, self.closed, self.label, self.origin, self.offset ) if self.loffset is not None: if isinstance(self.loffset, datetime.timedelta): @@ -111,7 +169,14 @@ def first_items(self, index): return first_items.where(non_duplicate) -def _get_time_bins(index, freq, closed, label, base): +def _get_time_bins( + index: CFTimeIndex, + freq: BaseCFTimeOffset, + closed: SideOptions, + label: SideOptions, + origin: str | CFTimeDatetime, + offset: datetime.timedelta | None, +): """Obtain the bins and their respective labels for resampling operations. Parameters @@ -122,18 +187,26 @@ def _get_time_bins(index, freq, closed, label, base): The offset object representing target conversion a.k.a. resampling frequency (e.g., 'MS', '2D', 'H', or '3T' with coding.cftime_offsets.to_offset() applied to it). - closed : 'left' or 'right', optional + closed : 'left' or 'right' Which side of bin interval is closed. The default is 'left' for all frequency offsets except for 'M' and 'A', which have a default of 'right'. - label : 'left' or 'right', optional + label : 'left' or 'right' Which bin edge label to label bucket with. The default is 'left' for all frequency offsets except for 'M' and 'A', which have a default of 'right'. - base : int, optional - For frequencies that evenly subdivide 1 day, the "origin" of the - aggregated intervals. For example, for '5min' frequency, base could - range from 0 through 4. Defaults to 0. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'} or cftime.datetime, default 'start_day' + The datetime on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + + If a datetime is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + offset : datetime.timedelta, default is None + An offset timedelta added to the origin. Returns ------- @@ -154,7 +227,7 @@ def _get_time_bins(index, freq, closed, label, base): return datetime_bins, labels first, last = _get_range_edges( - index.min(), index.max(), freq, closed=closed, base=base + index.min(), index.max(), freq, closed=closed, origin=origin, offset=offset ) datetime_bins = labels = cftime_range( freq=freq, start=first, end=last, name=index.name @@ -172,7 +245,13 @@ def _get_time_bins(index, freq, closed, label, base): return datetime_bins, labels -def _adjust_bin_edges(datetime_bins, offset, closed, index, labels): +def _adjust_bin_edges( + datetime_bins: np.ndarray, + freq: BaseCFTimeOffset, + closed: SideOptions, + index: CFTimeIndex, + labels: np.ndarray, +): """This is required for determining the bin edges resampling with daily frequencies greater than one day, month end, and year end frequencies. @@ -207,8 +286,8 @@ def _adjust_bin_edges(datetime_bins, offset, closed, index, labels): This is also required for daily frequencies longer than one day and year-end frequencies. """ - is_super_daily = isinstance(offset, (MonthEnd, QuarterEnd, YearEnd)) or ( - isinstance(offset, Day) and offset.n > 1 + is_super_daily = isinstance(freq, (MonthEnd, QuarterEnd, YearEnd)) or ( + isinstance(freq, Day) and freq.n > 1 ) if is_super_daily: if closed == "right": @@ -220,7 +299,14 @@ def _adjust_bin_edges(datetime_bins, offset, closed, index, labels): return datetime_bins, labels -def _get_range_edges(first, last, offset, closed="left", base=0): +def _get_range_edges( + first: CFTimeDatetime, + last: CFTimeDatetime, + freq: BaseCFTimeOffset, + closed: SideOptions = "left", + origin: str | CFTimeDatetime = "start_day", + offset: datetime.timedelta | None = None, +): """Get the correct starting and ending datetimes for the resampled CFTimeIndex range. @@ -232,16 +318,24 @@ def _get_range_edges(first, last, offset, closed="left", base=0): last : cftime.datetime Uncorrected ending datetime object for resampled CFTimeIndex range. Usually the max of the original CFTimeIndex. - offset : xarray.coding.cftime_offsets.BaseCFTimeOffset + freq : xarray.coding.cftime_offsets.BaseCFTimeOffset The offset object representing target conversion a.k.a. resampling frequency. Contains information on offset type (e.g. Day or 'D') and offset magnitude (e.g., n = 3). - closed : 'left' or 'right', optional + closed : 'left' or 'right' Which side of bin interval is closed. Defaults to 'left'. - base : int, optional - For frequencies that evenly subdivide 1 day, the "origin" of the - aggregated intervals. For example, for '5min' frequency, base could - range from 0 through 4. Defaults to 0. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'} or cftime.datetime, default 'start_day' + The datetime on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + + If a datetime is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + offset : datetime.timedelta, default is None + An offset timedelta added to the origin. Returns ------- @@ -250,21 +344,28 @@ def _get_range_edges(first, last, offset, closed="left", base=0): last : cftime.datetime Corrected ending datetime object for resampled CFTimeIndex range. """ - if isinstance(offset, CFTIME_TICKS): + if isinstance(freq, Tick): first, last = _adjust_dates_anchored( - first, last, offset, closed=closed, base=base + first, last, freq, closed=closed, origin=origin, offset=offset ) return first, last else: first = normalize_date(first) last = normalize_date(last) - first = offset.rollback(first) if closed == "left" else first - offset - last = last + offset + first = freq.rollback(first) if closed == "left" else first - freq + last = last + freq return first, last -def _adjust_dates_anchored(first, last, offset, closed="right", base=0): +def _adjust_dates_anchored( + first: CFTimeDatetime, + last: CFTimeDatetime, + freq: Tick, + closed: SideOptions = "right", + origin: str | CFTimeDatetime = "start_day", + offset: datetime.timedelta | None = None, +): """First and last offsets should be calculated from the start day to fix an error cause by resampling across multiple days when a one day period is not a multiple of the frequency. @@ -276,16 +377,24 @@ def _adjust_dates_anchored(first, last, offset, closed="right", base=0): A datetime object representing the start of a CFTimeIndex range. last : cftime.datetime A datetime object representing the end of a CFTimeIndex range. - offset : xarray.coding.cftime_offsets.BaseCFTimeOffset + freq : xarray.coding.cftime_offsets.BaseCFTimeOffset The offset object representing target conversion a.k.a. resampling frequency. Contains information on offset type (e.g. Day or 'D') and offset magnitude (e.g., n = 3). - closed : 'left' or 'right', optional + closed : 'left' or 'right' Which side of bin interval is closed. Defaults to 'right'. - base : int, optional - For frequencies that evenly subdivide 1 day, the "origin" of the - aggregated intervals. For example, for '5min' frequency, base could - range from 0 through 4. Defaults to 0. + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'} or cftime.datetime, default 'start_day' + The datetime on which to adjust the grouping. The timezone of origin + must match the timezone of the index. + + If a datetime is not used, these values are also supported: + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + offset : datetime.timedelta, default is None + An offset timedelta added to the origin. Returns ------- @@ -296,33 +405,59 @@ def _adjust_dates_anchored(first, last, offset, closed="right", base=0): A datetime object representing the end of a date range that has been adjusted to fix resampling errors. """ + import cftime + + if origin == "start_day": + origin_date = normalize_date(first) + elif origin == "start": + origin_date = first + elif origin == "epoch": + origin_date = type(first)(1970, 1, 1) + elif origin in ["end", "end_day"]: + origin_last = last if origin == "end" else _ceil_via_cftimeindex(last, "D") + sub_freq_times = (origin_last - first) // freq.as_timedelta() + if closed == "left": + sub_freq_times += 1 + first = origin_last - sub_freq_times * freq + origin_date = first + elif isinstance(origin, cftime.datetime): + origin_date = origin + else: + raise ValueError( + f"origin must be one of {{'epoch', 'start_day', 'start', 'end', 'end_day'}} " + f"or a cftime.datetime object. Got {origin}." + ) + + if offset is not None: + origin_date = origin_date + offset + + foffset = (first - origin_date) % freq.as_timedelta() + loffset = (last - origin_date) % freq.as_timedelta() - base = base % offset.n - start_day = normalize_date(first) - base_td = type(offset)(n=base).as_timedelta() - start_day += base_td - foffset = exact_cftime_datetime_difference(start_day, first) % offset.as_timedelta() - loffset = exact_cftime_datetime_difference(start_day, last) % offset.as_timedelta() if closed == "right": if foffset.total_seconds() > 0: fresult = first - foffset else: - fresult = first - offset.as_timedelta() + fresult = first - freq.as_timedelta() if loffset.total_seconds() > 0: - lresult = last + (offset.as_timedelta() - loffset) + lresult = last + (freq.as_timedelta() - loffset) else: lresult = last else: - fresult = first - foffset if foffset.total_seconds() > 0 else first + if foffset.total_seconds() > 0: + fresult = first - foffset + else: + fresult = first + if loffset.total_seconds() > 0: - lresult = last + (offset.as_timedelta() - loffset) + lresult = last + (freq.as_timedelta() - loffset) else: - lresult = last + offset.as_timedelta() + lresult = last + freq return fresult, lresult -def exact_cftime_datetime_difference(a, b): +def exact_cftime_datetime_difference(a: CFTimeDatetime, b: CFTimeDatetime): """Exact computation of b - a Assumes: @@ -360,3 +495,19 @@ def exact_cftime_datetime_difference(a, b): seconds = int(round(seconds.total_seconds())) microseconds = b.microsecond - a.microsecond return datetime.timedelta(seconds=seconds, microseconds=microseconds) + + +def _convert_offset_to_timedelta( + offset: datetime.timedelta | str | BaseCFTimeOffset, +) -> datetime.timedelta: + if isinstance(offset, datetime.timedelta): + return offset + elif isinstance(offset, (str, Tick)): + return to_offset(offset).as_timedelta() + else: + raise ValueError + + +def _ceil_via_cftimeindex(date: CFTimeDatetime, freq: str | BaseCFTimeOffset): + index = CFTimeIndex([date]) + return index.ceil(freq).item() diff --git a/xarray/core/types.py b/xarray/core/types.py index 7579148e4c2..adf046dabb2 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime from typing import ( TYPE_CHECKING, Any, @@ -17,6 +18,7 @@ ) import numpy as np +import pandas as pd from packaging.version import Version if TYPE_CHECKING: @@ -82,7 +84,11 @@ def dtype(self) -> np.dtype: # anything with a dtype attribute _SupportsDType, ] - + try: + from cftime import datetime as CFTimeDatetime + except ImportError: + CFTimeDatetime = Any + DatetimeLike = Union[pd.Timestamp, datetime.datetime, np.datetime64, CFTimeDatetime] else: Self: Any = None DTypeLikeSave: Any = None diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 075393e84e7..d28f4594559 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1385,3 +1385,9 @@ def test_date_range_like_errors(): match="'source' must be a 1D array of datetime objects for inferring its range.", ): date_range_like(da, "noleap") + + +def as_timedelta_not_implemented_error(): + tick = Tick() + with pytest.raises(NotImplementedError): + tick.as_timedelta() diff --git a/xarray/tests/test_cftimeindex_resample.py b/xarray/tests/test_cftimeindex_resample.py index 35447a39f3c..e780421e09e 100644 --- a/xarray/tests/test_cftimeindex_resample.py +++ b/xarray/tests/test_cftimeindex_resample.py @@ -9,7 +9,7 @@ import xarray as xr from xarray.core.resample_cftime import CFTimeGrouper -pytest.importorskip("cftime") +cftime = pytest.importorskip("cftime") # Create a list of pairs of similar-length initial and resample frequencies @@ -50,7 +50,63 @@ ] -def da(index): +def compare_against_pandas( + da_datetimeindex, + da_cftimeindex, + freq, + closed=None, + label=None, + base=None, + offset=None, + origin=None, + loffset=None, +) -> None: + if isinstance(origin, tuple): + origin_pandas = pd.Timestamp(datetime.datetime(*origin)) + origin_cftime = cftime.DatetimeGregorian(*origin) + else: + origin_pandas = origin + origin_cftime = origin + + try: + result_datetimeindex = da_datetimeindex.resample( + time=freq, + closed=closed, + label=label, + base=base, + loffset=loffset, + offset=offset, + origin=origin_pandas, + ).mean() + except ValueError: + with pytest.raises(ValueError): + da_cftimeindex.resample( + time=freq, + closed=closed, + label=label, + base=base, + loffset=loffset, + origin=origin_cftime, + offset=offset, + ).mean() + else: + result_cftimeindex = da_cftimeindex.resample( + time=freq, + closed=closed, + label=label, + base=base, + loffset=loffset, + origin=origin_cftime, + offset=offset, + ).mean() + # TODO (benbovy - flexible indexes): update when CFTimeIndex is a xarray Index subclass + result_cftimeindex["time"] = ( + result_cftimeindex.xindexes["time"].to_pandas_index().to_datetimeindex() + ) + xr.testing.assert_identical(result_cftimeindex, result_datetimeindex) + + +def da(index) -> xr.DataArray: return xr.DataArray( np.arange(100.0, 100.0 + index.size), coords=[index], dims=["time"] ) @@ -59,53 +115,31 @@ def da(index): @pytest.mark.parametrize("freqs", FREQS, ids=lambda x: "{}->{}".format(*x)) @pytest.mark.parametrize("closed", [None, "left", "right"]) @pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("base", [24, 31]) -def test_resample(freqs, closed, label, base) -> None: +@pytest.mark.parametrize( + ("base", "offset"), [(24, None), (31, None), (None, "5S")], ids=lambda x: f"{x}" +) +def test_resample(freqs, closed, label, base, offset) -> None: initial_freq, resample_freq = freqs start = "2000-01-01T12:07:01" + loffset = "12H" + origin = "start" index_kwargs = dict(start=start, periods=5, freq=initial_freq) datetime_index = pd.date_range(**index_kwargs) cftime_index = xr.cftime_range(**index_kwargs) + da_datetimeindex = da(datetime_index) + da_cftimeindex = da(cftime_index) - loffset = "12H" - try: - da_datetime = ( - da(datetime_index) - .resample( - time=resample_freq, - closed=closed, - label=label, - base=base, - loffset=loffset, - ) - .mean() - ) - except ValueError: - with pytest.raises(ValueError): - da(cftime_index).resample( - time=resample_freq, - closed=closed, - label=label, - base=base, - loffset=loffset, - ).mean() - else: - da_cftime = ( - da(cftime_index) - .resample( - time=resample_freq, - closed=closed, - label=label, - base=base, - loffset=loffset, - ) - .mean() - ) - # TODO (benbovy - flexible indexes): update when CFTimeIndex is a xarray Index subclass - da_cftime["time"] = ( - da_cftime.xindexes["time"].to_pandas_index().to_datetimeindex() - ) - xr.testing.assert_identical(da_cftime, da_datetime) + compare_against_pandas( + da_datetimeindex, + da_cftimeindex, + resample_freq, + closed=closed, + label=label, + base=base, + offset=offset, + origin=origin, + loffset=loffset, + ) @pytest.mark.parametrize( @@ -153,3 +187,54 @@ def test_calendars(calendar) -> None: # TODO (benbovy - flexible indexes): update when CFTimeIndex is a xarray Index subclass da_cftime["time"] = da_cftime.xindexes["time"].to_pandas_index().to_datetimeindex() xr.testing.assert_identical(da_cftime, da_datetime) + + +@pytest.mark.parametrize("closed", ["left", "right"]) +@pytest.mark.parametrize( + "origin", + ["start_day", "start", "end", "end_day", "epoch", (1970, 1, 1, 3, 2)], + ids=lambda x: f"{x}", +) +def test_origin(closed, origin) -> None: + initial_freq, resample_freq = ("3H", "9H") + start = "1969-12-31T12:07:01" + index_kwargs = dict(start=start, periods=12, freq=initial_freq) + datetime_index = pd.date_range(**index_kwargs) + cftime_index = xr.cftime_range(**index_kwargs) + da_datetimeindex = da(datetime_index) + da_cftimeindex = da(cftime_index) + + compare_against_pandas( + da_datetimeindex, + da_cftimeindex, + resample_freq, + closed=closed, + origin=origin, + ) + + +def test_base_and_offset_error(): + cftime_index = xr.cftime_range("2000", periods=5) + da_cftime = da(cftime_index) + with pytest.raises(ValueError, match="base and offset cannot"): + da_cftime.resample(time="2D", base=3, offset="5S") + + +@pytest.mark.parametrize("offset", ["foo", "5MS", 10]) +def test_invalid_offset_error(offset) -> None: + cftime_index = xr.cftime_range("2000", periods=5) + da_cftime = da(cftime_index) + with pytest.raises(ValueError, match="offset must be"): + da_cftime.resample(time="2D", offset=offset) + + +def test_timedelta_offset() -> None: + timedelta = datetime.timedelta(seconds=5) + string = "5S" + + cftime_index = xr.cftime_range("2000", periods=5) + da_cftime = da(cftime_index) + + timedelta_result = da_cftime.resample(time="2D", offset=timedelta).mean() + string_result = da_cftime.resample(time="2D", offset=string).mean() + xr.testing.assert_identical(timedelta_result, string_result) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index d647c82a76b..063dc22e633 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1810,6 +1810,33 @@ def test_upsample_interpolate_dask(self, chunked_time): # done here due to floating point arithmetic assert_allclose(expected, actual, rtol=1e-16) + def test_resample_base(self) -> None: + times = pd.date_range("2000-01-01T02:03:01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + base = 11 + actual = array.resample(time="24H", base=base).mean() + expected = DataArray(array.to_series().resample("24H", base=base).mean()) + assert_identical(expected, actual) + + def test_resample_offset(self) -> None: + times = pd.date_range("2000-01-01T02:03:01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + offset = pd.Timedelta("11H") + actual = array.resample(time="24H", offset=offset).mean() + expected = DataArray(array.to_series().resample("24H", offset=offset).mean()) + assert_identical(expected, actual) + + def test_resample_origin(self) -> None: + times = pd.date_range("2000-01-01T02:03:01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + origin = "start" + actual = array.resample(time="24H", origin=origin).mean() + expected = DataArray(array.to_series().resample("24H", origin=origin).mean()) + assert_identical(expected, actual) + class TestDatasetResample: def test_resample_and_first(self):