diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f619284437b..ac326342115 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -52,6 +52,13 @@ Deprecations New Features ~~~~~~~~~~~~ +- Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O + roundtripping of ``cftime.datetime`` objects (:pull:`4758`). + By `Spencer Clark `_. +- :py:meth:`~xarray.cftime_range` and :py:meth:`DataArray.resample` now support + millisecond (``"L"`` or ``"ms"``) and microsecond (``"U"`` or ``"us"``) frequencies + for ``cftime.datetime`` coordinates (:issue:`4097`, :pull:`4758`). + By `Spencer Clark `_. - Significantly higher ``unstack`` performance on numpy-backed arrays which contain missing values; 8x faster in our benchmark, and 2x faster than pandas. (:pull:`4746`); diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 177a0fd831b..c25d5296c41 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -576,6 +576,26 @@ def __apply__(self, other): return other + self.as_timedelta() +class Millisecond(BaseCFTimeOffset): + _freq = "L" + + def as_timedelta(self): + return timedelta(milliseconds=self.n) + + def __apply__(self, other): + return other + self.as_timedelta() + + +class Microsecond(BaseCFTimeOffset): + _freq = "U" + + def as_timedelta(self): + return timedelta(microseconds=self.n) + + def __apply__(self, other): + return other + self.as_timedelta() + + _FREQUENCIES = { "A": YearEnd, "AS": YearBegin, @@ -590,6 +610,10 @@ def __apply__(self, other): "T": Minute, "min": Minute, "S": Second, + "L": Millisecond, + "ms": Millisecond, + "U": Microsecond, + "us": Microsecond, "AS-JAN": partial(YearBegin, month=1), "AS-FEB": partial(YearBegin, month=2), "AS-MAR": partial(YearBegin, month=3), @@ -824,7 +848,7 @@ def cftime_range( `ISO-8601 format `_. - It supports many, but not all, frequencies supported by ``pandas.date_range``. For example it does not currently support any of - the business-related, semi-monthly, or sub-second frequencies. + the business-related or semi-monthly frequencies. - Compound sub-monthly frequencies are not supported, e.g. '1H1min', as these can easily be written in terms of the finest common resolution, e.g. '61min'. @@ -855,6 +879,10 @@ def cftime_range( +--------+--------------------------+ | S | Second frequency | +--------+--------------------------+ + | L, ms | Millisecond frequency | + +--------+--------------------------+ + | U, us | Microsecond frequency | + +--------+--------------------------+ Any multiples of the following anchored offsets are also supported. diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 39ad2f57c1e..a1822393dc1 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -1,6 +1,6 @@ import re import warnings -from datetime import datetime +from datetime import datetime, timedelta from distutils.version import LooseVersion from functools import partial @@ -35,6 +35,26 @@ "D": int(1e9) * 60 * 60 * 24, } +_US_PER_TIME_DELTA = { + "microseconds": 1, + "milliseconds": 1_000, + "seconds": 1_000_000, + "minutes": 60 * 1_000_000, + "hours": 60 * 60 * 1_000_000, + "days": 24 * 60 * 60 * 1_000_000, +} + +_NETCDF_TIME_UNITS_CFTIME = [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", +] + +_NETCDF_TIME_UNITS_NUMPY = _NETCDF_TIME_UNITS_CFTIME + ["nanoseconds"] + TIME_UNITS = frozenset( [ "days", @@ -225,9 +245,7 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): if calendar in _STANDARD_CALENDARS: dates = cftime_to_nptime(dates) elif use_cftime: - dates = _decode_datetime_with_cftime( - flat_num_dates.astype(float), units, calendar - ) + dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar) else: dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) @@ -262,25 +280,33 @@ def decode_cf_timedelta(num_timedeltas, units): return result.reshape(num_timedeltas.shape) +def _unit_timedelta_cftime(units): + return timedelta(microseconds=_US_PER_TIME_DELTA[units]) + + +def _unit_timedelta_numpy(units): + numpy_units = _netcdf_to_numpy_timeunit(units) + return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns") + + def _infer_time_units_from_diff(unique_timedeltas): - # Note that the modulus operator was only implemented for np.timedelta64 - # arrays as of NumPy version 1.16.0. Once our minimum version of NumPy - # supported is greater than or equal to this we will no longer need to cast - # unique_timedeltas to a TimedeltaIndex. In the meantime, however, the - # modulus operator works for TimedeltaIndex objects. - unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas) - for time_unit in [ - "days", - "hours", - "minutes", - "seconds", - "milliseconds", - "microseconds", - "nanoseconds", - ]: - delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] - unit_delta = np.timedelta64(delta_ns, "ns") - if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")): + if unique_timedeltas.dtype == np.dtype("O"): + time_units = _NETCDF_TIME_UNITS_CFTIME + unit_timedelta = _unit_timedelta_cftime + zero_timedelta = timedelta(microseconds=0) + timedeltas = unique_timedeltas + else: + time_units = _NETCDF_TIME_UNITS_NUMPY + unit_timedelta = _unit_timedelta_numpy + zero_timedelta = np.timedelta64(0, "ns") + # Note that the modulus operator was only implemented for np.timedelta64 + # arrays as of NumPy version 1.16.0. Once our minimum version of NumPy + # supported is greater than or equal to this we will no longer need to cast + # unique_timedeltas to a TimedeltaIndex. In the meantime, however, the + # modulus operator works for TimedeltaIndex objects. + timedeltas = pd.TimedeltaIndex(unique_timedeltas) + for time_unit in time_units: + if np.all(timedeltas % unit_timedelta(time_unit) == zero_timedelta): return time_unit return "seconds" @@ -309,10 +335,6 @@ def infer_datetime_units(dates): reference_date = dates[0] if len(dates) > 0 else "1970-01-01" reference_date = format_cftime_datetime(reference_date) unique_timedeltas = np.unique(np.diff(dates)) - if unique_timedeltas.dtype == np.dtype("O"): - # Convert to np.timedelta64 objects using pandas to work around a - # NumPy casting bug: https://github.com/numpy/numpy/issues/11096 - unique_timedeltas = to_timedelta_unboxed(unique_timedeltas) units = _infer_time_units_from_diff(unique_timedeltas) return f"{units} since {reference_date}" diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 7c18f1a8c8a..a7761aefa3d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -68,6 +68,7 @@ def LooseVersion(vstring): has_pseudonetcdf, requires_pseudonetcdf = _importorskip("PseudoNetCDF") has_cftime, requires_cftime = _importorskip("cftime") has_cftime_1_1_0, requires_cftime_1_1_0 = _importorskip("cftime", minversion="1.1.0.0") +has_cftime_1_4_1, requires_cftime_1_4_1 = _importorskip("cftime", minversion="1.4.1") has_dask, requires_dask = _importorskip("dask") has_bottleneck, requires_bottleneck = _importorskip("bottleneck") has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 3efcf8039c6..b1ecf059f2f 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -10,6 +10,8 @@ BaseCFTimeOffset, Day, Hour, + Microsecond, + Millisecond, Minute, MonthBegin, MonthEnd, @@ -181,6 +183,14 @@ def test_to_offset_offset_input(offset): ("2min", Minute(n=2)), ("S", Second()), ("2S", Second(n=2)), + ("L", Millisecond(n=1)), + ("2L", Millisecond(n=2)), + ("ms", Millisecond(n=1)), + ("2ms", Millisecond(n=2)), + ("U", Microsecond(n=1)), + ("2U", Microsecond(n=2)), + ("us", Microsecond(n=1)), + ("2us", Microsecond(n=2)), ], ids=_id_func, ) @@ -299,6 +309,8 @@ def test_to_cftime_datetime_error_type_error(): Hour(), Minute(), Second(), + Millisecond(), + Microsecond(), ] _EQ_TESTS_B = [ BaseCFTimeOffset(n=2), @@ -316,6 +328,8 @@ def test_to_cftime_datetime_error_type_error(): Hour(n=2), Minute(n=2), Second(n=2), + Millisecond(n=2), + Microsecond(n=2), ] @@ -340,6 +354,8 @@ def test_neq(a, b): Hour(n=2), Minute(n=2), Second(n=2), + Millisecond(n=2), + Microsecond(n=2), ] @@ -360,6 +376,8 @@ def test_eq(a, b): (Hour(), Hour(n=3)), (Minute(), Minute(n=3)), (Second(), Second(n=3)), + (Millisecond(), Millisecond(n=3)), + (Microsecond(), Microsecond(n=3)), ] @@ -387,6 +405,8 @@ def test_rmul(offset, expected): (Hour(), Hour(n=-1)), (Minute(), Minute(n=-1)), (Second(), Second(n=-1)), + (Millisecond(), Millisecond(n=-1)), + (Microsecond(), Microsecond(n=-1)), ], ids=_id_func, ) @@ -399,6 +419,8 @@ def test_neg(offset, expected): (Hour(n=2), (1, 1, 1, 2)), (Minute(n=2), (1, 1, 1, 0, 2)), (Second(n=2), (1, 1, 1, 0, 0, 2)), + (Millisecond(n=2), (1, 1, 1, 0, 0, 0, 2000)), + (Microsecond(n=2), (1, 1, 1, 0, 0, 0, 2)), ] @@ -427,6 +449,8 @@ def test_radd_sub_monthly(offset, expected_date_args, calendar): (Hour(n=2), (1, 1, 2, 22)), (Minute(n=2), (1, 1, 2, 23, 58)), (Second(n=2), (1, 1, 2, 23, 59, 58)), + (Millisecond(n=2), (1, 1, 2, 23, 59, 59, 998000)), + (Microsecond(n=2), (1, 1, 2, 23, 59, 59, 999998)), ], ids=_id_func, ) @@ -802,6 +826,8 @@ def test_add_quarter_end_onOffset( ((1, 1, 1), Hour(), True), ((1, 1, 1), Minute(), True), ((1, 1, 1), Second(), True), + ((1, 1, 1), Millisecond(), True), + ((1, 1, 1), Microsecond(), True), ], ids=_id_func, ) @@ -865,6 +891,8 @@ def test_onOffset_month_or_quarter_or_year_end( (Hour(), (1, 3, 2, 1, 1), (1, 3, 2, 1, 1)), (Minute(), (1, 3, 2, 1, 1, 1), (1, 3, 2, 1, 1, 1)), (Second(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)), + (Millisecond(), (1, 3, 2, 1, 1, 1, 1000), (1, 3, 2, 1, 1, 1, 1000)), + (Microsecond(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)), ], ids=_id_func, ) @@ -914,6 +942,8 @@ def test_rollforward(calendar, offset, initial_date_args, partial_expected_date_ (Hour(), (1, 3, 2, 1, 1), (1, 3, 2, 1, 1)), (Minute(), (1, 3, 2, 1, 1, 1), (1, 3, 2, 1, 1, 1)), (Second(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)), + (Millisecond(), (1, 3, 2, 1, 1, 1, 1000), (1, 3, 2, 1, 1, 1, 1000)), + (Microsecond(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)), ], ids=_id_func, ) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d8412f82374..eda32d31148 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1,4 +1,5 @@ import warnings +from datetime import timedelta from itertools import product import numpy as np @@ -6,7 +7,15 @@ import pytest from pandas.errors import OutOfBoundsDatetime -from xarray import DataArray, Dataset, Variable, coding, conventions, decode_cf +from xarray import ( + DataArray, + Dataset, + Variable, + cftime_range, + coding, + conventions, + decode_cf, +) from xarray.coding.times import ( _encode_datetime_with_cftime, cftime_to_nptime, @@ -19,7 +28,15 @@ from xarray.core.common import contains_cftime_datetimes from xarray.testing import assert_equal -from . import arm_xfail, assert_array_equal, has_cftime, requires_cftime, requires_dask +from . import ( + arm_xfail, + assert_array_equal, + has_cftime, + has_cftime_1_4_1, + requires_cftime, + requires_cftime_1_4_1, + requires_dask, +) _NON_STANDARD_CALENDARS_SET = { "noleap", @@ -973,8 +990,13 @@ def test_decode_ambiguous_time_warns(calendar): @pytest.mark.parametrize("encoding_units", FREQUENCIES_TO_ENCODING_UNITS.values()) @pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) -def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq): - times = pd.date_range("2000", periods=3, freq=freq) +@pytest.mark.parametrize("date_range", [pd.date_range, cftime_range]) +def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq, date_range): + if not has_cftime_1_4_1 and date_range == cftime_range: + pytest.skip("Test requires cftime 1.4.1.") + if (freq == "N" or encoding_units == "nanoseconds") and date_range == cftime_range: + pytest.skip("Nanosecond frequency is not valid for cftime dates.") + times = date_range("2000", periods=3, freq=freq) units = f"{encoding_units} since 2000-01-01" encoded, _, _ = coding.times.encode_cf_datetime(times, units) @@ -987,7 +1009,7 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq): @pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) -def test_encode_decode_roundtrip(freq): +def test_encode_decode_roundtrip_datetime64(freq): # See GH 4045. Prior to GH 4684 this test would fail for frequencies of # "S", "L", "U", and "N". initial_time = pd.date_range("1678-01-01", periods=1) @@ -998,6 +1020,19 @@ def test_encode_decode_roundtrip(freq): assert_equal(variable, decoded) +@requires_cftime_1_4_1 +@pytest.mark.parametrize("freq", ["U", "L", "S", "T", "H", "D"]) +def test_encode_decode_roundtrip_cftime(freq): + initial_time = cftime_range("0001", periods=1) + times = initial_time.append( + cftime_range("0001", periods=2, freq=freq) + timedelta(days=291000 * 365) + ) + variable = Variable(["time"], times) + encoded = conventions.encode_cf_variable(variable) + decoded = conventions.decode_cf_variable("time", encoded, use_cftime=True) + assert_equal(variable, decoded) + + @requires_cftime def test__encode_datetime_with_cftime(): # See GH 4870. cftime versions > 1.4.0 required us to adapt the