From 637fb879ac1b24d0e9676a4d49a93d894c65cad0 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 10:50:50 -0500 Subject: [PATCH 01/18] Use integers when possible to encode/decode times --- xarray/coding/times.py | 29 ++++++++++++---- xarray/tests/test_coding_times.py | 55 ++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 59f8b89743a..99a5912727a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -26,6 +26,7 @@ _STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} _NS_PER_TIME_DELTA = { + "ns": 1, "us": int(1e3), "ms": int(1e6), "s": int(1e9), @@ -44,6 +45,7 @@ def _netcdf_to_numpy_timeunit(units): if not units.endswith("s"): units = "%ss" % units return { + "nanoseconds": "ns", "microseconds": "us", "milliseconds": "ms", "seconds": "s", @@ -162,10 +164,9 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # Cast input dates to integers of nanoseconds because `pd.to_datetime` # works much faster when dealing with integers # make _NS_PER_TIME_DELTA an array to ensure type upcasting - flat_num_dates_ns_int = ( - flat_num_dates.astype(np.float64) * _NS_PER_TIME_DELTA[delta] - ).astype(np.int64) - + flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( + np.int64 + ) return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values @@ -252,7 +253,15 @@ def decode_cf_timedelta(num_timedeltas, units): def _infer_time_units_from_diff(unique_timedeltas): - for time_unit in ["days", "hours", "minutes", "seconds"]: + for time_unit in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ]: delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] unit_delta = np.timedelta64(delta_ns, "ns") diffs = unique_timedeltas / unit_delta @@ -416,7 +425,15 @@ def encode_cf_datetime(dates, units=None, calendar=None): # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). - num = (pd.DatetimeIndex(dates.ravel()) - ref_date) / time_delta + dates_as_index = pd.DatetimeIndex(dates.ravel()) + time_deltas = dates_as_index - ref_date + + # Use floor division if the time_delta evenly divides all differences + # to preserve integer dtype if possible. + if np.all(time_deltas % time_delta == np.timedelta64(0, delta_units)): + num = time_deltas // time_delta + else: + num = time_deltas / time_delta num = num.values.reshape(dates.shape) except (OutOfBoundsDatetime, OverflowError): diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d35cad019b7..56478498c37 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -6,7 +6,7 @@ import pytest from pandas.errors import OutOfBoundsDatetime -from xarray import DataArray, Dataset, Variable, coding, decode_cf +from xarray import DataArray, Dataset, Variable, coding, conventions, decode_cf from xarray.coding.times import ( cftime_to_nptime, decode_cf_datetime, @@ -958,3 +958,56 @@ def test_decode_ambiguous_time_warns(calendar): assert not record np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + ("freq", "units"), + [ + ("D", "days"), + ("D", "hours"), + ("D", "minutes"), + ("D", "seconds"), + ("D", "milliseconds"), + ("D", "microseconds"), + ("D", "nanoseconds"), + ("H", "hours"), + ("H", "minutes"), + ("H", "seconds"), + ("H", "milliseconds"), + ("H", "microseconds"), + ("H", "nanoseconds"), + ("T", "minutes"), + ("T", "seconds"), + ("T", "milliseconds"), + ("T", "microseconds"), + ("T", "nanoseconds"), + ("S", "seconds"), + ("S", "milliseconds"), + ("S", "microseconds"), + ("S", "nanoseconds"), + ("L", "milliseconds"), + ("L", "microseconds"), + ("L", "nanoseconds"), + ("U", "microseconds"), + ("U", "nanoseconds"), + ("N", "nanoseconds"), + ], +) +def test_encode_cf_datetime_preserves_integer_units(freq, units): + times = pd.date_range("2000", periods=3, freq=freq) + cf_units = f"{units} since 2000-01-01" + encoded, _, _ = coding.times.encode_cf_datetime(times, cf_units) + assert encoded.dtype == np.int64 + + +@pytest.mark.parametrize( + "encoding", + [{"units": "nanoseconds since 1900-01-01"}, {}], + ids=["explicit-units", "no-units"], +) +def test_nanosecond_resolution_roundtrip(encoding): + times = pd.date_range("2000", periods=12, freq="337N") + variable = Variable(["time"], times, encoding=encoding) + encoded = conventions.encode_cf_variable(variable) + decoded = conventions.decode_cf_variable("time", encoded) + assert_equal(variable, decoded) From e2efadbe3b5e2f130123d0b12109abd71aa461cc Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 14:39:06 -0500 Subject: [PATCH 02/18] Improve tests --- xarray/tests/test_coding_times.py | 73 ++++++++++++++----------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 56478498c37..b31f4f1b27b 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -960,44 +960,39 @@ def test_decode_ambiguous_time_warns(calendar): np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize( - ("freq", "units"), - [ - ("D", "days"), - ("D", "hours"), - ("D", "minutes"), - ("D", "seconds"), - ("D", "milliseconds"), - ("D", "microseconds"), - ("D", "nanoseconds"), - ("H", "hours"), - ("H", "minutes"), - ("H", "seconds"), - ("H", "milliseconds"), - ("H", "microseconds"), - ("H", "nanoseconds"), - ("T", "minutes"), - ("T", "seconds"), - ("T", "milliseconds"), - ("T", "microseconds"), - ("T", "nanoseconds"), - ("S", "seconds"), - ("S", "milliseconds"), - ("S", "microseconds"), - ("S", "nanoseconds"), - ("L", "milliseconds"), - ("L", "microseconds"), - ("L", "nanoseconds"), - ("U", "microseconds"), - ("U", "nanoseconds"), - ("N", "nanoseconds"), - ], -) -def test_encode_cf_datetime_preserves_integer_units(freq, units): - times = pd.date_range("2000", periods=3, freq=freq) - cf_units = f"{units} since 2000-01-01" - encoded, _, _ = coding.times.encode_cf_datetime(times, cf_units) - assert encoded.dtype == np.int64 +ENCODING_UNITS = [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", +] +NUMPY_FREQUENCIES = [ + np.timedelta64(1, "D"), + np.timedelta64(1, "h"), + np.timedelta64(1, "m"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), +] + + +@pytest.mark.parametrize("encoding_units", ENCODING_UNITS) +@pytest.mark.parametrize("data_freq", NUMPY_FREQUENCIES, ids=lambda x: f"{x!r}") +def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq): + times = pd.date_range("2000", periods=3, freq=pd.to_timedelta(data_freq)) + units = f"{encoding_units} since 2000-01-01" + encoded, _, _ = coding.times.encode_cf_datetime(times, units) + + numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) + cf_units_as_timedelta = np.timedelta64(1, numpy_timeunit) + if data_freq >= cf_units_as_timedelta: + assert encoded.dtype == np.int64 + else: + assert encoded.dtype == np.float64 @pytest.mark.parametrize( @@ -1005,7 +1000,7 @@ def test_encode_cf_datetime_preserves_integer_units(freq, units): [{"units": "nanoseconds since 1900-01-01"}, {}], ids=["explicit-units", "no-units"], ) -def test_nanosecond_resolution_roundtrip(encoding): +def test_encode_decode_roundtrip(encoding): times = pd.date_range("2000", periods=12, freq="337N") variable = Variable(["time"], times, encoding=encoding) encoded = conventions.encode_cf_variable(variable) From 6a14aa09b146007506509ef3ef08ca7a15afe1c4 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 14:51:02 -0500 Subject: [PATCH 03/18] Further improvements to tests --- xarray/tests/test_coding_times.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index b31f4f1b27b..548caee3aed 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -995,14 +995,11 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq) assert encoded.dtype == np.float64 -@pytest.mark.parametrize( - "encoding", - [{"units": "nanoseconds since 1900-01-01"}, {}], - ids=["explicit-units", "no-units"], -) -def test_encode_decode_roundtrip(encoding): - times = pd.date_range("2000", periods=12, freq="337N") - variable = Variable(["time"], times, encoding=encoding) +@pytest.mark.parametrize("freq", ["N", "U", "L", "S", "T", "H", "D"]) +def test_encode_decode_roundtrip(freq): + initial_time = pd.date_range("1678-01-01", periods=1) + times = initial_time.append(pd.date_range("1968", periods=12, freq=freq)) + variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) decoded = conventions.decode_cf_variable("time", encoded) assert_equal(variable, decoded) From 9b8cfdaf129d000611a91cfc29a28dc3f53c8a8e Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 15:31:25 -0500 Subject: [PATCH 04/18] Remove optimization in favor of maximum correctness --- xarray/coding/times.py | 20 +++++--------------- xarray/tests/test_coding_times.py | 31 +++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 99a5912727a..380f286c261 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -153,21 +153,11 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime - # fixes: https://github.com/pydata/pandas/issues/14068 - # these lines check if the the lowest or the highest value in dates - # cause an OutOfBoundsDatetime (Overflow) error - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) - pd.to_timedelta(flat_num_dates.min(), delta) + ref_date - pd.to_timedelta(flat_num_dates.max(), delta) + ref_date - - # Cast input dates to integers of nanoseconds because `pd.to_datetime` - # works much faster when dealing with integers - # make _NS_PER_TIME_DELTA an array to ensure type upcasting - flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( - np.int64 - ) - return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values + # Use pd.to_timedelta to safely cast integer or float values to timedeltas, + # and add those to a Timestamp to safely produce a DatetimeIndex. This + # ensures that we do not encounter integer overflow at any point in the + # process without raising an OutOfBoundsDatetime or OverflowError. + return (pd.to_timedelta(flat_num_dates, delta) + ref_date).values def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 548caee3aed..d1967c72ea0 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -99,10 +99,12 @@ def test_cf_datetime(num_dates, units, calendar): if min_y >= 1678 and max_y < 2262: expected = cftime_to_nptime(expected) + print(expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = coding.times.decode_cf_datetime(num_dates, units, calendar) + print(actual) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -479,27 +481,36 @@ def test_decoded_cf_datetime_array_2d(): assert_array_equal(np.asarray(result), expected) +PANDAS_FREQUENCIES_TO_ENCODING_UNITS = { + "N": "nanoseconds", + "U": "microseconds", + "L": "milliseconds", + "S": "seconds", + "T": "minutes", + "H": "hours", + "D": "days" +} + + +@pytest.mark.parametrize(("freq", "units"), PANDAS_FREQUENCIES_TO_ENCODING_UNITS.items()) +def test_infer_datetime_units(freq, units): + dates = pd.date_range("2000", periods=2, freq=freq) + expected = f"{units} since 2000-01-01 00:00:00" + assert expected == coding.times.infer_datetime_units(dates) + + @pytest.mark.parametrize( ["dates", "expected"], [ - (pd.date_range("1900-01-01", periods=5), "days since 1900-01-01 00:00:00"), - ( - pd.date_range("1900-01-01 12:00:00", freq="H", periods=2), - "hours since 1900-01-01 12:00:00", - ), ( pd.to_datetime(["1900-01-01", "1900-01-02", "NaT"]), "days since 1900-01-01 00:00:00", ), - ( - pd.to_datetime(["1900-01-01", "1900-01-02T00:00:00.005"]), - "seconds since 1900-01-01 00:00:00", - ), (pd.to_datetime(["NaT", "1900-01-01"]), "days since 1900-01-01 00:00:00"), (pd.to_datetime(["NaT"]), "days since 1970-01-01 00:00:00"), ], ) -def test_infer_datetime_units(dates, expected): +def test_infer_datetime_units_with_NaT(dates, expected): assert expected == coding.times.infer_datetime_units(dates) From ce4bfb587aaef3e096b94b6dc3062fdc283aa24a Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 15:39:01 -0500 Subject: [PATCH 05/18] Remove print statements --- xarray/tests/test_coding_times.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d1967c72ea0..a74d3f57dbd 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -99,12 +99,10 @@ def test_cf_datetime(num_dates, units, calendar): if min_y >= 1678 and max_y < 2262: expected = cftime_to_nptime(expected) - print(expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = coding.times.decode_cf_datetime(num_dates, units, calendar) - print(actual) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() From 032b577f033b017a3cfd8ea5b1cd3a4799cdaf6d Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 16:00:41 -0500 Subject: [PATCH 06/18] Restore optimization --- xarray/coding/times.py | 18 +++++++++++++++--- xarray/tests/test_coding_times.py | 6 ++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 380f286c261..052ebed69a6 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -153,11 +153,23 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime - # Use pd.to_timedelta to safely cast integer or float values to timedeltas, + # To avoid integer overflow when converting to nanosecond units for integer + # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64 + # (GH #2002). + if flat_num_dates.dtype.kind == "i": + flat_num_dates = flat_num_dates.astype(np.int64) + + # Cast input ordinals to integers of nanoseconds because pd.to_timedelta + # works much faster when dealing with integers (GH #1399). + flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( + np.int64 + ) + + # Use pd.to_timedelta to safely cast integer values to timedeltas, # and add those to a Timestamp to safely produce a DatetimeIndex. This # ensures that we do not encounter integer overflow at any point in the - # process without raising an OutOfBoundsDatetime or OverflowError. - return (pd.to_timedelta(flat_num_dates, delta) + ref_date).values + # process without raising OutOfBoundsDatetime. + return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index a74d3f57dbd..7110bb48bb2 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -486,11 +486,13 @@ def test_decoded_cf_datetime_array_2d(): "S": "seconds", "T": "minutes", "H": "hours", - "D": "days" + "D": "days", } -@pytest.mark.parametrize(("freq", "units"), PANDAS_FREQUENCIES_TO_ENCODING_UNITS.items()) +@pytest.mark.parametrize( + ("freq", "units"), PANDAS_FREQUENCIES_TO_ENCODING_UNITS.items() +) def test_infer_datetime_units(freq, units): dates = pd.date_range("2000", periods=2, freq=freq) expected = f"{units} since 2000-01-01 00:00:00" From f3a870f11189f91a5387e4ecaba3058226685c91 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 16:29:16 -0500 Subject: [PATCH 07/18] Add a what's new entry --- doc/whats-new.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 433e0185a91..c7b6c70b8cd 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,7 +32,14 @@ Bug fixes ~~~~~~~~~ - :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs (:issue:`4627`). - +- By default, when possible, xarray will now always use values of type ``int64`` when encoding + and decoding ``numpy.datetime64[ns]`` datetimes. This ensures that maximum + precision and accuracy is maintained in the round-tripping process + (:issue:`4045`). It also enables encoding and decoding standard calendar + dates with time units of nanoseconds (:pull:`4400`). By `Spencer Clark + `_ and `Mark Harfouche `_. + + Documentation ~~~~~~~~~~~~~ - start a list of external I/O integrating with ``xarray`` (:issue:`683`, :pull:`4566`). From 2740483e4da52995eb7839bbd91c74c84e1e9e73 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 16:39:34 -0500 Subject: [PATCH 08/18] Add test for decoding timedeltas with nanosecond units too --- xarray/coding/times.py | 10 +++++++++- xarray/tests/test_coding_times.py | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 052ebed69a6..e01815a500c 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -36,7 +36,15 @@ } TIME_UNITS = frozenset( - ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"] + [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] ) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 7110bb48bb2..092268c9d7b 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -546,6 +546,7 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected): ("1h", "hours", np.int64(1)), ("1ms", "milliseconds", np.int64(1)), ("1us", "microseconds", np.int64(1)), + ("1ns", "nanoseconds", np.int64(1)), (["NaT", "0s", "1s"], None, [np.nan, 0, 1]), (["30m", "60m"], "hours", [0.5, 1.0]), ("NaT", "days", np.nan), From 0c2b41b5789ffadb9d80b894a7c8ab3778ecaaec Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 16:48:21 -0500 Subject: [PATCH 09/18] Some minor cleanups --- doc/whats-new.rst | 4 ++-- xarray/tests/test_coding_times.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c7b6c70b8cd..ceeb4d16c56 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,8 +34,8 @@ Bug fixes - :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs (:issue:`4627`). - By default, when possible, xarray will now always use values of type ``int64`` when encoding and decoding ``numpy.datetime64[ns]`` datetimes. This ensures that maximum - precision and accuracy is maintained in the round-tripping process - (:issue:`4045`). It also enables encoding and decoding standard calendar + precision and accuracy are maintained in the round-tripping process + (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard calendar dates with time units of nanoseconds (:pull:`4400`). By `Spencer Clark `_ and `Mark Harfouche `_. diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 092268c9d7b..85fa4fd8a70 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1000,8 +1000,8 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq) encoded, _, _ = coding.times.encode_cf_datetime(times, units) numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) - cf_units_as_timedelta = np.timedelta64(1, numpy_timeunit) - if data_freq >= cf_units_as_timedelta: + encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit) + if data_freq >= encoding_units_as_timedelta: assert encoded.dtype == np.int64 else: assert encoded.dtype == np.float64 From db487cef951e80f131803528c1eedacd352eef63 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 17:08:23 -0500 Subject: [PATCH 10/18] Add comment to motivate new test --- xarray/coding/times.py | 2 +- xarray/tests/test_coding_times.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index e01815a500c..950245c077a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -438,7 +438,7 @@ def encode_cf_datetime(dates, units=None, calendar=None): dates_as_index = pd.DatetimeIndex(dates.ravel()) time_deltas = dates_as_index - ref_date - # Use floor division if the time_delta evenly divides all differences + # Use floor division if time_delta evenly divides all differences # to preserve integer dtype if possible. if np.all(time_deltas % time_delta == np.timedelta64(0, delta_units)): num = time_deltas // time_delta diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 85fa4fd8a70..93ac9b12521 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1009,8 +1009,10 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq) @pytest.mark.parametrize("freq", ["N", "U", "L", "S", "T", "H", "D"]) def test_encode_decode_roundtrip(freq): + # See GH #4045. Prior to #4684 this test would fail for frequencies of "S", + # "L", "U", and "N". initial_time = pd.date_range("1678-01-01", periods=1) - times = initial_time.append(pd.date_range("1968", periods=12, freq=freq)) + times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) decoded = conventions.decode_cf_variable("time", encoded) From 480388224b5d45026a265f99f6893b1e80cb72c8 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 19:49:51 -0500 Subject: [PATCH 11/18] Add some print statements to try and debug things on Windows --- xarray/coding/times.py | 6 ++++++ xarray/tests/test_coding_times.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 950245c077a..4af034f8f55 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -164,14 +164,20 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64 # (GH #2002). + print(f"flat_num_dates dtype prior to casting {flat_num_dates.dtype}") if flat_num_dates.dtype.kind == "i": flat_num_dates = flat_num_dates.astype(np.int64) + print(f"flat_num_dates dtype after casting {flat_num_dates.dtype}") # Cast input ordinals to integers of nanoseconds because pd.to_timedelta # works much faster when dealing with integers (GH #1399). + print("flat_num_dates_ns_int prior to casting") + print(flat_num_dates * _NS_PER_TIME_DELTA[delta]) flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( np.int64 ) + print("flat_num_dates_ns_int after casting") + print(flat_num_dates_ns_int) # Use pd.to_timedelta to safely cast integer values to timedeltas, # and add those to a Timestamp to safely produce a DatetimeIndex. This diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 93ac9b12521..e082e993081 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1016,4 +1016,6 @@ def test_encode_decode_roundtrip(freq): variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) decoded = conventions.decode_cf_variable("time", encoded) + print(encoded) + print(decoded) assert_equal(variable, decoded) From ce6d072616a1af878224ae7ac1947ccfeb725d5e Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 20:57:25 -0500 Subject: [PATCH 12/18] xfail round-trip test on Windows; remove print statements --- xarray/coding/times.py | 6 ------ xarray/tests/test_coding_times.py | 6 ++++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 4af034f8f55..950245c077a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -164,20 +164,14 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64 # (GH #2002). - print(f"flat_num_dates dtype prior to casting {flat_num_dates.dtype}") if flat_num_dates.dtype.kind == "i": flat_num_dates = flat_num_dates.astype(np.int64) - print(f"flat_num_dates dtype after casting {flat_num_dates.dtype}") # Cast input ordinals to integers of nanoseconds because pd.to_timedelta # works much faster when dealing with integers (GH #1399). - print("flat_num_dates_ns_int prior to casting") - print(flat_num_dates * _NS_PER_TIME_DELTA[delta]) flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( np.int64 ) - print("flat_num_dates_ns_int after casting") - print(flat_num_dates_ns_int) # Use pd.to_timedelta to safely cast integer values to timedeltas, # and add those to a Timestamp to safely produce a DatetimeIndex. This diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index e082e993081..0482bdfe8e8 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -16,6 +16,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes +from xarray.tests.test_backends import ON_WINDOWS from xarray.testing import assert_equal from . import arm_xfail, assert_array_equal, has_cftime, requires_cftime, requires_dask @@ -1008,6 +1009,9 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq) @pytest.mark.parametrize("freq", ["N", "U", "L", "S", "T", "H", "D"]) +@pytest.mark.xfail( + ON_WINDOWS, reason="We do not expect exact round-tripping on Windows" +) def test_encode_decode_roundtrip(freq): # See GH #4045. Prior to #4684 this test would fail for frequencies of "S", # "L", "U", and "N". @@ -1016,6 +1020,4 @@ def test_encode_decode_roundtrip(freq): variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) decoded = conventions.decode_cf_variable("time", encoded) - print(encoded) - print(decoded) assert_equal(variable, decoded) From 41d24f18760e91ac99c6f7c15b0f168ac62473f9 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 12 Dec 2020 21:01:46 -0500 Subject: [PATCH 13/18] Don't xfail Windows tests for now; we should figure why they fail --- xarray/tests/test_coding_times.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 0482bdfe8e8..93ac9b12521 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -16,7 +16,6 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes -from xarray.tests.test_backends import ON_WINDOWS from xarray.testing import assert_equal from . import arm_xfail, assert_array_equal, has_cftime, requires_cftime, requires_dask @@ -1009,9 +1008,6 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq) @pytest.mark.parametrize("freq", ["N", "U", "L", "S", "T", "H", "D"]) -@pytest.mark.xfail( - ON_WINDOWS, reason="We do not expect exact round-tripping on Windows" -) def test_encode_decode_roundtrip(freq): # See GH #4045. Prior to #4684 this test would fail for frequencies of "S", # "L", "U", and "N". From 57814f77df2bc1431233b05ee41e4fb681c4662c Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 13 Dec 2020 15:17:05 -0500 Subject: [PATCH 14/18] Fix things on Windows --- xarray/coding/times.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 950245c077a..8ba1ef13d7e 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -274,8 +274,7 @@ def _infer_time_units_from_diff(unique_timedeltas): ]: delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] unit_delta = np.timedelta64(delta_ns, "ns") - diffs = unique_timedeltas / unit_delta - if np.all(diffs == diffs.astype(int)): + if np.all(unique_timedeltas % unit_delta == np.timedelta64(0, "ns")): return time_unit return "seconds" From 3ff3cd8cadcf7450f9a4dd627b780a88023605a0 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 13 Dec 2020 15:33:33 -0500 Subject: [PATCH 15/18] Use pandas for divisiblity check for older NumPy compatibility --- xarray/coding/times.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 8ba1ef13d7e..5a307a28542 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -263,6 +263,7 @@ def decode_cf_timedelta(num_timedeltas, units): def _infer_time_units_from_diff(unique_timedeltas): + unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas) for time_unit in [ "days", "hours", @@ -273,8 +274,8 @@ def _infer_time_units_from_diff(unique_timedeltas): "nanoseconds", ]: delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] - unit_delta = np.timedelta64(delta_ns, "ns") - if np.all(unique_timedeltas % unit_delta == np.timedelta64(0, "ns")): + unit_delta = pd.to_timedelta(delta_ns, "ns") + if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")): return time_unit return "seconds" From 367500874fd394b544e98bcd46aedf729f860225 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 13 Dec 2020 18:07:25 -0500 Subject: [PATCH 16/18] Reduce changes needed; improve comments --- xarray/coding/times.py | 13 +++++++--- xarray/tests/test_coding_times.py | 42 ++++++++----------------------- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 5a307a28542..3d104ede2a4 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -163,12 +163,12 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64 - # (GH #2002). + # (GH 2002). if flat_num_dates.dtype.kind == "i": flat_num_dates = flat_num_dates.astype(np.int64) # Cast input ordinals to integers of nanoseconds because pd.to_timedelta - # works much faster when dealing with integers (GH #1399). + # works much faster when dealing with integers (GH 1399). flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( np.int64 ) @@ -263,6 +263,11 @@ def decode_cf_timedelta(num_timedeltas, units): def _infer_time_units_from_diff(unique_timedeltas): + # Note that the modulus operator was only implemented for np.timedelta64 + # arrays as of NumPy version 1.16.0. Once our minimum version of NumPy + # supported is greater than or equal to this we will no longer need to cast + # unique_timedeltas to a TimedeltaIndex. In the meantime, however, the + # modulus operator works for TimedeltaIndex objects. unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas) for time_unit in [ "days", @@ -274,7 +279,7 @@ def _infer_time_units_from_diff(unique_timedeltas): "nanoseconds", ]: delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] - unit_delta = pd.to_timedelta(delta_ns, "ns") + unit_delta = np.timedelta64(delta_ns, "ns") if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")): return time_unit return "seconds" @@ -439,7 +444,7 @@ def encode_cf_datetime(dates, units=None, calendar=None): time_deltas = dates_as_index - ref_date # Use floor division if time_delta evenly divides all differences - # to preserve integer dtype if possible. + # to preserve integer dtype if possible (GH 4045). if np.all(time_deltas % time_delta == np.timedelta64(0, delta_units)): num = time_deltas // time_delta else: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 93ac9b12521..dfd558f737e 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -479,7 +479,7 @@ def test_decoded_cf_datetime_array_2d(): assert_array_equal(np.asarray(result), expected) -PANDAS_FREQUENCIES_TO_ENCODING_UNITS = { +FREQUENCIES_TO_ENCODING_UNITS = { "N": "nanoseconds", "U": "microseconds", "L": "milliseconds", @@ -490,9 +490,7 @@ def test_decoded_cf_datetime_array_2d(): } -@pytest.mark.parametrize( - ("freq", "units"), PANDAS_FREQUENCIES_TO_ENCODING_UNITS.items() -) +@pytest.mark.parametrize(("freq", "units"), FREQUENCIES_TO_ENCODING_UNITS.items()) def test_infer_datetime_units(freq, units): dates = pd.date_range("2000", periods=2, freq=freq) expected = f"{units} since 2000-01-01 00:00:00" @@ -972,45 +970,25 @@ def test_decode_ambiguous_time_warns(calendar): np.testing.assert_array_equal(result, expected) -ENCODING_UNITS = [ - "days", - "hours", - "minutes", - "seconds", - "milliseconds", - "microseconds", - "nanoseconds", -] -NUMPY_FREQUENCIES = [ - np.timedelta64(1, "D"), - np.timedelta64(1, "h"), - np.timedelta64(1, "m"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), -] - - -@pytest.mark.parametrize("encoding_units", ENCODING_UNITS) -@pytest.mark.parametrize("data_freq", NUMPY_FREQUENCIES, ids=lambda x: f"{x!r}") -def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, data_freq): - times = pd.date_range("2000", periods=3, freq=pd.to_timedelta(data_freq)) +@pytest.mark.parametrize("encoding_units", FREQUENCIES_TO_ENCODING_UNITS.values()) +@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) +def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq): + times = pd.date_range("2000", periods=3, freq=freq) units = f"{encoding_units} since 2000-01-01" encoded, _, _ = coding.times.encode_cf_datetime(times, units) numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit) - if data_freq >= encoding_units_as_timedelta: + if pd.to_timedelta(1, freq) >= encoding_units_as_timedelta: assert encoded.dtype == np.int64 else: assert encoded.dtype == np.float64 -@pytest.mark.parametrize("freq", ["N", "U", "L", "S", "T", "H", "D"]) +@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) def test_encode_decode_roundtrip(freq): - # See GH #4045. Prior to #4684 this test would fail for frequencies of "S", - # "L", "U", and "N". + # See GH 4045. Prior to GH 4684 this test would fail for frequencies of + # "S", "L", "U", and "N". initial_time = pd.date_range("1678-01-01", periods=1) times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) From ed2bce65180316b340102d7ad5529e8dd1712297 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 13 Dec 2020 18:12:00 -0500 Subject: [PATCH 17/18] Checking remainder against zero nanoseconds is more straightforward It probably doesn't really matter though. --- xarray/coding/times.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 3d104ede2a4..3d877a169f5 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -445,7 +445,7 @@ def encode_cf_datetime(dates, units=None, calendar=None): # Use floor division if time_delta evenly divides all differences # to preserve integer dtype if possible (GH 4045). - if np.all(time_deltas % time_delta == np.timedelta64(0, delta_units)): + if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")): num = time_deltas // time_delta else: num = time_deltas / time_delta From 450037de70a040cfc4bb2ee0786e5ad88506750d Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Wed, 16 Dec 2020 06:17:20 -0500 Subject: [PATCH 18/18] Add a note to the breaking changes section --- doc/whats-new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ceeb4d16c56..60d1de94506 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,11 @@ v0.16.3 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ +- As a result of :pull:`4684` the default units encoding for + datetime-like values (``np.datetime64[ns]`` or ``cftime.datetime``) will now + always be set such that ``int64`` values can be used. In the past, no units + finer than "seconds" were chosen, which would sometimes mean that ``float64`` + values were required, which would lead to inaccurate I/O round-trips. New Features