From 453b6310ad43a4fe508ba80aa3ed6110b60aee0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 10:57:39 +0100 Subject: [PATCH 1/7] enhance and move iso-8601-parser to coding.times --- xarray/coding/cftime_offsets.py | 3 +- xarray/coding/cftimeindex.py | 73 +------------------------------- xarray/coding/times.py | 70 ++++++++++++++++++++++++++++++ xarray/tests/test_cftimeindex.py | 4 +- 4 files changed, 77 insertions(+), 73 deletions(-) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 2cd8eccd6f3..1f75d4a8613 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -53,9 +53,10 @@ import pandas as pd from packaging.version import Version -from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso +from xarray.coding.cftimeindex import CFTimeIndex from xarray.coding.times import ( _is_standard_calendar, + _parse_iso8601_with_reso, _should_cftime_be_used, convert_time_or_go_back, format_cftime_datetime, diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 0494952fc9c..2285c5db99d 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -42,7 +42,6 @@ from __future__ import annotations import math -import re import warnings from datetime import timedelta from typing import TYPE_CHECKING, Any @@ -53,6 +52,8 @@ from xarray.coding.times import ( _STANDARD_CALENDARS, + _parse_iso8601_with_reso, + _parse_iso8601_without_reso, cftime_to_nptime, infer_calendar_name, ) @@ -78,71 +79,6 @@ OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,) -def named(name, pattern): - return "(?P<" + name + ">" + pattern + ")" - - -def optional(x): - return "(?:" + x + ")?" - - -def trailing_optional(xs): - if not xs: - return "" - return xs[0] + optional(trailing_optional(xs[1:])) - - -def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."): - pieces = [ - (None, "year", r"\d{4}"), - (date_sep, "month", r"\d{2}"), - (date_sep, "day", r"\d{2}"), - (datetime_sep, "hour", r"\d{2}"), - (time_sep, "minute", r"\d{2}"), - (time_sep, "second", r"\d{2}"), - (micro_sep, "microsecond", r"\d{1,6}"), - ] - pattern_list = [] - for sep, name, sub_pattern in pieces: - pattern_list.append((sep if sep else "") + named(name, sub_pattern)) - # TODO: allow timezone offsets? - return "^" + trailing_optional(pattern_list) + "$" - - -_BASIC_PATTERN = build_pattern(date_sep="", time_sep="") -_EXTENDED_PATTERN = build_pattern() -_CFTIME_PATTERN = build_pattern(datetime_sep=" ") -_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN] - - -def parse_iso8601_like(datetime_string): - for pattern in _PATTERNS: - match = re.match(pattern, datetime_string) - if match: - return match.groupdict() - raise ValueError( - f"no ISO-8601 or cftime-string-like match for string: {datetime_string}" - ) - - -def _parse_iso8601_with_reso(date_type, timestr): - _ = attempt_import("cftime") - - default = date_type(1, 1, 1) - result = parse_iso8601_like(timestr) - replace = {} - - for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: - value = result.get(attr, None) - if value is not None: - if attr == "microsecond": - # convert match string into valid microsecond value - value = 10 ** (6 - len(value)) * int(value) - replace[attr] = int(value) - resolution = attr - return default.replace(**replace), resolution - - def _parsed_string_to_bounds(date_type, resolution, parsed): """Generalization of pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds @@ -811,11 +747,6 @@ def is_leap_year(self): return func(self.year, calendar=self.calendar) -def _parse_iso8601_without_reso(date_type, datetime_str): - date, _ = _parse_iso8601_with_reso(date_type, datetime_str) - return date - - def _parse_array_of_cftime_strings(strings, date_type): """Create a numpy array from an array of strings. diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 4622298e152..2bd001e1588 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]: return delta_units, ref_date +def named(name, pattern): + return "(?P<" + name + ">" + pattern + ")" + + +def optional(x): + return "(?:" + x + ")?" + + +def trailing_optional(xs): + if not xs: + return "" + return xs[0] + optional(trailing_optional(xs[1:])) + + +def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."): + pieces = [ + (None, "year", r"[+-]?\d{4,5}"), + (date_sep, "month", r"\d{2}"), + (date_sep, "day", r"\d{2}"), + (datetime_sep, "hour", r"\d{2}"), + (time_sep, "minute", r"\d{2}"), + (time_sep, "second", r"\d{2}"), + (micro_sep, "microsecond", r"\d{1,6}"), + ] + pattern_list = [] + for sep, name, sub_pattern in pieces: + pattern_list.append((sep if sep else "") + named(name, sub_pattern)) + # TODO: allow timezone offsets? + return "^" + trailing_optional(pattern_list) + "$" + + +_BASIC_PATTERN = build_pattern(date_sep="", time_sep="") +_EXTENDED_PATTERN = build_pattern() +_CFTIME_PATTERN = build_pattern(datetime_sep=" ") +_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN] + + +def parse_iso8601_like(datetime_string): + for pattern in _PATTERNS: + match = re.match(pattern, datetime_string) + if match: + return match.groupdict() + raise ValueError( + f"no ISO-8601 or cftime-string-like match for string: {datetime_string}" + ) + + +def _parse_iso8601_with_reso(date_type, timestr): + default = date_type(1, 1, 1) + result = parse_iso8601_like(timestr) + replace = {} + + for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: + value = result.get(attr, None) + if value is not None: + resolution = attr + if attr == "microsecond": + if len(value) <= 3: + resolution = "millisecond" + # convert match string into valid microsecond value + value = 10 ** (6 - len(value)) * int(value) + replace[attr] = int(value) + return default.replace(**replace), resolution + + +def _parse_iso8601_without_reso(date_type, datetime_str): + date, _ = _parse_iso8601_with_reso(date_type, datetime_str) + return date + + def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: # same us _unpack_netcdf_time_units but finalizes ref_date for # processing in encode_cf_datetime diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index d1fccc52a9a..1ef50cf925b 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -12,9 +12,11 @@ from xarray.coding.cftimeindex import ( CFTimeIndex, _parse_array_of_cftime_strings, - _parse_iso8601_with_reso, _parsed_string_to_bounds, assert_all_valid_date_type, +) +from xarray.coding.times import ( + _parse_iso8601_with_reso, parse_iso8601_like, ) from xarray.tests import ( From e358a30ce45d1369191e17c97b80e0036945eacc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 10:59:48 +0100 Subject: [PATCH 2/7] enhance test for iso 8601 parser --- xarray/tests/test_cftimeindex.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 1ef50cf925b..fb061f4c89d 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -134,16 +134,26 @@ def date_dict( list(ISO8601_LIKE_STRING_TESTS.values()), ids=list(ISO8601_LIKE_STRING_TESTS.keys()), ) -def test_parse_iso8601_like(string, expected): - result = parse_iso8601_like(string) +@pytest.mark.parametrize("five", [False, True], ids=["4Y", "5Y"]) +@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"]) +def test_parse_iso8601_like(five, sign, string, expected): + pre = "1" if five else "" + datestring = sign + pre + string + result = parse_iso8601_like(datestring) + expected = expected.copy() + expected.update(year=sign + pre + expected["year"]) assert result == expected - if result["microsecond"] is None: + # check malformed single digit addendum + # tests for year/month/day excluded as year can be 4 or 5 digits + if result["microsecond"] is None and result["hour"] is not None: with pytest.raises(ValueError): - parse_iso8601_like(string + "3") - if result["second"] is None: + parse_iso8601_like(datestring + "3") + + # check malformed floating point addendum + if result["second"] is None or result["microsecond"] is not None: with pytest.raises(ValueError): - parse_iso8601_like(string + ".3") + parse_iso8601_like(datestring + ".3") _CFTIME_CALENDARS = [ From c395bc82e565a197ba1e6f9323a1f4a7f7c38e67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 12:00:44 +0100 Subject: [PATCH 3/7] add whats-new.rst entry --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ecf1702c356..f7c895bcd50 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -61,6 +61,8 @@ Internal Changes ~~~~~~~~~~~~~~~~ - Move non-CF related ``ensure_dtype_not_object`` from conventions to backends (:pull:`9828`). By `Kai Mühlbauer `_. +- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import) (:pull:`9899`). + By `Kai Mühlbauer `_. .. _whats-new.2024.11.0: From 88573ba48ac4ef82b2a3bdfb8fc84320b0341f7e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Dec 2024 08:23:09 -0700 Subject: [PATCH 4/7] add datetime property test --- properties/test_encode_decode.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py index e7eece7e81e..211c33ff6b5 100644 --- a/properties/test_encode_decode.py +++ b/properties/test_encode_decode.py @@ -11,10 +11,12 @@ # isort: split import hypothesis.extra.numpy as npst +import hypothesis.strategies as st import numpy as np from hypothesis import given import xarray as xr +from xarray.coding.times import _parse_iso8601_without_reso from xarray.testing.strategies import variables @@ -43,3 +45,10 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None: coder = xr.coding.variables.CFScaleOffsetCoder() roundtripped = coder.decode(coder.encode(original)) xr.testing.assert_identical(original, roundtripped) + + +# TODO: add cftime.datetime +@given(dt=st.datetimes()) +def test_iso8601_decode(dt): + iso = dt.isoformat() + assert dt == _parse_iso8601_without_reso(type(dt), iso) From dd16f39a1b38b6d2d6d62935c1ef7bc259eaad55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 17:06:26 +0100 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Deepak Cherian --- xarray/tests/test_cftimeindex.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index fb061f4c89d..49988658550 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -134,7 +134,7 @@ def date_dict( list(ISO8601_LIKE_STRING_TESTS.values()), ids=list(ISO8601_LIKE_STRING_TESTS.keys()), ) -@pytest.mark.parametrize("five", [False, True], ids=["4Y", "5Y"]) +@pytest.mark.parametrize("five-digit-year", [False, True], ids=["4Y", "5Y"]) @pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"]) def test_parse_iso8601_like(five, sign, string, expected): pre = "1" if five else "" @@ -145,7 +145,11 @@ def test_parse_iso8601_like(five, sign, string, expected): assert result == expected # check malformed single digit addendum - # tests for year/month/day excluded as year can be 4 or 5 digits + # this check is only performed when we have at least "hour" given + # like "1999010101", where a single added digit should raise + # for "1999" (year), "199901" (month) and "19990101" (day) + # and a single added digit the string would just be interpreted + # as having a 5-digit year. if result["microsecond"] is None and result["hour"] is not None: with pytest.raises(ValueError): parse_iso8601_like(datestring + "3") From fc861604fb750977c19aaccdbf8f61bfd9a147fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:07:54 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_cftimeindex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 49988658550..ef30516b468 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -145,10 +145,10 @@ def test_parse_iso8601_like(five, sign, string, expected): assert result == expected # check malformed single digit addendum - # this check is only performed when we have at least "hour" given + # this check is only performed when we have at least "hour" given # like "1999010101", where a single added digit should raise - # for "1999" (year), "199901" (month) and "19990101" (day) - # and a single added digit the string would just be interpreted + # for "1999" (year), "199901" (month) and "19990101" (day) + # and a single added digit the string would just be interpreted # as having a 5-digit year. if result["microsecond"] is None and result["hour"] is not None: with pytest.raises(ValueError): From 0af4ff1af950165f30a0ddbfce93bc8e7b8a078c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 17:40:47 +0100 Subject: [PATCH 7/7] fix test --- xarray/tests/test_cftimeindex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index ef30516b468..20d14b053cf 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -134,10 +134,10 @@ def date_dict( list(ISO8601_LIKE_STRING_TESTS.values()), ids=list(ISO8601_LIKE_STRING_TESTS.keys()), ) -@pytest.mark.parametrize("five-digit-year", [False, True], ids=["4Y", "5Y"]) +@pytest.mark.parametrize("five_digit_year", [False, True], ids=["4Y", "5Y"]) @pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"]) -def test_parse_iso8601_like(five, sign, string, expected): - pre = "1" if five else "" +def test_parse_iso8601_like(five_digit_year, sign, string, expected): + pre = "1" if five_digit_year else "" datestring = sign + pre + string result = parse_iso8601_like(datestring) expected = expected.copy()