From 3dadd8908da691604ce2350a477958251efaa44b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 27 Sep 2023 14:15:42 -0700 Subject: [PATCH 1/6] Fix to_datetime with format allowing out-of-range values --- python/cudf/cudf/core/column/string.py | 5 +++++ python/cudf/cudf/tests/test_datetime.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fe21dc87bac..f17b6cadf34 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5665,6 +5665,11 @@ def _as_datetime_or_timedelta_column(self, dtype, format): if (self == "None").any(): raise ValueError("Could not convert `None` value to datetime") + if dtype.kind == "M": + valid = str_cast.istimestamp(self, format) + if not valid.all(): + raise ValueError(f"Column contains invalid data for {format=}") + casting_func = ( str_cast.timestamp2int if dtype.type == np.datetime64 diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 164856ed6f5..d5be569d7c1 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -804,6 +804,11 @@ def test_to_datetime_format(data, format, infer_datetime_format): assert_eq(actual, expected) +def test_to_datetime_data_out_of_range_for_format(): + with pytest.raises(ValueError): + cudf.to_datetime("2015-02-99", format="%Y-%m-%d") + + def test_datetime_can_cast_safely(): sr = cudf.Series( From c379ba45c7acaaacfa5aa9fbe74132c3f4b3eb84 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 27 Sep 2023 16:51:01 -0700 Subject: [PATCH 2/6] Trigger CI From 187b2454d8855e3c91abe0b8779c99abaf4404ee Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:00:23 -0700 Subject: [PATCH 3/6] allow nat string --- python/cudf/cudf/core/column/string.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f17b6cadf34..1afee62d9e5 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5665,8 +5665,10 @@ def _as_datetime_or_timedelta_column(self, dtype, format): if (self == "None").any(): raise ValueError("Could not convert `None` value to datetime") + is_nat = self == "NaT" if dtype.kind == "M": - valid = str_cast.istimestamp(self, format) + valid_ts = str_cast.istimestamp(self, format) + valid = valid_ts | is_nat if not valid.all(): raise ValueError(f"Column contains invalid data for {format=}") @@ -5677,9 +5679,8 @@ def _as_datetime_or_timedelta_column(self, dtype, format): ) result_col = casting_func(self, dtype, format) - boolean_match = self == "NaT" - if (boolean_match).any(): - result_col[boolean_match] = None + if is_nat.any(): + result_col[is_nat] = None return result_col From 251dd12d8f5773ac2e8b964899bb5379e51354a8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 28 Sep 2023 18:27:01 -0700 Subject: [PATCH 4/6] Disabled mixed formats --- python/cudf/cudf/core/column/string.py | 12 ++++++++++++ python/cudf/cudf/tests/test_datetime.py | 5 +++++ python/cudf/cudf/tests/test_index.py | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1afee62d9e5..c9f8cd7d0a3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5666,6 +5666,18 @@ def _as_datetime_or_timedelta_column(self, dtype, format): raise ValueError("Could not convert `None` value to datetime") is_nat = self == "NaT" + all_same_length = ( + libstrings.count_characters(self).distinct_count(dropna=True) == 1 + ) + if not all_same_length: + # Unfortunately disables OK cases like: + # ["2020-01-01", "2020-01-01 00:00:00"] + # But currently incorrect for cases like (drops 10) + # ["2020-01-01", "2020-01-01 10:00:00"] + raise NotImplementedError( + "Cannot parse date-like strings with different formats" + ) + if dtype.kind == "M": valid_ts = str_cast.istimestamp(self, format) valid = valid_ts | is_nat diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d5be569d7c1..f152cb6c606 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -809,6 +809,11 @@ def test_to_datetime_data_out_of_range_for_format(): cudf.to_datetime("2015-02-99", format="%Y-%m-%d") +def test_to_datetime_different_formats_notimplemented(): + with pytest.raises(NotImplementedError): + cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"]) + + def test_datetime_can_cast_safely(): sr = cudf.Series( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 29232f63e90..c731e0e4bec 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2272,7 +2272,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): [], ["this", "is"], [0, 19, 13], - ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02"], + ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"], ], ) def test_isin_index(data, values): From 1805b298089f617cd39d97a7d4423233296470c6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Sep 2023 13:07:09 -0700 Subject: [PATCH 5/6] Only for datetime type --- python/cudf/cudf/core/column/string.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c9f8cd7d0a3..e4c7ecf84c3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5666,19 +5666,19 @@ def _as_datetime_or_timedelta_column(self, dtype, format): raise ValueError("Could not convert `None` value to datetime") is_nat = self == "NaT" - all_same_length = ( - libstrings.count_characters(self).distinct_count(dropna=True) == 1 - ) - if not all_same_length: - # Unfortunately disables OK cases like: - # ["2020-01-01", "2020-01-01 00:00:00"] - # But currently incorrect for cases like (drops 10) - # ["2020-01-01", "2020-01-01 10:00:00"] - raise NotImplementedError( - "Cannot parse date-like strings with different formats" - ) - if dtype.kind == "M": + all_same_length = ( + libstrings.count_characters(self).distinct_count(dropna=True) + == 1 + ) + if not all_same_length: + # Unfortunately disables OK cases like: + # ["2020-01-01", "2020-01-01 00:00:00"] + # But currently incorrect for cases like (drops 10): + # ["2020-01-01", "2020-01-01 10:00:00"] + raise NotImplementedError( + "Cannot parse date-like strings with different formats" + ) valid_ts = str_cast.istimestamp(self, format) valid = valid_ts | is_nat if not valid.all(): From 6ebf1338409efbff9084d7b5208810be8d83adad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Sep 2023 13:28:46 -0700 Subject: [PATCH 6/6] Filter nat --- python/cudf/cudf/core/column/string.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e4c7ecf84c3..eb86f555432 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5667,8 +5667,11 @@ def _as_datetime_or_timedelta_column(self, dtype, format): is_nat = self == "NaT" if dtype.kind == "M": + without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) all_same_length = ( - libstrings.count_characters(self).distinct_count(dropna=True) + libstrings.count_characters(without_nat).distinct_count( + dropna=True + ) == 1 ) if not all_same_length: