Skip to content

Commit

Permalink
Merge pull request #6 from mroeschke/bug/to_datetime/format_range
Browse files Browse the repository at this point in the history
Fix to_datetime with format allowing out-of-range values
  • Loading branch information
mroeschke authored Oct 3, 2023
2 parents 8e6da87 + 75f7027 commit ac52602
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 4 deletions.
27 changes: 24 additions & 3 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5665,16 +5665,37 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
if (self == "None").any():
raise ValueError("Could not convert `None` value to datetime")

is_nat = self == "NaT"
if dtype.kind == "M":
without_nat = self.apply_boolean_mask(is_nat.unary_operator("not"))
all_same_length = (
libstrings.count_characters(without_nat).distinct_count(
dropna=True
)
== 1
)
if not all_same_length:
# Unfortunately disables OK cases like:
# ["2020-01-01", "2020-01-01 00:00:00"]
# But currently incorrect for cases like (drops 10):
# ["2020-01-01", "2020-01-01 10:00:00"]
raise NotImplementedError(
"Cannot parse date-like strings with different formats"
)
valid_ts = str_cast.istimestamp(self, format)
valid = valid_ts | is_nat
if not valid.all():
raise ValueError(f"Column contains invalid data for {format=}")

casting_func = (
str_cast.timestamp2int
if dtype.type == np.datetime64
else str_cast.timedelta2int
)
result_col = casting_func(self, dtype, format)

boolean_match = self == "NaT"
if (boolean_match).any():
result_col[boolean_match] = None
if is_nat.any():
result_col[is_nat] = None

return result_col

Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,16 @@ def test_to_datetime_format(data, format, infer_datetime_format):
assert_eq(actual, expected)


def test_to_datetime_data_out_of_range_for_format():
with pytest.raises(ValueError):
cudf.to_datetime("2015-02-99", format="%Y-%m-%d")


def test_to_datetime_different_formats_notimplemented():
with pytest.raises(NotImplementedError):
cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"])


def test_datetime_can_cast_safely():

sr = cudf.Series(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2272,7 +2272,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
[],
["this", "is"],
[0, 19, 13],
["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02"],
["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"],
],
)
def test_isin_index(data, values):
Expand Down

0 comments on commit ac52602

Please sign in to comment.