Skip to content

Commit

Permalink
BUG: incorrect OutOfBoundsDatetime with non-nano dtype (pandas-dev#55756
Browse files Browse the repository at this point in the history
)

* BUG: incorrect OutOfBoundsDatetime with non-nano dtype

* GH ref
  • Loading branch information
jbrockmendel authored Oct 30, 2023
1 parent 0287cde commit 2d2d67d
Show file tree
Hide file tree
Showing 16 changed files with 118 additions and 39 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ Datetimelike
- Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`)
- Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`)
- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
-

Timedelta
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def array_to_datetime(
dayfirst: bool = ...,
yearfirst: bool = ...,
utc: bool = ...,
creso: int = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...

# returned ndarray may be object dtype or datetime64[ns]
Expand Down
21 changes: 13 additions & 8 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ from pandas._libs.tslibs.conversion cimport (
get_datetime64_nanos,
parse_pydatetime,
)
from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
Expand Down Expand Up @@ -277,6 +278,7 @@ def array_with_unit_to_datetime(
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
creso=NPY_FR_ns,
)
return result, tz

Expand Down Expand Up @@ -408,6 +410,7 @@ cpdef array_to_datetime(
bint dayfirst=False,
bint yearfirst=False,
bint utc=False,
NPY_DATETIMEUNIT creso=NPY_FR_ns,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
Expand All @@ -434,6 +437,7 @@ cpdef array_to_datetime(
yearfirst parsing behavior when encountering datetime strings
utc : bool, default False
indicator whether the dates should be UTC
creso : NPY_DATETIMEUNIT, default NPY_FR_ns
Returns
-------
Expand All @@ -457,13 +461,14 @@ cpdef array_to_datetime(
set out_tzoffset_vals = set()
tzinfo tz_out = None
cnp.flatiter it = cnp.PyArray_IterNew(values)
NPY_DATETIMEUNIT creso = NPY_FR_ns
DatetimeParseState state = DatetimeParseState()
str reso_str

# specify error conditions
assert is_raise or is_ignore or is_coerce

result = np.empty((<object>values).shape, dtype="M8[ns]")
reso_str = npy_unit_to_abbrev(creso)
result = np.empty((<object>values).shape, dtype=f"M8[{reso_str}]")
iresult = result.view("i8").ravel()

for i in range(n):
Expand All @@ -480,11 +485,11 @@ cpdef array_to_datetime(
iresult[i] = parse_pydatetime(val, &dts, creso=creso)

elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts)
check_dts_bounds(&dts)
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)

elif is_datetime64_object(val):
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
iresult[i] = get_datetime64_nanos(val, creso)

elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
Expand All @@ -493,23 +498,23 @@ cpdef array_to_datetime(
iresult[i] = NPY_NAT
else:
# we now need to parse this as if unit='ns'
iresult[i] = cast_from_unit(val, "ns")
iresult[i] = cast_from_unit(val, "ns", out_reso=creso)

elif isinstance(val, str):
# string
if type(val) is not str:
# GH#32264 np.str_ object
val = str(val)

if parse_today_now(val, &iresult[i], utc):
if parse_today_now(val, &iresult[i], utc, creso):
# We can't _quite_ dispatch this to convert_str_to_tsobject
# bc there isn't a nice way to pass "utc"
continue

_ts = convert_str_to_tsobject(
val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
)
_ts.ensure_reso(NPY_FR_ns, val)
_ts.ensure_reso(creso, val)

iresult[i] = _ts.value

Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/conversion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1

cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
cpdef (int64_t, int) precision_from_unit(str unit, NPY_DATETIMEUNIT out_reso=*)
cpdef (int64_t, int) precision_from_unit(
NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
)

cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)

Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/conversion.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ DT64NS_DTYPE: np.dtype
TD64NS_DTYPE: np.dtype

def precision_from_unit(
unit: str,
in_reso: int, # NPY_DATETIMEUNIT
) -> tuple[int, int]: ... # (int64_t, _)
def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
25 changes: 16 additions & 9 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ cdef int64_t cast_from_unit(
cdef:
int64_t m
int p
NPY_DATETIMEUNIT in_reso

if unit in ["Y", "M"]:
if is_float_object(ts) and not ts.is_integer():
Expand All @@ -123,7 +124,14 @@ cdef int64_t cast_from_unit(
dt64obj = np.datetime64(ts, unit)
return get_datetime64_nanos(dt64obj, out_reso)

m, p = precision_from_unit(unit, out_reso)
in_reso = abbrev_to_npy_unit(unit)
if out_reso < in_reso and in_reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# We will end up rounding (always *down*), so don't need the fractional
# part of `ts`.
m, _ = precision_from_unit(out_reso, in_reso)
return (<int64_t>ts) // m

m, p = precision_from_unit(in_reso, out_reso)

# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
Expand All @@ -146,8 +154,8 @@ cdef int64_t cast_from_unit(
) from err


cpdef inline (int64_t, int) precision_from_unit(
str unit,
cpdef (int64_t, int) precision_from_unit(
NPY_DATETIMEUNIT in_reso,
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
):
"""
Expand All @@ -163,25 +171,24 @@ cpdef inline (int64_t, int) precision_from_unit(
int64_t m
int64_t multiplier
int p
NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit)

if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
reso = NPY_DATETIMEUNIT.NPY_FR_ns
if reso == NPY_DATETIMEUNIT.NPY_FR_Y:
if in_reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
in_reso = NPY_DATETIMEUNIT.NPY_FR_ns
if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y:
# each 400 years we have 97 leap years, for an average of 97/400=.2425
# extra days each year. We get 31556952 by writing
# 3600*24*365.2425=31556952
multiplier = periods_per_second(out_reso)
m = multiplier * 31556952
elif reso == NPY_DATETIMEUNIT.NPY_FR_M:
elif in_reso == NPY_DATETIMEUNIT.NPY_FR_M:
# 2629746 comes from dividing the "Y" case by 12.
multiplier = periods_per_second(out_reso)
m = multiplier * 2629746
else:
# Careful: if get_conversion_factor raises, the exception does
# not propagate, instead we get a warning about an ignored exception.
# https://github.com/pandas-dev/pandas/pull/51483#discussion_r1115198951
m = get_conversion_factor(reso, out_reso)
m = get_conversion_factor(in_reso, out_reso)

p = <int>log10(m) # number of digits in 'm' minus 1
return m, p
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ from cpython.datetime cimport (
)
from numpy cimport int64_t

from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT

cdef bint parse_today_now(str val, int64_t* iresult, bint utc)

cdef bint parse_today_now(str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso)


cdef class DatetimeParseState:
Expand Down
19 changes: 12 additions & 7 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -111,22 +111,27 @@ def _test_format_is_iso(f: str) -> bool:
return format_is_iso(f)


cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
cdef bint parse_today_now(
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso
):
# We delay this check for as long as possible
# because it catches relatively rare cases
cdef:
_Timestamp ts

# Multiply by 1000 to convert to nanos, since these methods naturally have
# microsecond resolution
if val == "now":
if utc:
iresult[0] = Timestamp.utcnow()._value * 1000
ts = <_Timestamp>Timestamp.utcnow()
iresult[0] = ts._as_creso(creso)._value
else:
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
# Note using Timestamp.now() is faster than Timestamp("now")
iresult[0] = Timestamp.now()._value * 1000
ts = <_Timestamp>Timestamp.now()
iresult[0] = ts._as_creso(creso)._value
return True
elif val == "today":
iresult[0] = Timestamp.today()._value * 1000
ts = <_Timestamp>Timestamp.today()
iresult[0] = ts._as_creso(creso)._value
return True
return False

Expand Down Expand Up @@ -363,7 +368,7 @@ def array_strptime(
check_dts_bounds(&dts)
continue

if parse_today_now(val, &iresult[i], utc):
if parse_today_now(val, &iresult[i], utc, NPY_FR_ns):
continue

# Some ISO formats can't be parsed by string_to_dts
Expand Down
4 changes: 1 addition & 3 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -303,18 +303,16 @@ cdef object ensure_td64ns(object ts):
cdef:
NPY_DATETIMEUNIT td64_unit
int64_t td64_value, mult
str unitstr

td64_unit = get_datetime64_unit(ts)
if (
td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns
and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
unitstr = npy_unit_to_abbrev(td64_unit)

td64_value = cnp.get_timedelta64_value(ts)

mult = precision_from_unit(unitstr)[0]
mult = precision_from_unit(td64_unit)[0]
try:
# NB: cython#1381 this cannot be *=
td64_value = td64_value * mult
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2251,18 +2251,19 @@ def _sequence_to_dt64ns(
dayfirst=dayfirst,
yearfirst=yearfirst,
allow_object=False,
out_unit=out_unit or "ns",
)
copy = False
if tz and inferred_tz:
# two timezones: convert to intended from base UTC repr
assert converted.dtype == "i8"
# GH#42505
# by convention, these are _already_ UTC, e.g
result = converted.view(DT64NS_DTYPE)
result = converted.view(out_dtype)

elif inferred_tz:
tz = inferred_tz
result = converted.view(DT64NS_DTYPE)
result = converted.view(out_dtype)

else:
result, _ = _construct_from_dt64_naive(
Expand Down Expand Up @@ -2360,6 +2361,7 @@ def objects_to_datetime64ns(
utc: bool = False,
errors: DateTimeErrorChoices = "raise",
allow_object: bool = False,
out_unit: str = "ns",
):
"""
Convert data to array of timestamps.
Expand All @@ -2375,6 +2377,7 @@ def objects_to_datetime64ns(
allow_object : bool
Whether to return an object-dtype ndarray instead of raising if the
data contains more than one timezone.
out_unit : str, default "ns"
Returns
-------
Expand All @@ -2399,6 +2402,7 @@ def objects_to_datetime64ns(
utc=utc,
dayfirst=dayfirst,
yearfirst=yearfirst,
creso=abbrev_to_npy_unit(out_unit),
)

if tz_parsed is not None:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
to_offset,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._libs.tslibs.fields import (
get_timedelta_days,
get_timedelta_field,
Expand Down Expand Up @@ -1078,7 +1079,7 @@ def sequence_to_td64ns(
else:
mask = np.isnan(data)
# The next few lines are effectively a vectorized 'cast_from_unit'
m, p = precision_from_unit(unit or "ns")
m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns"))
with warnings.catch_warnings():
# Suppress RuntimeWarning about All-NaN slice
warnings.filterwarnings(
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/dtypes/astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,10 @@ def _astype_nansafe(
# then coerce to datetime64[ns] and use DatetimeArray.astype

if lib.is_np_dtype(dtype, "M"):
from pandas import to_datetime
from pandas.core.arrays import DatetimeArray

dti = to_datetime(arr.ravel())
dta = dti._data.reshape(arr.shape)
return dta.astype(dtype, copy=False)._ndarray
dta = DatetimeArray._from_sequence(arr, dtype=dtype)
return dta._ndarray

elif lib.is_np_dtype(dtype, "m"):
from pandas.core.construction import ensure_wrapped_if_datetimelike
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import precision_from_unit
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._libs.tslibs.parsing import (
DateParseError,
guess_datetime_format,
Expand Down Expand Up @@ -550,7 +551,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
tz_parsed = None

elif arg.dtype.kind == "f":
mult, _ = precision_from_unit(unit)
mult, _ = precision_from_unit(abbrev_to_npy_unit(unit))

mask = np.isnan(arg) | (arg == iNaT)
fvalues = (arg * mult).astype("f8", copy=False)
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,12 @@ def test_astype_from_object_to_datetime_unit(self, unit):
["2017-01-01", "2017-01-02", "2017-02-03"],
]
df = DataFrame(vals, dtype=object)
with pytest.raises(TypeError, match="Cannot cast"):
msg = (
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
r"'datetime64\[ns\]' or DatetimeTZDtype"
)
with pytest.raises(ValueError, match=msg):
df.astype(f"M8[{unit}]")

@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
Expand Down
Loading

0 comments on commit 2d2d67d

Please sign in to comment.