Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: nanoseconds and reso in dateutil paths #56051

Merged
merged 2 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ Datetimelike
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
-

Timedelta
Expand Down
10 changes: 7 additions & 3 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
npy_datetimestruct dts
int out_local = 0, out_tzoffset = 0, string_to_dts_failed
datetime dt
int64_t ival
int64_t ival, nanos = 0
NPY_DATETIMEUNIT out_bestunit, reso
_TSObject obj

Expand Down Expand Up @@ -560,10 +560,14 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
return obj

dt = parse_datetime_string(
ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit
ts,
dayfirst=dayfirst,
yearfirst=yearfirst,
out_bestunit=&out_bestunit,
nanos=&nanos,
)
reso = get_supported_reso(out_bestunit)
return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso)
return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso)

return convert_datetime_to_tsobject(dt, tz)

Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/parsing.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cpython.datetime cimport datetime
from numpy cimport int64_t

from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT

Expand All @@ -10,5 +11,6 @@ cdef datetime parse_datetime_string(
str date_string,
bint dayfirst,
bint yearfirst,
NPY_DATETIMEUNIT* out_bestunit
NPY_DATETIMEUNIT* out_bestunit,
int64_t* nanos,
)
58 changes: 47 additions & 11 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ from numpy cimport (
PyArray_IterNew,
flatiter,
float64_t,
int64_t,
)

cnp.import_array()
Expand Down Expand Up @@ -272,8 +273,11 @@ def py_parse_datetime_string(
# parse_datetime_string cpdef bc it has a pointer argument)
cdef:
NPY_DATETIMEUNIT out_bestunit
int64_t nanos

return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit)
return parse_datetime_string(
date_string, dayfirst, yearfirst, &out_bestunit, &nanos
)


cdef datetime parse_datetime_string(
Expand All @@ -283,7 +287,8 @@ cdef datetime parse_datetime_string(
str date_string,
bint dayfirst,
bint yearfirst,
NPY_DATETIMEUNIT* out_bestunit
NPY_DATETIMEUNIT* out_bestunit,
int64_t* nanos,
):
"""
Parse datetime string, only returns datetime.
Expand Down Expand Up @@ -311,7 +316,7 @@ cdef datetime parse_datetime_string(
default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
dt = dateutil_parse(date_string, default=default,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=out_bestunit)
ignoretz=False, out_bestunit=out_bestunit, nanos=nanos)
return dt

dt = _parse_delimited_date(date_string, dayfirst, out_bestunit)
Expand All @@ -330,7 +335,7 @@ cdef datetime parse_datetime_string(

dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=out_bestunit)
ignoretz=False, out_bestunit=out_bestunit, nanos=nanos)
return dt


Expand Down Expand Up @@ -436,7 +441,7 @@ def parse_datetime_string_with_reso(

parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=&out_bestunit)
ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL)
reso = npy_unit_to_attrname[out_bestunit]
return parsed, reso

Expand Down Expand Up @@ -639,7 +644,8 @@ cdef datetime dateutil_parse(
bint ignoretz,
bint dayfirst,
bint yearfirst,
NPY_DATETIMEUNIT* out_bestunit
NPY_DATETIMEUNIT* out_bestunit,
int64_t* nanos,
):
""" lifted from dateutil to get resolution"""

Expand Down Expand Up @@ -671,11 +677,8 @@ cdef datetime dateutil_parse(
if reso is None:
raise DateParseError(f"Unable to parse datetime string: {timestr}")

if reso == "microsecond":
if repl["microsecond"] == 0:
reso = "second"
elif repl["microsecond"] % 1000 == 0:
reso = "millisecond"
if reso == "microsecond" and repl["microsecond"] % 1000 == 0:
reso = _find_subsecond_reso(timestr, nanos=nanos)

try:
ret = default.replace(**repl)
Expand Down Expand Up @@ -745,6 +748,38 @@ cdef datetime dateutil_parse(
return ret


cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P<frac>\d+)")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the first \ds always guaranteed to be separate by :?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dateutil supports some really weird formats (@MarcoGorelli and i have discussed moving away from using it at all) so i dont know. but i think this covers the vast majority of cases we care about


cdef _find_subsecond_reso(str timestr, int64_t* nanos):
# GH#55737
# Check for trailing zeros in a H:M:S.f pattern
match = _reso_pattern.search(timestr)
if not match:
reso = "second"
else:
frac = match.groupdict()["frac"]
if len(frac) <= 3:
reso = "millisecond"
elif len(frac) > 6:
if frac[6:] == "0" * len(frac[6:]):
# corner case where we haven't lost any data
reso = "nanosecond"
elif len(frac) <= 9:
reso = "nanosecond"
if nanos is not NULL:
if len(frac) < 9:
frac = frac + "0" * (9 - len(frac))
nanos[0] = int(frac[6:])
else:
# TODO: should we warn/raise in higher-than-nano cases?
reso = "nanosecond"
if nanos is not NULL:
nanos[0] = int(frac[6:9])
else:
reso = "microsecond"
return reso


# ----------------------------------------------------------------------
# Parsing for type-inference

Expand Down Expand Up @@ -916,6 +951,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
yearfirst=False,
ignoretz=False,
out_bestunit=&out_bestunit,
nanos=NULL,
)
except (ValueError, OverflowError, InvalidOperation):
# In case the datetime can't be parsed, its format cannot be guessed
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/scalar/timestamp/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,18 @@ def test_constructor_str_infer_reso(self):
ts = Timestamp("300 June 1:30:01.300")
assert ts.unit == "ms"

# dateutil path -> don't drop trailing zeros
ts = Timestamp("01-01-2013T00:00:00.000000000+0000")
assert ts.unit == "ns"

ts = Timestamp("2016/01/02 03:04:05.001000 UTC")
assert ts.unit == "us"

# higher-than-nanosecond -> we drop the trailing bits
ts = Timestamp("01-01-2013T00:00:00.000000002100+0000")
assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000")
assert ts.unit == "ns"


class TestTimestampConstructors:
def test_weekday_but_no_day_raises(self):
Expand Down
Loading