From 486bbf1c17d1ac0d64bf78d559a82df6b5e7b110 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 12 Sep 2023 16:30:30 -0400 Subject: [PATCH 1/5] first round of fixes --- .../src/vendored/ujson/lib/ultrajsonenc.c | 7 ++- pandas/_libs/tslibs/np_datetime.pyx | 62 ++++++++++++++----- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index e3e710ce1b876..e3b4f76d573ff 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -763,7 +763,12 @@ void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { char *wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + (uint64_t)1; + } else { + uvalue = (value < 0) ? -value : value; + } wstr = enc->offset; // Conversion. Number is reversed. diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 7b2ee68c73ad2..1a4a76a83e6fe 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,3 @@ -cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -18,6 +17,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT @@ -545,7 +545,6 @@ cdef ndarray astype_round_check( return iresult -@cython.overflowcheck(True) cdef int64_t get_conversion_factor( NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit @@ -553,6 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ + cdef int64_t value if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -565,25 +565,55 @@ cdef int64_t get_conversion_factor( return 1 if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: - return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + if INT64_MAX // 7 < value: + raise OverflowError("result would overflow") + return 7 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: - return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + if INT64_MAX // 24 < value: + raise OverflowError("result would overflow") + return 24 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + if INT64_MAX // 60 < value: + raise OverflowError("result would overflow") + return 60 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + if INT64_MAX // 60 < value: + raise OverflowError("result would overflow") + return 60 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + if INT64_MAX // 1000 < value: + raise OverflowError("result would overflow") + return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + if INT64_MAX // 1000 < value: + raise OverflowError("result would overflow") + return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + if INT64_MAX // 1000 < value: + raise OverflowError("result would overflow") + return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + if INT64_MAX // 1000 < value: + raise OverflowError("result would overflow") + return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + if INT64_MAX // 1000 < value: + raise OverflowError("result would overflow") + return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + if INT64_MAX // 1000 < value: + raise OverflowError("result would overflow") + return 1000 * value else: raise ValueError("Converting from M or Y units is not supported.") @@ -624,9 +654,11 @@ cdef int64_t convert_reso( else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - with cython.overflowcheck(True): - # Note: caller is responsible for re-raising as OutOfBoundsTimedelta - res_value = value * mult + if INT64_MAX // mult < value: + raise OverflowError("result would overflow") + + # Note: caller is responsible for re-raising as OutOfBoundsTimedelta + res_value = value * mult return res_value From 6a20d07a037148d10c4eeda31a4fbe74941b3f40 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 12 Sep 2023 17:40:41 -0400 Subject: [PATCH 2/5] fix up includes --- pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index e3b4f76d573ff..942bd0b518144 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -44,6 +44,7 @@ Numeric decoder derived from TCL library #include #include #include +#include #include #include #include @@ -765,7 +766,7 @@ void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { char *wstr; JSUINT64 uvalue; if (value == INT64_MIN) { - uvalue = INT64_MAX + (uint64_t)1; + uvalue = INT64_MAX + UINT64_C(1); } else { uvalue = (value < 0) ? -value : value; } From 0dd45f05286fe262af3207b0659a4137d019a84f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 12 Sep 2023 17:56:51 -0400 Subject: [PATCH 3/5] updates --- pandas/_libs/tslibs/np_datetime.pyx | 37 +++++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 1a4a76a83e6fe..cc18c09ae4379 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -552,7 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ - cdef int64_t value + cdef int64_t value, overflow_limit if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -566,52 +566,62 @@ cdef int64_t get_conversion_factor( if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) - if INT64_MAX // 7 < value: + overflow_limit = INT64_MAX // 7 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 7 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) - if INT64_MAX // 24 < value: + overflow_limit = INT64_MAX // 24 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 24 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) - if INT64_MAX // 60 < value: + overflow_limit = INT64_MAX // 60 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 60 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) - if INT64_MAX // 60 < value: + overflow_limit = INT64_MAX // 60 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 60 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) - if INT64_MAX // 1000 < value: + overflow_limit = INT64_MAX // 1000 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) - if INT64_MAX // 1000 < value: + overflow_limit = INT64_MAX // 1000 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) - if INT64_MAX // 1000 < value: + overflow_limit = INT64_MAX // 1000 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) - if INT64_MAX // 1000 < value: + overflow_limit = INT64_MAX // 1000 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) - if INT64_MAX // 1000 < value: + overflow_limit = INT64_MAX // 1000 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 1000 * value elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) - if INT64_MAX // 1000 < value: + overflow_limit = INT64_MAX // 1000 + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") return 1000 * value else: @@ -625,7 +635,7 @@ cdef int64_t convert_reso( bint round_ok, ) except? -1: cdef: - int64_t res_value, mult, div, mod + int64_t res_value, mult, div, mod, overflow_limit if from_reso == to_reso: return value @@ -654,7 +664,8 @@ cdef int64_t convert_reso( else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - if INT64_MAX // mult < value: + overflow_limit = INT64_MAX // mult + if value > overflow_limit or value < -overflow_limit: raise OverflowError("result would overflow") # Note: caller is responsible for re-raising as OutOfBoundsTimedelta From 44f97908afb1f25fb3dc142865044e21acb615e4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 13 Sep 2023 13:47:22 -0400 Subject: [PATCH 4/5] dedup logic --- pandas/_libs/tslibs/np_datetime.pyx | 58 +++++++++-------------------- 1 file changed, 17 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index cc18c09ae4379..c3d401474d9b7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -552,7 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ - cdef int64_t value, overflow_limit + cdef int64_t value, overflow_limit, factor if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -566,67 +566,43 @@ cdef int64_t get_conversion_factor( if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) - overflow_limit = INT64_MAX // 7 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 7 * value + factor = 7 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) - overflow_limit = INT64_MAX // 24 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 24 * value + factor = 24 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) - overflow_limit = INT64_MAX // 60 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 60 * value + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) - overflow_limit = INT64_MAX // 60 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 60 * value + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) - overflow_limit = INT64_MAX // 1000 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 1000 * value + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) - overflow_limit = INT64_MAX // 1000 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 1000 * value + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) - overflow_limit = INT64_MAX // 1000 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 1000 * value + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) - overflow_limit = INT64_MAX // 1000 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 1000 * value + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) - overflow_limit = INT64_MAX // 1000 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 1000 * value + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) - overflow_limit = INT64_MAX // 1000 - if value > overflow_limit or value < -overflow_limit: - raise OverflowError("result would overflow") - return 1000 * value + factor = 1000 else: raise ValueError("Converting from M or Y units is not supported.") + overflow_limit = INT64_MAX // factor + if value > overflow_limit or value < -overflow_limit: + raise OverflowError("result would overflow") + + return factor * value + cdef int64_t convert_reso( int64_t value, From 24228ba8f6cc555f66f74e42baf9a75b40ffb4b8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 13 Sep 2023 16:01:44 -0400 Subject: [PATCH 5/5] move comment --- pandas/_libs/tslibs/np_datetime.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index c3d401474d9b7..c3ee68e14a8d4 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -642,9 +642,9 @@ cdef int64_t convert_reso( mult = get_conversion_factor(from_reso, to_reso) overflow_limit = INT64_MAX // mult if value > overflow_limit or value < -overflow_limit: + # Note: caller is responsible for re-raising as OutOfBoundsTimedelta raise OverflowError("result would overflow") - # Note: caller is responsible for re-raising as OutOfBoundsTimedelta res_value = value * mult return res_value