From 59ba12ba56ba30639229556e886cca9b7096969f Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 7 Dec 2016 10:47:34 -0500 Subject: [PATCH] BUG: Prevent addition overflow with TimedeltaIndex Expands checked-add array addition introduced in gh-14237 to include all other addition cases (i.e. TimedeltaIndex and Timedelta). Follow-up to gh-14453. --- asv_bench/benchmarks/algorithms.py | 15 ++++- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 89 +++++++++++++++++++++++++ pandas/core/nanops.py | 55 --------------- pandas/tests/test_algos.py | 49 ++++++++++++++ pandas/tests/test_nanops.py | 22 ------ pandas/tseries/base.py | 8 ++- pandas/tseries/tdi.py | 4 +- pandas/tseries/tests/test_timedeltas.py | 14 ++++ 9 files changed, 175 insertions(+), 82 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index c4a6117c0704a6..2671bff9660ac1 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -17,13 +17,16 @@ def setup(self): self.float = pd.Float64Index(np.random.randn(N).repeat(5)) # Convenience naming. - self.checked_add = pd.core.nanops._checked_add_with_arr + self.checked_add = pd.core.algorithms.checked_add_with_arr self.arr = np.arange(1000000) self.arrpos = np.arange(1000000) self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) + self.arr_nan = np.random.choice([True, False], size=1000000) + self.arrmixed_nan = np.random.choice([True, False], size=1000000) + # match self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) @@ -64,6 +67,16 @@ def time_add_overflow_neg_arr(self): def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) + def time_add_overflow_first_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + + def time_add_overflow_second_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_arr_nan) + + def time_add_overflow_both_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, + b_mask=self.arrmixed_arr_nan) + class Hashing(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 508093380ac811..7f3de710cb716c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -145,6 +145,7 @@ Bug Fixes ~~~~~~~~~ - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) +- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index effca6398419e7..0bebc13b226e08 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -27,6 +27,7 @@ _ensure_float64, _ensure_int64, is_list_like) +from pandas.compat.numpy import _np_version_under1p10 from pandas.types.missing import isnull import pandas.core.common as com @@ -40,6 +41,94 @@ # top-level algos # # --------------- # +def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): + """ + Perform array addition that checks for underflow and overflow. + + Performs the addition of an int64 array and an int64 integer (or array) + but checks that they do not result in overflow first. For elements that + are indicated to be NaN, whether or not there is overflow for that element + is automatically ignored. + + Parameters + ---------- + arr : array addend. + b : array or scalar addend. + arr_mask : boolean array or None + array indicating which elements to exclude from checking + b_mask : boolean array or boolean or None + array or scalar indicating which element(s) to exclude from checking + + Returns + ------- + sum : An array for elements x + b for each element x in arr if b is + a scalar or an array for elements x + y for each element pair + (x, y) in (arr, b). + + Raises + ------ + OverflowError if any x + y exceeds the maximum or minimum int64 value. + """ + def _broadcast(arr_or_scalar, shape): + """ + Helper function to broadcast arrays / scalars to the desired shape. + + This function is compatible with different versions of NumPy and is + implemented for performance reasons. + """ + if _np_version_under1p10: + if lib.isscalar(arr_or_scalar): + out = np.empty(shape) + out.fill(arr_or_scalar) + else: + out = arr_or_scalar + else: + out = np.broadcast_to(arr_or_scalar, shape) + return out + + b2 = _broadcast(b, arr.shape) + if b_mask is not None: + b2_mask = _broadcast(b_mask, arr.shape) + else: + b2_mask = None + + # For elements that are NaN, regardless of their value, we should + # ignore whether they overflow or not when doing the checked add. + if arr_mask is not None and b2_mask is not None: + not_nan = np.logical_not(arr_mask | b2_mask) + elif arr_mask is not None: + not_nan = np.logical_not(arr_mask) + elif b_mask is not None: + not_nan = np.logical_not(b2_mask) + else: + not_nan = np.empty(arr.shape, dtype=bool) + not_nan.fill(True) + + # gh-14324: For each element in 'arr' and its corresponding element + # in 'b2', we check the sign of the element in 'b2'. If it is positive, + # we then check whether its sum with the element in 'arr' exceeds + # np.iinfo(np.int64).max. If so, we have an overflow error. If it + # it is negative, we then check whether its sum with the element in + # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow + # error as well. + mask1 = b2 > 0 + mask2 = b2 < 0 + + if not mask1.any(): + to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() + elif not mask2.any(): + to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() + else: + to_raise = (((np.iinfo(np.int64).max - + b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or + ((np.iinfo(np.int64).min - + b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) + + if to_raise: + raise OverflowError("Overflow in int64 addition") + return arr + b + + def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d7d68ad536be50..a76e348b7dee2e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,7 +11,6 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.compat.numpy import _np_version_under1p10 from pandas.types.common import (_ensure_int64, _ensure_object, _ensure_float64, _get_dtype, is_float, is_scalar, @@ -810,57 +809,3 @@ def unique1d(values): table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(_ensure_object(values)) return uniques - - -def _checked_add_with_arr(arr, b): - """ - Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. - - Parameters - ---------- - arr : array addend. - b : array or scalar addend. - - Returns - ------- - sum : An array for elements x + b for each element x in arr if b is - a scalar or an array for elements x + y for each element pair - (x, y) in (arr, b). - - Raises - ------ - OverflowError if any x + y exceeds the maximum or minimum int64 value. - """ - # For performance reasons, we broadcast 'b' to the new array 'b2' - # so that it has the same size as 'arr'. - if _np_version_under1p10: - if lib.isscalar(b): - b2 = np.empty(arr.shape) - b2.fill(b) - else: - b2 = b - else: - b2 = np.broadcast_to(b, arr.shape) - - # gh-14324: For each element in 'arr' and its corresponding element - # in 'b2', we check the sign of the element in 'b2'. If it is positive, - # we then check whether its sum with the element in 'arr' exceeds - # np.iinfo(np.int64).max. If so, we have an overflow error. If it - # it is negative, we then check whether its sum with the element in - # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow - # error as well. - mask1 = b2 > 0 - mask2 = b2 < 0 - - if not mask1.any(): - to_raise = (np.iinfo(np.int64).min - b2 > arr).any() - elif not mask2.any(): - to_raise = (np.iinfo(np.int64).max - b2 < arr).any() - else: - to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any()) - - if to_raise: - raise OverflowError("Overflow in int64 addition") - return arr + b diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f89f41abd0d352..d0c909b9c1b30a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1129,6 +1129,55 @@ def test_ensure_platform_int(): assert (result is arr) +def test_int64_add_overflow(): + # see gh-14068 + msg = "Overflow in int64 addition" + m = np.iinfo(np.int64).max + n = np.iinfo(np.int64).min + + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), m) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([n, n]), n) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + with tm.assert_produces_warning(RuntimeWarning): + algos.checked_add_with_arr(np.array([m, m]), + np.array([np.nan, m])) + + # Check that the nan boolean arrays override whether or not + # the addition overflows. We don't check the result but just + # the fact that an OverflowError is not raised. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_mask=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True])) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index be634228b1b6ea..dd3a49de55d73d 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1002,28 +1002,6 @@ def prng(self): return np.random.RandomState(1234) -def test_int64_add_overflow(): - # see gh-14068 - msg = "Overflow in int64 addition" - m = np.iinfo(np.int64).max - n = np.iinfo(np.int64).min - - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, m]), m) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m])) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([n, n]), n) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n])) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n])) - with tm.assertRaisesRegexp(OverflowError, msg): - with tm.assert_produces_warning(RuntimeWarning): - nanops._checked_add_with_arr(np.array([m, m]), - np.array([np.nan, m])) - - if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 4645ae24684ffb..cd8dc80e1d8035 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -16,6 +16,7 @@ ABCPeriodIndex, ABCIndexClass) from pandas.types.missing import isnull from pandas.core import common as com, algorithms +from pandas.core.algorithms import checked_add_with_arr from pandas.core.common import AbstractMethodError import pandas.formats.printing as printing @@ -684,7 +685,8 @@ def _add_delta_td(self, other): # return the i8 result view inc = tslib._delta_to_nanoseconds(other) - new_values = (self.asi8 + inc).view('i8') + new_values = checked_add_with_arr(self.asi8, inc, + arr_mask=self._isnan).view('i8') if self.hasnans: new_values[self._isnan] = tslib.iNaT return new_values.view('i8') @@ -699,7 +701,9 @@ def _add_delta_tdi(self, other): self_i8 = self.asi8 other_i8 = other.asi8 - new_values = self_i8 + other_i8 + new_values = checked_add_with_arr(self_i8, other_i8, + arr_mask=self._isnan, + b_mask=other._isnan) if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = tslib.iNaT diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 7e77d8baf3b2c7..1585aac0c8eada 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -20,8 +20,8 @@ import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import to_offset +from pandas.core.algorithms import checked_add_with_arr from pandas.core.base import _shared_docs -from pandas.core.nanops import _checked_add_with_arr from pandas.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.types.concat as _concat @@ -347,7 +347,7 @@ def _add_datelike(self, other): else: other = Timestamp(other) i8 = self.asi8 - result = _checked_add_with_arr(i8, other.value) + result = checked_add_with_arr(i8, other.value) result = self._maybe_mask_results(result, fill_value=tslib.iNaT) return DatetimeIndex(result, name=self.name, copy=False) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index f0d14014d65597..7aaf690f0f024f 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1964,6 +1964,20 @@ def test_add_overflow(self): with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta([106580], 'D') + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = to_timedelta([pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(['4 days', pd.NaT]) + result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) + result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + tm.assert_index_equal(result, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)