From 8e21e60c4aabe4f7a37fccd921375c65a6a3e514 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 7 Dec 2016 10:47:34 -0500 Subject: [PATCH] BUG: Prevent addition overflow with TimedeltaIndex Expands checked-add array addition introduced in gh-14237 to include all other addition cases (i.e. TimedeltaIndex and TimeDelta). Follow-up to gh-14453. --- asv_bench/benchmarks/algorithms.py | 15 ++++++++++++ pandas/core/nanops.py | 31 ++++++++++++++++++++----- pandas/tests/test_nanops.py | 27 +++++++++++++++++++++ pandas/tseries/base.py | 8 +++++-- pandas/tseries/tests/test_timedeltas.py | 14 +++++++++++ 5 files changed, 87 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 53b7d55368f6a0..2f0c612f73fe26 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -24,6 +24,11 @@ def setup(self): self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) + self.arr_nan = np.random.choice([True, False], size=1000000) + self.arrpos_nan = np.random.choice([True, False], size=1000000) + self.arrneg_nan = np.random.choice([True, False], size=1000000) + self.arrmixed_nan = np.random.choice([True, False], size=1000000) + def time_int_factorize(self): self.int.factorize() @@ -57,6 +62,16 @@ def time_add_overflow_neg_arr(self): def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) + def time_add_overflow_first_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_nans=self.arr_nan) + + def time_add_overflow_second_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, b_nans=self.arrmixed_arr_nan) + + def time_add_overflow_both_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_nans=self.arr_nan, + b_nans=self.arrmixed_arr_nan) + class hashing(object): goal_time = 0.2 diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d7d68ad536be50..ed81bd4a8d5519 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -812,15 +812,21 @@ def unique1d(values): return uniques -def _checked_add_with_arr(arr, b): +def _checked_add_with_arr(arr, b, arr_nans=None, b_nans=None): """ + Perform array addition that checks for underflow and overflow. + Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. + but checks that they do not result in overflow first. For elements that + are indicated to be NaN, whether or not there is overflow for that element + is automatically ignored. Parameters ---------- arr : array addend. b : array or scalar addend. + arr_nans : array indicating which elements are NaN + b_nans : array or scalar indicating which elements are NaN Returns ------- @@ -843,6 +849,17 @@ def _checked_add_with_arr(arr, b): else: b2 = np.broadcast_to(b, arr.shape) + # For elements that are NaN, regardless of their value, we should + # ignore whether they overflow or not when doing the checked add. + if arr_nans is not None and b_nans is not None: + not_nan = np.logical_not(arr_nans | b_nans) + elif arr_nans is not None: + not_nan = np.logical_not(arr_nans) + elif b_nans is not None: + not_nan = np.logical_not(b_nans) + else: + not_nan = np.array([True]) + # gh-14324: For each element in 'arr' and its corresponding element # in 'b2', we check the sign of the element in 'b2'. If it is positive, # we then check whether its sum with the element in 'arr' exceeds @@ -854,12 +871,14 @@ def _checked_add_with_arr(arr, b): mask2 = b2 < 0 if not mask1.any(): - to_raise = (np.iinfo(np.int64).min - b2 > arr).any() + to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() elif not mask2.any(): - to_raise = (np.iinfo(np.int64).max - b2 < arr).any() + to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: - to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any()) + to_raise = (((np.iinfo(np.int64).max - + b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or + ((np.iinfo(np.int64).min - + b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) if to_raise: raise OverflowError("Overflow in int64 addition") diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index be634228b1b6ea..625b2121ceca43 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1018,11 +1018,38 @@ def test_int64_add_overflow(): nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n])) with tm.assertRaisesRegexp(OverflowError, msg): nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_nans=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_nans=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_nans=np.array([False, True]), + b_nans=np.array([False, True])) with tm.assertRaisesRegexp(OverflowError, msg): with tm.assert_produces_warning(RuntimeWarning): nanops._checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) + # Check that the nan boolean arrays override whether or not + # the addition overflows. We don't check the result but just + # the fact that an OverflowError is not raised. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_nans=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_nans=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_nans=np.array([True, False]), + b_nans=np.array([False, True])) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 4645ae24684ffb..76429ca49e72f7 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -17,6 +17,7 @@ from pandas.types.missing import isnull from pandas.core import common as com, algorithms from pandas.core.common import AbstractMethodError +from pandas.core.nanops import _checked_add_with_arr import pandas.formats.printing as printing import pandas.tslib as tslib @@ -684,7 +685,8 @@ def _add_delta_td(self, other): # return the i8 result view inc = tslib._delta_to_nanoseconds(other) - new_values = (self.asi8 + inc).view('i8') + new_values = _checked_add_with_arr(self.asi8, inc, + arr_nans=self._isnan).view('i8') if self.hasnans: new_values[self._isnan] = tslib.iNaT return new_values.view('i8') @@ -699,7 +701,9 @@ def _add_delta_tdi(self, other): self_i8 = self.asi8 other_i8 = other.asi8 - new_values = self_i8 + other_i8 + new_values = _checked_add_with_arr(self_i8, other_i8, + arr_nans=self._isnan, + b_nans=other._isnan) if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = tslib.iNaT diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index f0d14014d65597..7aaf690f0f024f 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1964,6 +1964,20 @@ def test_add_overflow(self): with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta([106580], 'D') + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = to_timedelta([pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(['4 days', pd.NaT]) + result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) + result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + tm.assert_index_equal(result, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)