BUG: Prevent addition overflow with TimedeltaIndex

Expands checked-add array addition introduced in pandas-devgh-14237 to include all other addition cases (i.e. TimedeltaIndex and Timedelta). Follow-up to pandas-devgh-14453.
forking-repos · Dec 7, 2016 · 46909f0 · 46909f0
1 parent 2466ecb
commit 46909f0
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 8 deletions.
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -24,6 +24,11 @@ def setup(self):
         self.arrneg = np.arange(-1000000, 0)
         self.arrmixed = np.array([1, -1]).repeat(500000)
 
+        self.arr_nan = np.random.choice([True, False], size=1000000)
+        self.arrpos_nan = np.random.choice([True, False], size=1000000)
+        self.arrneg_nan = np.random.choice([True, False], size=1000000)
+        self.arrmixed_nan = np.random.choice([True, False], size=1000000)
+
     def time_int_factorize(self):
         self.int.factorize()
 
@@ -57,6 +62,16 @@ def time_add_overflow_neg_arr(self):
     def time_add_overflow_mixed_arr(self):
         self.checked_add(self.arr, self.arrmixed)
 
+    def time_add_overflow_first_arg_nan(self):
+        self.checked_add(self.arr, self.arrmixed, arr_nans=self.arr_nan)
+
+    def time_add_overflow_second_arg_nan(self):
+        self.checked_add(self.arr, self.arrmixed, b_nans=self.arrmixed_arr_nan)
+
+    def time_add_overflow_both_arg_nan(self):
+        self.checked_add(self.arr, self.arrmixed, arr_nans=self.arr_nan,
+                         b_nans=self.arrmixed_arr_nan)
+
 
 class hashing(object):
     goal_time = 0.2

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -812,15 +812,21 @@ def unique1d(values):
     return uniques
 
 
-def _checked_add_with_arr(arr, b):
+def _checked_add_with_arr(arr, b, arr_nans=None, b_nans=None):
     """
+    Perform array addition that checks for underflow and overflow.
+
     Performs the addition of an int64 array and an int64 integer (or array)
-    but checks that they do not result in overflow first.
+    but checks that they do not result in overflow first. For elements that
+    are indicated to be NaN, whether or not there is overflow for that element
+    is automatically ignored.
 
     Parameters
     ----------
     arr : array addend.
     b : array or scalar addend.
+    arr_nans : array indicating which elements are NaN
+    b_nans : array or scalar indicating which elements are NaN
 
     Returns
     -------
@@ -843,6 +849,17 @@ def _checked_add_with_arr(arr, b):
     else:
         b2 = np.broadcast_to(b, arr.shape)
 
+    # For elements that are NaN, regardless of their value, we should
+    # ignore whether they overflow or not when doing the checked add.
+    if arr_nans is not None and b_nans is not None:
+        not_nan = np.logical_not(arr_nans | b_nans)
+    elif arr_nans is not None:
+        not_nan = np.logical_not(arr_nans)
+    elif b_nans is not None:
+        not_nan = np.logical_not(b_nans)
+    else:
+        not_nan = np.array([True])
+
     # gh-14324: For each element in 'arr' and its corresponding element
     # in 'b2', we check the sign of the element in 'b2'. If it is positive,
     # we then check whether its sum with the element in 'arr' exceeds
@@ -854,12 +871,14 @@ def _checked_add_with_arr(arr, b):
     mask2 = b2 < 0
 
     if not mask1.any():
-        to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
+        to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
     elif not mask2.any():
-        to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
+        to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
     else:
-        to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
-                    (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())
+        to_raise = (((np.iinfo(np.int64).max -
+                      b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
+                    ((np.iinfo(np.int64).min -
+                      b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
 
     if to_raise:
         raise OverflowError("Overflow in int64 addition")

diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -1018,11 +1018,38 @@ def test_int64_add_overflow():
         nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
     with tm.assertRaisesRegexp(OverflowError, msg):
         nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                     arr_nans=np.array([False, True]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                     b_nans=np.array([False, True]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                     arr_nans=np.array([False, True]),
+                                     b_nans=np.array([False, True]))
     with tm.assertRaisesRegexp(OverflowError, msg):
         with tm.assert_produces_warning(RuntimeWarning):
             nanops._checked_add_with_arr(np.array([m, m]),
                                          np.array([np.nan, m]))
 
+    # Check that the nan boolean arrays override whether or not
+    # the addition overflows. We don't check the result but just
+    # the fact that an OverflowError is not raised.
+    with tm.assertRaises(AssertionError):
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                         arr_nans=np.array([True, True]))
+    with tm.assertRaises(AssertionError):
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                         b_nans=np.array([True, True]))
+    with tm.assertRaises(AssertionError):
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                         arr_nans=np.array([True, False]),
+                                         b_nans=np.array([False, True]))
+
 
 if __name__ == '__main__':
     import nose

diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py
@@ -17,6 +17,7 @@
 from pandas.types.missing import isnull
 from pandas.core import common as com, algorithms
 from pandas.core.common import AbstractMethodError
+from pandas.core.nanops import _checked_add_with_arr
 
 import pandas.formats.printing as printing
 import pandas.tslib as tslib
@@ -684,7 +685,8 @@ def _add_delta_td(self, other):
         # return the i8 result view
 
         inc = tslib._delta_to_nanoseconds(other)
-        new_values = (self.asi8 + inc).view('i8')
+        new_values = _checked_add_with_arr(self.asi8, inc,
+                                           arr_nans=self._isnan).view('i8')
         if self.hasnans:
             new_values[self._isnan] = tslib.iNaT
         return new_values.view('i8')
@@ -699,7 +701,9 @@ def _add_delta_tdi(self, other):
 
         self_i8 = self.asi8
         other_i8 = other.asi8
-        new_values = self_i8 + other_i8
+        new_values = _checked_add_with_arr(self_i8, other_i8,
+                                           arr_nans=self._isnan,
+                                           b_nans=other._isnan)
         if self.hasnans or other.hasnans:
             mask = (self._isnan) | (other._isnan)
             new_values[mask] = tslib.iNaT

diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -1964,6 +1964,20 @@ def test_add_overflow(self):
         with tm.assertRaisesRegexp(OverflowError, msg):
             Timestamp('2000') + to_timedelta([106580], 'D')
 
+        # These should not overflow!
+        exp = TimedeltaIndex([pd.NaT])
+        result = to_timedelta([pd.NaT]) - Timedelta('1 days')
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex(['4 days', pd.NaT])
+        result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
+        result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
+                  to_timedelta(['7 seconds', pd.NaT, '4 hours']))
+        tm.assert_index_equal(result, exp)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)