Skip to content

Commit

Permalink
BUG: Prevent addition overflow with TimedeltaIndex
Browse files Browse the repository at this point in the history
Expands checked-add array addition introduced in
pandas-devgh-14237 to include all other addition cases (i.e.
TimedeltaIndex and Timedelta). Follow-up to pandas-devgh-14453.

In addition, move checked add function to core/algorithms.
  • Loading branch information
gfyoung committed Dec 17, 2016
1 parent dd8cba2 commit a086db6
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 82 deletions.
15 changes: 14 additions & 1 deletion asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ def setup(self):
self.float = pd.Float64Index(np.random.randn(N).repeat(5))

# Convenience naming.
self.checked_add = pd.core.nanops._checked_add_with_arr
self.checked_add = pd.core.algorithms.checked_add_with_arr

self.arr = np.arange(1000000)
self.arrpos = np.arange(1000000)
self.arrneg = np.arange(-1000000, 0)
self.arrmixed = np.array([1, -1]).repeat(500000)
self.strings = tm.makeStringIndex(100000)

self.arr_nan = np.random.choice([True, False], size=1000000)
self.arrmixed_nan = np.random.choice([True, False], size=1000000)

# match
self.uniques = tm.makeStringIndex(1000).values
self.all = self.uniques.repeat(10)
Expand Down Expand Up @@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self):
def time_add_overflow_mixed_arr(self):
self.checked_add(self.arr, self.arrmixed)

def time_add_overflow_first_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)

def time_add_overflow_second_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)

def time_add_overflow_both_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
b_mask=self.arrmixed_nan)


class Hashing(object):
goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ Performance Improvements
Bug Fixes
~~~~~~~~~

- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)


Expand Down
90 changes: 90 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_ensure_float64,
_ensure_int64,
is_list_like)
from pandas.compat.numpy import _np_version_under1p10
from pandas.types.missing import isnull

import pandas.core.common as com
Expand Down Expand Up @@ -550,6 +551,95 @@ def rank(values, axis=0, method='average', na_option='keep',

return ranks


def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
"""
Perform array addition that checks for underflow and overflow.
Performs the addition of an int64 array and an int64 integer (or array)
but checks that they do not result in overflow first. For elements that
are indicated to be NaN, whether or not there is overflow for that element
is automatically ignored.
Parameters
----------
arr : array addend.
b : array or scalar addend.
arr_mask : boolean array or None
array indicating which elements to exclude from checking
b_mask : boolean array or boolean or None
array or scalar indicating which element(s) to exclude from checking
Returns
-------
sum : An array for elements x + b for each element x in arr if b is
a scalar or an array for elements x + y for each element pair
(x, y) in (arr, b).
Raises
------
OverflowError if any x + y exceeds the maximum or minimum int64 value.
"""
def _broadcast(arr_or_scalar, shape):
"""
Helper function to broadcast arrays / scalars to the desired shape.
"""
if _np_version_under1p10:
if lib.isscalar(arr_or_scalar):
out = np.empty(shape)
out.fill(arr_or_scalar)
else:
out = arr_or_scalar
else:
out = np.broadcast_to(arr_or_scalar, shape)
return out

# For performance reasons, we broadcast 'b' to the new array 'b2'
# so that it has the same size as 'arr'.
b2 = _broadcast(b, arr.shape)
if b_mask is not None:
# We do the same broadcasting for b_mask as well.
b2_mask = _broadcast(b_mask, arr.shape)
else:
b2_mask = None

# For elements that are NaN, regardless of their value, we should
# ignore whether they overflow or not when doing the checked add.
if arr_mask is not None and b2_mask is not None:
not_nan = np.logical_not(arr_mask | b2_mask)
elif arr_mask is not None:
not_nan = np.logical_not(arr_mask)
elif b_mask is not None:
not_nan = np.logical_not(b2_mask)
else:
not_nan = np.empty(arr.shape, dtype=bool)
not_nan.fill(True)

# gh-14324: For each element in 'arr' and its corresponding element
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
# we then check whether its sum with the element in 'arr' exceeds
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
# it is negative, we then check whether its sum with the element in
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
# error as well.
mask1 = b2 > 0
mask2 = b2 < 0

if not mask1.any():
to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
elif not mask2.any():
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
else:
to_raise = (((np.iinfo(np.int64).max -
b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
((np.iinfo(np.int64).min -
b2[mask2] > arr[mask2]) & not_nan[mask2]).any())

if to_raise:
raise OverflowError("Overflow in int64 addition")
return arr + b


_rank1d_functions = {
'float64': algos.rank_1d_float64,
'int64': algos.rank_1d_int64,
Expand Down
55 changes: 0 additions & 55 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import pandas.hashtable as _hash
from pandas import compat, lib, algos, tslib
from pandas.compat.numpy import _np_version_under1p10
from pandas.types.common import (_ensure_int64, _ensure_object,
_ensure_float64, _get_dtype,
is_float, is_scalar,
Expand Down Expand Up @@ -810,57 +809,3 @@ def unique1d(values):
table = _hash.PyObjectHashTable(len(values))
uniques = table.unique(_ensure_object(values))
return uniques


def _checked_add_with_arr(arr, b):
"""
Performs the addition of an int64 array and an int64 integer (or array)
but checks that they do not result in overflow first.
Parameters
----------
arr : array addend.
b : array or scalar addend.
Returns
-------
sum : An array for elements x + b for each element x in arr if b is
a scalar or an array for elements x + y for each element pair
(x, y) in (arr, b).
Raises
------
OverflowError if any x + y exceeds the maximum or minimum int64 value.
"""
# For performance reasons, we broadcast 'b' to the new array 'b2'
# so that it has the same size as 'arr'.
if _np_version_under1p10:
if lib.isscalar(b):
b2 = np.empty(arr.shape)
b2.fill(b)
else:
b2 = b
else:
b2 = np.broadcast_to(b, arr.shape)

# gh-14324: For each element in 'arr' and its corresponding element
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
# we then check whether its sum with the element in 'arr' exceeds
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
# it is negative, we then check whether its sum with the element in
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
# error as well.
mask1 = b2 > 0
mask2 = b2 < 0

if not mask1.any():
to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
elif not mask2.any():
to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
else:
to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())

if to_raise:
raise OverflowError("Overflow in int64 addition")
return arr + b
49 changes: 49 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,55 @@ def test_ensure_platform_int():
assert (result is arr)


def test_int64_add_overflow():
# see gh-14068
msg = "Overflow in int64 addition"
m = np.iinfo(np.int64).max
n = np.iinfo(np.int64).min

with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), m)
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([n, n]), n)
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([False, True]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
b_mask=np.array([False, True]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([False, True]),
b_mask=np.array([False, True]))
with tm.assertRaisesRegexp(OverflowError, msg):
with tm.assert_produces_warning(RuntimeWarning):
algos.checked_add_with_arr(np.array([m, m]),
np.array([np.nan, m]))

# Check that the nan boolean arrays override whether or not
# the addition overflows. We don't check the result but just
# the fact that an OverflowError is not raised.
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([True, True]))
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
b_mask=np.array([True, True]))
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([True, False]),
b_mask=np.array([False, True]))


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
22 changes: 0 additions & 22 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,28 +1002,6 @@ def prng(self):
return np.random.RandomState(1234)


def test_int64_add_overflow():
# see gh-14068
msg = "Overflow in int64 addition"
m = np.iinfo(np.int64).max
n = np.iinfo(np.int64).min

with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([m, m]), m)
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]))
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([n, n]), n)
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
with tm.assert_produces_warning(RuntimeWarning):
nanops._checked_add_with_arr(np.array([m, m]),
np.array([np.nan, m]))


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'
Expand Down
8 changes: 6 additions & 2 deletions pandas/tseries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
ABCPeriodIndex, ABCIndexClass)
from pandas.types.missing import isnull
from pandas.core import common as com, algorithms
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.common import AbstractMethodError

import pandas.formats.printing as printing
Expand Down Expand Up @@ -688,7 +689,8 @@ def _add_delta_td(self, other):
# return the i8 result view

inc = tslib._delta_to_nanoseconds(other)
new_values = (self.asi8 + inc).view('i8')
new_values = checked_add_with_arr(self.asi8, inc,
arr_mask=self._isnan).view('i8')
if self.hasnans:
new_values[self._isnan] = tslib.iNaT
return new_values.view('i8')
Expand All @@ -703,7 +705,9 @@ def _add_delta_tdi(self, other):

self_i8 = self.asi8
other_i8 = other.asi8
new_values = self_i8 + other_i8
new_values = checked_add_with_arr(self_i8, other_i8,
arr_mask=self._isnan,
b_mask=other._isnan)
if self.hasnans or other.hasnans:
mask = (self._isnan) | (other._isnan)
new_values[mask] = tslib.iNaT
Expand Down
4 changes: 2 additions & 2 deletions pandas/tseries/tdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
import pandas.compat as compat
from pandas.compat import u
from pandas.tseries.frequencies import to_offset
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.base import _shared_docs
from pandas.core.nanops import _checked_add_with_arr
from pandas.indexes.base import _index_shared_docs
import pandas.core.common as com
import pandas.types.concat as _concat
Expand Down Expand Up @@ -347,7 +347,7 @@ def _add_datelike(self, other):
else:
other = Timestamp(other)
i8 = self.asi8
result = _checked_add_with_arr(i8, other.value)
result = checked_add_with_arr(i8, other.value)
result = self._maybe_mask_results(result, fill_value=tslib.iNaT)
return DatetimeIndex(result, name=self.name, copy=False)

Expand Down
23 changes: 23 additions & 0 deletions pandas/tseries/tests/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1958,11 +1958,34 @@ def test_add_overflow(self):
with tm.assertRaisesRegexp(OverflowError, msg):
Timestamp('2000') + to_timedelta(106580, 'D')

_NaT = int(pd.NaT) + 1
msg = "Overflow in int64 addition"
with tm.assertRaisesRegexp(OverflowError, msg):
to_timedelta([106580], 'D') + Timestamp('2000')
with tm.assertRaisesRegexp(OverflowError, msg):
Timestamp('2000') + to_timedelta([106580], 'D')
with tm.assertRaisesRegexp(OverflowError, msg):
to_timedelta([_NaT]) - Timedelta('1 days')
with tm.assertRaisesRegexp(OverflowError, msg):
to_timedelta(['5 days', _NaT]) - Timedelta('1 days')
with tm.assertRaisesRegexp(OverflowError, msg):
(to_timedelta([_NaT, '5 days', '1 hours']) -
to_timedelta(['7 seconds', _NaT, '4 hours']))

# These should not overflow!
exp = TimedeltaIndex([pd.NaT])
result = to_timedelta([pd.NaT]) - Timedelta('1 days')
tm.assert_index_equal(result, exp)

exp = TimedeltaIndex(['4 days', pd.NaT])
result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
tm.assert_index_equal(result, exp)

exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
to_timedelta(['7 seconds', pd.NaT, '4 hours']))
tm.assert_index_equal(result, exp)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down

0 comments on commit a086db6

Please sign in to comment.