Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BUG #16012 - fix isin for large object arrays
Browse files Browse the repository at this point in the history
Morgan243 committed Jul 18, 2017

Verified

This commit was signed with the committer’s verified signature.
Ana06 Ana María Martínez Gómez
1 parent 81f8ace commit 186607b
Showing 3 changed files with 14 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
@@ -204,3 +204,4 @@ Categorical
Other
^^^^^
- Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`)
- Bug when using :func:`isin` on a large object series and large comparison array, numpy's in1d is used but doesn't support objects in most conditions (:issue:`16012`)
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
@@ -402,7 +402,10 @@ def isin(comps, values):
# work-around for numpy < 1.8 and comparisions on py3
# faster for larger cases to use np.in1d
f = lambda x, y: htable.ismember_object(x, values)
if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
# GH16012
# Ensure np.in1d doesn't get object types or it *may* throw an exception
if ((_np_version_under1p8 and compat.PY3) or len(comps) > 1000000 and
not is_object_dtype(comps)):
f = lambda x, y: np.in1d(x, y)
elif is_integer_dtype(comps):
try:
9 changes: 9 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
@@ -1092,6 +1092,15 @@ def test_isin(self):
expected = Series([True, False, True, False, False, False, True, True])
assert_series_equal(result, expected)

# GH: 16012
# This specific issue has to have a series over 1e6 in len, but the
# comparison array (in_list) must be large enough so that numpy doesn't
# do a manual masking trick that will avoid this issue altogether
s = Series(list('abcdefghijk' * 10 ** 5))
in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E', 'K', 'E', 'S', 'I', 'R', 'R']*6

assert s.isin(in_list).sum() == 200000

def test_isin_with_string_scalar(self):
# GH4763
s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])

0 comments on commit 186607b

Please sign in to comment.