Skip to content

Commit

Permalink
Merge pull request pandas-dev#5199 from TomAugspurger/isin_dfs
Browse files Browse the repository at this point in the history
ENH/API: Accept DataFrame for isin
  • Loading branch information
hayd committed Oct 17, 2013
2 parents 97f5878 + 38b8fca commit b538892
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 31 deletions.
5 changes: 3 additions & 2 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -476,15 +476,16 @@ Enhancements
t = Timestamp('20130101 09:01:02')
t + pd.datetools.Nano(123)

- A new method, ``isin`` for DataFrames, plays nicely with boolean indexing. See :ref:`the docs<indexing.basics.indexing_isin>` for more.
- A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs<indexing.basics.indexing_isin>` for more.

To get the rows where any of the conditions are met:

.. ipython:: python

dfi = DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'f', 'n']})
dfi
mask = dfi.isin({'A': [1, 2], 'B': ['e', 'f']})
other = DataFrame({'A': [1, 3, 3, 7], 'B': ['e', 'f', 'f', 'e']})
mask = dfi.isin(other)
mask
dfi[mask.any(1)]

Expand Down
66 changes: 51 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4239,35 +4239,71 @@ def to_period(self, freq=None, axis=0, copy=True):

return self._constructor(new_data)


def isin(self, values, iloc=False):
def isin(self, values):
"""
Return boolean DataFrame showing whether each element in the DataFrame is
contained in values.
Return boolean DataFrame showing whether each element in the
DataFrame is contained in values.
Parameters
----------
values : iterable or dictionary of columns to values
iloc : boolean, if passing a dict as values, describe columns using integer
locations (default is to use labels)
values : iterable, Series, DataFrame or dictionary
The result will only be true at a location if all the
labels match. If `values` is a Series, that's the index. If
`values` is a dictionary, the keys must be the column names,
which must match. If `values` is a DataFrame,
then both the index and column labels must match.
Returns
-------
DataFrame of booleans
Examples
--------
When ``values`` is a list:
>>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
>>> df.isin([1, 3, 12, 'a'])
A B
0 True True
1 False False
2 True False
When ``values`` is a dict:
>>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
>>> df.isin({'A': [1, 3], 'B': [4, 7, 12]})
A B
0 True False # Note that B didn't match the 1 here.
1 False True
2 True True
When ``values`` is a Series or DataFrame:
>>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
>>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
>>> df.isin(other)
A B
0 True False
1 False False # Column A in `other` has a 3, but not at index 1.
2 True True
"""
if isinstance(values, dict):
from collections import defaultdict
from pandas.tools.merge import concat
values = defaultdict(list, values)
if iloc:
return concat((self.iloc[:, [i]].isin(values[i])
for i, col in enumerate(self.columns)), axis=1)
else:
return concat((self.iloc[:, [i]].isin(values[col])
for i, col in enumerate(self.columns)), axis=1)


return concat((self.iloc[:, [i]].isin(values[col])
for i, col in enumerate(self.columns)), axis=1)
elif isinstance(values, Series):
if not values.index.is_unique:
raise ValueError("ValueError: cannot compute isin with"
" a duplicate axis.")
return self.eq(values.reindex_like(self), axis='index')
elif isinstance(values, DataFrame):
if not (values.columns.is_unique and values.index.is_unique):
raise ValueError("ValueError: cannot compute isin with"
" a duplicate axis.")
return self.eq(values.reindex_like(self))
else:
if not is_list_like(values):
raise TypeError("only list-like or dict-like objects are"
Expand Down
92 changes: 78 additions & 14 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11431,20 +11431,6 @@ def test_isin_dict(self):
result = df.isin(d)
assert_frame_equal(result, expected)

# iloc
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
d = {0: ['a']}
expected = DataFrame(False, df.index, df.columns)

# without using iloc
result = df.isin(d)
assert_frame_equal(result, expected)

# using iloc
result = df.isin(d, iloc=True)
expected.iloc[0, 0] = True
assert_frame_equal(result, expected)

def test_isin_with_string_scalar(self):
#GH4763
df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
Expand All @@ -11456,6 +11442,84 @@ def test_isin_with_string_scalar(self):
with tm.assertRaises(TypeError):
df.isin('aaa')

def test_isin_df(self):
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
expected = DataFrame(False, df1.index, df1.columns)
result = df1.isin(df2)
expected['A'].loc[[1, 3]] = True
expected['B'].loc[[0, 2]] = True
assert_frame_equal(result, expected)

# partial overlapping columns
df2.columns = ['A', 'C']
result = df1.isin(df2)
expected['B'] = False
assert_frame_equal(result, expected)

def test_isin_df_dupe_values(self):
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
# just cols duped
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
columns=['B', 'B'])
with tm.assertRaises(ValueError):
df1.isin(df2)

# just index duped
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
columns=['A', 'B'], index=[0, 0, 1, 1])
with tm.assertRaises(ValueError):
df1.isin(df2)

# cols and index:
df2.columns = ['B', 'B']
with tm.assertRaises(ValueError):
df1.isin(df2)

def test_isin_dupe_self(self):
other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A','A'])
result = df.isin(other)
expected = DataFrame(False, index=df.index, columns=df.columns)
expected.loc[0] = True
expected.iloc[1, 1] = True
assert_frame_equal(result, expected)


def test_isin_against_series(self):
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
index=['a', 'b', 'c', 'd'])
s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
expected = DataFrame(False, index=df.index, columns=df.columns)
expected['A'].loc['a'] = True
expected.loc['d'] = True
result = df.isin(s)
assert_frame_equal(result, expected)

def test_isin_multiIndex(self):
idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
(0, 'b', 'bar'), (0, 'b', 'baz'),
(2, 'a', 'foo'), (2, 'a', 'bar'),
(2, 'c', 'bar'), (2, 'c', 'baz'),
(1, 'b', 'foo'), (1, 'b', 'bar'),
(1, 'c', 'bar'), (1, 'c', 'baz')])
df1 = DataFrame({'A': np.ones(12),
'B': np.zeros(12)}, index=idx)
df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
# against regular index
expected = DataFrame(False, index=df1.index, columns=df1.columns)
result = df1.isin(df2)
assert_frame_equal(result, expected)

df2.index = idx
expected = df2.values.astype(np.bool)
expected[:, 1] = ~expected[:, 1]
expected = DataFrame(expected, columns=['A', 'B'], index=idx)

result = df1.isin(df2)
assert_frame_equal(result, expected)

def test_to_csv_date_format(self):
from pandas import to_datetime
pname = '__tmp_to_csv_date_format__'
Expand Down

0 comments on commit b538892

Please sign in to comment.