Merge pull request pandas-dev#5199 from TomAugspurger/isin_dfs

ENH/API: Accept DataFrame for isin
jennolsen84 · Oct 17, 2013 · b538892 · b538892
2 parents 97f5878 + 38b8fca
commit b538892
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 31 deletions.
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -476,15 +476,16 @@ Enhancements
      t = Timestamp('20130101 09:01:02')
      t + pd.datetools.Nano(123)
 
-- A new method, ``isin`` for DataFrames, plays nicely with boolean indexing. See :ref:`the docs<indexing.basics.indexing_isin>` for more.
+- A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs<indexing.basics.indexing_isin>` for more.
 
   To get the rows where any of the conditions are met:
 
   .. ipython:: python
 
      dfi = DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'f', 'n']})
      dfi
-     mask = dfi.isin({'A': [1, 2], 'B': ['e', 'f']})
+     other = DataFrame({'A': [1, 3, 3, 7], 'B': ['e', 'f', 'f', 'e']})
+     mask = dfi.isin(other)
      mask
      dfi[mask.any(1)]
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4239,35 +4239,71 @@ def to_period(self, freq=None, axis=0, copy=True):
 
         return self._constructor(new_data)
 
-
-    def isin(self, values, iloc=False):
+    def isin(self, values):
         """
-        Return boolean DataFrame showing whether each element in the DataFrame is
-        contained in values.
+        Return boolean DataFrame showing whether each element in the
+        DataFrame is contained in values.
 
         Parameters
         ----------
-        values : iterable or dictionary of columns to values
-        iloc : boolean, if passing a dict as values, describe columns using integer
-                        locations (default is to use labels)
+        values : iterable, Series, DataFrame or dictionary
+            The result will only be true at a location if all the
+            labels match. If `values` is a Series, that's the index. If
+            `values` is a dictionary, the keys must be the column names,
+            which must match. If `values` is a DataFrame,
+            then both the index and column labels must match.
 
         Returns
         -------
 
         DataFrame of booleans
+
+        Examples
+        --------
+        When ``values`` is a list:
+
+        >>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
+        >>> df.isin([1, 3, 12, 'a'])
+               A      B
+        0   True   True
+        1  False  False
+        2   True  False
+
+        When ``values`` is a dict:
+
+        >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
+        >>> df.isin({'A': [1, 3], 'B': [4, 7, 12]})
+               A      B
+        0   True  False  # Note that B didn't match the 1 here.
+        1  False   True
+        2   True   True
+
+        When ``values`` is a Series or DataFrame:
+
+        >>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
+        >>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
+        >>> df.isin(other)
+               A      B
+        0   True  False
+        1  False  False  # Column A in `other` has a 3, but not at index 1.
+        2   True   True
         """
         if isinstance(values, dict):
             from collections import defaultdict
             from pandas.tools.merge import concat
             values = defaultdict(list, values)
-            if iloc:
-                return concat((self.iloc[:, [i]].isin(values[i])
-                                 for i, col in enumerate(self.columns)), axis=1)
-            else:
-                return concat((self.iloc[:, [i]].isin(values[col])
-                                 for i, col in enumerate(self.columns)), axis=1)
-
-
+            return concat((self.iloc[:, [i]].isin(values[col])
+                           for i, col in enumerate(self.columns)), axis=1)
+        elif isinstance(values, Series):
+            if not values.index.is_unique:
+                raise ValueError("ValueError: cannot compute isin with"
+                                 " a duplicate axis.")
+            return self.eq(values.reindex_like(self), axis='index')
+        elif isinstance(values, DataFrame):
+            if not (values.columns.is_unique and values.index.is_unique):
+                raise ValueError("ValueError: cannot compute isin with"
+                                 " a duplicate axis.")
+            return self.eq(values.reindex_like(self))
         else:
             if not is_list_like(values):
                 raise TypeError("only list-like or dict-like objects are"

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -11431,20 +11431,6 @@ def test_isin_dict(self):
         result = df.isin(d)
         assert_frame_equal(result, expected)
 
-        # iloc
-        df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
-        d = {0: ['a']}
-        expected = DataFrame(False, df.index, df.columns)
-
-        # without using iloc
-        result = df.isin(d)
-        assert_frame_equal(result, expected)
-
-        # using iloc
-        result = df.isin(d, iloc=True)
-        expected.iloc[0, 0] = True
-        assert_frame_equal(result, expected)
-
     def test_isin_with_string_scalar(self):
         #GH4763
         df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
@@ -11456,6 +11442,84 @@ def test_isin_with_string_scalar(self):
         with tm.assertRaises(TypeError):
             df.isin('aaa')
 
+    def test_isin_df(self):
+        df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
+        df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
+        expected = DataFrame(False, df1.index, df1.columns)
+        result = df1.isin(df2)
+        expected['A'].loc[[1, 3]] = True
+        expected['B'].loc[[0, 2]] = True
+        assert_frame_equal(result, expected)
+
+        # partial overlapping columns
+        df2.columns = ['A', 'C']
+        result = df1.isin(df2)
+        expected['B'] = False
+        assert_frame_equal(result, expected)
+
+    def test_isin_df_dupe_values(self):
+        df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
+        # just cols duped
+        df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
+                        columns=['B', 'B'])
+        with tm.assertRaises(ValueError):
+            df1.isin(df2)
+
+        # just index duped
+        df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
+                        columns=['A', 'B'], index=[0, 0, 1, 1])
+        with tm.assertRaises(ValueError):
+            df1.isin(df2)
+
+        # cols and index:
+        df2.columns = ['B', 'B']
+        with tm.assertRaises(ValueError):
+            df1.isin(df2)
+
+    def test_isin_dupe_self(self):
+        other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
+        df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A','A'])
+        result = df.isin(other)
+        expected = DataFrame(False, index=df.index, columns=df.columns)
+        expected.loc[0] = True
+        expected.iloc[1, 1] = True
+        assert_frame_equal(result, expected)
+
+
+    def test_isin_against_series(self):
+        df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
+                          index=['a', 'b', 'c', 'd'])
+        s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
+        expected = DataFrame(False, index=df.index, columns=df.columns)
+        expected['A'].loc['a'] = True
+        expected.loc['d'] = True
+        result = df.isin(s)
+        assert_frame_equal(result, expected)
+
+    def test_isin_multiIndex(self):
+        idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
+                                      (0, 'b', 'bar'), (0, 'b', 'baz'),
+                                      (2, 'a', 'foo'), (2, 'a', 'bar'),
+                                      (2, 'c', 'bar'), (2, 'c', 'baz'),
+                                      (1, 'b', 'foo'), (1, 'b', 'bar'),
+                                      (1, 'c', 'bar'), (1, 'c', 'baz')])
+        df1 = DataFrame({'A': np.ones(12),
+                         'B': np.zeros(12)}, index=idx)
+        df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
+                         'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
+        # against regular index
+        expected = DataFrame(False, index=df1.index, columns=df1.columns)
+        result = df1.isin(df2)
+        assert_frame_equal(result, expected)
+
+        df2.index = idx
+        expected = df2.values.astype(np.bool)
+        expected[:, 1] = ~expected[:, 1]
+        expected = DataFrame(expected, columns=['A', 'B'], index=idx)
+
+        result = df1.isin(df2)
+        assert_frame_equal(result, expected)
+
     def test_to_csv_date_format(self):
         from pandas import to_datetime
         pname = '__tmp_to_csv_date_format__'