Bug in pd.merge() when merge/join with multiple categorical columns (p…

…andas-dev#16786) closes pandas-dev#16767 (cherry picked from commit 5e776fb)
TomAugspurger · Jul 6, 2017 · 022e0b7 · 022e0b7
1 parent fec510c
commit 022e0b7
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 6 deletions.
diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt
@@ -55,8 +55,8 @@ Indexing
 I/O
 ^^^
 
-- Bug in :func:`read_csv`` in which files weren't opened as binary files by the C engine on Windows, causing EOF characters mid-field, which would fail (:issue:`16039`, :issue:`16559`, :issue:`16675`)
-- Bug in :func:`read_hdf`` in which reading a ``Series`` saved to an HDF file in 'fixed' format fails when an explicit ``mode='r'`` argument is supplied (:issue:`16583`)
+- Bug in :func:`read_csv` in which files weren't opened as binary files by the C engine on Windows, causing EOF characters mid-field, which would fail (:issue:`16039`, :issue:`16559`, :issue:`16675`)
+- Bug in :func:`read_hdf` in which reading a ``Series`` saved to an HDF file in 'fixed' format fails when an explicit ``mode='r'`` argument is supplied (:issue:`16583`)
 
 Plotting
 ^^^^^^^^
@@ -79,6 +79,7 @@ Reshaping
 ^^^^^^^^^
 
 - Bug in joining on a ``MultiIndex`` with a ``category`` dtype for a level (:issue:`16627`).
+- Bug in :func:`merge` when merging/joining with multiple categorical columns (:issue:`16767`)
 
 
 Numeric

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1384,13 +1384,14 @@ def _factorize_keys(lk, rk, sort=True):
         lk = lk.values
         rk = rk.values
 
-    # if we exactly match in categories, allow us to use codes
+    # if we exactly match in categories, allow us to factorize on codes
     if (is_categorical_dtype(lk) and
             is_categorical_dtype(rk) and
             lk.is_dtype_equal(rk)):
-        return lk.codes, rk.codes, len(lk.categories)
-
-    if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
+        klass = libhashtable.Int64Factorizer
+        lk = _ensure_int64(lk.codes)
+        rk = _ensure_int64(rk.codes)
+    elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
         klass = libhashtable.Int64Factorizer
         lk = _ensure_int64(com._values_from_object(lk))
         rk = _ensure_int64(com._values_from_object(rk))

diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py
@@ -1356,6 +1356,29 @@ def test_dtype_on_merged_different(self, change, how, left, right):
                           index=['X', 'Y', 'Z'])
         assert_series_equal(result, expected)
 
+    def test_self_join_multiple_categories(self):
+        # GH 16767
+        # non-duplicates should work with multiple categories
+        m = 5
+        df = pd.DataFrame({
+            'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m,
+            'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m,
+            'c': [letter
+                  for each in ['m', 'n', 'u', 'p', 'o']
+                  for letter in [each] * 2 * m],
+            'd': [letter
+                  for each in ['aa', 'bb', 'cc', 'dd', 'ee',
+                               'ff', 'gg', 'hh', 'ii', 'jj']
+                  for letter in [each] * m]})
+
+        # change them all to categorical variables
+        df = df.apply(lambda x: x.astype('category'))
+
+        # self-join should equal ourselves
+        result = pd.merge(df, df, on=list(df.columns))
+
+        assert_frame_equal(result, df)
+
 
 @pytest.fixture
 def left_df():