pandas-dev · jreback · Feb 8, 2018 · Feb 6, 2018 · Feb 7, 2018 · Feb 7, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -528,6 +528,32 @@ Documentation Changes
 Bug Fixes
 ~~~~~~~~~
 
+Categorical
+^^^^^^^^^^^
+
+.. warning::
+
+   A class of bugs were introduced in pandas 0.21 with ``CategoricalDtype`` that
+   affects the correctness of operations like ``merge``, ``concat``, and
+   indexing when comparing multiple unordered ``Categorical`` arrays that have
+   the same categories, but in a different order. We highly recommend upgrading
+   or manually aligning your categories before doing these operations.
+
+- Bug in ``Categorical.equals`` returning the wrong result when comparing two
+  unordered ``Categorical`` arrays with the same categories, but in a different
+  order (:issue:`16603`)
+- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result
+  when for unordered categoricals with the categories in a different order.
+  This affected :func:`pandas.concat` with Categorical data (:issue:`19096`).
+- Bug in :func:`pandas.merge` returning the wrong result when joining on an
+  unordered ``Categorical`` that had the same categories but in a different
+  order (:issue:`19551`)
+- Bug in :meth:`CategoricalIndex.get_indexer` returning the wrong result when
+  ``target`` was an unordered ``Categorical`` that had the same categories as
+  ``self`` but in a different order (:issue:`19551`)
+- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
+- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
+- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
 
 Datetimelike
 ^^^^^^^^^^^^
@@ -671,20 +697,6 @@ Reshaping
 - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`)
 -
 
-
-Categorical
-^^^^^^^^^^^
-
--
-- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result
-  when all the categoricals had the same categories, but in a different order.
-  This affected :func:`pandas.concat` with Categorical data (:issue:`19096`).
-- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`)
-- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
-- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
-- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
--
-
 Other
 ^^^^^
 

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -553,6 +553,8 @@ def _reindex_non_unique(self, target):
 
     @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
+        from pandas.core.arrays.categorical import _recode_for_categories
+
         method = missing.clean_reindex_fill_method(method)
         target = ibase._ensure_index(target)
 
@@ -568,8 +570,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         if (isinstance(target, CategoricalIndex) and
                 self.values.is_dtype_equal(target)):
-            # we have the same codes
-            codes = target.codes
+            if self.values.equals(target.values):
+                # we have the same codes
+                codes = target.codes
+            else:
+                codes = _recode_for_categories(target.codes,
+                                               target.categories,
+                                               self.values.categories)
         else:
             if isinstance(target, CategoricalIndex):
                 code_indexer = self.categories.get_indexer(target.categories)

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -12,6 +12,7 @@
 
 from pandas import (Categorical, DataFrame,
                     Index, MultiIndex, Timedelta)
+from pandas.core.arrays.categorical import _recode_for_categories
 from pandas.core.frame import _merge_doc
 from pandas.core.dtypes.common import (
     is_datetime64tz_dtype,
@@ -1540,8 +1541,15 @@ def _factorize_keys(lk, rk, sort=True):
             is_categorical_dtype(rk) and
             lk.is_dtype_equal(rk)):
         klass = libhashtable.Int64Factorizer
+
+        if lk.categories.equals(rk.categories):
+            rk = rk.codes
+        else:
+            # Same categories in different orders -> recode
+            rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
+
         lk = _ensure_int64(lk.codes)
-        rk = _ensure_int64(rk.codes)
+        rk = _ensure_int64(rk)
     elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
         klass = libhashtable.Int64Factorizer
         lk = _ensure_int64(com._values_from_object(lk))

diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -432,6 +432,23 @@ def test_get_indexer_array(self):
         expected = np.array([0, 1], dtype='intp')
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_get_indexer_same_categories_same_order(self):
+        ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])
+
+        result = ci.get_indexer(CategoricalIndex(['b', 'b'],
+                                                 categories=['a', 'b']))
+        expected = np.array([1, 1], dtype='intp')
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_get_indexer_same_categories_different_order(self):
+        # https://github.com/pandas-dev/pandas/issues/19551
+        ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])
+
+        result = ci.get_indexer(CategoricalIndex(['b', 'b'],
+                                                 categories=['b', 'a']))
+        expected = np.array([1, 1], dtype='intp')
+        tm.assert_numpy_array_equal(result, expected)
+
     def test_getitem_with_listlike(self):
         # GH 16115
         cats = Categorical([Timestamp('12-31-1999'),

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1643,6 +1643,25 @@ def test_merge_categorical(self):
         result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
         tm.assert_frame_equal(result, expected)
 
+    def tests_merge_categorical_unordered_equal(self):
+        # GH-19551
+        df1 = DataFrame({
+            'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']),
+            'Left': ['A0', 'B0', 'C0'],
+        })
+
+        df2 = DataFrame({
+            'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']),
+            'Right': ['C1', 'B1', 'A1'],
+        })
+        result = pd.merge(df1, df2, on=['Foo'])
+        expected = DataFrame({
+            'Foo': pd.Categorical(['A', 'B', 'C']),
+            'Left': ['A0', 'B0', 'C0'],
+            'Right': ['A1', 'B1', 'C1'],
+        })
+        assert_frame_equal(result, expected)
+
     def test_other_columns(self, left, right):
         # non-merge columns should preserve if possible
         right = right.assign(Z=right.Z.astype('category'))