From c9e8564c73b08f1cf284a90aabe17b1852ec489d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Feb 2018 11:33:56 -0600 Subject: [PATCH 1/4] BUG: Fixed merge on dtype equal categories --- doc/source/whatsnew/v0.23.0.txt | 3 +++ pandas/core/reshape/merge.py | 10 +++++++++- pandas/tests/reshape/merge/test_merge.py | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b5bf7ccbda0b6..032e3dfa842c9 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -679,6 +679,9 @@ Categorical - Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result when all the categoricals had the same categories, but in a different order. This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). +- Bug in :func:`pandas.merge` when joining on a ``Categorical`` column where + the categories were unordered and had the same values but in a different + order (:issue:`19551`) - Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`) - Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9dbb327e3d956..4b99b0407cfcc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -12,6 +12,7 @@ from pandas import (Categorical, DataFrame, Index, MultiIndex, Timedelta) +from pandas.core.arrays.categorical import _recode_for_categories from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1540,8 +1541,15 @@ def _factorize_keys(lk, rk, sort=True): is_categorical_dtype(rk) and lk.is_dtype_equal(rk)): klass = libhashtable.Int64Factorizer + + if lk.categories.equals(rk.categories): + rk = rk.codes + else: + # Same categories in different orders -> recode + rk = _recode_for_categories(rk.codes, rk.categories, lk.categories) + lk = _ensure_int64(lk.codes) - rk = _ensure_int64(rk.codes) + rk = _ensure_int64(rk) elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = libhashtable.Int64Factorizer lk = _ensure_int64(com._values_from_object(lk)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 32f83ab972be5..101d34ebdb89f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1643,6 +1643,25 @@ def test_merge_categorical(self): result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') tm.assert_frame_equal(result, expected) + def tests_merge_categorical_unordered_equal(self): + # GH-19551 + df1 = DataFrame({ + 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']), + 'Left': ['A0', 'B0', 'C0'], + }) + + df2 = DataFrame({ + 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']), + 'Right': ['C1', 'B1', 'A1'], + }) + result = pd.merge(df1, df2, on=['Foo']) + expected = DataFrame({ + 'Foo': pd.Categorical(['A', 'B', 'C']), + 'Left': ['A0', 'B0', 'C0'], + 'Right': ['A1', 'B1', 'C1'], + }) + assert_frame_equal(result, expected) + def test_other_columns(self, left, right): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype('category')) From 5a665b0f8c44c0374890500781f46812df7815c1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 09:47:22 -0600 Subject: [PATCH 2/4] One more case --- pandas/core/indexes/category.py | 11 +++++++++-- pandas/tests/indexing/test_categorical.py | 17 +++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2c7be2b21f959..b36bc1df23247 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -553,6 +553,8 @@ def _reindex_non_unique(self, target): @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): + from pandas.core.arrays.categorical import _recode_for_categories + method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) @@ -568,8 +570,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): - # we have the same codes - codes = target.codes + if self.values.equals(target.values): + # we have the same codes + codes = target.codes + else: + codes = _recode_for_categories(target.codes, + target.categories, + self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index f2182687d047f..634ad0d8160ed 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -432,6 +432,23 @@ def test_get_indexer_array(self): expected = np.array([0, 1], dtype='intp') tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + + result = ci.get_indexer(CategoricalIndex(['b', 'b'], + categories=['a', 'b'])) + expected = np.array([1, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + + result = ci.get_indexer(CategoricalIndex(['b', 'b'], + categories=['b', 'a'])) + expected = np.array([1, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + def test_getitem_with_listlike(self): # GH 16115 cats = Categorical([Timestamp('12-31-1999'), From ffc2f8cdba2588791692019a804d377658452965 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 09:59:58 -0600 Subject: [PATCH 3/4] Updated docs --- doc/source/whatsnew/v0.23.0.txt | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 032e3dfa842c9..89624357e93c5 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -528,6 +528,32 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +Categorical +^^^^^^^^^^^ + +.. warning:: + + A class of bugs were introduced in 0.21 with ``CategoricalDtype`` that affect + the correctness of operations like ``merge``, ``concat``, and indexing when + comparing multiple unordered ``Categorical`` arrays that have the same categories + in a different order. We highly recommend upgrading or manually aligning your + categories. + +- Bug in ``Categorical.equals`` returning the wrong result when comparing two + unordered ``Categorical`` arrays with the same categories, but in a different + order (:issue:`16603`) +- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result + when for unordered categoricals with the categories in a different order. + This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). +- Bug in :func:`pandas.merge` returning the wrong result when joining on an + unordered ``Categorical`` that had the same categories but in a different + order (:issue:`19551`) +- Bug in :meth:`CategoricalIndex.get_indexer` returning the wrong result when + ``target`` was an unordered ``Categorical`` that had the same categories as + ``self`` but in a different order (:issue:`19551`) +- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) +- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) Datetimelike ^^^^^^^^^^^^ @@ -671,23 +697,6 @@ Reshaping - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) - - -Categorical -^^^^^^^^^^^ - -- -- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result - when all the categoricals had the same categories, but in a different order. - This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). -- Bug in :func:`pandas.merge` when joining on a ``Categorical`` column where - the categories were unordered and had the same values but in a different - order (:issue:`19551`) -- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`) -- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) -- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) -- - Other ^^^^^ From 9eb7f28309673ef8ec549ce89a63bd964122289c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 10:01:50 -0600 Subject: [PATCH 4/4] Wording --- doc/source/whatsnew/v0.23.0.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 89624357e93c5..18a063ce3750b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -533,11 +533,11 @@ Categorical .. warning:: - A class of bugs were introduced in 0.21 with ``CategoricalDtype`` that affect - the correctness of operations like ``merge``, ``concat``, and indexing when - comparing multiple unordered ``Categorical`` arrays that have the same categories - in a different order. We highly recommend upgrading or manually aligning your - categories. + A class of bugs were introduced in pandas 0.21 with ``CategoricalDtype`` that + affects the correctness of operations like ``merge``, ``concat``, and + indexing when comparing multiple unordered ``Categorical`` arrays that have + the same categories, but in a different order. We highly recommend upgrading + or manually aligning your categories before doing these operations. - Bug in ``Categorical.equals`` returning the wrong result when comparing two unordered ``Categorical`` arrays with the same categories, but in a different