BUG: Fix KeyError in merge on CategoricalIndex (pandas-dev#20777)

forking-repos · May 3, 2018 · 21f5fb1 · 21f5fb1
1 parent d3d3352
commit 21f5fb1
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 25 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1327,6 +1327,7 @@ Sparse
 Reshaping
 ^^^^^^^^^
 
+- Bug in :func:`DataFrame.merge` where referencing a ``CategoricalIndex`` by name, where the ``by`` kwarg would ``KeyError`` (:issue:`20777`)
 - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
 - Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`)
 - Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1585,6 +1585,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
 
     if is_sparse(arr):
         arr = arr.get_values()
+    elif isinstance(arr, (ABCIndexClass, ABCSeries)):
+        arr = arr.values
 
     arr = np.asarray(arr)
 

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -705,8 +705,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                                 take_right = self.right[name]._values
 
             elif left_indexer is not None \
-                    and isinstance(self.left_join_keys[i], np.ndarray):
-
+                    and is_array_like(self.left_join_keys[i]):
                 take_left = self.left_join_keys[i]
                 take_right = self.right_join_keys[i]
 

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1,26 +1,27 @@
 # pylint: disable=E1103
 
-import pytest
-from datetime import datetime, date
-from numpy.random import randn
-from numpy import nan
-import numpy as np
 import random
 import re
+from collections import OrderedDict
+from datetime import date, datetime
+
+import numpy as np
+import pytest
+from numpy import nan
+from numpy.random import randn
 
 import pandas as pd
+import pandas.util.testing as tm
+from pandas import (Categorical, CategoricalIndex, DataFrame, DatetimeIndex,
+                    Float64Index, Index, Int64Index, MultiIndex, RangeIndex,
+                    Series, UInt64Index)
+from pandas.api.types import CategoricalDtype as CDT
 from pandas.compat import lrange, lzip
+from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.reshape.concat import concat
-from pandas.core.reshape.merge import merge, MergeError
+from pandas.core.reshape.merge import MergeError, merge
 from pandas.util.testing import assert_frame_equal, assert_series_equal
-from pandas.core.dtypes.dtypes import CategoricalDtype
-from pandas.core.dtypes.common import (
-    is_categorical_dtype,
-    is_object_dtype,
-)
-from pandas import DataFrame, Index, MultiIndex, Series, Categorical
-import pandas.util.testing as tm
-from pandas.api.types import CategoricalDtype as CDT
 
 N = 50
 NGROUPS = 8
@@ -813,7 +814,7 @@ def test_validation(self):
 
         # Dups on right
         right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
-                                    index=[4]))
+                                                 index=[4]))
         merge(left, right_w_dups, left_index=True, right_index=True,
               validate='one_to_many')
 
@@ -1388,17 +1389,24 @@ def test_merge_datetime_index(self, klass):
         if klass is not None:
             on_vector = klass(on_vector)
 
-        expected = DataFrame({"a": [1, 2, 3]})
-
-        if klass == np.asarray:
-            # The join key is added for ndarray.
-            expected["key_1"] = [2016, 2017, 2018]
+        expected = DataFrame(
+            OrderedDict([
+                ("a", [1, 2, 3]),
+                ("key_1", [2016, 2017, 2018]),
+            ])
+        )
 
         result = df.merge(df, on=["a", on_vector], how="inner")
         tm.assert_frame_equal(result, expected)
 
-        expected = DataFrame({"a_x": [1, 2, 3],
-                              "a_y": [1, 2, 3]})
+        expected = DataFrame(
+            OrderedDict([
+                ("key_0", [2016, 2017, 2018]),
+                ("a_x", [1, 2, 3]),
+                ("a_y", [1, 2, 3]),
+            ])
+        )
+
         result = df.merge(df, on=[df.index.year], how="inner")
         tm.assert_frame_equal(result, expected)
 
@@ -1427,7 +1435,7 @@ def test_different(self, right_vals):
         # We allow merging on object and categorical cols and cast
         # categorical cols to object
         if (is_categorical_dtype(right['A'].dtype) or
-           is_object_dtype(right['A'].dtype)):
+                is_object_dtype(right['A'].dtype)):
             result = pd.merge(left, right, on='A')
             assert is_object_dtype(result.A.dtype)
 
@@ -1826,3 +1834,26 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
                           how=how,
                           sort=sort)
         tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'index', [
+        CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'),
+        Float64Index([1.0, 2.0], name='index_col'),
+        Int64Index([1, 2], name='index_col'),
+        UInt64Index([1, 2], name='index_col'),
+        RangeIndex(start=0, stop=2, name='index_col'),
+        DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'),
+    ], ids=lambda x: type(x).__name__)
+def test_merge_index_types(index):
+    # gh-20777
+    # assert key access is consistent across index types
+    left = DataFrame({"left_data": [1, 2]}, index=index)
+    right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
+
+    result = left.merge(right, on=['index_col'])
+
+    expected = DataFrame(
+        OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
+        index=index)
+    assert_frame_equal(result, expected)