Skip to content

Commit

Permalink
BUG: Fix KeyError in merge on CategoricalIndex (pandas-dev#20777)
Browse files Browse the repository at this point in the history
  • Loading branch information
fjetter authored and jreback committed May 3, 2018
1 parent d3d3352 commit 21f5fb1
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 25 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1327,6 +1327,7 @@ Sparse
Reshaping
^^^^^^^^^

- Bug in :func:`DataFrame.merge` where referencing a ``CategoricalIndex`` by name, where the ``by`` kwarg would ``KeyError`` (:issue:`20777`)
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
- Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`)
- Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,6 +1585,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,

if is_sparse(arr):
arr = arr.get_values()
elif isinstance(arr, (ABCIndexClass, ABCSeries)):
arr = arr.values

arr = np.asarray(arr)

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,8 +705,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
take_right = self.right[name]._values

elif left_indexer is not None \
and isinstance(self.left_join_keys[i], np.ndarray):

and is_array_like(self.left_join_keys[i]):
take_left = self.left_join_keys[i]
take_right = self.right_join_keys[i]

Expand Down
77 changes: 54 additions & 23 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
# pylint: disable=E1103

import pytest
from datetime import datetime, date
from numpy.random import randn
from numpy import nan
import numpy as np
import random
import re
from collections import OrderedDict
from datetime import date, datetime

import numpy as np
import pytest
from numpy import nan
from numpy.random import randn

import pandas as pd
import pandas.util.testing as tm
from pandas import (Categorical, CategoricalIndex, DataFrame, DatetimeIndex,
Float64Index, Index, Int64Index, MultiIndex, RangeIndex,
Series, UInt64Index)
from pandas.api.types import CategoricalDtype as CDT
from pandas.compat import lrange, lzip
from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge, MergeError
from pandas.core.reshape.merge import MergeError, merge
from pandas.util.testing import assert_frame_equal, assert_series_equal
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_object_dtype,
)
from pandas import DataFrame, Index, MultiIndex, Series, Categorical
import pandas.util.testing as tm
from pandas.api.types import CategoricalDtype as CDT

N = 50
NGROUPS = 8
Expand Down Expand Up @@ -813,7 +814,7 @@ def test_validation(self):

# Dups on right
right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']},
index=[4]))
index=[4]))
merge(left, right_w_dups, left_index=True, right_index=True,
validate='one_to_many')

Expand Down Expand Up @@ -1388,17 +1389,24 @@ def test_merge_datetime_index(self, klass):
if klass is not None:
on_vector = klass(on_vector)

expected = DataFrame({"a": [1, 2, 3]})

if klass == np.asarray:
# The join key is added for ndarray.
expected["key_1"] = [2016, 2017, 2018]
expected = DataFrame(
OrderedDict([
("a", [1, 2, 3]),
("key_1", [2016, 2017, 2018]),
])
)

result = df.merge(df, on=["a", on_vector], how="inner")
tm.assert_frame_equal(result, expected)

expected = DataFrame({"a_x": [1, 2, 3],
"a_y": [1, 2, 3]})
expected = DataFrame(
OrderedDict([
("key_0", [2016, 2017, 2018]),
("a_x", [1, 2, 3]),
("a_y", [1, 2, 3]),
])
)

result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1427,7 +1435,7 @@ def test_different(self, right_vals):
# We allow merging on object and categorical cols and cast
# categorical cols to object
if (is_categorical_dtype(right['A'].dtype) or
is_object_dtype(right['A'].dtype)):
is_object_dtype(right['A'].dtype)):
result = pd.merge(left, right, on='A')
assert is_object_dtype(result.A.dtype)

Expand Down Expand Up @@ -1826,3 +1834,26 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
how=how,
sort=sort)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
'index', [
CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'),
Float64Index([1.0, 2.0], name='index_col'),
Int64Index([1, 2], name='index_col'),
UInt64Index([1, 2], name='index_col'),
RangeIndex(start=0, stop=2, name='index_col'),
DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'),
], ids=lambda x: type(x).__name__)
def test_merge_index_types(index):
# gh-20777
# assert key access is consistent across index types
left = DataFrame({"left_data": [1, 2]}, index=index)
right = DataFrame({"right_data": [1.0, 2.0]}, index=index)

result = left.merge(right, on=['index_col'])

expected = DataFrame(
OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]),
index=index)
assert_frame_equal(result, expected)

0 comments on commit 21f5fb1

Please sign in to comment.