Skip to content

Commit

Permalink
BUG: Fix merging non-indexes causes Index dtype promotion in when key…
Browse files Browse the repository at this point in the history
…s are missing from left or right side. (GH28220)

Also closes GH24897, GH24212, and GH17257
  • Loading branch information
dworvos committed Nov 9, 2019
1 parent 6498bc1 commit ec70eee
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 1 deletion.
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.25.4.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

Reshaping
^^^^^^^^^

- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).

6 changes: 6 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,12 @@
.. versionadded:: 0.21.0
index_na_value : value, optional
If a join requires NA values to be placed in the index use this value or
accept the default NA for the dtype which may involve a type promotion
.. versionadded:: 0.25.2
Returns
-------
DataFrame
Expand Down
17 changes: 16 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
from pandas.core.sorting import is_int64_overflow_possible


class DefaultNA:
pass


@Substitution("\nleft : DataFrame")
@Appender(_merge_doc, indents=0)
def merge(
Expand All @@ -63,6 +67,7 @@ def merge(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
op = _MergeOperation(
left,
Expand All @@ -78,6 +83,7 @@ def merge(
copy=copy,
indicator=indicator,
validate=validate,
index_na_value=index_na_value,
)
return op.get_result()

Expand Down Expand Up @@ -555,6 +561,7 @@ def __init__(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
left = validate_operand(left)
right = validate_operand(right)
Expand Down Expand Up @@ -623,6 +630,10 @@ def __init__(
if validate is not None:
self._validate(validate)

# if a join requires NA values to be placed in the index
# use this value or default NA which may involve a type promotion
self.index_na_value = index_na_value

def get_result(self):
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
Expand Down Expand Up @@ -902,7 +913,11 @@ def _create_join_index(
# and fill_value because it throws a ValueError on integer indices
mask = indexer == -1
if np.any(mask):
fill_value = na_value_for_dtype(index.dtype, compat=False)
if isinstance(self.index_na_value, DefaultNA):
fill_value = na_value_for_dtype(index.dtype, compat=False)
else:
fill_value = self.index_na_value

index = index.append(Index([fill_value]))
return index.take(indexer)

Expand Down
175 changes: 175 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2153,3 +2153,178 @@ def test_merge_multiindex_columns():
expected["id"] = ""

tm.assert_frame_equal(result, expected)


@pytest.fixture(
params=[
dict(domain=pd.Index(["A", "B", "C"])),
dict(domain=CategoricalIndex(["A", "B", "C"])),
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
dict(domain=Float64Index([1, 2, 3])),
dict(domain=Int64Index([1, 2, 3])),
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
]
)
def fix_GH_28220_(request):
class Data:
def __init__(self):
self.domain = request.param["domain"]
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
self.Y = pd.DataFrame(
{"name": self.domain.take([0, 2]), "value": [100, 200]}
)
self.Z = pd.DataFrame(
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
)
self.E = pd.DataFrame(columns=["name", "value"])

assert isinstance(self.X.index, type(self.domain))

return Data()


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, -255], [0, 1, -255], [0, 1])),
("inner", ([0], [0, 1], [])),
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
],
)
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
e3 = fix_GH_28220_.domain.take(e3)

r1 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.Y,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.Z,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.E,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

tm.assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
("inner", ([0], [0, 0], [])),
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

tm.assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, 1], [0, 1, 2], [0, 1])),
("right", ([0, 1], [0, 1, 2], [0, 2])),
("inner", ([0], [0, 1], [])),
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y,
left_on=["index"],
right_on=["name"],
how=how,
)
tm.assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z,
left_on=["index"],
right_on=["name"],
how=how,
)
tm.assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E,
left_on=["index"],
right_on=["name"],
how=how,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

tm.assert_index_equal(r3.index, e3)

0 comments on commit ec70eee

Please sign in to comment.