Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix merging non-indexes causes Index dtype promotion in when keys are missing #28436

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.25.4.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

Reshaping
^^^^^^^^^

- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).

6 changes: 6 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,12 @@

.. versionadded:: 0.21.0

index_na_value : value, optional
If a join requires NA values to be placed in the index use this value or
accept the default NA for the dtype which may involve a type promotion

.. versionadded:: 0.25.2

Returns
-------
DataFrame
Expand Down
17 changes: 16 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
from pandas.core.sorting import is_int64_overflow_possible


class DefaultNA:
pass


@Substitution("\nleft : DataFrame")
@Appender(_merge_doc, indents=0)
def merge(
Expand All @@ -63,6 +67,7 @@ def merge(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
op = _MergeOperation(
left,
Expand All @@ -78,6 +83,7 @@ def merge(
copy=copy,
indicator=indicator,
validate=validate,
index_na_value=index_na_value,
)
return op.get_result()

Expand Down Expand Up @@ -555,6 +561,7 @@ def __init__(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
left = validate_operand(left)
right = validate_operand(right)
Expand Down Expand Up @@ -623,6 +630,10 @@ def __init__(
if validate is not None:
self._validate(validate)

# if a join requires NA values to be placed in the index
# use this value or default NA which may involve a type promotion
self.index_na_value = index_na_value

def get_result(self):
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
Expand Down Expand Up @@ -902,7 +913,11 @@ def _create_join_index(
# and fill_value because it throws a ValueError on integer indices
mask = indexer == -1
if np.any(mask):
fill_value = na_value_for_dtype(index.dtype, compat=False)
if isinstance(self.index_na_value, DefaultNA):
fill_value = na_value_for_dtype(index.dtype, compat=False)
else:
fill_value = self.index_na_value

index = index.append(Index([fill_value]))
return index.take(indexer)

Expand Down
175 changes: 175 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2153,3 +2153,178 @@ def test_merge_multiindex_columns():
expected["id"] = ""

tm.assert_frame_equal(result, expected)


@pytest.fixture(
params=[
dict(domain=pd.Index(["A", "B", "C"])),
dict(domain=CategoricalIndex(["A", "B", "C"])),
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
dict(domain=Float64Index([1, 2, 3])),
dict(domain=Int64Index([1, 2, 3])),
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
]
)
def fix_GH_28220_(request):
class Data:
def __init__(self):
self.domain = request.param["domain"]
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
self.Y = pd.DataFrame(
{"name": self.domain.take([0, 2]), "value": [100, 200]}
)
self.Z = pd.DataFrame(
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
)
self.E = pd.DataFrame(columns=["name", "value"])

assert isinstance(self.X.index, type(self.domain))

return Data()


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, -255], [0, 1, -255], [0, 1])),
("inner", ([0], [0, 1], [])),
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
],
)
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
e3 = fix_GH_28220_.domain.take(e3)

r1 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.Y,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.Z,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.E,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

tm.assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
("inner", ([0], [0, 0], [])),
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
tm.assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

tm.assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, 1], [0, 1, 2], [0, 1])),
("right", ([0, 1], [0, 1, 2], [0, 2])),
("inner", ([0], [0, 1], [])),
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y,
left_on=["index"],
right_on=["name"],
how=how,
)
tm.assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z,
left_on=["index"],
right_on=["name"],
how=how,
)
tm.assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E,
left_on=["index"],
right_on=["name"],
how=how,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

tm.assert_index_equal(r3.index, e3)