Skip to content

Commit

Permalink
BUG: Fix merging non-indexes causes Index dtype promotion in when key…
Browse files Browse the repository at this point in the history
…s are missing from left or right side. (GH28220)

Also closes GH24897, GH24212, and GH17257
  • Loading branch information
dworvos committed Sep 28, 2019
1 parent f8a924b commit a0a95c2
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Groupby/resample/rolling
Reshaping
^^^^^^^^^

- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).
-
-
-
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,12 @@
.. versionadded:: 0.21.0
index_na_value: value, optional
If a join requires NA values to be placed in the index use this value or
accept the default NA for the dtype which may involve a type promotion
.. versionadded:: 0.25.2
Returns
-------
DataFrame
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
import pandas.core.sorting as sorting
from pandas.core.sorting import is_int64_overflow_possible

class DefaultNA:
pass

@Substitution("\nleft : DataFrame")
@Appender(_merge_doc, indents=0)
Expand All @@ -64,6 +66,7 @@ def merge(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
op = _MergeOperation(
left,
Expand All @@ -79,6 +82,7 @@ def merge(
copy=copy,
indicator=indicator,
validate=validate,
index_na_value=index_na_value,
)
return op.get_result()

Expand Down Expand Up @@ -551,6 +555,7 @@ def __init__(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
left = validate_operand(left)
right = validate_operand(right)
Expand Down Expand Up @@ -619,6 +624,10 @@ def __init__(
if validate is not None:
self._validate(validate)

# if a join requires NA values to be placed in the index
# use this value or default NA which may involve a type promotion
self.index_na_value = index_na_value

def get_result(self):
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
Expand Down Expand Up @@ -898,7 +907,11 @@ def _create_join_index(
# and fill_value because it throws a ValueError on integer indices
mask = indexer == -1
if np.any(mask):
fill_value = na_value_for_dtype(index.dtype, compat=False)
if isinstance(self.index_na_value, DefaultNA):
fill_value = na_value_for_dtype(index.dtype, compat=False)
else:
fill_value = self.index_na_value

index = index.append(Index([fill_value]))
return index.take(indexer)

Expand Down
165 changes: 164 additions & 1 deletion pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import MergeError, merge
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
from pandas.util.testing import (
assert_frame_equal,
assert_series_equal,
assert_index_equal,
)

N = 50
NGROUPS = 8
Expand Down Expand Up @@ -2094,3 +2098,162 @@ def test_merge_equal_cat_dtypes2():

# Categorical is unordered, so don't check ordering.
tm.assert_frame_equal(result, expected, check_categorical=False)

@pytest.fixture(
params=[
dict(domain=pd.Index(["A", "B", "C"])),
dict(domain=CategoricalIndex(["A", "B", "C"])),
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
dict(domain=Float64Index([1, 2, 3])),
dict(domain=Int64Index([1, 2, 3])),
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")),
]
)
def fix_GH_28220_(request):
class Data:
def __init__(self):
self.domain = request.param["domain"]
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
self.Y = pd.DataFrame(
{"name": self.domain.take([0, 2]), "value": [100, 200]}
)
self.Z = pd.DataFrame(
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
)
self.E = pd.DataFrame(columns=["name", "value"])

assert isinstance(self.X.index, type(self.domain))

return Data()

@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, -255], [0, 1, -255], [0, 1])),
("inner", ([0], [0, 1], [])),
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
],
)
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
e3 = fix_GH_28220_.domain.take(e3)

r1 = pd.merge(
fix_GH_28220_.X, fix_GH_28220_.Y, left_index=True, right_on=["name"], how=how, index_na_value=-255
)
assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X, fix_GH_28220_.Z, left_index=True, right_on=["name"], how=how, index_na_value=-255
)
assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X, fix_GH_28220_.E, left_index=True, right_on=["name"], how=how, index_na_value=-255
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
("inner", ([0], [0, 0], [])),
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, 1], [0, 1, 2], [0, 1])),
("right", ([0, 1], [0, 1, 2], [0, 2])),
("inner", ([0], [0, 1], [])),
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y,
left_on=["index"],
right_on=["name"],
how=how,
)
assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z,
left_on=["index"],
right_on=["name"],
how=how,
)
assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E,
left_on=["index"],
right_on=["name"],
how=how,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

assert_index_equal(r3.index, e3)

0 comments on commit a0a95c2

Please sign in to comment.