Skip to content

Commit

Permalink
BUG: Fix merging non-indexes causes Index dtype promotion in when key…
Browse files Browse the repository at this point in the history
…s are missing from left or right side. (GH28220)

Also closes GH24897, GH24212, and GH17257
  • Loading branch information
dworvos committed Oct 20, 2019
1 parent e623f0f commit 730c97c
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 2 deletions.
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.25.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ Groupby/resample/rolling
- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)

Reshaping
^^^^^^^^^

- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).

Other
^^^^^

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,12 @@
.. versionadded:: 0.21.0
index_na_value : value, optional
If a join requires NA values to be placed in the index use this value or
accept the default NA for the dtype which may involve a type promotion
.. versionadded:: 0.25.2
Returns
-------
DataFrame
Expand Down
17 changes: 16 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
from pandas.core.sorting import is_int64_overflow_possible


class DefaultNA:
pass


@Substitution("\nleft : DataFrame")
@Appender(_merge_doc, indents=0)
def merge(
Expand All @@ -64,6 +68,7 @@ def merge(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
op = _MergeOperation(
left,
Expand All @@ -79,6 +84,7 @@ def merge(
copy=copy,
indicator=indicator,
validate=validate,
index_na_value=index_na_value,
)
return op.get_result()

Expand Down Expand Up @@ -551,6 +557,7 @@ def __init__(
copy=True,
indicator=False,
validate=None,
index_na_value=DefaultNA(),
):
left = validate_operand(left)
right = validate_operand(right)
Expand Down Expand Up @@ -619,6 +626,10 @@ def __init__(
if validate is not None:
self._validate(validate)

# if a join requires NA values to be placed in the index
# use this value or default NA which may involve a type promotion
self.index_na_value = index_na_value

def get_result(self):
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
Expand Down Expand Up @@ -898,7 +909,11 @@ def _create_join_index(
# and fill_value because it throws a ValueError on integer indices
mask = indexer == -1
if np.any(mask):
fill_value = na_value_for_dtype(index.dtype, compat=False)
if isinstance(self.index_na_value, DefaultNA):
fill_value = na_value_for_dtype(index.dtype, compat=False)
else:
fill_value = self.index_na_value

index = index.append(Index([fill_value]))
return index.take(indexer)

Expand Down
182 changes: 181 additions & 1 deletion pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import MergeError, merge
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
from pandas.util.testing import (
assert_frame_equal,
assert_index_equal,
assert_series_equal,
)

N = 50
NGROUPS = 8
Expand Down Expand Up @@ -2131,3 +2135,179 @@ def test_merge_multiindex_columns():
expected["id"] = ""

tm.assert_frame_equal(result, expected)


@pytest.fixture(
params=[
dict(domain=pd.Index(["A", "B", "C"])),
dict(domain=CategoricalIndex(["A", "B", "C"])),
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
dict(domain=Float64Index([1, 2, 3])),
dict(domain=Int64Index([1, 2, 3])),
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")),
]
)
def fix_GH_28220_(request):
class Data:
def __init__(self):
self.domain = request.param["domain"]
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
self.Y = pd.DataFrame(
{"name": self.domain.take([0, 2]), "value": [100, 200]}
)
self.Z = pd.DataFrame(
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
)
self.E = pd.DataFrame(columns=["name", "value"])

assert isinstance(self.X.index, type(self.domain))

return Data()


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, -255], [0, 1, -255], [0, 1])),
("inner", ([0], [0, 1], [])),
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
],
)
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
e3 = fix_GH_28220_.domain.take(e3)

r1 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.Y,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)
assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.Z,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)
assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X,
fix_GH_28220_.E,
left_index=True,
right_on=["name"],
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
("inner", ([0], [0, 0], [])),
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)
assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E.set_index("name"),
left_on=["index"],
right_index=True,
how=how,
index_na_value=-255,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

assert_index_equal(r3.index, e3)


@pytest.mark.parametrize(
"how,expected",
[
("left", ([0, 1], [0, 1, 2], [0, 1])),
("right", ([0, 1], [0, 1, 2], [0, 2])),
("inner", ([0], [0, 1], [])),
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
],
)
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):

# GH 28220
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)

r1 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Y,
left_on=["index"],
right_on=["name"],
how=how,
)
assert_index_equal(r1.index, e1)

r2 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.Z,
left_on=["index"],
right_on=["name"],
how=how,
)
assert_index_equal(r2.index, e2)

r3 = pd.merge(
fix_GH_28220_.X.reset_index(),
fix_GH_28220_.E,
left_on=["index"],
right_on=["name"],
how=how,
)

# special case when result is empty, dtype is object
if r3.empty:
e3 = pd.Index([], dtype=object, name=e3.name)

assert_index_equal(r3.index, e3)

0 comments on commit 730c97c

Please sign in to comment.