diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index a99751f9bab9f..acd3960431714 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -34,6 +34,11 @@ Groupby/resample/rolling - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). - Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) +Reshaping +^^^^^^^^^ + +- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`). + Other ^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c90bf4ba7151f..9338a336a11b0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -231,6 +231,12 @@ .. versionadded:: 0.21.0 +index_na_value : value, optional + If a join requires NA values to be placed in the index use this value or + accept the default NA for the dtype which may involve a type promotion + + .. versionadded:: 0.25.2 + Returns ------- DataFrame diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7bfc8153da568..5ef56f645648e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -48,6 +48,10 @@ from pandas.core.sorting import is_int64_overflow_possible +class DefaultNA: + pass + + @Substitution("\nleft : DataFrame") @Appender(_merge_doc, indents=0) def merge( @@ -64,6 +68,7 @@ def merge( copy=True, indicator=False, validate=None, + index_na_value=DefaultNA(), ): op = _MergeOperation( left, @@ -79,6 +84,7 @@ def merge( copy=copy, indicator=indicator, validate=validate, + index_na_value=index_na_value, ) return op.get_result() @@ -551,6 +557,7 @@ def __init__( copy=True, indicator=False, validate=None, + index_na_value=DefaultNA(), ): left = validate_operand(left) right = validate_operand(right) @@ -619,6 +626,10 @@ def __init__( if validate is not None: self._validate(validate) + # if a join requires NA values to be placed in the index + # use this value or default NA which may involve a type promotion + self.index_na_value = index_na_value + def get_result(self): if self.indicator: self.left, self.right = self._indicator_pre_merge(self.left, self.right) @@ -898,7 +909,11 @@ def _create_join_index( # and fill_value because it throws a ValueError on integer indices mask = indexer == -1 if np.any(mask): - fill_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(self.index_na_value, DefaultNA): + fill_value = na_value_for_dtype(index.dtype, compat=False) + else: + fill_value = self.index_na_value + index = index.append(Index([fill_value])) return index.take(indexer) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 08698133e360d..cec18addc1e01 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -29,7 +29,11 @@ from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge import pandas.util.testing as tm -from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.util.testing import ( + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) N = 50 NGROUPS = 8 @@ -2131,3 +2135,179 @@ def test_merge_multiindex_columns(): expected["id"] = "" tm.assert_frame_equal(result, expected) + + +@pytest.fixture( + params=[ + dict(domain=pd.Index(["A", "B", "C"])), + dict(domain=CategoricalIndex(["A", "B", "C"])), + dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])), + dict(domain=Float64Index([1, 2, 3])), + dict(domain=Int64Index([1, 2, 3])), + dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])), + dict(domain=TimedeltaIndex(["1d", "2d", "3d"])), + dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")), + ] +) +def fix_GH_28220_(request): + class Data: + def __init__(self): + self.domain = request.param["domain"] + self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1])) + self.Y = pd.DataFrame( + {"name": self.domain.take([0, 2]), "value": [100, 200]} + ) + self.Z = pd.DataFrame( + {"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]} + ) + self.E = pd.DataFrame(columns=["name", "value"]) + + assert isinstance(self.X.index, type(self.domain)) + + return Data() + + +@pytest.mark.parametrize( + "how,expected", + [ + ("left", ([0, -255], [0, 1, -255], [0, 1])), + ("inner", ([0], [0, 1], [])), + ("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])), + ], +) +def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected): + + # GH 28220 + (e1, e2, e3) = map(lambda x: pd.Index(x), expected) + e3 = fix_GH_28220_.domain.take(e3) + + r1 = pd.merge( + fix_GH_28220_.X, + fix_GH_28220_.Y, + left_index=True, + right_on=["name"], + how=how, + index_na_value=-255, + ) + assert_index_equal(r1.index, e1) + + r2 = pd.merge( + fix_GH_28220_.X, + fix_GH_28220_.Z, + left_index=True, + right_on=["name"], + how=how, + index_na_value=-255, + ) + assert_index_equal(r2.index, e2) + + r3 = pd.merge( + fix_GH_28220_.X, + fix_GH_28220_.E, + left_index=True, + right_on=["name"], + how=how, + index_na_value=-255, + ) + + # special case when result is empty, dtype is object + if r3.empty: + e3 = pd.Index([], dtype=object, name=e3.name) + + assert_index_equal(r3.index, e3) + + +@pytest.mark.parametrize( + "how,expected", + [ + ("right", ([0, -255], [0, 0, -255], [0, 1, 2])), + ("inner", ([0], [0, 0], [])), + ("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])), + ], +) +def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected): + + # GH 28220 + (e1, e2, e3) = map(lambda x: pd.Index(x), expected) + + r1 = pd.merge( + fix_GH_28220_.X.reset_index(), + fix_GH_28220_.Y.set_index("name"), + left_on=["index"], + right_index=True, + how=how, + index_na_value=-255, + ) + assert_index_equal(r1.index, e1) + + r2 = pd.merge( + fix_GH_28220_.X.reset_index(), + fix_GH_28220_.Z.set_index("name"), + left_on=["index"], + right_index=True, + how=how, + index_na_value=-255, + ) + assert_index_equal(r2.index, e2) + + r3 = pd.merge( + fix_GH_28220_.X.reset_index(), + fix_GH_28220_.E.set_index("name"), + left_on=["index"], + right_index=True, + how=how, + index_na_value=-255, + ) + + # special case when result is empty, dtype is object + if r3.empty: + e3 = pd.Index([], dtype=object, name=e3.name) + + assert_index_equal(r3.index, e3) + + +@pytest.mark.parametrize( + "how,expected", + [ + ("left", ([0, 1], [0, 1, 2], [0, 1])), + ("right", ([0, 1], [0, 1, 2], [0, 2])), + ("inner", ([0], [0, 1], [])), + ("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])), + ], +) +def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected): + + # GH 28220 + (e1, e2, e3) = map(lambda x: pd.Index(x), expected) + + r1 = pd.merge( + fix_GH_28220_.X.reset_index(), + fix_GH_28220_.Y, + left_on=["index"], + right_on=["name"], + how=how, + ) + assert_index_equal(r1.index, e1) + + r2 = pd.merge( + fix_GH_28220_.X.reset_index(), + fix_GH_28220_.Z, + left_on=["index"], + right_on=["name"], + how=how, + ) + assert_index_equal(r2.index, e2) + + r3 = pd.merge( + fix_GH_28220_.X.reset_index(), + fix_GH_28220_.E, + left_on=["index"], + right_on=["name"], + how=how, + ) + + # special case when result is empty, dtype is object + if r3.empty: + e3 = pd.Index([], dtype=object, name=e3.name) + + assert_index_equal(r3.index, e3)