diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index cea0800962272..682839544187f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -404,7 +404,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`pandas.merge` adds a string of ``None``, if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`, :issue:`25009`) - :func:`to_records` now accepts dtypes to its ``column_dtypes`` parameter (:issue:`24895`) - Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) - Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2bebb66e10e64..e8c82feaed3df 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -803,22 +803,18 @@ def _create_join_index(self, index, other_index, indexer, ------- join_index """ - join_index = index.take(indexer) if (self.how in (how, 'outer') and not isinstance(other_index, MultiIndex)): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take - # to take the final value in target index + # to take the final value in target index. So, we set the last + # element to be the desired fill value. We do not use allow_fill + # and fill_value because it throws a ValueError on integer indices mask = indexer == -1 if np.any(mask): - # if values missing (-1) from target index, - # take from other_index instead - join_list = join_index.to_numpy() - other_list = other_index.take(other_indexer).to_numpy() - join_list[mask] = other_list[mask] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) - return join_index + fill_value = na_value_for_dtype(index.dtype, compat=False) + index = index.append(Index([fill_value])) + return index.take(indexer) def _get_merge_keys(self): """ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b4a58628faa4d..8bc68cc7f8fc2 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -15,7 +15,8 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Int64Index, MultiIndex, RangeIndex, Series, UInt64Index) + Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series, + TimedeltaIndex, UInt64Index) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge @@ -1034,11 +1035,30 @@ def test_merge_two_empty_df_no_division_error(self): merge(a, a, on=('a', 'b')) @pytest.mark.parametrize('how', ['right', 'outer']) - def test_merge_on_index_with_more_values(self, how): + @pytest.mark.parametrize( + 'index,expected_index', + [(CategoricalIndex([1, 2, 4]), + CategoricalIndex([1, 2, 4, None, None, None])), + (DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03']), + DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03', + pd.NaT, pd.NaT, pd.NaT])), + (Float64Index([1, 2, 3]), + Float64Index([1, 2, 3, None, None, None])), + (Int64Index([1, 2, 3]), + Float64Index([1, 2, 3, None, None, None])), + (IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4), + np.nan, np.nan, np.nan])), + (PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03'], freq='D'), + PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03', + pd.NaT, pd.NaT, pd.NaT], freq='D')), + (TimedeltaIndex(['1d', '2d', '3d']), + TimedeltaIndex(['1d', '2d', '3d', pd.NaT, pd.NaT, pd.NaT]))]) + def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) + df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index) df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on='key', right_index=True, how=how) expected = pd.DataFrame([[1.0, 0, 1], @@ -1048,7 +1068,7 @@ def test_merge_on_index_with_more_values(self, how): [np.nan, 3, 4], [np.nan, 4, 5]], columns=['a', 'key', 'b']) - expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True) + expected.set_index(expected_index, inplace=True) assert_frame_equal(result, expected) def test_merge_right_index_right(self): @@ -1062,11 +1082,27 @@ def test_merge_right_index_right(self): 'key': [0, 1, 1, 2], 'b': [1, 2, 2, 3]}, columns=['a', 'key', 'b'], - index=[0, 1, 2, 2]) + index=[0, 1, 2, np.nan]) result = left.merge(right, left_on='key', right_index=True, how='right') tm.assert_frame_equal(result, expected) + def test_merge_take_missing_values_from_index_of_other_dtype(self): + # GH 24212 + left = pd.DataFrame({'a': [1, 2, 3], + 'key': pd.Categorical(['a', 'a', 'b'], + categories=list('abc'))}) + right = pd.DataFrame({'b': [1, 2, 3]}, + index=pd.CategoricalIndex(['a', 'b', 'c'])) + result = left.merge(right, left_on='key', + right_index=True, how='right') + expected = pd.DataFrame({'a': [1, 2, 3, None], + 'key': pd.Categorical(['a', 'a', 'b', 'c']), + 'b': [1, 1, 2, 3]}, + index=[0, 1, 2, np.nan]) + expected = expected.reindex(columns=['a', 'key', 'b']) + tm.assert_frame_equal(result, expected) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: