Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: 27453 right merge order #31278

Merged
merged 15 commits into from
Mar 26, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,32 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss
...
KeyError: Timestamp('1970-01-01 00:00:00')

:meth:`DataFrame.merge` preserves right frame's row order
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)

.. ipython:: python

left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]})
right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]})
left_df
right_df

*pandas 1.0.x*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change to prior behavior


.. code-block:: python

>>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
animal max_speed
0 pig 11
1 quetzal 80

*pandas 1.1.0*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

current behavior


.. ipython:: python

left_df.merge(right_df, on=['animal', 'max_speed'], how="right")

.. ---------------------------------------------------------------------------

.. _whatsnew_110.deprecations:
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1294,7 +1294,7 @@ def _get_join_indexers(

# get left & right join labels and num. of levels at each location
mapped = (
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
for n in range(len(left_keys))
)
zipped = zip(*mapped)
Expand All @@ -1306,8 +1306,8 @@ def _get_join_indexers(
# factorize keys to a dense i8 space
# `count` is the num. of unique keys
# set(lkey) | set(rkey) == range(count)
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
# preserve left frame order if how == 'left' and sort == False
kwargs = copy.copy(kwargs)
if how == "left":
Expand Down Expand Up @@ -1847,7 +1847,7 @@ def _right_outer_join(x, y, max_groups):
return left_indexer, right_indexer


def _factorize_keys(lk, rk, sort=True):
def _factorize_keys(lk, rk, sort=True, how="inner"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string and typing here

# Some pre-processing for non-ndarray lk / rk
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
lk = getattr(lk, "_values", lk)._data
Expand Down Expand Up @@ -1916,6 +1916,8 @@ def _factorize_keys(lk, rk, sort=True):
np.putmask(rlab, rmask, count)
count += 1

if how == "right":
return rlab, llab, count
return llab, rlab, count


Expand Down
54 changes: 46 additions & 8 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
# GH 24212
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
# -1 is interpreted as a missing value instead of the last element
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
result = df1.merge(df2, left_on="key", right_index=True, how=how)
expected = pd.DataFrame(
[
[1.0, 0, 1],
[2.0, 2, 3],
[3.0, 2, 3],
[np.nan, 1, 2],
[np.nan, 3, 4],
[np.nan, 4, 5],
[0, 0, 0],
[1, 1, 1],
[2, 2, 2],
[np.nan, 3, 3],
[np.nan, 4, 4],
[np.nan, 5, 5],
],
columns=["a", "key", "b"],
)
Expand All @@ -1318,6 +1318,44 @@ def test_merge_right_index_right(self):
result = left.merge(right, left_on="key", right_index=True, how="right")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("how", ["left", "right"])
def test_merge_preserves_row_order(self, how):
# GH 27453
a = [2, 5, 3, 5]
df1 = pd.DataFrame({"A": a, "B": [8, 2, 4, 1]})
df2 = pd.DataFrame({"A": a, "B": [7, 1, 3, 0]})

result = df1.merge(df2[["A", "B"]], on=["A", "B"], how=how)
expected = pd.DataFrame({"A": a})
if how == "right":
expected["B"] = df2["B"]
else:
expected["B"] = df1["B"]
tm.assert_frame_equal(result, expected)

left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either reduce the examples here (they look like 3x of the same one), or parameterize over them.

right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0]))
result = left_df.merge(right_df, left_index=True, right_index=True, how=how)
if how == "right":
expected = pd.DataFrame(
{"colors": ["red", "blue"], "hats": ["small", "big"]}
)
else:
expected = pd.DataFrame(
{"colors": ["blue", "red"], "hats": ["big", "small"]}
)

left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
if how == "right":
expected = pd.DataFrame(
{"animal": ["quetzal", "pig"], "max_speed": [80, 11]}
)
else:
expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
tm.assert_frame_equal(result, expected)

def test_merge_take_missing_values_from_index_of_other_dtype(self):
# GH 24212
left = pd.DataFrame(
Expand Down