-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: 27453 right merge order #31278
BUG: 27453 right merge order #31278
Changes from 7 commits
f81c4ee
25f7e03
a53cc22
2d77a5c
bab654e
714f5b4
ff236a8
d7e2db9
c83f46f
65c3226
0e2c529
0cc79a9
711d37c
511dade
aaf542e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,6 +162,32 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss | |
... | ||
KeyError: Timestamp('1970-01-01 00:00:00') | ||
|
||
:meth:`DataFrame.merge` preserves right frame's row order | ||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) | ||
|
||
.. ipython:: python | ||
|
||
left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) | ||
right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) | ||
left_df | ||
right_df | ||
|
||
*pandas 1.0.x* | ||
|
||
.. code-block:: python | ||
|
||
>>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right") | ||
animal max_speed | ||
0 pig 11 | ||
1 quetzal 80 | ||
|
||
*pandas 1.1.0* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. current behavior |
||
|
||
.. ipython:: python | ||
|
||
left_df.merge(right_df, on=['animal', 'max_speed'], how="right") | ||
|
||
.. --------------------------------------------------------------------------- | ||
|
||
.. _whatsnew_110.deprecations: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1294,7 +1294,7 @@ def _get_join_indexers( | |
|
||
# get left & right join labels and num. of levels at each location | ||
mapped = ( | ||
_factorize_keys(left_keys[n], right_keys[n], sort=sort) | ||
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) | ||
for n in range(len(left_keys)) | ||
) | ||
zipped = zip(*mapped) | ||
|
@@ -1306,8 +1306,8 @@ def _get_join_indexers( | |
# factorize keys to a dense i8 space | ||
# `count` is the num. of unique keys | ||
# set(lkey) | set(rkey) == range(count) | ||
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) | ||
|
||
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) | ||
# preserve left frame order if how == 'left' and sort == False | ||
kwargs = copy.copy(kwargs) | ||
if how == "left": | ||
|
@@ -1847,7 +1847,7 @@ def _right_outer_join(x, y, max_groups): | |
return left_indexer, right_indexer | ||
|
||
|
||
def _factorize_keys(lk, rk, sort=True): | ||
def _factorize_keys(lk, rk, sort=True, how="inner"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-string and typing here |
||
# Some pre-processing for non-ndarray lk / rk | ||
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): | ||
lk = getattr(lk, "_values", lk)._data | ||
|
@@ -1916,6 +1916,8 @@ def _factorize_keys(lk, rk, sort=True): | |
np.putmask(rlab, rmask, count) | ||
count += 1 | ||
|
||
if how == "right": | ||
return rlab, llab, count | ||
return llab, rlab, count | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): | |
# GH 24212 | ||
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that | ||
# -1 is interpreted as a missing value instead of the last element | ||
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) | ||
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) | ||
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) | ||
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) | ||
result = df1.merge(df2, left_on="key", right_index=True, how=how) | ||
expected = pd.DataFrame( | ||
[ | ||
[1.0, 0, 1], | ||
[2.0, 2, 3], | ||
[3.0, 2, 3], | ||
[np.nan, 1, 2], | ||
[np.nan, 3, 4], | ||
[np.nan, 4, 5], | ||
[0, 0, 0], | ||
[1, 1, 1], | ||
[2, 2, 2], | ||
[np.nan, 3, 3], | ||
[np.nan, 4, 4], | ||
[np.nan, 5, 5], | ||
], | ||
columns=["a", "key", "b"], | ||
) | ||
|
@@ -1318,6 +1318,44 @@ def test_merge_right_index_right(self): | |
result = left.merge(right, left_on="key", right_index=True, how="right") | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("how", ["left", "right"]) | ||
def test_merge_preserves_row_order(self, how): | ||
# GH 27453 | ||
a = [2, 5, 3, 5] | ||
df1 = pd.DataFrame({"A": a, "B": [8, 2, 4, 1]}) | ||
df2 = pd.DataFrame({"A": a, "B": [7, 1, 3, 0]}) | ||
|
||
result = df1.merge(df2[["A", "B"]], on=["A", "B"], how=how) | ||
expected = pd.DataFrame({"A": a}) | ||
if how == "right": | ||
expected["B"] = df2["B"] | ||
else: | ||
expected["B"] = df1["B"] | ||
tm.assert_frame_equal(result, expected) | ||
|
||
left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. either reduce the examples here (they look like 3x of the same one), or parameterize over them. |
||
right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0])) | ||
result = left_df.merge(right_df, left_index=True, right_index=True, how=how) | ||
if how == "right": | ||
expected = pd.DataFrame( | ||
{"colors": ["red", "blue"], "hats": ["small", "big"]} | ||
) | ||
else: | ||
expected = pd.DataFrame( | ||
{"colors": ["blue", "red"], "hats": ["big", "small"]} | ||
) | ||
|
||
left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) | ||
right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) | ||
result = left_df.merge(right_df, on=["animal", "max_speed"], how=how) | ||
if how == "right": | ||
expected = pd.DataFrame( | ||
{"animal": ["quetzal", "pig"], "max_speed": [80, 11]} | ||
) | ||
else: | ||
expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_merge_take_missing_values_from_index_of_other_dtype(self): | ||
# GH 24212 | ||
left = pd.DataFrame( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change to prior behavior