Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: 27453 right merge order #31278

Merged
merged 15 commits into from
Mar 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,32 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss
...
KeyError: Timestamp('1970-01-01 00:00:00')

:meth:`DataFrame.merge` preserves right frame's row order
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)

.. ipython:: python

left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]})
right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]})
left_df
right_df

*Previous behavior*:

.. code-block:: python

>>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
animal max_speed
0 pig 11
1 quetzal 80

*New behavior*:

.. ipython:: python

left_df.merge(right_df, on=['animal', 'max_speed'], how="right")

.. ---------------------------------------------------------------------------

.. _whatsnew_110.api_breaking.assignment_to_multiple_columns:
Expand Down
70 changes: 64 additions & 6 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import datetime
from functools import partial
import string
from typing import TYPE_CHECKING, Optional, Tuple, Union
from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
import warnings

import numpy as np

from pandas._libs import Timedelta, hashtable as libhashtable, lib
import pandas._libs.join as libjoin
from pandas._typing import FrameOrSeries
from pandas._typing import ArrayLike, FrameOrSeries
from pandas.errors import MergeError
from pandas.util._decorators import Appender, Substitution

Expand All @@ -24,6 +24,7 @@
is_array_like,
is_bool,
is_bool_dtype,
is_categorical,
is_categorical_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
Expand Down Expand Up @@ -1271,7 +1272,7 @@ def _get_join_indexers(

# get left & right join labels and num. of levels at each location
mapped = (
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
for n in range(len(left_keys))
)
zipped = zip(*mapped)
Expand All @@ -1283,8 +1284,8 @@ def _get_join_indexers(
# factorize keys to a dense i8 space
# `count` is the num. of unique keys
# set(lkey) | set(rkey) == range(count)
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
# preserve left frame order if how == 'left' and sort == False
kwargs = copy.copy(kwargs)
if how == "left":
Expand Down Expand Up @@ -1822,7 +1823,59 @@ def _right_outer_join(x, y, max_groups):
return left_indexer, right_indexer


def _factorize_keys(lk, rk, sort=True):
def _factorize_keys(
lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
) -> Tuple[np.array, np.array, int]:
"""
Encode left and right keys as enumerated types.

This is used to get the join indexers to be used when merging DataFrames.

Parameters
----------
lk : array-like
Left key.
rk : array-like
Right key.
sort : bool, defaults to True
If True, the encoding is done such that the unique elements in the
keys are sorted.
how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
Type of merge.

Returns
-------
array
Left (resp. right if called with `key='right'`) labels, as enumerated type.
array
Right (resp. left if called with `key='right'`) labels, as enumerated type.
int
Number of unique elements in union of left and right labels.

See Also
--------
merge : Merge DataFrame or named Series objects
with a database-style join.
algorithms.factorize : Encode the object as an enumerated type
or categorical variable.

Examples
--------
>>> lk = np.array(["a", "c", "b"])
>>> rk = np.array(["a", "c"])

Here, the unique values are `'a', 'b', 'c'`. With the default
`sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:

>>> pd.core.reshape.merge._factorize_keys(lk, rk)
(array([0, 2, 1]), array([0, 2]), 3)

With the `sort=False`, the encoding will correspond to the order
in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:

>>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
(array([0, 1, 2]), array([0, 1]), 3)
"""
# Some pre-processing for non-ndarray lk / rk
lk = extract_array(lk, extract_numpy=True)
rk = extract_array(rk, extract_numpy=True)
Expand All @@ -1834,8 +1887,11 @@ def _factorize_keys(lk, rk, sort=True):
rk, _ = rk._values_for_factorize()

elif (
is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)
is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
):
assert is_categorical(lk) and is_categorical(rk)
lk = cast(Categorical, lk)
rk = cast(Categorical, rk)
if lk.categories.equals(rk.categories):
# if we exactly match in categories, allow us to factorize on codes
rk = rk.codes
Expand Down Expand Up @@ -1892,6 +1948,8 @@ def _factorize_keys(lk, rk, sort=True):
np.putmask(rlab, rmask, count)
count += 1

if how == "right":
return rlab, llab, count
return llab, rlab, count


Expand Down
30 changes: 22 additions & 8 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
# GH 24212
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
# -1 is interpreted as a missing value instead of the last element
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
result = df1.merge(df2, left_on="key", right_index=True, how=how)
expected = pd.DataFrame(
[
[1.0, 0, 1],
[2.0, 2, 3],
[3.0, 2, 3],
[np.nan, 1, 2],
[np.nan, 3, 4],
[np.nan, 4, 5],
[0, 0, 0],
[1, 1, 1],
[2, 2, 2],
[np.nan, 3, 3],
[np.nan, 4, 4],
[np.nan, 5, 5],
],
columns=["a", "key", "b"],
)
Expand All @@ -1318,6 +1318,20 @@ def test_merge_right_index_right(self):
result = left.merge(right, left_on="key", right_index=True, how="right")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("how", ["left", "right"])
def test_merge_preserves_row_order(self, how):
# GH 27453
left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
if how == "right":
expected = pd.DataFrame(
{"animal": ["quetzal", "pig"], "max_speed": [80, 11]}
)
else:
expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
tm.assert_frame_equal(result, expected)

def test_merge_take_missing_values_from_index_of_other_dtype(self):
# GH 24212
left = pd.DataFrame(
Expand Down