Skip to content

Commit

Permalink
BUG: 27453 right merge order (#31278)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli authored Mar 26, 2020
1 parent 218cc30 commit 8a5f291
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 14 deletions.
26 changes: 26 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,32 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss
...
KeyError: Timestamp('1970-01-01 00:00:00')
:meth:`DataFrame.merge` preserves right frame's row order
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)

.. ipython:: python
left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]})
right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]})
left_df
right_df
*Previous behavior*:

.. code-block:: python
>>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
animal max_speed
0 pig 11
1 quetzal 80
*New behavior*:

.. ipython:: python
left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
.. ---------------------------------------------------------------------------
.. _whatsnew_110.api_breaking.assignment_to_multiple_columns:
Expand Down
70 changes: 64 additions & 6 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import datetime
from functools import partial
import string
from typing import TYPE_CHECKING, Optional, Tuple, Union
from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
import warnings

import numpy as np

from pandas._libs import Timedelta, hashtable as libhashtable, lib
import pandas._libs.join as libjoin
from pandas._typing import FrameOrSeries
from pandas._typing import ArrayLike, FrameOrSeries
from pandas.errors import MergeError
from pandas.util._decorators import Appender, Substitution

Expand All @@ -24,6 +24,7 @@
is_array_like,
is_bool,
is_bool_dtype,
is_categorical,
is_categorical_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
Expand Down Expand Up @@ -1271,7 +1272,7 @@ def _get_join_indexers(

# get left & right join labels and num. of levels at each location
mapped = (
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
for n in range(len(left_keys))
)
zipped = zip(*mapped)
Expand All @@ -1283,8 +1284,8 @@ def _get_join_indexers(
# factorize keys to a dense i8 space
# `count` is the num. of unique keys
# set(lkey) | set(rkey) == range(count)
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
# preserve left frame order if how == 'left' and sort == False
kwargs = copy.copy(kwargs)
if how == "left":
Expand Down Expand Up @@ -1822,7 +1823,59 @@ def _right_outer_join(x, y, max_groups):
return left_indexer, right_indexer


def _factorize_keys(lk, rk, sort=True):
def _factorize_keys(
lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
) -> Tuple[np.array, np.array, int]:
"""
Encode left and right keys as enumerated types.
This is used to get the join indexers to be used when merging DataFrames.
Parameters
----------
lk : array-like
Left key.
rk : array-like
Right key.
sort : bool, defaults to True
If True, the encoding is done such that the unique elements in the
keys are sorted.
how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
Type of merge.
Returns
-------
array
Left (resp. right if called with `key='right'`) labels, as enumerated type.
array
Right (resp. left if called with `key='right'`) labels, as enumerated type.
int
Number of unique elements in union of left and right labels.
See Also
--------
merge : Merge DataFrame or named Series objects
with a database-style join.
algorithms.factorize : Encode the object as an enumerated type
or categorical variable.
Examples
--------
>>> lk = np.array(["a", "c", "b"])
>>> rk = np.array(["a", "c"])
Here, the unique values are `'a', 'b', 'c'`. With the default
`sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
>>> pd.core.reshape.merge._factorize_keys(lk, rk)
(array([0, 2, 1]), array([0, 2]), 3)
With the `sort=False`, the encoding will correspond to the order
in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
>>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
(array([0, 1, 2]), array([0, 1]), 3)
"""
# Some pre-processing for non-ndarray lk / rk
lk = extract_array(lk, extract_numpy=True)
rk = extract_array(rk, extract_numpy=True)
Expand All @@ -1834,8 +1887,11 @@ def _factorize_keys(lk, rk, sort=True):
rk, _ = rk._values_for_factorize()

elif (
is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)
is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk)
):
assert is_categorical(lk) and is_categorical(rk)
lk = cast(Categorical, lk)
rk = cast(Categorical, rk)
if lk.categories.equals(rk.categories):
# if we exactly match in categories, allow us to factorize on codes
rk = rk.codes
Expand Down Expand Up @@ -1892,6 +1948,8 @@ def _factorize_keys(lk, rk, sort=True):
np.putmask(rlab, rmask, count)
count += 1

if how == "right":
return rlab, llab, count
return llab, rlab, count


Expand Down
30 changes: 22 additions & 8 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
# GH 24212
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
# -1 is interpreted as a missing value instead of the last element
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
result = df1.merge(df2, left_on="key", right_index=True, how=how)
expected = pd.DataFrame(
[
[1.0, 0, 1],
[2.0, 2, 3],
[3.0, 2, 3],
[np.nan, 1, 2],
[np.nan, 3, 4],
[np.nan, 4, 5],
[0, 0, 0],
[1, 1, 1],
[2, 2, 2],
[np.nan, 3, 3],
[np.nan, 4, 4],
[np.nan, 5, 5],
],
columns=["a", "key", "b"],
)
Expand All @@ -1318,6 +1318,20 @@ def test_merge_right_index_right(self):
result = left.merge(right, left_on="key", right_index=True, how="right")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("how", ["left", "right"])
def test_merge_preserves_row_order(self, how):
# GH 27453
left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
if how == "right":
expected = pd.DataFrame(
{"animal": ["quetzal", "pig"], "max_speed": [80, 11]}
)
else:
expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
tm.assert_frame_equal(result, expected)

def test_merge_take_missing_values_from_index_of_other_dtype(self):
# GH 24212
left = pd.DataFrame(
Expand Down

0 comments on commit 8a5f291

Please sign in to comment.