Skip to content

Commit

Permalink
Review
Browse files Browse the repository at this point in the history
  • Loading branch information
harisbal authored and harisbal committed Oct 8, 2018
1 parent 4d4acc5 commit 8b44f42
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 87 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ This is the same behavior as ``Series.values`` for categorical data. See
Joining with two multi-indexes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

As of Pandas 0.24.0 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`20356`)
As of Pandas 0.24.0 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)

See the :ref:`Merge, join, and concatenate
<merging.Join_with_two_multi_indexes>` documentation section.
Expand Down
35 changes: 19 additions & 16 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3912,7 +3912,7 @@ def join(self, other, how='left', level=None, return_indexers=False,

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
from pandas.core.reshape.merge import _complete_multilevel_join
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin

# figure out join names
self_names = set(com._not_none(*self.names))
Expand All @@ -3928,27 +3928,30 @@ def _join_multi(self, other, how, return_indexers=True):

if self_is_mi and other_is_mi:

# Drop the non matching levels
ldrop_levels = list(set(self_names) - set(overlap))
rdrop_levels = list(set(other_names) - set(overlap))
# Drop the non-matching levels from left and right respectively
ldrop_names = list(set(self_names) - set(overlap))
rdrop_names = list(set(other_names) - set(overlap))

self_jnlevels = self.droplevel(ldrop_levels)
other_jnlevels = other.droplevel(rdrop_levels)

if not (self_jnlevels.is_unique and other_jnlevels.is_unique):
raise ValueError("Join on level between two MultiIndex objects"
"is ambiguous")

dropped_levels = ldrop_levels + rdrop_levels
self_jnlevels = self.droplevel(ldrop_names)
other_jnlevels = other.droplevel(rdrop_names)

# Join left and right
# Join on same leveled multi-index frames is supported
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
return_indexers=True)

levels, labels, names = _complete_multilevel_join(self, other, how,
dropped_levels,
join_idx,
lidx, ridx)
# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names

levels, labels, names = (
_restore_dropped_levels_multijoin(self, other,
dropped_names,
join_idx,
lidx, ridx))

# Re-create the multi-index
multi_join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

Expand Down
42 changes: 24 additions & 18 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,12 +1141,12 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
return join_func(lkey, rkey, count, **kwargs)


def _complete_multilevel_join(left, right, how, dropped_levels,
join_idx, lidx, ridx):
def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
join_idx, lidx, ridx):
"""
*this is an internal non-public method*
Returns the levels, labels and names of a multilevel to multilevel join
Returns the levels, labels and names of a multil-index to multi-index join.
Depending on the type of join, this method restores the appropriate
dropped levels of the joined multi-index. The method relies on lidx, ridx
which hold the index positions of left and right, where a join was feasible
Expand All @@ -1157,19 +1157,18 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
left index
right : Index
right index
dropped_level_names : str array
list of non-common levels
join_idx : Index
the index of the join between the common levels of left and right
how : {'left', 'right', 'outer', 'inner'}
lidx : intp array
left indexer
right : intp array
right indexer
dropped_levels : str array
list of non-common levels
Returns
-------
levels : intp array
levels : intp ndarray
levels of combined multiindexes
labels : str array
labels of combined multiindexes
Expand All @@ -1178,12 +1177,20 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
"""

# Convert to 1 level multi-index if not
if not isinstance(join_idx, MultiIndex):
levels = [join_idx.values]
labels = [list(range(0, len(join_idx)))]
names = [join_idx.name]
join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

join_levels = join_idx.levels
join_labels = join_idx.labels
join_names = join_idx.names

# lidx and ridx hold the indexes where the join occured
# for left and right respectively. If left (right) is None it means that
# for left and right respectively. If left (right) is None then
# the join occured on all indices of left (right)
if lidx is None:
lidx = range(0, len(left))
Expand All @@ -1192,27 +1199,26 @@ def _complete_multilevel_join(left, right, how, dropped_levels,
ridx = range(0, len(right))

# Iterate through the levels that must be restored
for dl in dropped_levels:
if dl in left.names:
for dropped_level_name in dropped_level_names:
if dropped_level_name in left.names:
idx = left
indexer = lidx
else:
idx = right
indexer = ridx

# The index of the level name to be restored
name_idx = idx.names.index(dl)
name_idx = idx.names.index(dropped_level_name)

restore_levels = idx.levels[name_idx].values
restore_labels = idx.labels[name_idx]

join_levels = join_levels.__add__([restore_levels])
join_names = join_names.__add__([dl])

# Inject -1 in the labels list where a join was not possible
# IOW indexer[i]=-1
labels = [restore_labels[i] if i != -1 else -1 for i in indexer]
join_labels = join_labels.__add__([labels])
labels = idx.labels[name_idx]
restore_labels = [labels[i] if i != -1 else -1 for i in indexer]

join_levels = join_levels.__add__([restore_levels])
join_labels = join_labels.__add__([restore_labels])
join_names = join_names.__add__([dropped_level_name])

return join_levels, join_labels, join_names

Expand Down
129 changes: 77 additions & 52 deletions pandas/tests/reshape/merge/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,50 @@
from pandas.compat import lzip
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
from pandas.util.testing import assert_frame_equal


@pytest.fixture
def left():
# a little relevant example with NAs
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
'qux', 'snap']
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
'three', 'one']

data = np.random.randn(len(key1))
return DataFrame({'key1': key1, 'key2': key2, 'data': data})


@pytest.fixture
def right():
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])

return DataFrame(np.random.randn(10, 3), index=index,
columns=['j_one', 'j_two', 'j_three'])


class TestMergeMulti(object):

def setup_method(self):
self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])
self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
columns=['j_one', 'j_two', 'j_three'])

# a little relevant example with NAs
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
'qux', 'snap']
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
'three', 'one']

data = np.random.randn(len(key1))
self.data = DataFrame({'key1': key1, 'key2': key2,
'data': data})

def test_merge_on_multikey(self):
joined = self.data.join(self.to_join, on=['key1', 'key2'])

join_key = Index(lzip(self.data['key1'], self.data['key2']))
indexer = self.to_join.index.get_indexer(join_key)
ex_values = self.to_join.values.take(indexer, axis=0)
def test_merge_on_multikey(self, left, right):
joined = left.join(right, on=['key1', 'key2'])

join_key = Index(lzip(left['key1'], left['key2']))
indexer = right.index.get_indexer(join_key)
ex_values = right.values.take(indexer, axis=0)
ex_values[indexer == -1] = np.nan
expected = self.data.join(DataFrame(ex_values,
columns=self.to_join.columns))
expected = left.join(DataFrame(ex_values, columns=right.columns))

# TODO: columns aren't in the same order yet
assert_frame_equal(joined, expected.loc[:, joined.columns])
tm.assert_frame_equal(joined, expected.loc[:, joined.columns])

left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
left = left.join(right, on=['key1', 'key2'], sort=True)
right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'],
kind='mergesort')
assert_frame_equal(left, right)
tm.assert_frame_equal(left, right)

def test_left_join_multi_index(self):
icols = ['1st', '2nd', '3rd']
Expand Down Expand Up @@ -119,18 +122,18 @@ def run_asserts(left, right):

run_asserts(left, right)

def test_merge_right_vs_left(self):
def test_merge_right_vs_left(self, left, right):
# compare left vs right merge with multikey
for sort in [False, True]:
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
right_index=True, how='left', sort=sort)
merged1 = left.merge(right, left_on=['key1', 'key2'],
right_index=True, how='left', sort=sort)

merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
left_index=True, how='right',
sort=sort)
merged2 = right.merge(left, right_on=['key1', 'key2'],
left_index=True, how='right',
sort=sort)

merged2 = merged2.loc[:, merged1.columns]
assert_frame_equal(merged1, merged2)
tm.assert_frame_equal(merged1, merged2)

def test_compress_group_combinations(self):

Expand Down Expand Up @@ -393,15 +396,13 @@ def test_join_multi_levels(self):
'nl0000289965']))
.set_index(['household_id', 'asset_id'])
.reindex(columns=['male', 'wealth', 'name', 'share']))
assert_frame_equal(result, expected)

assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# equivalency
result2 = (merge(household.reset_index(), portfolio.reset_index(),
on=['household_id'], how='inner')
.set_index(['household_id', 'asset_id']))
assert_frame_equal(result2, expected)
result = (merge(household.reset_index(), portfolio.reset_index(),
on=['household_id'], how='inner')
.set_index(['household_id', 'asset_id']))
tm.assert_frame_equal(result, expected)

result = household.join(portfolio, how='outer')
expected = (concat([
Expand All @@ -412,7 +413,7 @@ def test_join_multi_levels(self):
[(4, np.nan)],
names=['household_id', 'asset_id'])))
], axis=0, sort=True).reindex(columns=expected.columns))
assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# invalid cases
household.index.name = 'foo'
Expand Down Expand Up @@ -471,7 +472,7 @@ def test_join_multi_levels2(self):
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='inner')
.set_index(['household_id', 'asset_id', 't']))
assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

expected = (
DataFrame(dict(
Expand All @@ -496,7 +497,7 @@ def test_join_multi_levels2(self):
on=['asset_id'], how='outer')
.set_index(['household_id', 'asset_id', 't']))

assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)


@pytest.fixture
Expand Down Expand Up @@ -564,17 +565,17 @@ def test_join_multi_empty_frames(self, left_multi, right_multi, join_type,
result = left_multi.join(right_multi, how=join_type).sort_index()
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("klass", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, klass):
@pytest.mark.parametrize("box", [None, np.asarray, Series, Index])
def test_merge_datetime_index(self, box):
# see gh-19038
df = DataFrame([1, 2, 3],
["2016-01-01", "2017-01-01", "2018-01-01"],
columns=["a"])
df.index = pd.to_datetime(df.index)
on_vector = df.index.year

if klass is not None:
on_vector = klass(on_vector)
if box is not None:
on_vector = box(on_vector)

expected = DataFrame(
OrderedDict([
Expand All @@ -596,3 +597,27 @@ def test_merge_datetime_index(self, klass):

result = df.merge(df, on=[df.index.year], how="inner")
tm.assert_frame_equal(result, expected)

def test_single_common_level(self):
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
('K1', 'X2')],
names=['key', 'X'])

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=index_left)

index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
('K2', 'Y2'), ('K2', 'Y3')],
names=['key', 'Y'])

right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=index_right)

result = left.join(right)
expected = (pd.merge(left.reset_index(), right.reset_index(),
on=['key'], how='inner')
.set_index(['key', 'X', 'Y']))

tm.assert_frame_equal(result, expected)

0 comments on commit 8b44f42

Please sign in to comment.