Skip to content

Commit

Permalink
ENH: Multi-level merge on multi-indexes
Browse files Browse the repository at this point in the history
Allow for merging on multiple levels of multi-indexes
  • Loading branch information
harisbal committed Apr 26, 2017
1 parent d50b162 commit b0f0440
Show file tree
Hide file tree
Showing 2 changed files with 243 additions and 27 deletions.
83 changes: 68 additions & 15 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3028,27 +3028,84 @@ def join(self, other, how='left', level=None, return_indexers=False,

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

def _complete_join():
new_lvls = join_index.levels
new_lbls = join_index.labels
new_nms = join_index.names

for n in not_overlap:
if n in self_names:
idx = lidx
lvls = self.levels[self_names.index(n)].values
lbls = self.labels[self_names.index(n)]
else:
idx = ridx
lvls = other.levels[other_names.index(n)].values
lbls = other.labels[other_names.index(n)]

new_lvls = new_lvls.__add__([lvls])
new_nms = new_nms.__add__([n])

# Return the label on match else -1
l = [lbls[i] if i!=-1 else -1 for i in idx]
new_lbls = new_lbls.__add__([l])

return new_lvls, new_lbls, new_nms

# figure out join names
self_names = [n for n in self.names if n is not None]
other_names = [n for n in other.names if n is not None]
overlap = list(set(self_names) & set(other_names))

# Drop the non matching levels
ldrop_levels = [l for l in self_names if l not in overlap]
rdrop_levels = [l for l in other_names if l not in overlap]

self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

# need at least 1 in common, but not more than 1
if not len(overlap):
raise ValueError("cannot join with no level specified and no "
"overlapping names")
if len(overlap) > 1:
raise NotImplementedError("merging with more than one level "
"overlap on a multi-index is not "
"implemented")
jl = overlap[0]
raise ValueError("cannot join with no overlapping index names")

if self_is_mi and other_is_mi:
self_tmp = self.droplevel(ldrop_levels)
other_tmp = other.droplevel(rdrop_levels)

if not (other_tmp.is_unique and self_tmp.is_unique):
raise TypeError(" The index resulting from the overlapping "
"levels is not unique")

join_index, lidx, ridx = self_tmp.join(other_tmp, how=how,
return_indexers=True)

# Append to the returned Index the non-overlapping levels
not_overlap = ldrop_levels + rdrop_levels

if how == 'left':
join_index = self
elif how == 'right':
join_index = other
else:
join_index = join_index

if how == 'outer':
new_levels, new_labels, new_names = _complete_join()
else:
new_levels = join_index.levels
new_labels = join_index.labels
new_names = join_index.names

join_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)

return join_index, lidx, ridx

# make the indices into mi's that match
if not (self_is_mi and other_is_mi):
else:
jl = overlap[0]

# make the indices into mi's that match
flip_order = False
if self_is_mi:
self, other = other, self
Expand All @@ -3065,10 +3122,6 @@ def _join_multi(self, other, how, return_indexers=True):
return result[0], result[2], result[1]
return result

# 2 multi-indexes
raise NotImplementedError("merging with both multi-indexes is not "
"implemented")

def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.core.reshape.merge import _get_join_indexers

Expand Down
187 changes: 175 additions & 12 deletions pandas/tests/reshape/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,14 +1136,14 @@ def test_join_multi_levels(self):

def f():
household.join(portfolio, how='inner')
pytest.raises(ValueError, f)
self.assertRaises(ValueError, f)

portfolio2 = portfolio.copy()
portfolio2.index.set_names(['household_id', 'foo'])

def f():
portfolio2.join(portfolio, how='inner')
pytest.raises(ValueError, f)
self.assertRaises(ValueError, f)

def test_join_multi_levels2(self):

Expand Down Expand Up @@ -1182,11 +1182,7 @@ def test_join_multi_levels2(self):
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=['share', 'log_return']))

def f():
household.join(log_return, how='inner')
pytest.raises(NotImplementedError, f)

# this is the equivalency
# this is equivalency the
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='inner')
.set_index(['household_id', 'asset_id', 't']))
Expand All @@ -1195,7 +1191,7 @@ def f():
expected = (
DataFrame(dict(
household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29",
asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29",
"gb00b03mlx29", "gb00b03mlx29",
"gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
"lu0197800237", "lu0197800237",
Expand All @@ -1208,12 +1204,179 @@ def f():
.09604978, -.06524096, .03532373,
.03025441, .036997, None, None]
))
.set_index(["household_id", "asset_id", "t"]))
.set_index(["household_id", "asset_id", "t"])
.reindex(columns=['share', 'log_return']))

def f():
household.join(log_return, how='outer')
pytest.raises(NotImplementedError, f)
result = (merge(household.reset_index(), log_return.reset_index(),
on=['asset_id'], how='outer')
.set_index(['household_id', 'asset_id', 't']))

assert_frame_equal(result, expected)

def test_join_multi_levels3(self):
# Multi-index join tests
# Self join
matrix = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

distances = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

expected = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Trips_joined=[1987, 3647, 2470, 4296, 4444]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips', 'Trips_joined'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

result = matrix.join(matrix, how='inner', rsuffix='_joined')
assert_frame_equal(result, expected)

#Left join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP', 'AM', 'OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Distance=[100, 80, 90, np.nan, 75]),
columns=['Origin', 'Destination', 'Period', 'TripPurp',
'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

result = matrix.join(distances, how='left')
assert_frame_equal(result, expected)

#Right join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Trips=[1987, 3647, 2470, np.nan, 4444, np.nan, np.nan],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Trips', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

result = matrix.join(distances, how='right')
assert_frame_equal(result, expected)

#Inner join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 3],
Destination=[1, 2, 1, 1],
Period=['AM','PM','IP', 'OP'],
Trips=[1987, 3647, 2470, 4444],
Distance=[100, 80, 90, 75]),
columns=['Origin', 'Destination', 'Period', 'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period']))

result = matrix.join(distances, how='inner')
assert_frame_equal(result, expected)

#Outer join
expected = (
pd.DataFrame(
dict(Origin= [1, 1, 2, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 3, 1, 2, 6],
Period=['AM','PM','IP', 'AM', 'AM', 'OP', 'IP', 'AM'],
TripPurp=['hbw', 'nhb', 'hbo', np.nan, 'nhb',
'hbw', np.nan, np.nan],
LinkType=['a', 'a', 'c', 'b', np.nan, 'a', 'b', 'a'],
Trips=[1987, 3647, 2470, np.nan, 4296, 4444, np.nan, np.nan],
Distance=[100, 80, 90, 80, np.nan, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType',
'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']))


result = matrix.join(distances, how='outer')
assert_frame_equal(result, expected)

#Non-unique resulting index
distances2 = (
pd.DataFrame(
dict(Origin= [1, 1, 2],
Destination=[1, 1, 1],
Period=['AM','AM', 'PM'],
LinkType=['a', 'b', 'a'],
Distance=[100, 110, 120]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))

def f():
matrix.join(distances2, how='left')
self.assertRaises(TypeError, f)

#No-overlapping level names
distances2 = (
pd.DataFrame(
dict(Orig= [1, 1, 2, 2, 3, 3, 5],
Dest=[1, 2, 1, 2, 1, 2, 6],
Per=['AM','PM','IP','AM','OP','IP', 'AM'],
LinkTyp=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Dist=[100, 80, 90, 80, 75, 35, 55]),
columns=['Orig', 'Dest', 'Per',
'LinkTyp', 'Dist'])
.set_index(['Orig', 'Dest','Per', 'LinkTyp']))

def f():
matrix.join(distances2, how='left')
self.assertRaises(ValueError, f)

# Empty Level
distances2 = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3, 3, 5],
Destination=[1, 2, 1, 2, 1, 2, 6],
Period=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan],
LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'],
Distance=[100, 80, 90, 80, 75, 35, 55]),
columns=['Origin', 'Destination', 'Period',
'LinkType', 'Distance'])
.set_index(['Origin', 'Destination','Period', 'LinkType']))


expected = (
pd.DataFrame(
dict(Origin=[1, 1, 2, 2, 3],
Destination=[1, 2, 1, 3, 1],
Period=['AM','PM','IP','AM','OP'],
TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
Trips=[1987, 3647, 2470, 4296, 4444],
Distance=[np.nan, np.nan, np.nan, np.nan, np.nan]),
columns=['Origin', 'Destination', 'Period',
'TripPurp', 'Trips', 'Distance'])
.set_index(['Origin', 'Destination', 'Period', 'TripPurp']))

result = matrix.join(distances2, how='left')
assert_frame_equal(result, expected)

@pytest.fixture
def df():
Expand Down

0 comments on commit b0f0440

Please sign in to comment.