diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 04458d684d7959..e45ccc7efeac12 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3028,27 +3028,84 @@ def join(self, other, how='left', level=None, return_indexers=False, def _join_multi(self, other, how, return_indexers=True): from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + def _complete_join(): + new_lvls = join_index.levels + new_lbls = join_index.labels + new_nms = join_index.names + + for n in not_overlap: + if n in self_names: + idx = lidx + lvls = self.levels[self_names.index(n)].values + lbls = self.labels[self_names.index(n)] + else: + idx = ridx + lvls = other.levels[other_names.index(n)].values + lbls = other.labels[other_names.index(n)] + + new_lvls = new_lvls.__add__([lvls]) + new_nms = new_nms.__add__([n]) + + # Return the label on match else -1 + l = [lbls[i] if i!=-1 else -1 for i in idx] + new_lbls = new_lbls.__add__([l]) + + return new_lvls, new_lbls, new_nms + # figure out join names self_names = [n for n in self.names if n is not None] other_names = [n for n in other.names if n is not None] overlap = list(set(self_names) & set(other_names)) + # Drop the non matching levels + ldrop_levels = [l for l in self_names if l not in overlap] + rdrop_levels = [l for l in other_names if l not in overlap] + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + # need at least 1 in common, but not more than 1 if not len(overlap): - raise ValueError("cannot join with no level specified and no " - "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] + raise ValueError("cannot join with no overlapping index names") + + if self_is_mi and other_is_mi: + self_tmp = self.droplevel(ldrop_levels) + other_tmp = other.droplevel(rdrop_levels) + + if not (other_tmp.is_unique and self_tmp.is_unique): + raise TypeError(" The index resulting from the overlapping " + "levels is not unique") + + join_index, lidx, ridx = self_tmp.join(other_tmp, how=how, + return_indexers=True) + + # Append to the returned Index the non-overlapping levels + not_overlap = ldrop_levels + rdrop_levels + + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + else: + join_index = join_index + + if how == 'outer': + new_levels, new_labels, new_names = _complete_join() + else: + new_levels = join_index.levels + new_labels = join_index.labels + new_names = join_index.names + + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + return join_index, lidx, ridx - # make the indices into mi's that match - if not (self_is_mi and other_is_mi): + else: + jl = overlap[0] + # make the indices into mi's that match flip_order = False if self_is_mi: self, other = other, self @@ -3065,10 +3122,6 @@ def _join_multi(self, other, how, return_indexers=True): return result[0], result[2], result[1] return result - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") - def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 73d0346546b979..ba9ea6a4d93577 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1136,14 +1136,14 @@ def test_join_multi_levels(self): def f(): household.join(portfolio, how='inner') - pytest.raises(ValueError, f) + self.assertRaises(ValueError, f) portfolio2 = portfolio.copy() portfolio2.index.set_names(['household_id', 'foo']) def f(): portfolio2.join(portfolio, how='inner') - pytest.raises(ValueError, f) + self.assertRaises(ValueError, f) def test_join_multi_levels2(self): @@ -1182,11 +1182,7 @@ def test_join_multi_levels2(self): .set_index(["household_id", "asset_id", "t"]) .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='inner') - pytest.raises(NotImplementedError, f) - - # this is the equivalency + # this is equivalency the result = (merge(household.reset_index(), log_return.reset_index(), on=['asset_id'], how='inner') .set_index(['household_id', 'asset_id', 't'])) @@ -1195,7 +1191,7 @@ def f(): expected = ( DataFrame(dict( household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", @@ -1208,12 +1204,179 @@ def f(): .09604978, -.06524096, .03532373, .03025441, .036997, None, None] )) - .set_index(["household_id", "asset_id", "t"])) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='outer') - pytest.raises(NotImplementedError, f) + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) + assert_frame_equal(result, expected) + + def test_join_multi_levels3(self): + # Multi-index join tests + # Self join + matrix = ( + pd.DataFrame( + dict(Origin=[1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP','AM','OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + distances = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3, 3, 5], + Destination=[1, 2, 1, 2, 1, 2, 6], + Period=['AM','PM','IP','AM','OP','IP', 'AM'], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination','Period', 'LinkType'])) + + expected = ( + pd.DataFrame( + dict(Origin=[1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP','AM','OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444], + Trips_joined=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips', 'Trips_joined']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + result = matrix.join(matrix, how='inner', rsuffix='_joined') + assert_frame_equal(result, expected) + + #Left join + expected = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444], + Distance=[100, 80, 90, np.nan, 75]), + columns=['Origin', 'Destination', 'Period', 'TripPurp', + 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + result = matrix.join(distances, how='left') + assert_frame_equal(result, expected) + + #Right join + expected = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 3, 3, 5], + Destination=[1, 2, 1, 2, 1, 2, 6], + Period=['AM','PM','IP','AM','OP','IP', 'AM'], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Trips=[1987, 3647, 2470, np.nan, 4444, np.nan, np.nan], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Trips', 'Distance']) + .set_index(['Origin', 'Destination','Period', 'LinkType'])) + + result = matrix.join(distances, how='right') + assert_frame_equal(result, expected) + + #Inner join + expected = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 3], + Destination=[1, 2, 1, 1], + Period=['AM','PM','IP', 'OP'], + Trips=[1987, 3647, 2470, 4444], + Distance=[100, 80, 90, 75]), + columns=['Origin', 'Destination', 'Period', 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period'])) + + result = matrix.join(distances, how='inner') + assert_frame_equal(result, expected) + + #Outer join + expected = ( + pd.DataFrame( + dict(Origin= [1, 1, 2, 2, 2, 3, 3, 5], + Destination=[1, 2, 1, 2, 3, 1, 2, 6], + Period=['AM','PM','IP', 'AM', 'AM', 'OP', 'IP', 'AM'], + TripPurp=['hbw', 'nhb', 'hbo', np.nan, 'nhb', + 'hbw', np.nan, np.nan], + LinkType=['a', 'a', 'c', 'b', np.nan, 'a', 'b', 'a'], + Trips=[1987, 3647, 2470, np.nan, 4296, 4444, np.nan, np.nan], + Distance=[100, 80, 90, 80, np.nan, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType', + 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'])) + + + result = matrix.join(distances, how='outer') + assert_frame_equal(result, expected) + + #Non-unique resulting index + distances2 = ( + pd.DataFrame( + dict(Origin= [1, 1, 2], + Destination=[1, 1, 1], + Period=['AM','AM', 'PM'], + LinkType=['a', 'b', 'a'], + Distance=[100, 110, 120]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination','Period', 'LinkType'])) + + def f(): + matrix.join(distances2, how='left') + self.assertRaises(TypeError, f) + + #No-overlapping level names + distances2 = ( + pd.DataFrame( + dict(Orig= [1, 1, 2, 2, 3, 3, 5], + Dest=[1, 2, 1, 2, 1, 2, 6], + Per=['AM','PM','IP','AM','OP','IP', 'AM'], + LinkTyp=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Dist=[100, 80, 90, 80, 75, 35, 55]), + columns=['Orig', 'Dest', 'Per', + 'LinkTyp', 'Dist']) + .set_index(['Orig', 'Dest','Per', 'LinkTyp'])) + + def f(): + matrix.join(distances2, how='left') + self.assertRaises(ValueError, f) + + # Empty Level + distances2 = ( + pd.DataFrame( + dict(Origin=[1, 1, 2, 2, 3, 3, 5], + Destination=[1, 2, 1, 2, 1, 2, 6], + Period=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination','Period', 'LinkType'])) + + + expected = ( + pd.DataFrame( + dict(Origin=[1, 1, 2, 2, 3], + Destination=[1, 2, 1, 3, 1], + Period=['AM','PM','IP','AM','OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444], + Distance=[np.nan, np.nan, np.nan, np.nan, np.nan]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + result = matrix.join(distances2, how='left') + assert_frame_equal(result, expected) @pytest.fixture def df():