diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e45ccc7efeac12..6af9b2e9b89c86 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3033,7 +3033,7 @@ def _complete_join(): new_lvls = join_index.levels new_lbls = join_index.labels new_nms = join_index.names - + for n in not_overlap: if n in self_names: idx = lidx @@ -3043,16 +3043,16 @@ def _complete_join(): idx = ridx lvls = other.levels[other_names.index(n)].values lbls = other.labels[other_names.index(n)] - + new_lvls = new_lvls.__add__([lvls]) new_nms = new_nms.__add__([n]) - # Return the label on match else -1 - l = [lbls[i] if i!=-1 else -1 for i in idx] + # Return the label on match else -1 + l = [lbls[i] if i != -1 else -1 for i in idx] new_lbls = new_lbls.__add__([l]) - - return new_lvls, new_lbls, new_nms - + + return new_lvls, new_lbls, new_nms + # figure out join names self_names = [n for n in self.names if n is not None] other_names = [n for n in other.names if n is not None] @@ -3065,7 +3065,7 @@ def _complete_join(): self_is_mi = isinstance(self, MultiIndex) other_is_mi = isinstance(other, MultiIndex) - # need at least 1 in common, but not more than 1 + # need at least 1 in common if not len(overlap): raise ValueError("cannot join with no overlapping index names") @@ -3075,31 +3075,31 @@ def _complete_join(): if not (other_tmp.is_unique and self_tmp.is_unique): raise TypeError(" The index resulting from the overlapping " - "levels is not unique") - + " levels is not unique ") + join_index, lidx, ridx = self_tmp.join(other_tmp, how=how, return_indexers=True) - - # Append to the returned Index the non-overlapping levels + + # Append to the returned Index the non-overlapping levels not_overlap = ldrop_levels + rdrop_levels - + if how == 'left': join_index = self elif how == 'right': join_index = other else: join_index = join_index - + if how == 'outer': new_levels, new_labels, new_names = _complete_join() else: new_levels = join_index.levels new_labels = join_index.labels new_names = join_index.names - + join_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) - + return join_index, lidx, ridx else: diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index fedbbd9df6db27..e3fd342b7485f0 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1208,8 +1208,8 @@ def test_join_multi_levels2(self): .reindex(columns=['share', 'log_return'])) result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='outer') - .set_index(['household_id', 'asset_id', 't'])) + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) assert_frame_equal(result, expected) @@ -1220,132 +1220,133 @@ def test_join_multi_levels3(self): pd.DataFrame( dict(Origin=[1, 1, 2, 2, 3], Destination=[1, 2, 1, 3, 1], - Period=['AM','PM','IP','AM','OP'], + Period=['AM', 'PM', 'IP', 'AM', 'OP'], TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], Trips=[1987, 3647, 2470, 4296, 4444]), columns=['Origin', 'Destination', 'Period', 'TripPurp', 'Trips']) .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) - + distances = ( pd.DataFrame( - dict(Origin= [1, 1, 2, 2, 3, 3, 5], + dict(Origin=[1, 1, 2, 2, 3, 3, 5], Destination=[1, 2, 1, 2, 1, 2, 6], - Period=['AM','PM','IP','AM','OP','IP', 'AM'], + Period=['AM', 'PM', 'IP', 'AM', 'OP', 'IP', 'AM'], LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], Distance=[100, 80, 90, 80, 75, 35, 55]), - columns=['Origin', 'Destination', 'Period', + columns=['Origin', 'Destination', 'Period', 'LinkType', 'Distance']) - .set_index(['Origin', 'Destination','Period', 'LinkType'])) - + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + expected = ( pd.DataFrame( dict(Origin=[1, 1, 2, 2, 3], Destination=[1, 2, 1, 3, 1], - Period=['AM','PM','IP','AM','OP'], + Period=['AM', 'PM', 'IP', 'AM', 'OP'], TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], Trips=[1987, 3647, 2470, 4296, 4444], Trips_joined=[1987, 3647, 2470, 4296, 4444]), columns=['Origin', 'Destination', 'Period', 'TripPurp', 'Trips', 'Trips_joined']) .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) - - result = matrix.join(matrix, how='inner', rsuffix='_joined') + + result = matrix.join(matrix, how='inner', rsuffix='_joined') assert_frame_equal(result, expected) - - #Left join + + # Left join expected = ( pd.DataFrame( - dict(Origin= [1, 1, 2, 2, 3], + dict(Origin=[1, 1, 2, 2, 3], Destination=[1, 2, 1, 3, 1], - Period=['AM','PM','IP', 'AM', 'OP'], + Period=['AM', 'PM', 'IP', 'AM', 'OP'], TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], Trips=[1987, 3647, 2470, 4296, 4444], Distance=[100, 80, 90, np.nan, 75]), - columns=['Origin', 'Destination', 'Period', 'TripPurp', + columns=['Origin', 'Destination', 'Period', 'TripPurp', 'Trips', 'Distance']) .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) - + result = matrix.join(distances, how='left') assert_frame_equal(result, expected) - - #Right join + + # Right join expected = ( pd.DataFrame( - dict(Origin= [1, 1, 2, 2, 3, 3, 5], + dict(Origin=[1, 1, 2, 2, 3, 3, 5], Destination=[1, 2, 1, 2, 1, 2, 6], - Period=['AM','PM','IP','AM','OP','IP', 'AM'], + Period=['AM', 'PM', 'IP', 'AM', 'OP', 'IP', 'AM'], LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], Trips=[1987, 3647, 2470, np.nan, 4444, np.nan, np.nan], Distance=[100, 80, 90, 80, 75, 35, 55]), - columns=['Origin', 'Destination', 'Period', + columns=['Origin', 'Destination', 'Period', 'LinkType', 'Trips', 'Distance']) - .set_index(['Origin', 'Destination','Period', 'LinkType'])) - + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + result = matrix.join(distances, how='right') assert_frame_equal(result, expected) - - #Inner join + + # Inner join expected = ( pd.DataFrame( - dict(Origin= [1, 1, 2, 3], + dict(Origin=[1, 1, 2, 3], Destination=[1, 2, 1, 1], - Period=['AM','PM','IP', 'OP'], + Period=['AM', 'PM', 'IP', 'OP'], Trips=[1987, 3647, 2470, 4444], Distance=[100, 80, 90, 75]), - columns=['Origin', 'Destination', 'Period', 'Trips', 'Distance']) + columns=['Origin', 'Destination', 'Period', + 'Trips', 'Distance']) .set_index(['Origin', 'Destination', 'Period'])) - + result = matrix.join(distances, how='inner') assert_frame_equal(result, expected) - #Outer join + # Outer join expected = ( pd.DataFrame( - dict(Origin= [1, 1, 2, 2, 2, 3, 3, 5], + dict(Origin=[1, 1, 2, 2, 2, 3, 3, 5], Destination=[1, 2, 1, 2, 3, 1, 2, 6], - Period=['AM','PM','IP', 'AM', 'AM', 'OP', 'IP', 'AM'], + Period=['AM', 'PM', 'IP', 'AM', 'AM', 'OP', 'IP', 'AM'], TripPurp=['hbw', 'nhb', 'hbo', np.nan, 'nhb', 'hbw', np.nan, np.nan], LinkType=['a', 'a', 'c', 'b', np.nan, 'a', 'b', 'a'], - Trips=[1987, 3647, 2470, np.nan, 4296, 4444, np.nan, np.nan], + Trips=[1987, 3647, 2470, np.nan, + 4296, 4444, np.nan, np.nan], Distance=[100, 80, 90, 80, np.nan, 75, 35, 55]), - columns=['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType', - 'Trips', 'Distance']) - .set_index(['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'])) - - + columns=['Origin', 'Destination', 'Period', 'TripPurp', + 'LinkType', 'Trips', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', + 'TripPurp', 'LinkType'])) + result = matrix.join(distances, how='outer') assert_frame_equal(result, expected) - - #Non-unique resulting index + + # Non-unique resulting index distances2 = ( pd.DataFrame( - dict(Origin= [1, 1, 2], + dict(Origin=[1, 1, 2], Destination=[1, 1, 1], - Period=['AM','AM', 'PM'], + Period=['AM', 'AM', 'PM'], LinkType=['a', 'b', 'a'], Distance=[100, 110, 120]), - columns=['Origin', 'Destination', 'Period', + columns=['Origin', 'Destination', 'Period', 'LinkType', 'Distance']) - .set_index(['Origin', 'Destination','Period', 'LinkType'])) - + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + def f(): matrix.join(distances2, how='left') pytest.raises(TypeError, f) - - #No-overlapping level names + + # No-overlapping level names distances2 = ( pd.DataFrame( - dict(Orig= [1, 1, 2, 2, 3, 3, 5], + dict(Orig=[1, 1, 2, 2, 3, 3, 5], Dest=[1, 2, 1, 2, 1, 2, 6], - Per=['AM','PM','IP','AM','OP','IP', 'AM'], + Per=['AM', 'PM', 'IP', 'AM', 'OP', 'IP', 'AM'], LinkTyp=['a', 'a', 'c', 'b', 'a', 'b', 'a'], Dist=[100, 80, 90, 80, 75, 35, 55]), - columns=['Orig', 'Dest', 'Per', - 'LinkTyp', 'Dist']) - .set_index(['Orig', 'Dest','Per', 'LinkTyp'])) - + columns=['Orig', 'Dest', 'Per', 'LinkTyp', 'Dist']) + .set_index(['Orig', 'Dest', 'Per', 'LinkTyp'])) + def f(): matrix.join(distances2, how='left') pytest.raises(ValueError, f) @@ -1355,29 +1356,29 @@ def f(): pd.DataFrame( dict(Origin=[1, 1, 2, 2, 3, 3, 5], Destination=[1, 2, 1, 2, 1, 2, 6], - Period=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan], + Period=[np.nan] * 7, LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], Distance=[100, 80, 90, 80, 75, 35, 55]), - columns=['Origin', 'Destination', 'Period', + columns=['Origin', 'Destination', 'Period', 'LinkType', 'Distance']) - .set_index(['Origin', 'Destination','Period', 'LinkType'])) - - + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + expected = ( pd.DataFrame( dict(Origin=[1, 1, 2, 2, 3], Destination=[1, 2, 1, 3, 1], - Period=['AM','PM','IP','AM','OP'], + Period=['AM', 'PM', 'IP', 'AM', 'OP'], TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], Trips=[1987, 3647, 2470, 4296, 4444], - Distance=[np.nan, np.nan, np.nan, np.nan, np.nan]), + Distance=[np.nan] * 5), columns=['Origin', 'Destination', 'Period', 'TripPurp', 'Trips', 'Distance']) .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) - + result = matrix.join(distances2, how='left') assert_frame_equal(result, expected) + @pytest.fixture def df(): return DataFrame(