From c7a1833c1dd5f89512422f8e0c83192cec2c6ad4 Mon Sep 17 00:00:00 2001 From: harisbal Date: Mon, 13 Nov 2017 23:30:03 +0000 Subject: [PATCH] Rebase --- pandas/core/indexes/base.py | 111 +++++++------- pandas/core/indexes/multi.py | 12 +- pandas/core/reshape/merge.py | 96 ++++++++++-- pandas/tests/reshape/test_merge.py | 192 +++++++++++++++++++++--- pandas/tests/reshape/test_merge_asof.py | 2 +- 5 files changed, 318 insertions(+), 95 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eb96cbad70099a..8f2a11a86e1bfc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2490,6 +2490,7 @@ def _get_unique_index(self, dropna=False): includes list, tuple, array, Series, and must be the same size as the index and its dtype must exactly match the index's type. + .. versionadded:: 0.17.0 .. versionadded:: 0.21.0 (list-like tolerance) Returns @@ -2639,6 +2640,7 @@ def _get_level_values(self, level): the same size as the index and its dtype must exactly match the index's type. + .. versionadded:: 0.17.0 .. versionadded:: 0.21.0 (list-like tolerance) Examples @@ -3180,46 +3182,68 @@ def join(self, other, how='left', level=None, return_indexers=False, def _join_multi(self, other, how, return_indexers=True): from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + from pandas.core.reshape.merge import _complete_multilevel_join # figure out join names - self_names = _not_none(*self.names) - other_names = _not_none(*other.names) + self_names = list(_not_none(*self.names)) + other_names = list(_not_none(*other.names)) overlap = list(set(self_names) & set(other_names)) - # need at least 1 in common, but not more than 1 + # need at least 1 in common if not len(overlap): - raise ValueError("cannot join with no level specified and no " - "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] + raise ValueError("cannot join with no overlapping index names") + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # Drop the non matching levels + ldrop_levels = list(set(self_names) - set(overlap)) + rdrop_levels = list(set(other_names) - set(overlap)) + + if self_is_mi and other_is_mi: + self_jnlevels = self.droplevel(ldrop_levels) + other_jnlevels = other.droplevel(rdrop_levels) + + if not (self_jnlevels.is_unique and other_jnlevels.is_unique): + raise ValueError("Join on level between two MultiIndex objects" + "is ambiguous") + + dropped_levels = ldrop_levels + rdrop_levels + + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) + + levels, labels, names = _complete_multilevel_join(self, other, how, + dropped_levels, + join_idx, + lidx, ridx) + + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) + + # Check for unused levels + multi_join_idx = multi_join_idx.remove_unused_levels() + + return multi_join_idx, lidx, ridx + + jl = list(overlap)[0] # make the indices into mi's that match - if not (self_is_mi and other_is_mi): - - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) - - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) - - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) + + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers @@ -3428,8 +3452,8 @@ def _get_string_slice(self, key, use_lhs=True, use_rhs=True): def slice_indexer(self, start=None, end=None, step=None, kind=None): """ - For an ordered or unique index, compute the slice indexer for input - labels and step. + For an ordered Index, compute the slice indexer for input labels and + step Parameters ---------- @@ -3442,28 +3466,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): Returns ------- - indexer : slice - - Raises - ------ - KeyError : If key does not exist, or key is not unique and index is - not ordered. + indexer : ndarray or slice Notes ----- This function assumes that the data is sorted, so use at your own peril - - Examples - --------- - This is a method on all index types. For example you can do: - - >>> idx = pd.Index(list('abcd')) - >>> idx.slice_indexer(start='b', end='c') - slice(1, 3) - - >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) - >>> idx.slice_indexer(start='b', end=('c', 'g')) - slice(1, 3) """ start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f4acb6862addbf..fbf4c8b82b4af5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1345,9 +1345,10 @@ def remove_unused_levels(self): for lev, lab in zip(self.levels, self.labels): uniques = algos.unique(lab) - + # remove if NaN in index + uniques_no_nans = uniques[uniques != -1] # nothing unused - if len(uniques) == len(lev): + if len(uniques_no_nans) == len(lev): new_levels.append(lev) new_labels.append(lab) continue @@ -1356,11 +1357,12 @@ def remove_unused_levels(self): # labels get mapped from uniques to 0:len(uniques) label_mapping = np.zeros(len(lev)) - label_mapping[uniques] = np.arange(len(uniques)) - lab = label_mapping[lab] + label_mapping[uniques_no_nans] = np.arange(len(uniques_no_nans)) + # apply the mapping where lab != -1 + lab = np.where(lab != -1, label_mapping[lab], -1) # new levels are simple - lev = lev.take(uniques) + lev = lev.take(uniques_no_nans) new_levels.append(lev) new_labels.append(lab) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 412c00dc95ec00..987d98b2eb1169 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -126,7 +126,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, try: if k in merged: merged[k] = key - except KeyError: + except: pass pieces.append(merged) @@ -1066,6 +1066,82 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', return join_func(lkey, rkey, count, **kwargs) +def _complete_multilevel_join(left, right, how, dropped_levels, + join_idx, lidx, ridx): + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multilevel to multilevel join + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. The method relies on lidx, ridx + which hold the index positions of left and right, where a join was feasible + + Parameters + ---------- + left : Index + left index + right : Index + right index + join_idx : Index + the index of the join between the common levels of left and right + how : {'left', 'right', 'outer', 'inner'} + lidx : intp array + left indexer + right : intp array + right indexer + dropped_levels : str array + list of non-common levels + + Returns + ------- + levels : intp array + levels of combined multiindexes + labels : str array + labels of combined multiindexes + names : str array + names of combined multiindexes + + """ + + join_levels = join_idx.levels + join_labels = join_idx.labels + join_names = join_idx.names + + # lidx and ridx hold the indexes where the join occured + # for left and right respectively. If left (right) is None it means that + # the join occured on all indices of left (right) + if lidx is None: + lidx = range(0, len(left)) + + if ridx is None: + ridx = range(0, len(right)) + + # Iterate through the levels that must be restored + for dl in dropped_levels: + if dl in left.names: + idx = left + indexer = lidx + else: + idx = right + indexer = ridx + + # The index of the level name to be restored + name_idx = idx.names.index(dl) + + restore_levels = idx.levels[name_idx].values + restore_labels = idx.labels[name_idx] + + join_levels = join_levels.__add__([restore_levels]) + join_names = join_names.__add__([dl]) + + # Inject -1 in the labels list where a join was not possible + # IOW indexer[i]=-1 + labels = [restore_labels[i] if i != -1 else -1 for i in indexer] + join_labels = join_labels.__add__([labels]) + + return join_levels, join_labels, join_names + + class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' @@ -1253,12 +1329,10 @@ def _get_merge_keys(self): join_names) = super(_AsOfMerge, self)._get_merge_keys() # validate index types are the same - for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + for lk, rk in zip(left_join_keys, right_join_keys): if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, must be the same type" - .format(i=i, lkdtype=lk.dtype, - rkdtype=rk.dtype)) + raise MergeError("incompatible merge keys, " + "must be the same type") # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: @@ -1268,10 +1342,8 @@ def _get_merge_keys(self): else: lt = left_join_keys[-1] - msg = ("incompatible tolerance {tolerance}, must be compat " - "with type {lkdtype}".format( - tolerance=type(self.tolerance), - lkdtype=lt.dtype)) + msg = "incompatible tolerance, must be compat " \ + "with type {lt}".format(lt=type(lt)) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): @@ -1507,12 +1579,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - llength = len(left) + l = len(left) labels = np.concatenate([left, right]) _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) new_labels = _ensure_int64(new_labels) - new_left, new_right = new_labels[:llength], new_labels[llength:] + new_left, new_right = new_labels[:l], new_labels[l:] return new_left, new_right diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb86..cbe6cccd605f72 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1206,8 +1206,10 @@ def test_merge_na_keys(self): tm.assert_frame_equal(result, expected) - def test_join_multi_levels(self): +class TestMultiMulti(object): + + def test_join_multi_levels(self): # GH 3662 # merge multi-levels household = ( @@ -1242,9 +1244,9 @@ def test_join_multi_levels(self): 'Postbank BioTech Fonds'], share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], household_id=[1, 2, 2, 3, 3, 3], - asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', - 'gb00b03mlx29', 'lu0197800237', - 'nl0000289965'])) + asset_id=['nl0000301109', 'nl0000289783', + 'gb00b03mlx29', 'gb00b03mlx29', + 'lu0197800237', 'nl0000289965'])) .set_index(['household_id', 'asset_id']) .reindex(columns=['male', 'wealth', 'name', 'share'])) assert_frame_equal(result, expected) @@ -1283,7 +1285,6 @@ def f(): pytest.raises(ValueError, f) def test_join_multi_levels2(self): - # some more advanced merges # GH6360 household = ( @@ -1319,11 +1320,7 @@ def test_join_multi_levels2(self): .set_index(["household_id", "asset_id", "t"]) .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='inner') - pytest.raises(NotImplementedError, f) - - # this is the equivalency + # this is equivalency the result = (merge(household.reset_index(), log_return.reset_index(), on=['asset_id'], how='inner') .set_index(['household_id', 'asset_id', 't'])) @@ -1332,7 +1329,7 @@ def f(): expected = ( DataFrame(dict( household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", @@ -1345,11 +1342,134 @@ def f(): .09604978, -.06524096, .03532373, .03025441, .036997, None, None] )) - .set_index(["household_id", "asset_id", "t"])) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) + + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) + + assert_frame_equal(result, expected) + + +@pytest.fixture +def left_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C'], + Destination=['A', 'B', 'A', 'C', 'A'], + Period=['AM', 'PM', 'IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + +@pytest.fixture +def right_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'], + Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'], + Period=['AM', 'PM', 'IP', 'AM', 'OP', 'IP', 'AM'], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + + +@pytest.fixture +def on_cols(): + return ['Origin', 'Destination', 'Period'] + + +@pytest.fixture +def idx_cols(): + return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'] + + +class TestJoinMultiMulti(object): + + @pytest.mark.parametrize('how', ['left', 'right', 'inner', 'outer']) + def test_join_multi_multi(self, left_multi, right_multi, how, + on_cols, idx_cols): + # Multi-index join tests + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=how, on=on_cols).set_index(idx_cols) + .sort_index()) + + result = left_multi.join(right_multi, how=how).sort_index() + tm.assert_frame_equal(result, expected) + + + @pytest.mark.parametrize('how', ['left', 'right', 'inner', 'outer']) + def test_join_multi_multi_emptylevel(self, left_multi, right_multi, how, + on_cols, idx_cols): + # Join with empty level + num_lvls = len(right_multi.index.get_level_values('Period')) + # Set one level to None + right_multi.index.set_levels([np.nan] * num_lvls, level='Period', + inplace=True) + + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=how, on=on_cols).set_index(idx_cols) + .sort_index()) + + result = left_multi.join(right_multi, how=how).sort_index() + tm.assert_frame_equal(result, expected) + + + @pytest.mark.parametrize('how', ['left', 'right', 'inner', 'outer']) + def test_join_multi_empty_frames(self, left_multi, right_multi, how, + on_cols, idx_cols): + + left_multi = left_multi.drop(columns=left_multi.columns) + right_multi = right_multi.drop(columns=right_multi.columns) + + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=how, on=on_cols).set_index(idx_cols) + .sort_index()) + + result = left_multi.join(right_multi, how=how).sort_index() + tm.assert_frame_equal(result, expected) + + def test_join_multi_multi_nonunique(self, left_multi): + # Non-unique resulting index + right_multi = ( + DataFrame( + dict(Origin=[1, 1, 2], + Destination=[1, 1, 1], + Period=['AM', 'AM', 'PM'], + LinkType=['a', 'b', 'a'], + Distance=[100, 110, 120]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) def f(): - household.join(log_return, how='outer') - pytest.raises(NotImplementedError, f) + left_multi.join(right_multi, how='left') + pytest.raises(ValueError, f) + + def test_join_multi_multi_nooverlap(self, left_multi): + # No-overlapping level names + right_multi = ( + DataFrame( + dict(Orig=[1, 1, 2, 2, 3, 3, 5], + Dest=[1, 2, 1, 2, 1, 2, 6], + Per=['AM', 'PM', 'IP', 'AM', 'OP', 'IP', 'AM'], + LinkTyp=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Dist=[100, 80, 90, 80, 75, 35, 55]), + columns=['Orig', 'Dest', 'Per', 'LinkTyp', 'Dist']) + .set_index(['Orig', 'Dest', 'Per', 'LinkTyp'])) + + def f(): + left_multi.join(right_multi, how='left') + pytest.raises(ValueError, f) @pytest.fixture @@ -1365,12 +1485,22 @@ def df(): class TestMergeDtypes(object): - def test_different(self, df): + @pytest.fixture + def df(self): + return DataFrame( + {'A': ['foo', 'bar'], + 'B': Series(['foo', 'bar']).astype('category'), + 'C': [1, 2], + 'D': [1.0, 2.0], + 'E': Series([1, 2], dtype='uint64'), + 'F': Series([1, 2], dtype='int32')}) + + def test_different(self): # we expect differences by kind # to be ok, while other differences should return object - - left = df + df = self.df() + left = self.df() for col in df.columns: right = DataFrame({'A': df[col]}) result = pd.merge(left, right, on='A') @@ -1418,6 +1548,15 @@ def left(): size=(10,))).astype(CDT(['foo', 'bar'])), 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + @pytest.fixture + def left(self): + np.random.seed(1234) + return DataFrame( + {'X': Series(np.random.choice( + ['foo', 'bar'], + size=(10,))).astype('category', categories=['foo', 'bar']), + 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + @pytest.fixture def right(): @@ -1426,10 +1565,8 @@ def right(): {'X': Series(['foo', 'bar']).astype(CDT(['foo', 'bar'])), 'Z': [1, 2]}) - -class TestMergeCategorical(object): - - def test_identical(self, left): + def test_identical(self): + left = self.left() # merging on the same, should preserve dtypes merged = pd.merge(left, left, on='X') result = merged.dtypes.sort_index() @@ -1439,9 +1576,11 @@ def test_identical(self, left): index=['X', 'Y_x', 'Y_y']) assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self): # we have matching Categorical dtypes in X # so should preserve the merged column + left = self.left() + right = self.right() merged = pd.merge(left, right, on='X') result = merged.dtypes.sort_index() expected = Series([CategoricalDtype(), @@ -1450,8 +1589,10 @@ def test_basic(self, left, right): index=['X', 'Y', 'Z']) assert_series_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self): # non-merge columns should preserve if possible + left = self.left() + right = self.right() right = right.assign(Z=right.Z.astype('category')) merged = pd.merge(left, right, on='X') @@ -1471,10 +1612,11 @@ def test_other_columns(self, left, right): lambda x: x.astype(CDT(['foo', 'bar', 'bah'])), lambda x: x.astype(CDT(ordered=True))]) @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) - def test_dtype_on_merged_different(self, change, how, left, right): + def test_dtype_on_merged_different(self, change, how): # our merging columns, X now has 2 different dtypes # so we must be object as a result - + left = self.left() + right = self.right() X = change(right.X.astype('object')) right = right.assign(X=X) assert is_categorical_dtype(left.X.values) diff --git a/pandas/tests/reshape/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py index 4b2680b9be592b..bb458a62bc8014 100644 --- a/pandas/tests/reshape/test_merge_asof.py +++ b/pandas/tests/reshape/test_merge_asof.py @@ -976,7 +976,7 @@ def test_on_float_by_int(self): def test_merge_datatype_error(self): """ Tests merge datatype mismatch error """ - msg = 'merge keys \[0\] object and int64, must be the same type' + msg = 'incompatible merge keys, must be the same type' left = pd.DataFrame({'left_val': [1, 5, 10], 'a': ['a', 'b', 'c']})