Rebase

pandas-dev · Nov 13, 2017 · c7a1833 · c7a1833
1 parent 7495e9a
commit c7a1833
Show file tree

Hide file tree

Showing 5 changed files with 318 additions and 95 deletions.
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2490,6 +2490,7 @@ def _get_unique_index(self, dropna=False):
             includes list, tuple, array, Series, and must be the same size as
             the index and its dtype must exactly match the index's type.
 
+            .. versionadded:: 0.17.0
             .. versionadded:: 0.21.0 (list-like tolerance)
 
         Returns
@@ -2639,6 +2640,7 @@ def _get_level_values(self, level):
             the same size as the index and its dtype must exactly match the
             index's type.
 
+            .. versionadded:: 0.17.0
             .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
@@ -3180,46 +3182,68 @@ def join(self, other, how='left', level=None, return_indexers=False,
 
     def _join_multi(self, other, how, return_indexers=True):
         from .multi import MultiIndex
-        self_is_mi = isinstance(self, MultiIndex)
-        other_is_mi = isinstance(other, MultiIndex)
+        from pandas.core.reshape.merge import _complete_multilevel_join
 
         # figure out join names
-        self_names = _not_none(*self.names)
-        other_names = _not_none(*other.names)
+        self_names = list(_not_none(*self.names))
+        other_names = list(_not_none(*other.names))
         overlap = list(set(self_names) & set(other_names))
 
-        # need at least 1 in common, but not more than 1
+        # need at least 1 in common
         if not len(overlap):
-            raise ValueError("cannot join with no level specified and no "
-                             "overlapping names")
-        if len(overlap) > 1:
-            raise NotImplementedError("merging with more than one level "
-                                      "overlap on a multi-index is not "
-                                      "implemented")
-        jl = overlap[0]
+            raise ValueError("cannot join with no overlapping index names")
+
+        self_is_mi = isinstance(self, MultiIndex)
+        other_is_mi = isinstance(other, MultiIndex)
+
+        # Drop the non matching levels
+        ldrop_levels = list(set(self_names) - set(overlap))
+        rdrop_levels = list(set(other_names) - set(overlap))
+
+        if self_is_mi and other_is_mi:
+            self_jnlevels = self.droplevel(ldrop_levels)
+            other_jnlevels = other.droplevel(rdrop_levels)
+
+            if not (self_jnlevels.is_unique and other_jnlevels.is_unique):
+                raise ValueError("Join on level between two MultiIndex objects"
+                                 "is ambiguous")
+
+            dropped_levels = ldrop_levels + rdrop_levels
+
+            join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
+                                                      return_indexers=True)
+
+            levels, labels, names = _complete_multilevel_join(self, other, how,
+                                                              dropped_levels,
+                                                              join_idx,
+                                                              lidx, ridx)
+
+            multi_join_idx = MultiIndex(levels=levels, labels=labels,
+                                        names=names, verify_integrity=False)
+
+            # Check for unused levels
+            multi_join_idx = multi_join_idx.remove_unused_levels()
+
+            return multi_join_idx, lidx, ridx
+
+        jl = list(overlap)[0]
 
         # make the indices into mi's that match
-        if not (self_is_mi and other_is_mi):
-
-            flip_order = False
-            if self_is_mi:
-                self, other = other, self
-                flip_order = True
-                # flip if join method is right or left
-                how = {'right': 'left', 'left': 'right'}.get(how, how)
-
-            level = other.names.index(jl)
-            result = self._join_level(other, level, how=how,
-                                      return_indexers=return_indexers)
-
-            if flip_order:
-                if isinstance(result, tuple):
-                    return result[0], result[2], result[1]
-            return result
+        flip_order = False
+        if self_is_mi:
+            self, other = other, self
+            flip_order = True
+            # flip if join method is right or left
+            how = {'right': 'left', 'left': 'right'}.get(how, how)
 
-        # 2 multi-indexes
-        raise NotImplementedError("merging with both multi-indexes is not "
-                                  "implemented")
+        level = other.names.index(jl)
+        result = self._join_level(other, level, how=how,
+                                  return_indexers=return_indexers)
+
+        if flip_order:
+            if isinstance(result, tuple):
+                return result[0], result[2], result[1]
+        return result
 
     def _join_non_unique(self, other, how='left', return_indexers=False):
         from pandas.core.reshape.merge import _get_join_indexers
@@ -3428,8 +3452,8 @@ def _get_string_slice(self, key, use_lhs=True, use_rhs=True):
 
     def slice_indexer(self, start=None, end=None, step=None, kind=None):
         """
-        For an ordered or unique index, compute the slice indexer for input
-        labels and step.
+        For an ordered Index, compute the slice indexer for input labels and
+        step
 
         Parameters
         ----------
@@ -3442,28 +3466,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
 
         Returns
         -------
-        indexer : slice
-
-        Raises
-        ------
-        KeyError : If key does not exist, or key is not unique and index is
-            not ordered.
+        indexer : ndarray or slice
 
         Notes
         -----
         This function assumes that the data is sorted, so use at your own peril
-
-        Examples
-        ---------
-        This is a method on all index types. For example you can do:
-
-        >>> idx = pd.Index(list('abcd'))
-        >>> idx.slice_indexer(start='b', end='c')
-        slice(1, 3)
-
-        >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')])
-        >>> idx.slice_indexer(start='b', end=('c', 'g'))
-        slice(1, 3)
         """
         start_slice, end_slice = self.slice_locs(start, end, step=step,
                                                  kind=kind)

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1345,9 +1345,10 @@ def remove_unused_levels(self):
         for lev, lab in zip(self.levels, self.labels):
 
             uniques = algos.unique(lab)
-
+            # remove if NaN in index
+            uniques_no_nans = uniques[uniques != -1]
             # nothing unused
-            if len(uniques) == len(lev):
+            if len(uniques_no_nans) == len(lev):
                 new_levels.append(lev)
                 new_labels.append(lab)
                 continue
@@ -1356,11 +1357,12 @@ def remove_unused_levels(self):
 
             # labels get mapped from uniques to 0:len(uniques)
             label_mapping = np.zeros(len(lev))
-            label_mapping[uniques] = np.arange(len(uniques))
-            lab = label_mapping[lab]
+            label_mapping[uniques_no_nans] = np.arange(len(uniques_no_nans))
+            # apply the mapping where lab != -1
+            lab = np.where(lab != -1, label_mapping[lab], -1)
 
             # new levels are simple
-            lev = lev.take(uniques)
+            lev = lev.take(uniques_no_nans)
 
             new_levels.append(lev)
             new_labels.append(lab)

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -126,7 +126,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces,
             try:
                 if k in merged:
                     merged[k] = key
-            except KeyError:
+            except:
                 pass
 
         pieces.append(merged)
@@ -1066,6 +1066,82 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
     return join_func(lkey, rkey, count, **kwargs)
 
 
+def _complete_multilevel_join(left, right, how, dropped_levels,
+                              join_idx, lidx, ridx):
+    """
+    *this is an internal non-public method*
+
+    Returns the levels, labels and names of a multilevel to multilevel join
+    Depending on the type of join, this method restores the appropriate
+    dropped levels of the joined multi-index. The method relies on lidx, ridx
+    which hold the index positions of left and right, where a join was feasible
+
+    Parameters
+    ----------
+    left : Index
+        left index
+    right : Index
+        right index
+    join_idx : Index
+        the index of the join between the common levels of left and right
+    how : {'left', 'right', 'outer', 'inner'}
+    lidx : intp array
+        left indexer
+    right : intp array
+        right indexer
+    dropped_levels : str array
+        list of non-common levels
+
+    Returns
+    -------
+    levels : intp array
+        levels of combined multiindexes
+    labels : str array
+        labels of combined multiindexes
+    names : str array
+        names of combined multiindexes
+
+    """
+
+    join_levels = join_idx.levels
+    join_labels = join_idx.labels
+    join_names = join_idx.names
+
+    # lidx and ridx hold the indexes where the join occured
+    # for left and right respectively. If left (right) is None it means that
+    # the join occured on all indices of left (right)
+    if lidx is None:
+        lidx = range(0, len(left))
+
+    if ridx is None:
+        ridx = range(0, len(right))
+
+    # Iterate through the levels that must be restored
+    for dl in dropped_levels:
+        if dl in left.names:
+            idx = left
+            indexer = lidx
+        else:
+            idx = right
+            indexer = ridx
+
+        # The index of the level name to be restored
+        name_idx = idx.names.index(dl)
+
+        restore_levels = idx.levels[name_idx].values
+        restore_labels = idx.labels[name_idx]
+
+        join_levels = join_levels.__add__([restore_levels])
+        join_names = join_names.__add__([dl])
+
+        # Inject -1 in the labels list where a join was not possible
+        # IOW indexer[i]=-1
+        labels = [restore_labels[i] if i != -1 else -1 for i in indexer]
+        join_labels = join_labels.__add__([labels])
+
+    return join_levels, join_labels, join_names
+
+
 class _OrderedMerge(_MergeOperation):
     _merge_type = 'ordered_merge'
 
@@ -1253,12 +1329,10 @@ def _get_merge_keys(self):
          join_names) = super(_AsOfMerge, self)._get_merge_keys()
 
         # validate index types are the same
-        for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
+        for lk, rk in zip(left_join_keys, right_join_keys):
             if not is_dtype_equal(lk.dtype, rk.dtype):
-                raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
-                                 "{rkdtype}, must be the same type"
-                                 .format(i=i, lkdtype=lk.dtype,
-                                         rkdtype=rk.dtype))
+                raise MergeError("incompatible merge keys, "
+                                 "must be the same type")
 
         # validate tolerance; must be a Timedelta if we have a DTI
         if self.tolerance is not None:
@@ -1268,10 +1342,8 @@ def _get_merge_keys(self):
             else:
                 lt = left_join_keys[-1]
 
-            msg = ("incompatible tolerance {tolerance}, must be compat "
-                   "with type {lkdtype}".format(
-                       tolerance=type(self.tolerance),
-                       lkdtype=lt.dtype))
+            msg = "incompatible tolerance, must be compat " \
+                  "with type {lt}".format(lt=type(lt))
 
             if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
                 if not isinstance(self.tolerance, Timedelta):
@@ -1507,12 +1579,12 @@ def _sort_labels(uniques, left, right):
         # tuplesafe
         uniques = Index(uniques).values
 
-    llength = len(left)
+    l = len(left)
     labels = np.concatenate([left, right])
 
     _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
     new_labels = _ensure_int64(new_labels)
-    new_left, new_right = new_labels[:llength], new_labels[llength:]
+    new_left, new_right = new_labels[:l], new_labels[l:]
 
     return new_left, new_right