Skip to content

Commit

Permalink
Rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
harisbal committed Nov 13, 2017
1 parent 7495e9a commit c7a1833
Show file tree
Hide file tree
Showing 5 changed files with 318 additions and 95 deletions.
111 changes: 59 additions & 52 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2490,6 +2490,7 @@ def _get_unique_index(self, dropna=False):
includes list, tuple, array, Series, and must be the same size as
the index and its dtype must exactly match the index's type.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Returns
Expand Down Expand Up @@ -2639,6 +2640,7 @@ def _get_level_values(self, level):
the same size as the index and its dtype must exactly match the
index's type.
.. versionadded:: 0.17.0
.. versionadded:: 0.21.0 (list-like tolerance)
Examples
Expand Down Expand Up @@ -3180,46 +3182,68 @@ def join(self, other, how='left', level=None, return_indexers=False,

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)
from pandas.core.reshape.merge import _complete_multilevel_join

# figure out join names
self_names = _not_none(*self.names)
other_names = _not_none(*other.names)
self_names = list(_not_none(*self.names))
other_names = list(_not_none(*other.names))
overlap = list(set(self_names) & set(other_names))

# need at least 1 in common, but not more than 1
# need at least 1 in common
if not len(overlap):
raise ValueError("cannot join with no level specified and no "
"overlapping names")
if len(overlap) > 1:
raise NotImplementedError("merging with more than one level "
"overlap on a multi-index is not "
"implemented")
jl = overlap[0]
raise ValueError("cannot join with no overlapping index names")

self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

# Drop the non matching levels
ldrop_levels = list(set(self_names) - set(overlap))
rdrop_levels = list(set(other_names) - set(overlap))

if self_is_mi and other_is_mi:
self_jnlevels = self.droplevel(ldrop_levels)
other_jnlevels = other.droplevel(rdrop_levels)

if not (self_jnlevels.is_unique and other_jnlevels.is_unique):
raise ValueError("Join on level between two MultiIndex objects"
"is ambiguous")

dropped_levels = ldrop_levels + rdrop_levels

join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
return_indexers=True)

levels, labels, names = _complete_multilevel_join(self, other, how,
dropped_levels,
join_idx,
lidx, ridx)

multi_join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

# Check for unused levels
multi_join_idx = multi_join_idx.remove_unused_levels()

return multi_join_idx, lidx, ridx

jl = list(overlap)[0]

# make the indices into mi's that match
if not (self_is_mi and other_is_mi):

flip_order = False
if self_is_mi:
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {'right': 'left', 'left': 'right'}.get(how, how)

level = other.names.index(jl)
result = self._join_level(other, level, how=how,
return_indexers=return_indexers)

if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result
flip_order = False
if self_is_mi:
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {'right': 'left', 'left': 'right'}.get(how, how)

# 2 multi-indexes
raise NotImplementedError("merging with both multi-indexes is not "
"implemented")
level = other.names.index(jl)
result = self._join_level(other, level, how=how,
return_indexers=return_indexers)

if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result

def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.core.reshape.merge import _get_join_indexers
Expand Down Expand Up @@ -3428,8 +3452,8 @@ def _get_string_slice(self, key, use_lhs=True, use_rhs=True):

def slice_indexer(self, start=None, end=None, step=None, kind=None):
"""
For an ordered or unique index, compute the slice indexer for input
labels and step.
For an ordered Index, compute the slice indexer for input labels and
step
Parameters
----------
Expand All @@ -3442,28 +3466,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
Returns
-------
indexer : slice
Raises
------
KeyError : If key does not exist, or key is not unique and index is
not ordered.
indexer : ndarray or slice
Notes
-----
This function assumes that the data is sorted, so use at your own peril
Examples
---------
This is a method on all index types. For example you can do:
>>> idx = pd.Index(list('abcd'))
>>> idx.slice_indexer(start='b', end='c')
slice(1, 3)
>>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')])
>>> idx.slice_indexer(start='b', end=('c', 'g'))
slice(1, 3)
"""
start_slice, end_slice = self.slice_locs(start, end, step=step,
kind=kind)
Expand Down
12 changes: 7 additions & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,9 +1345,10 @@ def remove_unused_levels(self):
for lev, lab in zip(self.levels, self.labels):

uniques = algos.unique(lab)

# remove if NaN in index
uniques_no_nans = uniques[uniques != -1]
# nothing unused
if len(uniques) == len(lev):
if len(uniques_no_nans) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
continue
Expand All @@ -1356,11 +1357,12 @@ def remove_unused_levels(self):

# labels get mapped from uniques to 0:len(uniques)
label_mapping = np.zeros(len(lev))
label_mapping[uniques] = np.arange(len(uniques))
lab = label_mapping[lab]
label_mapping[uniques_no_nans] = np.arange(len(uniques_no_nans))
# apply the mapping where lab != -1
lab = np.where(lab != -1, label_mapping[lab], -1)

# new levels are simple
lev = lev.take(uniques)
lev = lev.take(uniques_no_nans)

new_levels.append(lev)
new_labels.append(lab)
Expand Down
96 changes: 84 additions & 12 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces,
try:
if k in merged:
merged[k] = key
except KeyError:
except:
pass

pieces.append(merged)
Expand Down Expand Up @@ -1066,6 +1066,82 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
return join_func(lkey, rkey, count, **kwargs)


def _complete_multilevel_join(left, right, how, dropped_levels,
join_idx, lidx, ridx):
"""
*this is an internal non-public method*
Returns the levels, labels and names of a multilevel to multilevel join
Depending on the type of join, this method restores the appropriate
dropped levels of the joined multi-index. The method relies on lidx, ridx
which hold the index positions of left and right, where a join was feasible
Parameters
----------
left : Index
left index
right : Index
right index
join_idx : Index
the index of the join between the common levels of left and right
how : {'left', 'right', 'outer', 'inner'}
lidx : intp array
left indexer
right : intp array
right indexer
dropped_levels : str array
list of non-common levels
Returns
-------
levels : intp array
levels of combined multiindexes
labels : str array
labels of combined multiindexes
names : str array
names of combined multiindexes
"""

join_levels = join_idx.levels
join_labels = join_idx.labels
join_names = join_idx.names

# lidx and ridx hold the indexes where the join occured
# for left and right respectively. If left (right) is None it means that
# the join occured on all indices of left (right)
if lidx is None:
lidx = range(0, len(left))

if ridx is None:
ridx = range(0, len(right))

# Iterate through the levels that must be restored
for dl in dropped_levels:
if dl in left.names:
idx = left
indexer = lidx
else:
idx = right
indexer = ridx

# The index of the level name to be restored
name_idx = idx.names.index(dl)

restore_levels = idx.levels[name_idx].values
restore_labels = idx.labels[name_idx]

join_levels = join_levels.__add__([restore_levels])
join_names = join_names.__add__([dl])

# Inject -1 in the labels list where a join was not possible
# IOW indexer[i]=-1
labels = [restore_labels[i] if i != -1 else -1 for i in indexer]
join_labels = join_labels.__add__([labels])

return join_levels, join_labels, join_names


class _OrderedMerge(_MergeOperation):
_merge_type = 'ordered_merge'

Expand Down Expand Up @@ -1253,12 +1329,10 @@ def _get_merge_keys(self):
join_names) = super(_AsOfMerge, self)._get_merge_keys()

# validate index types are the same
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
for lk, rk in zip(left_join_keys, right_join_keys):
if not is_dtype_equal(lk.dtype, rk.dtype):
raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
"{rkdtype}, must be the same type"
.format(i=i, lkdtype=lk.dtype,
rkdtype=rk.dtype))
raise MergeError("incompatible merge keys, "
"must be the same type")

# validate tolerance; must be a Timedelta if we have a DTI
if self.tolerance is not None:
Expand All @@ -1268,10 +1342,8 @@ def _get_merge_keys(self):
else:
lt = left_join_keys[-1]

msg = ("incompatible tolerance {tolerance}, must be compat "
"with type {lkdtype}".format(
tolerance=type(self.tolerance),
lkdtype=lt.dtype))
msg = "incompatible tolerance, must be compat " \
"with type {lt}".format(lt=type(lt))

if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
if not isinstance(self.tolerance, Timedelta):
Expand Down Expand Up @@ -1507,12 +1579,12 @@ def _sort_labels(uniques, left, right):
# tuplesafe
uniques = Index(uniques).values

llength = len(left)
l = len(left)
labels = np.concatenate([left, right])

_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
new_labels = _ensure_int64(new_labels)
new_left, new_right = new_labels[:llength], new_labels[llength:]
new_left, new_right = new_labels[:l], new_labels[l:]

return new_left, new_right

Expand Down
Loading

0 comments on commit c7a1833

Please sign in to comment.