Skip to content

Commit

Permalink
ENH: Allow for join between two multi-index dataframe instances (#20356)
Browse files Browse the repository at this point in the history
  • Loading branch information
harisbal authored and TomAugspurger committed Nov 15, 2018
1 parent fb68731 commit 88cbce3
Show file tree
Hide file tree
Showing 5 changed files with 866 additions and 554 deletions.
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,47 @@ array, but rather an ``ExtensionArray``:
This is the same behavior as ``Series.values`` for categorical data. See
:ref:`whatsnew_0240.api_breaking.interval_values` for more.

.. _whatsnew_0240.enhancements.join_with_two_multiindexes:

Joining with two multi-indexes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)

See the :ref:`Merge, join, and concatenate
<merging.Join_with_two_multi_indexes>` documentation section.

.. ipython:: python
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
('K1', 'X2')],
names=['key', 'X'])
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=index_left)
index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
('K2', 'Y2'), ('K2', 'Y3')],
names=['key', 'Y'])
right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=index_right)
left.join(right)
For earlier versions this can be done using the following.

.. ipython:: python
pd.merge(left.reset_index(), right.reset_index(),
on=['key'], how='inner').set_index(['key', 'X', 'Y'])
.. _whatsnew_0240.enhancements.rename_axis:

Renaming names in a MultiIndex
Expand Down
98 changes: 62 additions & 36 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3168,8 +3168,8 @@ def get_value(self, series, key):
iloc = self.get_loc(key)
return s[iloc]
except KeyError:
if (len(self) > 0
and (self.holds_integer() or self.is_boolean())):
if (len(self) > 0 and
(self.holds_integer() or self.is_boolean())):
raise
elif is_integer(key):
return s[key]
Expand Down Expand Up @@ -3957,46 +3957,72 @@ def join(self, other, how='left', level=None, return_indexers=False,

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin

# figure out join names
self_names = set(com._not_none(*self.names))
other_names = set(com._not_none(*other.names))
overlap = self_names & other_names

# need at least 1 in common
if not overlap:
raise ValueError("cannot join with no overlapping index names")

self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

# figure out join names
self_names = com._not_none(*self.names)
other_names = com._not_none(*other.names)
overlap = list(set(self_names) & set(other_names))

# need at least 1 in common, but not more than 1
if not len(overlap):
raise ValueError("cannot join with no level specified and no "
"overlapping names")
if len(overlap) > 1:
raise NotImplementedError("merging with more than one level "
"overlap on a multi-index is not "
"implemented")
jl = overlap[0]
if self_is_mi and other_is_mi:

# Drop the non-matching levels from left and right respectively
ldrop_names = list(self_names - overlap)
rdrop_names = list(other_names - overlap)

self_jnlevels = self.droplevel(ldrop_names)
other_jnlevels = other.droplevel(rdrop_names)

# Join left and right
# Join on same leveled multi-index frames is supported
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
return_indexers=True)

# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names

levels, labels, names = (
_restore_dropped_levels_multijoin(self, other,
dropped_names,
join_idx,
lidx, ridx))

# Re-create the multi-index
multi_join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

multi_join_idx = multi_join_idx.remove_unused_levels()

return multi_join_idx, lidx, ridx

jl = list(overlap)[0]

# Case where only one index is multi
# make the indices into mi's that match
if not (self_is_mi and other_is_mi):

flip_order = False
if self_is_mi:
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {'right': 'left', 'left': 'right'}.get(how, how)

level = other.names.index(jl)
result = self._join_level(other, level, how=how,
return_indexers=return_indexers)

if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result
flip_order = False
if self_is_mi:
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {'right': 'left', 'left': 'right'}.get(how, how)

level = other.names.index(jl)
result = self._join_level(other, level, how=how,
return_indexers=return_indexers)

# 2 multi-indexes
raise NotImplementedError("merging with both multi-indexes is not "
"implemented")
if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result

def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.core.reshape.merge import _get_join_indexers
Expand Down
89 changes: 89 additions & 0 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,95 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
return join_func(lkey, rkey, count, **kwargs)


def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
join_index, lindexer, rindexer):
"""
*this is an internal non-public method*
Returns the levels, labels and names of a multi-index to multi-index join.
Depending on the type of join, this method restores the appropriate
dropped levels of the joined multi-index.
The method relies on lidx, rindexer which hold the index positions of
left and right, where a join was feasible
Parameters
----------
left : MultiIndex
left index
right : MultiIndex
right index
dropped_level_names : str array
list of non-common level names
join_index : MultiIndex
the index of the join between the
common levels of left and right
lindexer : intp array
left indexer
rindexer : intp array
right indexer
Returns
-------
levels : list of Index
levels of combined multiindexes
labels : intp array
labels of combined multiindexes
names : str array
names of combined multiindexes
"""

def _convert_to_mulitindex(index):
if isinstance(index, MultiIndex):
return index
else:
return MultiIndex.from_arrays([index.values],
names=[index.name])

# For multi-multi joins with one overlapping level,
# the returned index if of type Index
# Assure that join_index is of type MultiIndex
# so that dropped levels can be appended
join_index = _convert_to_mulitindex(join_index)

join_levels = join_index.levels
join_labels = join_index.labels
join_names = join_index.names

# lindexer and rindexer hold the indexes where the join occurred
# for left and right respectively. If left/right is None then
# the join occurred on all indices of left/right
if lindexer is None:
lindexer = range(left.size)

if rindexer is None:
rindexer = range(right.size)

# Iterate through the levels that must be restored
for dropped_level_name in dropped_level_names:
if dropped_level_name in left.names:
idx = left
indexer = lindexer
else:
idx = right
indexer = rindexer

# The index of the level name to be restored
name_idx = idx.names.index(dropped_level_name)

restore_levels = idx.levels[name_idx]
# Inject -1 in the labels list where a join was not possible
# IOW indexer[i]=-1
labels = idx.labels[name_idx]
restore_labels = algos.take_nd(labels, indexer, fill_value=-1)

join_levels = join_levels + [restore_levels]
join_labels = join_labels + [restore_labels]
join_names = join_names + [dropped_level_name]

return join_levels, join_labels, join_names


class _OrderedMerge(_MergeOperation):
_merge_type = 'ordered_merge'

Expand Down
Loading

0 comments on commit 88cbce3

Please sign in to comment.