diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1eb5a994492..276038146e1 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -208,7 +208,7 @@ def _compute_join_keys(self): left_keys.extend( [ _Indexer(name=on, index=True) - for on in self.lhs.index.names + for on in self.lhs.index._data.names ] ) if self.left_on: @@ -223,7 +223,7 @@ def _compute_join_keys(self): right_keys.extend( [ _Indexer(name=on, index=True) - for on in self.rhs.index.names + for on in self.rhs.index._data.names ] ) if self.right_on: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 079a6d902b6..418d24f41df 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1524,9 +1524,17 @@ def from_pandas(cls, multiindex, nan_as_null=None): if not isinstance(multiindex, pd.MultiIndex): raise TypeError("not a pandas.MultiIndex") + # if `multiindex` has two or more levels that + # have the same name, then `multiindex.to_frame()` + # results in a DataFrame containing only one of those + # levels. Thus, set `names` to some tuple of unique values + # and then call `multiindex.to_frame(name=names)`, + # which preserves all levels of `multiindex`. + names = tuple(range(len(multiindex.names))) + mi = cls( names=multiindex.names, - source_data=multiindex.to_frame(), + source_data=multiindex.to_frame(name=names), nan_as_null=nan_as_null, ) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index c37939df7d3..b18cce60bfd 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -2112,3 +2112,23 @@ def test_string_join_values_nulls(): got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) assert_join_results_equal(expect, got, how="left") + + +def test_join_on_index_with_duplicate_names(): + # although index levels with duplicate names are poorly supported + # overall, we *should* be able to join on them: + lhs = pd.DataFrame({"a": [1, 2, 3]}) + rhs = pd.DataFrame({"b": [1, 2, 3]}) + lhs.index = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (2, 1)], names=["x", "x"] + ) + rhs.index = pd.MultiIndex.from_tuples( + [(1, 1), (1, 3), (2, 1)], names=["x", "x"] + ) + expect = lhs.join(rhs, how="inner") + + lhs = cudf.from_pandas(lhs) + rhs = cudf.from_pandas(rhs) + got = lhs.join(rhs, how="inner") + + assert_join_results_equal(expect, got, how="inner")