Skip to content

Commit

Permalink
Deprecate index merging (#10689)
Browse files Browse the repository at this point in the history
This PR deprecates support for merging Index objects. pandas only supports merging of DataFrames, so we should move towards that as well. The main internal implication of this change is that `BaseIndex.union` and `BaseIndex.difference` now require an internal conversion to a `DataFrame` followed by a conversion of the result back to the appropriate index type. Since the intermediate objects are not modified and don't involve additional memory allocations, this change just adds a little bit of Python overhead to index merging (10-50 us). Once the deprecated code is fully removed, though, we should be able to make this time back by simplifying the internals of joining, which currently has logic for handling Series and Index objects internally.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: #10689
  • Loading branch information
vyasr authored Apr 26, 2022
1 parent 41dfdc2 commit 75a675b
Showing 1 changed file with 27 additions and 2 deletions.
29 changes: 27 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import pickle
import warnings
from functools import cached_property
from typing import Any, Set

Expand Down Expand Up @@ -707,7 +708,18 @@ def difference(self, other, sort=None):
if is_mixed_with_object_dtype(self, other):
difference = self.copy()
else:
difference = self.join(other, how="leftanti")
other = other.copy(deep=False)
other.names = self.names
difference = cudf.core.index._index_from_data(
cudf.DataFrame._from_data(self._data)
._merge(
cudf.DataFrame._from_data(other._data),
how="leftanti",
on=self.name,
)
._data
)

if self.dtype != other.dtype:
difference = difference.astype(self.dtype)

Expand Down Expand Up @@ -989,7 +1001,17 @@ def _union(self, other, sort=None):
return union_result

def _intersection(self, other, sort=None):
intersection_result = self.unique().join(other.unique(), how="inner")
other_unique = other.unique()
other_unique.names = self.names
intersection_result = cudf.core.index._index_from_data(
cudf.DataFrame._from_data(self.unique()._data)
._merge(
cudf.DataFrame._from_data(other_unique._data),
how="inner",
on=self.name,
)
._data
)

if sort is None and len(other):
return intersection_result.sort_values()
Expand Down Expand Up @@ -1141,6 +1163,9 @@ def join(
(1, 2)],
names=['a', 'b'])
"""
warnings.warn(
"Index.join is deprecated and will be removed", FutureWarning
)

if isinstance(self, cudf.MultiIndex) and isinstance(
other, cudf.MultiIndex
Expand Down

0 comments on commit 75a675b

Please sign in to comment.