From 75a675b70171823b88d3fcfc0f0c0fa94b349d25 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 26 Apr 2022 17:07:36 -0400 Subject: [PATCH] Deprecate index merging (#10689) This PR deprecates support for merging Index objects. pandas only supports merging of DataFrames, so we should move towards that as well. The main internal implication of this change is that `BaseIndex.union` and `BaseIndex.difference` now require an internal conversion to a `DataFrame` followed by a conversion of the result back to the appropriate index type. Since the intermediate objects are not modified and don't involve additional memory allocations, this change just adds a little bit of Python overhead to index merging (10-50 us). Once the deprecated code is fully removed, though, we should be able to make this time back by simplifying the internals of joining, which currently has logic for handling Series and Index objects internally. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/10689 --- python/cudf/cudf/core/_base_index.py | 29 ++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 6fed6510484..8dbd71739b5 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations import pickle +import warnings from functools import cached_property from typing import Any, Set @@ -707,7 +708,18 @@ def difference(self, other, sort=None): if is_mixed_with_object_dtype(self, other): difference = self.copy() else: - difference = self.join(other, how="leftanti") + other = other.copy(deep=False) + other.names = self.names + difference = cudf.core.index._index_from_data( + cudf.DataFrame._from_data(self._data) + ._merge( + cudf.DataFrame._from_data(other._data), + how="leftanti", + on=self.name, + ) + ._data + ) + if self.dtype != other.dtype: difference = difference.astype(self.dtype) @@ -989,7 +1001,17 @@ def _union(self, other, sort=None): return union_result def _intersection(self, other, sort=None): - intersection_result = self.unique().join(other.unique(), how="inner") + other_unique = other.unique() + other_unique.names = self.names + intersection_result = cudf.core.index._index_from_data( + cudf.DataFrame._from_data(self.unique()._data) + ._merge( + cudf.DataFrame._from_data(other_unique._data), + how="inner", + on=self.name, + ) + ._data + ) if sort is None and len(other): return intersection_result.sort_values() @@ -1141,6 +1163,9 @@ def join( (1, 2)], names=['a', 'b']) """ + warnings.warn( + "Index.join is deprecated and will be removed", FutureWarning + ) if isinstance(self, cudf.MultiIndex) and isinstance( other, cudf.MultiIndex