From fc4b3d3ecbf95ee9afdcd509554bbeb5367a3059 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:02:05 -1000 Subject: [PATCH] Reduce deep copies in Index ops (#16054) 1. Changed `Index.rename(inplace=False)` to shallow copy which matches pandas behavior. Let me know if there's a reason why we should deep copy here. 2. Made `RangeIndex.unique` return a shallow copy like pandas. 3. Made `Index.dropna` with no NA's shallow copy like pandas. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16054 --- python/cudf/cudf/core/_base_index.py | 6 +++--- python/cudf/cudf/core/index.py | 5 +++-- python/cudf/cudf/tests/test_index.py | 25 +++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ad73cd57f7d..caf07b286cd 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1120,7 +1120,7 @@ def difference(self, other, sort=None): res_name = _get_result_name(self.name, other.name) if is_mixed_with_object_dtype(self, other) or len(other) == 0: - difference = self.copy().unique() + difference = self.unique() difference.name = res_name if sort is True: return difference.sort_values() @@ -1744,7 +1744,7 @@ def rename(self, name, inplace=False): self.name = name return None else: - out = self.copy(deep=True) + out = self.copy(deep=False) out.name = name return out @@ -2068,7 +2068,7 @@ def dropna(self, how="any"): raise ValueError(f"{how=} must be 'any' or 'all'") try: if not self.hasnans: - return self.copy() + return self.copy(deep=False) except NotImplementedError: pass # This is to be consistent with IndexedFrame.dropna to handle nans diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1c5d05d2d87..71658695b80 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -528,7 +528,7 @@ def memory_usage(self, deep: bool = False) -> int: def unique(self) -> Self: # RangeIndex always has unique values - return self + return self.copy() @_cudf_nvtx_annotate def __mul__(self, other): @@ -3197,7 +3197,8 @@ def _get_nearest_indexer( ) right_indexer = _get_indexer_basic( index=index, - positions=positions.copy(deep=True), + # positions no longer used so don't copy + positions=positions, method="backfill", target_col=target_col, tolerance=tolerance, diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3d6c71ebc1b..a59836df5ba 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -252,10 +252,10 @@ def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") gds = Index(pds) - # inplace=False should yield a deep copy + # inplace=False should yield a shallow copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr + assert gds_renamed_deep._values.data_ptr == gds._values.data_ptr # inplace=True returns none expected_ptr = gds._values.data_ptr @@ -3214,6 +3214,27 @@ def test_rangeindex_dropna(): assert_eq(result, expected) +def test_rangeindex_unique_shallow_copy(): + ri_pandas = pd.RangeIndex(1) + result = ri_pandas.unique() + assert result is not ri_pandas + + ri_cudf = cudf.RangeIndex(1) + result = ri_cudf.unique() + assert result is not ri_cudf + assert_eq(result, ri_cudf) + + +def test_rename_shallow_copy(): + idx = pd.Index([1]) + result = idx.rename("a") + assert idx.to_numpy(copy=False) is result.to_numpy(copy=False) + + idx = cudf.Index([1]) + result = idx.rename("a") + assert idx._column is result._column + + @pytest.mark.parametrize("data", [range(2), [10, 11, 12]]) def test_index_contains_hashable(data): gidx = cudf.Index(data)