From c031381c1ee7c8d56ab80640cc83ac6285f9a78e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Jun 2024 16:31:14 -0700 Subject: [PATCH 1/2] Use more shallow copies in indexing operations --- python/cudf/cudf/core/_base_index.py | 6 +++--- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/tests/test_index.py | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e71e45e410e..8d055ea68a1 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1122,7 +1122,7 @@ def difference(self, other, sort=None): res_name = _get_result_name(self.name, other.name) if is_mixed_with_object_dtype(self, other) or len(other) == 0: - difference = self.copy().unique() + difference = self.unique() difference.name = res_name if sort is True: return difference.sort_values() @@ -1746,7 +1746,7 @@ def rename(self, name, inplace=False): self.name = name return None else: - out = self.copy(deep=True) + out = self.copy(deep=False) out.name = name return out @@ -2070,7 +2070,7 @@ def dropna(self, how="any"): raise ValueError(f"{how=} must be 'any' or 'all'") try: if not self.hasnans: - return self.copy() + return self.copy(deep=False) except NotImplementedError: pass # This is to be consistent with IndexedFrame.dropna to handle nans diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index df21d392311..dca5c4557e0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -526,7 +526,7 @@ def memory_usage(self, deep: bool = False) -> int: def unique(self) -> Self: # RangeIndex always has unique values - return self + return self.copy() @_cudf_nvtx_annotate def __mul__(self, other): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3d6c71ebc1b..5b3c434b9a4 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3214,6 +3214,27 @@ def test_rangeindex_dropna(): assert_eq(result, expected) +def test_rangeindex_unique_shallow_copy(): + ri_pandas = pd.RangeIndex(1) + result = ri_pandas.unique() + assert result is not ri_pandas + + ri_cudf = cudf.RangeIndex(1) + result = ri_cudf.unique() + assert result is not ri_cudf + assert_eq(result, ri_cudf) + + +def test_rename_shallow_copy(): + idx = pd.Index([1]) + result = idx.rename("a") + assert idx.to_numpy(copy=False) is result.to_numpy(copy=False) + + idx = cudf.Index([1]) + result = idx.rename("a") + assert idx._column is result._column + + @pytest.mark.parametrize("data", [range(2), [10, 11, 12]]) def test_index_contains_hashable(data): gidx = cudf.Index(data) From 1f72292444e030e9835a94d17cd72eba263f2631 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:00:53 -0700 Subject: [PATCH 2/2] Fix other test avoid a copy in get_indexer --- python/cudf/cudf/core/index.py | 3 ++- python/cudf/cudf/tests/test_index.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index dca5c4557e0..9554b39b2f5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3206,7 +3206,8 @@ def _get_nearest_indexer( ) right_indexer = _get_indexer_basic( index=index, - positions=positions.copy(deep=True), + # positions no longer used so don't copy + positions=positions, method="backfill", target_col=target_col, tolerance=tolerance, diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 5b3c434b9a4..a59836df5ba 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -252,10 +252,10 @@ def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") gds = Index(pds) - # inplace=False should yield a deep copy + # inplace=False should yield a shallow copy gds_renamed_deep = gds.rename("new_name", inplace=False) - assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr + assert gds_renamed_deep._values.data_ptr == gds._values.data_ptr # inplace=True returns none expected_ptr = gds._values.data_ptr