Skip to content

Commit

Permalink
Reduce deep copies in Index ops (#16054)
Browse files Browse the repository at this point in the history
1. Changed `Index.rename(inplace=False)` to shallow copy which matches pandas behavior. Let me know if there's a reason why we should deep copy here.
2. Made `RangeIndex.unique` return a shallow copy like pandas.
3. Made `Index.dropna` with no NA's shallow copy like pandas.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16054
  • Loading branch information
mroeschke authored Jun 18, 2024
1 parent 231cb71 commit fc4b3d3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 7 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,7 +1120,7 @@ def difference(self, other, sort=None):
res_name = _get_result_name(self.name, other.name)

if is_mixed_with_object_dtype(self, other) or len(other) == 0:
difference = self.copy().unique()
difference = self.unique()
difference.name = res_name
if sort is True:
return difference.sort_values()
Expand Down Expand Up @@ -1744,7 +1744,7 @@ def rename(self, name, inplace=False):
self.name = name
return None
else:
out = self.copy(deep=True)
out = self.copy(deep=False)
out.name = name
return out

Expand Down Expand Up @@ -2068,7 +2068,7 @@ def dropna(self, how="any"):
raise ValueError(f"{how=} must be 'any' or 'all'")
try:
if not self.hasnans:
return self.copy()
return self.copy(deep=False)
except NotImplementedError:
pass
# This is to be consistent with IndexedFrame.dropna to handle nans
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def memory_usage(self, deep: bool = False) -> int:

def unique(self) -> Self:
# RangeIndex always has unique values
return self
return self.copy()

@_cudf_nvtx_annotate
def __mul__(self, other):
Expand Down Expand Up @@ -3197,7 +3197,8 @@ def _get_nearest_indexer(
)
right_indexer = _get_indexer_basic(
index=index,
positions=positions.copy(deep=True),
# positions no longer used so don't copy
positions=positions,
method="backfill",
target_col=target_col,
tolerance=tolerance,
Expand Down
25 changes: 23 additions & 2 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,10 @@ def test_index_rename_inplace():
pds = pd.Index([1, 2, 3], name="asdf")
gds = Index(pds)

# inplace=False should yield a deep copy
# inplace=False should yield a shallow copy
gds_renamed_deep = gds.rename("new_name", inplace=False)

assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr
assert gds_renamed_deep._values.data_ptr == gds._values.data_ptr

# inplace=True returns none
expected_ptr = gds._values.data_ptr
Expand Down Expand Up @@ -3214,6 +3214,27 @@ def test_rangeindex_dropna():
assert_eq(result, expected)


def test_rangeindex_unique_shallow_copy():
ri_pandas = pd.RangeIndex(1)
result = ri_pandas.unique()
assert result is not ri_pandas

ri_cudf = cudf.RangeIndex(1)
result = ri_cudf.unique()
assert result is not ri_cudf
assert_eq(result, ri_cudf)


def test_rename_shallow_copy():
idx = pd.Index([1])
result = idx.rename("a")
assert idx.to_numpy(copy=False) is result.to_numpy(copy=False)

idx = cudf.Index([1])
result = idx.rename("a")
assert idx._column is result._column


@pytest.mark.parametrize("data", [range(2), [10, 11, 12]])
def test_index_contains_hashable(data):
gidx = cudf.Index(data)
Expand Down

0 comments on commit fc4b3d3

Please sign in to comment.