Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce (shallow) copies in DataFrame ops #16060

Merged
merged 9 commits into from
Jun 26, 2024
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ def swaplevel(self, i=-2, j=-1):
new_keys[n][i], new_keys[n][j] = row[j], row[i]
new_dict.update({row: tuple(new_keys[n])})

# TODO: Change to deep=False when copy-on-write is default
new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()}

# swap level_names for i and j
Expand Down Expand Up @@ -669,10 +670,11 @@ def rename_column(x):
raise ValueError("Duplicate column names are not allowed")

data = dict(zip(new_col_names, self.values()))
return self.__class__(
return type(self)(
data=data,
level_names=self.level_names,
multiindex=self.multiindex,
label_dtype=self.label_dtype,
verify=False,
)

Expand Down
48 changes: 27 additions & 21 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1121,8 +1121,6 @@ def _from_data(
@staticmethod
@_cudf_nvtx_annotate
def _align_input_series_indices(data, index):
data = data.copy()

input_series = [
Series(val)
for val in data.values()
Expand All @@ -1142,6 +1140,7 @@ def _align_input_series_indices(data, index):
)
index = aligned_input_series[0].index

data = data.copy()
for name, val in data.items():
if isinstance(val, (pd.Series, Series, dict)):
data[name] = aligned_input_series.pop(0)
Expand Down Expand Up @@ -2969,6 +2968,7 @@ def set_index(
idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
idx.names = names

# TODO: Change to deep=False when copy-on-write is default
df = self if inplace else self.copy(deep=True)

if verify_integrity and not idx.is_unique:
Expand Down Expand Up @@ -3565,6 +3565,9 @@ def rename(
mapper if columns is None and axis in (1, "columns") else columns
)

result = self if inplace else self.copy(deep=copy)

out_index = None
if index:
if (
any(isinstance(item, str) for item in index.values())
Expand All @@ -3586,36 +3589,36 @@ def rename(
)
out_index._data[level] = column.as_column(level_values)
out_index._compute_levels_and_codes()
out = DataFrame(index=out_index)
else:
to_replace = list(index.keys())
vals = list(index.values())
is_all_na = vals.count(None) == len(vals)

try:
index_data = {
name: col.find_and_replace(to_replace, vals, is_all_na)
for name, col in self.index._data.items()
}
out_index = _index_from_data(
{
name: col.find_and_replace(
to_replace, vals, is_all_na
)
for name, col in self.index._data.items()
}
)
except OverflowError:
index_data = self.index._data.copy(deep=True)
pass

out = DataFrame(index=_index_from_data(index_data))
else:
out = DataFrame(index=self.index)
if out_index is not None:
result.index = out_index

if columns:
out._data = self._data.rename_levels(mapper=columns, level=level)
else:
out._data = self._data.copy(deep=copy)
result._data = result._data.rename_levels(
mapper=columns, level=level
)

if inplace:
self._data = out._data
else:
return out.copy(deep=copy)
return result

@_cudf_nvtx_annotate
def add_prefix(self, prefix):
# TODO: Change to deep=False when copy-on-write is default
out = self.copy(deep=True)
out.columns = [
prefix + col_name for col_name in list(self._data.keys())
Expand All @@ -3624,6 +3627,7 @@ def add_prefix(self, prefix):

@_cudf_nvtx_annotate
def add_suffix(self, suffix):
# TODO: Change to deep=False when copy-on-write is default
out = self.copy(deep=True)
out.columns = [
col_name + suffix for col_name in list(self._data.keys())
Expand Down Expand Up @@ -3956,7 +3960,8 @@ def swaplevel(self, i=-2, j=-1, axis=0):
weight 1.0 0.8
length 0.3 0.2
"""
result = self.copy()
# TODO: Change to deep=False when copy-on-write is default
result = self.copy(deep=True)

# To get axis number
axis = self._get_axis_from_axis_arg(axis)
Expand Down Expand Up @@ -4027,7 +4032,7 @@ def transpose(self):

# Set the old column names as the new index
result = self.__class__._from_data(
{i: col for i, col in enumerate(result_columns)},
ColumnAccessor(dict(enumerate(result_columns)), verify=False),
index=as_index(index),
)
# Set the old index as the new column names
Expand Down Expand Up @@ -5528,7 +5533,7 @@ def to_arrow(self, preserve_index=None):
b: [[4,5,6]]
"""

data = self.copy(deep=False)
data = self
index_descr = []
write_index = preserve_index is not False
keep_range_index = write_index and preserve_index is None
Expand Down Expand Up @@ -5556,6 +5561,7 @@ def to_arrow(self, preserve_index=None):
index_descr = (
index.names if index.name is not None else ("index",)
)
data = data.copy(deep=False)
for gen_name, col_name in zip(index_descr, index._data.names):
data._insert(
data.shape[1],
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10024,6 +10024,14 @@ def test_dataframe_rename_duplicate_column():
gdf.rename(columns={"a": "b"}, inplace=True)


def test_dataframe_rename_columns_keep_type():
gdf = cudf.DataFrame([[1, 2, 3]])
gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8)
result = gdf.rename({4: 50}, axis="columns").columns
expected = pd.Index([50, 5, 6], dtype=np.int8)
assert_eq(result, expected)


@pytest_unmark_spilling
@pytest.mark.skipif(
PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
Expand Down
Loading