Skip to content

Commit

Permalink
Reduce/clean copy usage in Series, reshaping (#16080)
Browse files Browse the repository at this point in the history
* Clean up copy usages in `concat`
* Avoid always shallow copying in `unstack`
* Don't extra copy pandas objects in the Series constructor

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16080
  • Loading branch information
mroeschke authored Jun 26, 2024
1 parent 65b64f6 commit f1efa40
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 42 deletions.
62 changes: 21 additions & 41 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,51 +300,31 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
obj = objs[0]
if ignore_index:
if axis == 1:
result = cudf.DataFrame._from_data(
data=obj._data.copy(deep=True),
index=obj.index.copy(deep=True),
)
# The DataFrame constructor for dict-like data (such as the
# ColumnAccessor given by obj._data here) will drop any columns
# in the data that are not in `columns`, so we have to rename
# after construction.
result.columns = pd.RangeIndex(len(obj._data.names))
else:
if isinstance(obj, cudf.Series):
result = cudf.Series._from_data(
data=obj._data.copy(deep=True),
index=cudf.RangeIndex(len(obj)),
)
elif isinstance(obj, pd.Series):
result = cudf.Series(
data=obj,
index=cudf.RangeIndex(len(obj)),
)
result = obj.to_frame()
else:
result = cudf.DataFrame._from_data(
data=obj._data.copy(deep=True),
index=cudf.RangeIndex(len(obj)),
)
result = obj.copy(deep=True)
result.columns = pd.RangeIndex(len(result._data))
else:
result = type(obj)._from_data(
data=obj._data.copy(deep=True),
index=cudf.RangeIndex(len(obj)),
)
elif axis == 0:
result = obj.copy(deep=True)
else:
if axis == 0:
result = obj.copy()
if isinstance(obj, cudf.Series):
result = obj.to_frame()
else:
data = obj._data.copy(deep=True)
if isinstance(obj, cudf.Series) and obj.name is None:
# If the Series has no name, pandas renames it to 0.
data[0] = data.pop(None)
result = cudf.DataFrame._from_data(
data, index=obj.index.copy(deep=True)
result = obj.copy(deep=True)
if keys is not None and isinstance(result, cudf.DataFrame):
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
for c in result._column_names
]
)
if keys is not None:
if isinstance(result, cudf.DataFrame):
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
for c in result._column_names
]
)

if isinstance(result, cudf.Series) and axis == 0:
# sort has no effect for series concatted along axis 0
Expand Down Expand Up @@ -1179,7 +1159,6 @@ def unstack(df, level, fill_value=None):
if pd.api.types.is_list_like(level):
if not level:
return df
df = df.copy(deep=False)
if not isinstance(df.index, cudf.MultiIndex):
dtype = df._columns[0].dtype
for col in df._columns:
Expand All @@ -1195,6 +1174,7 @@ def unstack(df, level, fill_value=None):
)
return res
else:
df = df.copy(deep=False)
columns = df.index._poplevels(level)
index = df.index
result = _pivot(df, index, columns)
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def __init__(
data = {}

if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
if copy:
if copy and not isinstance(data, (pd.Series, pd.Index)):
data = data.copy(deep=True)
name_from_data = data.name
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
Expand Down Expand Up @@ -3434,13 +3434,15 @@ def rename(self, index=None, copy=True):
@_cudf_nvtx_annotate
def add_prefix(self, prefix):
return Series._from_data(
# TODO: Change to deep=False when copy-on-write is default
data=self._data.copy(deep=True),
index=prefix + self.index.astype(str),
)

@_cudf_nvtx_annotate
def add_suffix(self, suffix):
return Series._from_data(
# TODO: Change to deep=False when copy-on-write is default
data=self._data.copy(deep=True),
index=self.index.astype(str) + suffix,
)
Expand Down

0 comments on commit f1efa40

Please sign in to comment.