From 3cacaeeeaf608e761b1a7b1104ad551b3e20aa5b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Jun 2024 19:37:55 -0700 Subject: [PATCH 1/3] Reduce copies in dataframe --- python/cudf/cudf/core/column_accessor.py | 2 +- python/cudf/cudf/core/dataframe.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 1bf9a393566..b5adcbfecc3 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -472,7 +472,7 @@ def swaplevel(self, i=-2, j=-1): new_keys[n][i], new_keys[n][j] = row[j], row[i] new_dict.update({row: tuple(new_keys[n])}) - new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()} + new_data = {new_dict[k]: v.copy(deep=False) for k, v in self.items()} # swap level_names for i and j new_names = list(self.level_names) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 065b13561ab..c7e7cd8fa9a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1121,8 +1121,6 @@ def _from_data( @staticmethod @_cudf_nvtx_annotate def _align_input_series_indices(data, index): - data = data.copy() - input_series = [ Series(val) for val in data.values() @@ -1142,6 +1140,7 @@ def _align_input_series_indices(data, index): ) index = aligned_input_series[0].index + data = data.copy() for name, val in data.items(): if isinstance(val, (pd.Series, Series, dict)): data[name] = aligned_input_series.pop(0) @@ -3590,7 +3589,7 @@ def rename( @_cudf_nvtx_annotate def add_prefix(self, prefix): - out = self.copy(deep=True) + out = self.copy(deep=False) out.columns = [ prefix + col_name for col_name in list(self._data.keys()) ] @@ -3598,7 +3597,7 @@ def add_prefix(self, prefix): @_cudf_nvtx_annotate def add_suffix(self, suffix): - out = self.copy(deep=True) + out = self.copy(deep=False) out.columns = [ col_name + suffix for col_name in list(self._data.keys()) ] @@ -3930,7 +3929,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): weight 1.0 0.8 length 0.3 0.2 """ - result = self.copy() + result = self.copy(deep=False) # To get axis number axis = self._get_axis_from_axis_arg(axis) @@ -4001,7 +4000,7 @@ def transpose(self): # Set the old column names as the new index result = self.__class__._from_data( - {i: col for i, col in enumerate(result_columns)}, + ColumnAccessor(dict(enumerate(result_columns)), verify=False), index=as_index(index), ) # Set the old index as the new column names @@ -5502,7 +5501,7 @@ def to_arrow(self, preserve_index=None): b: [[4,5,6]] """ - data = self.copy(deep=False) + data = self index_descr = [] write_index = preserve_index is not False keep_range_index = write_index and preserve_index is None @@ -5530,6 +5529,7 @@ def to_arrow(self, preserve_index=None): index_descr = ( index.names if index.name is not None else ("index",) ) + data = data.copy(deep=False) for gen_name, col_name in zip(index_descr, index._data.names): data._insert( data.shape[1], From b4c1d6823503e3a49f6491b00d226eccd322637c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jun 2024 14:54:08 -0700 Subject: [PATCH 2/3] Reduce more copies --- python/cudf/cudf/core/column_accessor.py | 3 +- python/cudf/cudf/core/dataframe.py | 36 +++++++++++++----------- python/cudf/cudf/tests/test_dataframe.py | 8 ++++++ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index b5adcbfecc3..0ff4a9afd93 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -669,10 +669,11 @@ def rename_column(x): raise ValueError("Duplicate column names are not allowed") data = dict(zip(new_col_names, self.values())) - return self.__class__( + return type(self)( data=data, level_names=self.level_names, multiindex=self.multiindex, + label_dtype=self.label_dtype, verify=False, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 37267c8a1ea..e96ca9659e8 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2968,7 +2968,7 @@ def set_index( idx = MultiIndex._from_data(dict(enumerate(data_to_add))) idx.names = names - df = self if inplace else self.copy(deep=True) + df = self if inplace else self.copy(deep=False) if verify_integrity and not idx.is_unique: raise ValueError(f"Values in Index are not unique: {idx}") @@ -3538,6 +3538,9 @@ def rename( mapper if columns is None and axis in (1, "columns") else columns ) + result = self if inplace else self.copy(deep=copy) + + out_index = None if index: if ( any(isinstance(item, str) for item in index.values()) @@ -3559,33 +3562,32 @@ def rename( ) out_index._data[level] = column.as_column(level_values) out_index._compute_levels_and_codes() - out = DataFrame(index=out_index) else: to_replace = list(index.keys()) vals = list(index.values()) is_all_na = vals.count(None) == len(vals) try: - index_data = { - name: col.find_and_replace(to_replace, vals, is_all_na) - for name, col in self.index._data.items() - } + out_index = _index_from_data( + { + name: col.find_and_replace( + to_replace, vals, is_all_na + ) + for name, col in self.index._data.items() + } + ) except OverflowError: - index_data = self.index._data.copy(deep=True) + pass - out = DataFrame(index=_index_from_data(index_data)) - else: - out = DataFrame(index=self.index) + if out_index is not None: + result.index = out_index if columns: - out._data = self._data.rename_levels(mapper=columns, level=level) - else: - out._data = self._data.copy(deep=copy) + result._data = result._data.rename_levels( + mapper=columns, level=level + ) - if inplace: - self._data = out._data - else: - return out.copy(deep=copy) + return result @_cudf_nvtx_annotate def add_prefix(self, prefix): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3661e13bd39..5bc61e90336 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10020,6 +10020,14 @@ def test_dataframe_rename_duplicate_column(): gdf.rename(columns={"a": "b"}, inplace=True) +def test_dataframe_rename_columns_keep_type(): + gdf = cudf.DataFrame([[1, 2, 3]]) + gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8) + result = gdf.rename({4: 50}, axis="columns").columns + expected = pd.Index([50, 5, 6], dtype=np.int8) + assert_eq(result, expected) + + @pytest_unmark_spilling @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, From 146e5783cc3d619872331cb396f1d39ef982c866 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:15:16 -0700 Subject: [PATCH 3/3] Revert some deep copy changes --- python/cudf/cudf/core/column_accessor.py | 3 ++- python/cudf/cudf/core/dataframe.py | 12 ++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 0ff4a9afd93..f30a557efb0 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -472,7 +472,8 @@ def swaplevel(self, i=-2, j=-1): new_keys[n][i], new_keys[n][j] = row[j], row[i] new_dict.update({row: tuple(new_keys[n])}) - new_data = {new_dict[k]: v.copy(deep=False) for k, v in self.items()} + # TODO: Change to deep=False when copy-on-write is default + new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()} # swap level_names for i and j new_names = list(self.level_names) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a5a56ce4b8f..f7f5ef792d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2968,7 +2968,8 @@ def set_index( idx = MultiIndex._from_data(dict(enumerate(data_to_add))) idx.names = names - df = self if inplace else self.copy(deep=False) + # TODO: Change to deep=False when copy-on-write is default + df = self if inplace else self.copy(deep=True) if verify_integrity and not idx.is_unique: raise ValueError(f"Values in Index are not unique: {idx}") @@ -3617,7 +3618,8 @@ def rename( @_cudf_nvtx_annotate def add_prefix(self, prefix): - out = self.copy(deep=False) + # TODO: Change to deep=False when copy-on-write is default + out = self.copy(deep=True) out.columns = [ prefix + col_name for col_name in list(self._data.keys()) ] @@ -3625,7 +3627,8 @@ def add_prefix(self, prefix): @_cudf_nvtx_annotate def add_suffix(self, suffix): - out = self.copy(deep=False) + # TODO: Change to deep=False when copy-on-write is default + out = self.copy(deep=True) out.columns = [ col_name + suffix for col_name in list(self._data.keys()) ] @@ -3957,7 +3960,8 @@ def swaplevel(self, i=-2, j=-1, axis=0): weight 1.0 0.8 length 0.3 0.2 """ - result = self.copy(deep=False) + # TODO: Change to deep=False when copy-on-write is default + result = self.copy(deep=True) # To get axis number axis = self._get_axis_from_axis_arg(axis)