From cb45755da37ebeeec8ef3d7fd722679519a4ce20 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 15:47:40 -0700 Subject: [PATCH 1/7] Replace _copy_construct with _from_data in _binaryop. --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/frame.py | 31 +++++++++++++++--------------- python/cudf/cudf/core/index.py | 14 ++++++++------ python/cudf/cudf/core/series.py | 9 +++++++-- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bc068413efb..c2f5c54ec3d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -10,7 +10,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Any, Mapping, Optional, TypeVar +from typing import Any, MutableMapping, Optional, TypeVar import cupy import numpy as np @@ -458,7 +458,7 @@ def _init_from_dict_like(self, data, index=None, columns=None): @classmethod def _from_data( cls, - data: Mapping, + data: MutableMapping, index: Optional[BaseIndex] = None, columns: Any = None, ) -> DataFrame: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 14b8ebe801f..5afc228baf5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,15 @@ import functools import warnings from collections import abc -from typing import Any, Dict, Mapping, Optional, Tuple, TypeVar, Union +from typing import ( + Any, + Dict, + MutableMapping, + Optional, + Tuple, + TypeVar, + Union, +) import cupy import numpy as np @@ -65,7 +73,9 @@ def __init_subclass__(cls): @classmethod def _from_data( - cls, data: Mapping, index: Optional[cudf.core.index.BaseIndex] = None, + cls, + data: MutableMapping, + index: Optional[cudf.core.index.BaseIndex] = None, ): obj = cls.__new__(cls) libcudf.table.Table.__init__(obj, data, index) @@ -3615,7 +3625,7 @@ class SingleColumnFrame(Frame): @classmethod def _from_data( cls, - data: Mapping, + data: MutableMapping, index: Optional[cudf.core.index.BaseIndex] = None, name: Any = None, ): @@ -3899,16 +3909,6 @@ def factorize(self, na_sentinel=-1): """ return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) - @property - def _copy_construct_defaults(self): - """A default dictionary of kwargs to be used for copy construction.""" - raise NotImplementedError - - def _copy_construct(self, **kwargs): - """Shallow copy this object by replacing certain ctor args. - """ - return self.__class__(**{**self._copy_construct_defaults, **kwargs}) - def _binaryop( self, other: T, @@ -3967,8 +3967,9 @@ def _binaryop( result_name: (self._column, other, reflect, fill_value) } - return self._copy_construct( - data=type(self)._colwise_binop(operands, fn)[result_name], + return self._from_data( + data=type(self)._colwise_binop(operands, fn), + index=self._index, name=result_name, ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 97ee0948209..6f9eff3d719 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,7 @@ import pickle from numbers import Number -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any, Dict, MutableMapping, Optional, Tuple, Type, Union import cupy import numpy as np @@ -1362,12 +1362,14 @@ def from_pandas(cls, index, nan_as_null=None): ind.name = index.name return ind - @property - def _copy_construct_defaults(self): - return {"data": self._column, "name": self.name} - @classmethod - def _from_data(cls, data, index=None): + def _from_data( + cls, + data: MutableMapping, + index: Optional[BaseIndex] = None, + name: Any = None, + ) -> BaseIndex: + assert index is None if not isinstance(data, cudf.core.column_accessor.ColumnAccessor): data = cudf.core.column_accessor.ColumnAccessor(data) if len(data) == 0: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 413fcefc2bc..0ff42979b93 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size -from typing import Any, Mapping, Optional +from typing import Any, MutableMapping, Optional from uuid import uuid4 import cupy @@ -269,7 +269,7 @@ def __init__( @classmethod def _from_data( cls, - data: Mapping, + data: MutableMapping, index: Optional[BaseIndex] = None, name: Any = None, ) -> Series: @@ -386,6 +386,11 @@ def deserialize(cls, header, frames): def _copy_construct_defaults(self): return {"data": self._column, "index": self._index, "name": self.name} + def _copy_construct(self, **kwargs): + """Shallow copy this object by replacing certain ctor args. + """ + return self.__class__(**{**self._copy_construct_defaults, **kwargs}) + def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` From 2f9025efa3207fd627dc74abcc95330f24ac20cf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 15:49:18 -0700 Subject: [PATCH 2/7] Remove leftover _copy_construct method in Index. --- python/cudf/cudf/core/index.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6f9eff3d719..d5f7d1277c8 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -724,39 +724,6 @@ def difference(self, other, sort=None): return difference - def _copy_construct(self, **kwargs): - # Need to override the parent behavior because pandas allows operations - # on unsigned types to return signed values, forcing us to choose the - # right index type here. - data = kwargs.get("data") - cls = self.__class__ - - if data is not None: - if self.dtype != data.dtype: - # TODO: This logic is largely copied from `as_index`. The two - # should be unified via a centralized type dispatching scheme. - if isinstance(data, NumericalColumn): - try: - cls = _dtype_to_index[data.dtype.type] - except KeyError: - cls = GenericIndex - elif isinstance(data, StringColumn): - cls = StringIndex - elif isinstance(data, DatetimeColumn): - cls = DatetimeIndex - elif isinstance(data, TimeDeltaColumn): - cls = TimedeltaIndex - elif isinstance(data, CategoricalColumn): - cls = CategoricalIndex - elif cls is RangeIndex: - # RangeIndex must convert to other numerical types for ops - try: - cls = _dtype_to_index[data.dtype.type] - except KeyError: - cls = GenericIndex - - return cls(**{**self._copy_construct_defaults, **kwargs}) - def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return a sorted copy of the index, and optionally return the indices From 5e53cfaa0aaf1ed5e13332363a65e9f40e66d875 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 16:32:56 -0700 Subject: [PATCH 3/7] Remove all uses of _copy_construct in Series. --- python/cudf/cudf/core/series.py | 44 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0ff42979b93..58bf01d9a1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -703,7 +703,7 @@ def reset_index(self, drop=False, inplace=False): if inplace is True: self._index = RangeIndex(len(self)) else: - return self._copy_construct(index=RangeIndex(len(self))) + return self._from_data(self._data, index=RangeIndex(len(self))) def set_index(self, index): """Returns a new Series with a different index. @@ -738,7 +738,7 @@ def set_index(self, index): dtype: int64 """ index = index if isinstance(index, BaseIndex) else as_index(index) - return self._copy_construct(index=index) + return self._from_data(self._data, index, self.name) def as_index(self): """Returns a new Series with a RangeIndex. @@ -850,8 +850,9 @@ def set_mask(self, mask, null_count=None): 4 5 dtype: int64 """ - col = self._column.set_mask(mask) - return self._copy_construct(data=col) + return self._from_data( + {self.name: self._column.set_mask(mask)}, self._index + ) def __sizeof__(self): return self._column.__sizeof__() + self._index.__sizeof__() @@ -1092,8 +1093,9 @@ def take(self, indices, keep_index=True): return self.iloc[indices] else: col_inds = as_column(indices) - data = self._column.take(col_inds, keep_index=False) - return self._copy_construct(data=data, index=None) + return self._from_data( + {self.name: self._column.take(col_inds, keep_index=False)} + ) def head(self, n=5): """ @@ -2722,8 +2724,9 @@ def nans_to_nulls(self): 4 10.0 dtype: float64 """ - result_col = self._column.nans_to_nulls() - return self._copy_construct(data=result_col) + return self._from_data( + {self.name: self._column.nans_to_nulls()}, self._index + ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): """ @@ -3100,9 +3103,11 @@ def astype(self, dtype, copy=False, errors="raise"): return self.copy(deep=copy) try: data = self._column.astype(dtype) + if copy: + data = data.copy(deep=True) - return self._copy_construct( - data=data.copy(deep=True) if copy else data, index=self.index + return self._from_data( + {self.name: data.copy(deep=True)}, index=self._index ) except Exception as e: @@ -3416,8 +3421,8 @@ def _sort(self, ascending=True, na_position="last"): col_keys, col_inds = self._column.sort_by_values( ascending=ascending, na_position=na_position ) - sr_keys = self._copy_construct(data=col_keys) - sr_inds = self._copy_construct(data=col_inds) + sr_keys = self._from_data({self.name: col_keys}, self._index) + sr_inds = self._from_data({self.name: col_inds}, self._index) return sr_keys, sr_inds def replace( @@ -3720,9 +3725,9 @@ def reverse(self): dtype: int64 """ rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) - col = self._column[rinds] - index = self.index._values[rinds] - return self._copy_construct(data=col, index=index) + return self._from_data( + {self.name: self._column[rinds]}, self.index._values[rinds] + ) def one_hot_encoding(self, cats, dtype="float64"): """Perform one-hot-encoding @@ -3876,7 +3881,9 @@ def _return_sentinel_series(): codes = codes.merge(value, on="value", how="left") codes = codes.sort_values("order")["code"].fillna(na_sentinel) - return codes._copy_construct(name=None, index=self.index) + codes.name = None + codes.index = self._index + return codes # UDF related @@ -3990,7 +3997,7 @@ def applymap(self, udf, out_dtype=None): """ if not callable(udf): raise ValueError("Input UDF must be a callable object.") - return self._copy_construct(data=self._unaryop(udf)) + return self._from_data({self.name: self._unaryop(udf)}, self._index) # # Stats @@ -5255,7 +5262,8 @@ def scale(self): vmin = self.min() vmax = self.max() scaled = (self - vmin) / (vmax - vmin) - return self._copy_construct(data=scaled) + scaled._index = self._index.copy(deep=False) + return scaled # Absolute def abs(self): From 71db785cc40fc21bedff96f92aeffade07755752 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 16:45:44 -0700 Subject: [PATCH 4/7] Remove all but one remaining use of _copy_construct. --- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/indexing.py | 2 ++ python/cudf/cudf/core/multiindex.py | 3 +-- python/cudf/cudf/core/window/rolling.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 87dc1d8e01f..176d91ad478 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -378,6 +378,6 @@ def where( if isinstance(frame, Index): result = Index(result, name=frame.name) else: - result = frame._copy_construct(data=result) + result = frame._from_data({frame.name: result}, frame._index) return frame._mimic_inplace(result, inplace=inplace) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index a4a69a4e084..e9af742a1f5 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -98,6 +98,8 @@ def __getitem__(self, arg): or _is_null_host_scalar(data) ): return data + # return self._sr._from_data( + # {self._sr.name: data}, self._sr.index.take(arg)) index = self._sr.index.take(arg) return self._sr._copy_construct(data=data, index=index) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 26a893a4676..ea9963e1115 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -775,8 +775,7 @@ def _compute_levels_and_codes(self): for name in self._source_data.columns: code, cats = self._source_data[name].factorize() codes[name] = code.astype(np.int64) - cats.name = None - cats = cudf.Series(cats)._copy_construct(name=None) + cats = cudf.Series(cats, name=None) levels.append(cats) self._levels = levels diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index d9a2fd89165..1ed808cfb77 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -215,7 +215,7 @@ def _apply_agg_series(self, sr, agg_name): self.center, agg_name, ) - return sr._copy_construct(data=result_col) + return sr._from_data({sr.name: result_col}, sr._index) def _apply_agg_dataframe(self, df, agg_name): result_df = cudf.DataFrame({}) From 6727fc4a12df311ce1a3e079b94801dcff670da9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 17:25:42 -0700 Subject: [PATCH 5/7] Fix last copy_construct issue. --- python/cudf/cudf/core/indexing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index e9af742a1f5..09cfc6e144a 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -98,10 +98,9 @@ def __getitem__(self, arg): or _is_null_host_scalar(data) ): return data - # return self._sr._from_data( - # {self._sr.name: data}, self._sr.index.take(arg)) - index = self._sr.index.take(arg) - return self._sr._copy_construct(data=data, index=index) + return self._sr._from_data( + {self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg)) + ) def __setitem__(self, key, value): from cudf.core.column import column From aecfb6b0137aaeaa7d951db34229e44cef1e9c75 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Aug 2021 17:26:10 -0700 Subject: [PATCH 6/7] Remove _copy_construct. --- python/cudf/cudf/core/series.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 58bf01d9a1e..2b032189370 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -382,15 +382,6 @@ def deserialize(cls, header, frames): return Series(column, index=index, name=name) - @property - def _copy_construct_defaults(self): - return {"data": self._column, "index": self._index, "name": self.name} - - def _copy_construct(self, **kwargs): - """Shallow copy this object by replacing certain ctor args. - """ - return self.__class__(**{**self._copy_construct_defaults, **kwargs}) - def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` From a031e8a26ce64981bc15ea4c91b96bfeb5b205ee Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Aug 2021 09:43:32 -0700 Subject: [PATCH 7/7] Inline copy logic. --- python/cudf/cudf/core/series.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5618ea3176d..f10dca3e610 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3008,11 +3008,10 @@ def astype(self, dtype, copy=False, errors="raise"): return self.copy(deep=copy) try: data = self._column.astype(dtype) - if copy: - data = data.copy(deep=True) return self._from_data( - {self.name: data.copy(deep=True)}, index=self._index + {self.name: (data.copy(deep=True) if copy else data)}, + index=self._index, ) except Exception as e: