From 680217af0461a482c9eb5719319c19cd45495c4c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:29:02 -0700 Subject: [PATCH 1/2] Clean Frame._dtype usage --- python/cudf/cudf/core/frame.py | 8 ++++---- python/cudf/cudf/core/groupby/groupby.py | 16 +++------------- python/cudf/cudf/core/indexed_frame.py | 6 +++--- python/cudf/cudf/io/csv.py | 5 ++--- python/cudf/cudf/io/json.py | 5 ++--- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d60c206ac24..f57ac389e68 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -79,12 +79,12 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? - return tuple(self._data.names) + def _column_names(self) -> Tuple[Any, ...]: + return self._data.names @property - def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? - return tuple(self._data.columns) + def _columns(self) -> Tuple[ColumnBase, ...]: + return self._data.columns @property def _dtypes(self): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3e7a1ee6026..755c9d04594 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,12 +22,7 @@ from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default -from cudf.api.types import ( - is_bool_dtype, - is_float_dtype, - is_list_like, - is_numeric_dtype, -) +from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -335,12 +330,8 @@ def dtypes(self): FutureWarning, ) index = self.grouping.keys.unique().sort_values().to_pandas() - obj_dtypes = self.obj._dtypes return pd.DataFrame( - { - name: [obj_dtypes[name]] * len(index) - for name in self.obj._data.names - }, + {name: [dtype] * len(index) for name, dtype in self.obj._dtypes}, index=index, ) @@ -499,8 +490,7 @@ def rank( # treats NaNs the way we treat nulls. if cudf.get_option("mode.pandas_compatible"): if any( - is_float_dtype(typ) - for typ in self.grouping.values._dtypes.values() + col.dtype.kind == "f" for col in self.grouping.values._columns ): raise NotImplementedError( "NaNs are not supported in groupby.rank." diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a31430e1571..3a54446a361 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6319,9 +6319,9 @@ def __dask_tokenize__(self): type(self), str(self._dtypes), *[ - normalize_token(cat.categories) - for cat in self._dtypes.values() - if cat == "category" + normalize_token(col.dtype.categories) + for col in self._columns + if col.dtype == "category" ], normalize_token(self.index), normalize_token(self.hash_values().values_host), diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3eeeac405b3..f07764e2ce4 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -132,10 +132,9 @@ def read_csv( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index dd4a0d9eb07..fc3387d5117 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -147,10 +147,9 @@ def read_json( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is True else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} From 0c269b1a5fd3beda7c20beb6f0ced449f3e15966 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:28:18 -0700 Subject: [PATCH 2/2] Make ._dtypes an iterator --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/frame.py | 8 +++----- python/cudf/cudf/core/indexed_frame.py | 4 ++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index acfc2d781a7..98969eb9146 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1231,7 +1231,7 @@ def dtypes(self): string object dtype: object """ - return pd.Series(self._dtypes, dtype="object") + return pd.Series(dict(self._dtypes), dtype="object") @property def ndim(self) -> int: @@ -2836,7 +2836,7 @@ def reindex( return df._reindex( column_names=columns, - dtypes=self._dtypes, + dtypes=dict(self._dtypes), deep=copy, index=index, inplace=False, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f57ac389e68..f01781af03e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -87,10 +87,8 @@ def _columns(self) -> Tuple[ColumnBase, ...]: return self._data.columns @property - def _dtypes(self): - return dict( - zip(self._data.names, (col.dtype for col in self._data.columns)) - ) + def _dtypes(self) -> abc.Iterator: + return zip(self._data.names, (col.dtype for col in self._data.columns)) @property def ndim(self) -> int: @@ -1980,7 +1978,7 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3a54446a361..1fd63c9aa2a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -893,7 +893,7 @@ def replace( ) = _get_replacement_values_for_columns( to_replace=to_replace, value=value, - columns_dtype_map=self._dtypes, + columns_dtype_map=dict(self._dtypes), ) for name, col in self._data.items(): @@ -6317,7 +6317,7 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), *[ normalize_token(col.dtype.categories) for col in self._columns