From 57aeeb78d85e169ac18b82f51d2b1cbd01b0608d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Jun 2024 06:49:57 -1000 Subject: [PATCH] Make Frame._dtype an iterator instead of a dict (#15920) A lot of the usages of `Frame._dtype` didn't require the previous `dict` return type since that was just re-iterated over anyways. Also removed a redundant `tuple` call in `Frame._column_names` and `Frame._columns` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15920 --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/frame.py | 16 +++++++--------- python/cudf/cudf/core/groupby/groupby.py | 16 +++------------- python/cudf/cudf/core/indexed_frame.py | 10 +++++----- python/cudf/cudf/io/csv.py | 5 ++--- python/cudf/cudf/io/json.py | 5 ++--- 6 files changed, 21 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c8f1e872300..9307267b227 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1231,7 +1231,7 @@ def dtypes(self): string object dtype: object """ - return pd.Series(self._dtypes, dtype="object") + return pd.Series(dict(self._dtypes), dtype="object") @property def ndim(self) -> int: @@ -2834,7 +2834,7 @@ def reindex( return df._reindex( column_names=columns, - dtypes=self._dtypes, + dtypes=dict(self._dtypes), deep=copy, index=index, inplace=False, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7326696c994..af8886a44a6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -79,18 +79,16 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? - return tuple(self._data.names) + def _column_names(self) -> Tuple[Any, ...]: + return self._data.names @property - def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? - return tuple(self._data.columns) + def _columns(self) -> Tuple[ColumnBase, ...]: + return self._data.columns @property - def _dtypes(self): - return dict( - zip(self._data.names, (col.dtype for col in self._data.columns)) - ) + def _dtypes(self) -> abc.Iterator: + return zip(self._data.names, (col.dtype for col in self._data.columns)) @property def ndim(self) -> int: @@ -1969,7 +1967,7 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ac8b381cbec..aa96051ea51 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,12 +22,7 @@ from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default -from cudf.api.types import ( - is_bool_dtype, - is_float_dtype, - is_list_like, - is_numeric_dtype, -) +from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -335,12 +330,8 @@ def dtypes(self): FutureWarning, ) index = self.grouping.keys.unique().sort_values().to_pandas() - obj_dtypes = self.obj._dtypes return pd.DataFrame( - { - name: [obj_dtypes[name]] * len(index) - for name in self.obj._data.names - }, + {name: [dtype] * len(index) for name, dtype in self.obj._dtypes}, index=index, ) @@ -499,8 +490,7 @@ def rank( # treats NaNs the way we treat nulls. if cudf.get_option("mode.pandas_compatible"): if any( - is_float_dtype(typ) - for typ in self.grouping.values._dtypes.values() + col.dtype.kind == "f" for col in self.grouping.values._columns ): raise NotImplementedError( "NaNs are not supported in groupby.rank." diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 688b268d478..ecfcec15337 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -891,7 +891,7 @@ def replace( ) = _get_replacement_values_for_columns( to_replace=to_replace, value=value, - columns_dtype_map=self._dtypes, + columns_dtype_map=dict(self._dtypes), ) for name, col in self._data.items(): @@ -6313,11 +6313,11 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), *[ - normalize_token(cat.categories) - for cat in self._dtypes.values() - if cat == "category" + normalize_token(col.dtype.categories) + for col in self._columns + if col.dtype == "category" ], normalize_token(self.index), normalize_token(self.hash_values().values_host), diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3eeeac405b3..f07764e2ce4 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -132,10 +132,9 @@ def read_csv( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index dd4a0d9eb07..fc3387d5117 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -147,10 +147,9 @@ def read_json( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is True else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {}