From a8478db1799f10ac26a4b0918ab8666b043e348b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:23:56 -0800 Subject: [PATCH 1/4] Add another one --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a5e99abd79e..48a742dca6b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2812,7 +2812,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # ColumnBase._concat so that all subclasses can override necessary # behavior. However, at the moment it's not clear what that API should look # like, so CategoricalColumn simply implements a minimal working API. - if all(is_categorical_dtype(o.dtype) for o in objs): + if all(isinstance(o.dtype, CategoricalDtype) for o in objs): return cudf.core.column.categorical.CategoricalColumn._concat( cast( MutableSequence[ From 9e55c4a56489f80d69dead251c598be793a76506 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:47:06 -0800 Subject: [PATCH 2/4] Fix more instances --- python/cudf/cudf/core/_internals/where.py | 4 +-- python/cudf/cudf/core/column/categorical.py | 4 +-- python/cudf/cudf/core/column/interval.py | 6 ++--- python/cudf/cudf/core/dataframe.py | 28 ++++++++++++--------- python/cudf/cudf/core/index.py | 3 +-- python/cudf/cudf/core/indexed_frame.py | 15 ++++++----- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/core/tools/numeric.py | 6 ++--- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_index.py | 2 +- 10 files changed, 37 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index e9131fd7f33..f76802c8b7d 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -10,10 +10,10 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, - is_categorical_dtype, is_scalar, ) from cudf.core.column import ColumnBase +from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, _dtype_can_hold_element, @@ -46,7 +46,7 @@ def _check_and_cast_columns_with_other( ) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: # Returns type-casted `source_col` & `other` based on `inplace`. source_dtype = source_col.dtype - if is_categorical_dtype(source_dtype): + if isinstance(source_dtype, CategoricalDtype): return _normalize_categorical(source_col, other) other_is_scalar = is_scalar(other) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index bab07624dfa..682abe6354d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -17,7 +17,7 @@ from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.api.types import is_categorical_dtype, is_interval_dtype +from cudf.api.types import is_interval_dtype from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods @@ -99,7 +99,7 @@ class CategoricalAccessor(ColumnMethods): _column: CategoricalColumn def __init__(self, parent: SeriesOrSingleColumnIndex): - if not is_categorical_dtype(parent.dtype): + if not isinstance(parent.dtype, CategoricalDtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index d4855def832..bbf4b596154 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -5,9 +5,9 @@ import pyarrow as pa import cudf -from cudf.api.types import is_categorical_dtype, is_interval_dtype +from cudf.api.types import is_interval_dtype from cudf.core.column import StructColumn -from cudf.core.dtypes import IntervalDtype +from cudf.core.dtypes import CategoricalDtype, IntervalDtype class IntervalColumn(StructColumn): @@ -102,7 +102,7 @@ def copy(self, deep=True): def as_interval_column(self, dtype, **kwargs): if is_interval_dtype(dtype): - if is_categorical_dtype(self): + if isinstance(self.dtype, CategoricalDtype): new_struct = self._get_decategorized_column() return IntervalColumn.from_struct_column(new_struct) if is_interval_dtype(dtype): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16eead6ea81..330f7adb640 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -47,7 +47,6 @@ from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, - is_categorical_dtype, is_datetime_dtype, is_dict_like, is_dtype_equal, @@ -319,7 +318,9 @@ def _getitem_tuple_arg(self, arg): as_column( tmp_arg[0], dtype=self._frame.index.dtype - if is_categorical_dtype(self._frame.index.dtype) + if isinstance( + self._frame.index.dtype, cudf.CategoricalDtype + ) else None, ), tmp_arg[1], @@ -1497,7 +1498,7 @@ def _get_numeric_data(self): columns = [ c for c, dt in self.dtypes.items() - if dt != object and not is_categorical_dtype(dt) + if dt != object and not isinstance(dt, cudf.CategoricalDtype) ] return self[columns] @@ -1740,8 +1741,8 @@ def _concat( out._index._data, indices[:first_data_column_position], ) - if not isinstance(out._index, MultiIndex) and is_categorical_dtype( - out._index._values.dtype + if not isinstance(out._index, MultiIndex) and isinstance( + out._index._values.dtype, cudf.CategoricalDtype ): out = out.set_index( cudf.core.index.as_index(out.index._values) @@ -3893,8 +3894,11 @@ def transpose(self): # No column from index is transposed with libcudf. source_columns = [*self._columns] source_dtype = source_columns[0].dtype - if is_categorical_dtype(source_dtype): - if any(not is_categorical_dtype(c.dtype) for c in source_columns): + if isinstance(source_dtype, cudf.CategoricalDtype): + if any( + not isinstance(c.dtype, cudf.CategoricalDtype) + for c in source_columns + ): raise ValueError("Columns must all have the same dtype") cats = list(c.categories for c in source_columns) cats = cudf.core.column.concat_columns(cats).unique() @@ -3908,7 +3912,7 @@ def transpose(self): result_columns = libcudf.transpose.transpose(source_columns) - if is_categorical_dtype(source_dtype): + if isinstance(source_dtype, cudf.CategoricalDtype): result_columns = [ codes._with_type_metadata( cudf.core.dtypes.CategoricalDtype(categories=cats) @@ -4610,8 +4614,8 @@ def apply_rows( """ for col in incols: current_col_dtype = self._data[col].dtype - if is_string_dtype(current_col_dtype) or is_categorical_dtype( - current_col_dtype + if is_string_dtype(current_col_dtype) or isinstance( + current_col_dtype, cudf.CategoricalDtype ): raise TypeError( "User defined functions are currently not " @@ -6438,7 +6442,7 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for i_dtype in include: # category handling - if is_categorical_dtype(i_dtype): + if isinstance(i_dtype, cudf.CategoricalDtype): include_subtypes.add(i_dtype) elif inspect.isclass(dtype.type): if issubclass(dtype.type, i_dtype): @@ -6449,7 +6453,7 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for e_dtype in exclude: # category handling - if is_categorical_dtype(e_dtype): + if isinstance(e_dtype, cudf.CategoricalDtype): exclude_subtypes.add(e_dtype) elif inspect.isclass(dtype.type): if issubclass(dtype.type, e_dtype): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9f0c66a5c74..055c108187e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -32,7 +32,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_categorical_dtype, is_dtype_equal, is_integer, is_interval_dtype, @@ -2972,7 +2971,7 @@ def __init__( if isinstance(data, CategoricalColumn): data = data elif isinstance(data, pd.Series) and ( - is_categorical_dtype(data.dtype) + isinstance(data.dtype, pd.CategoricalDtype) ): codes_data = column.as_column(data.cat.codes.values) data = column.build_categorical_column( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4211a8c24bf..a484c961110 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -42,7 +42,6 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, - is_categorical_dtype, is_decimal_dtype, is_dict_like, is_list_dtype, @@ -171,7 +170,7 @@ def _indices_from_labels(obj, labels): if not isinstance(labels, cudf.MultiIndex): labels = cudf.core.column.as_column(labels) - if is_categorical_dtype(obj.index): + if isinstance(obj.index.dtype, cudf.CategoricalDtype): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = cudf.core.column.build_categorical_column( @@ -5455,21 +5454,21 @@ def _is_same_dtype(lhs_dtype, rhs_dtype): if lhs_dtype == rhs_dtype: return True elif ( - is_categorical_dtype(lhs_dtype) - and is_categorical_dtype(rhs_dtype) + isinstance(lhs_dtype, cudf.CategoricalDtype) + and isinstance(rhs_dtype, cudf.CategoricalDtype) and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype ): # OK if categories are not all the same return True elif ( - is_categorical_dtype(lhs_dtype) - and not is_categorical_dtype(rhs_dtype) + isinstance(lhs_dtype, cudf.CategoricalDtype) + and not isinstance(rhs_dtype, cudf.CategoricalDtype) and lhs_dtype.categories.dtype == rhs_dtype ): return True elif ( - is_categorical_dtype(rhs_dtype) - and not is_categorical_dtype(lhs_dtype) + isinstance(rhs_dtype, cudf.CategoricalDtype) + and not isinstance(lhs_dtype, cudf.CategoricalDtype) and rhs_dtype.categories.dtype == lhs_dtype ): return True diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3c8489481d8..3cbe58ed39c 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -541,7 +541,7 @@ def melt( # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] - if any(cudf.api.types.is_categorical_dtype(t) for t in dtypes): + if any(isinstance(typ, cudf.CategoricalDtype) for typ in dtypes): raise NotImplementedError( "Categorical columns are not yet supported for function" ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 0273227010b..f24d8572655 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import warnings @@ -10,7 +10,6 @@ from cudf._lib import strings as libstrings from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_categorical_dtype, is_datetime_dtype, is_list_dtype, is_string_dtype, @@ -18,6 +17,7 @@ is_timedelta_dtype, ) from cudf.core.column import as_column +from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import can_convert_to_column @@ -110,7 +110,7 @@ def to_numeric(arg, errors="raise", downcast=None): if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): col = col.as_numerical_column(cudf.dtype("int64")) - elif is_categorical_dtype(dtype): + elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type if _is_non_decimal_numeric_dtype(cat_dtype): col = col.as_numerical_column(cat_dtype) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5677f97408a..5007915d30a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6067,7 +6067,7 @@ def test_df_sr_mask_where(data, condition, other, error, inplace): expect_mask = ps_mask got_mask = gs_mask - if pd.api.types.is_categorical_dtype(expect_where): + if isinstance(expect_where.dtype, pd.CategoricalDtype): np.testing.assert_array_equal( expect_where.cat.codes, got_where.cat.codes.astype(expect_where.cat.codes.dtype) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 087b93f1a02..7ebfe70ceb7 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -682,7 +682,7 @@ def test_index_where(data, condition, other, error): gs_other = other if error is None: - if pd.api.types.is_categorical_dtype(ps): + if isinstance(ps.dtype, pd.CategoricalDtype): expect = ps.where(ps_condition, other=ps_other) got = gs.where(gs_condition, other=gs_other) np.testing.assert_array_equal( From d5df3f9834b3640cecaba657bd5116303be707c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:40:57 -0800 Subject: [PATCH 3/4] match string in select dtypes --- python/cudf/cudf/core/dataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 330f7adb640..c4261b83ffe 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6442,7 +6442,8 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for i_dtype in include: # category handling - if isinstance(i_dtype, cudf.CategoricalDtype): + if i_dtype == "category": + # Matches cudf & pandas dtype objects include_subtypes.add(i_dtype) elif inspect.isclass(dtype.type): if issubclass(dtype.type, i_dtype): @@ -6453,7 +6454,8 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for e_dtype in exclude: # category handling - if isinstance(e_dtype, cudf.CategoricalDtype): + if i_dtype == "category": + # Matches cudf & pandas dtype objects exclude_subtypes.add(e_dtype) elif inspect.isclass(dtype.type): if issubclass(dtype.type, e_dtype): From 22f7bbaa52eee85fb6b82b2b078b7206bd012bfa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 15:01:12 -0800 Subject: [PATCH 4/4] Compare class --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c4261b83ffe..a804f22fa3d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6442,7 +6442,7 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for i_dtype in include: # category handling - if i_dtype == "category": + if i_dtype == cudf.CategoricalDtype: # Matches cudf & pandas dtype objects include_subtypes.add(i_dtype) elif inspect.isclass(dtype.type): @@ -6454,7 +6454,7 @@ def select_dtypes(self, include=None, exclude=None): for dtype in self.dtypes: for e_dtype in exclude: # category handling - if i_dtype == "category": + if e_dtype == cudf.CategoricalDtype: # Matches cudf & pandas dtype objects exclude_subtypes.add(e_dtype) elif inspect.isclass(dtype.type): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5007915d30a..4976ee62e71 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6067,7 +6067,9 @@ def test_df_sr_mask_where(data, condition, other, error, inplace): expect_mask = ps_mask got_mask = gs_mask - if isinstance(expect_where.dtype, pd.CategoricalDtype): + if isinstance(expect_where, pd.Series) and isinstance( + expect_where.dtype, pd.CategoricalDtype + ): np.testing.assert_array_equal( expect_where.cat.codes, got_where.cat.codes.astype(expect_where.cat.codes.dtype)