diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 5b49143fd5a..67211a1c4bf 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -99,7 +99,7 @@ def set_rand_params(self, params): if dtype_val is not None: dtype_val = { col_name: "category" - if cudf.utils.dtypes._is_categorical_dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype) else pandas_dtypes_to_np_dtypes[dtype] for col_name, dtype in dtype_val.items() } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index bffd508b2ef..e987529c8ba 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val): if dtype_val is not None and isinstance(dtype_val, abc.Mapping): processed_dtypes = {} for col_name, dtype in dtype_val.items(): - if cudf.utils.dtypes._is_categorical_dtype(dtype): + if isinstance(dtype, cudf.CategoricalDtype): processed_dtypes[col_name] = "category" else: processed_dtypes[col_name] = str( diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 0f0bc3ce81a..0df27cab5c6 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -434,7 +434,7 @@ def read_csv( if dtype is not None: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - if cudf.api.types._is_categorical_dtype(v): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): df._data[str(k)] = df._data[str(k)].astype(v) elif ( cudf.api.types.is_scalar(dtype) or @@ -442,11 +442,11 @@ def read_csv( np.dtype, pd.api.extensions.ExtensionDtype, type )) ): - if cudf.api.types._is_categorical_dtype(dtype): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): df = df.astype(dtype) elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): - if cudf.api.types._is_categorical_dtype(col_dtype): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) @@ -554,7 +554,7 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 - if cudf.api.types._is_categorical_dtype(dtype): + if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): if isinstance(dtype, str): dtype = "str" else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ad56cabb48e..fe484a98166 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -52,7 +52,6 @@ from cudf._lib.types import size_type_dtype from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( - _is_categorical_dtype, _is_datetime64tz_dtype, _is_interval_dtype, _is_non_decimal_numeric_dtype, @@ -968,7 +967,8 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: col = self if self.dtype == dtype: return col - if _is_categorical_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): return col.as_categorical_column(dtype) if ( @@ -987,7 +987,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: dtype = pandas_dtypes_to_np_dtypes.get(dtype, dtype) if _is_non_decimal_numeric_dtype(dtype): return col.as_numerical_column(dtype) - elif _is_categorical_dtype(dtype): + elif isinstance(dtype, cudf.CategoricalDtype): return col.as_categorical_column(dtype) elif cudf.dtype(dtype).type in { np.str_, @@ -1396,7 +1396,7 @@ def column_empty_like( if ( hasattr(column, "dtype") - and _is_categorical_dtype(column.dtype) + and isinstance(column.dtype, cudf.CategoricalDtype) and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) @@ -2263,7 +2263,10 @@ def as_column( np_type = None try: if dtype is not None: - if _is_categorical_dtype(dtype) or _is_interval_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance( + dtype, (cudf.CategoricalDtype, cudf.IntervalDtype) + ): raise TypeError if _is_datetime64tz_dtype(dtype): raise NotImplementedError( @@ -2407,7 +2410,7 @@ def as_column( except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e: if isinstance(e, MixedTypeError): raise TypeError(str(e)) - if _is_categorical_dtype(dtype): + if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif np_type == np.str_: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 17d6d42618a..61a2c126e54 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1008,7 +1008,7 @@ def _is_categorical_dtype(obj): pd.Series, ), ): - return _is_categorical_dtype(obj.dtype) + return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype) if hasattr(obj, "type"): if obj.type is pd_CategoricalDtypeType: return True diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index fc253c5c197..dffbbe92fc1 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -8,11 +8,7 @@ import cudf from cudf._lib.unary import is_nan -from cudf.api.types import ( - _is_categorical_dtype, - is_numeric_dtype, - is_string_dtype, -) +from cudf.api.types import is_numeric_dtype, is_string_dtype from cudf.core.missing import NA, NaT @@ -86,7 +82,7 @@ def _check_types( if ( exact and not isinstance(left, cudf.MultiIndex) - and _is_categorical_dtype(left) + and isinstance(left.dtype, cudf.CategoricalDtype) ): if left.dtype != right.dtype: raise_assert_detail( @@ -144,8 +140,8 @@ def assert_column_equal( """ if check_dtype is True: if ( - _is_categorical_dtype(left) - and _is_categorical_dtype(right) + isinstance(left.dtype, cudf.CategoricalDtype) + and isinstance(right.dtype, cudf.CategoricalDtype) and not check_categorical ): pass @@ -173,7 +169,9 @@ def assert_column_equal( return if check_exact and check_categorical: - if _is_categorical_dtype(left) and _is_categorical_dtype(right): + if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( + right.dtype, cudf.CategoricalDtype + ): left_cat = left.categories right_cat = right.categories @@ -207,8 +205,8 @@ def assert_column_equal( if ( not check_dtype - and _is_categorical_dtype(left) - and _is_categorical_dtype(right) + and isinstance(left.dtype, cudf.CategoricalDtype) + and isinstance(right.dtype, cudf.CategoricalDtype) ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) @@ -258,7 +256,9 @@ def assert_column_equal( raise e else: columns_equal = False - if _is_categorical_dtype(left) and _is_categorical_dtype(right): + if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( + right.dtype, cudf.CategoricalDtype + ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 8e8555b2005..8dfa550b159 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if cudf.api.types._is_categorical_dtype(col.dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) elif cudf.api.types.is_string_dtype(col.dtype): @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): else: pd_series = series.to_pandas() - if cudf.api.types._is_categorical_dtype(col.dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): # The cudf.Series is constructed from an already sliced column, whereas # the pandas.Series is constructed from the unsliced series and then # sliced, so the indexes should be different and we must ignore it. diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 4b0e46bf286..65ea6f87358 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,7 +9,6 @@ import pytest import cudf as gd -from cudf.api.types import _is_categorical_dtype from cudf.core._compat import PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( @@ -600,8 +599,8 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual = gd.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): - if not _is_categorical_dtype(expected[key].dtype): + if isinstance(col.dtype, gd.CategoricalDtype): + if not isinstance(expected[key].dtype, gd.CategoricalDtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = expected[key].fillna("-1").astype("str") @@ -1203,8 +1202,10 @@ def test_concat_join_empty_dataframes( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): - if not _is_categorical_dtype(expected[key].dtype): + if isinstance(col.dtype, gd.CategoricalDtype): + if not isinstance( + expected[key].dtype, gd.CategoricalDtype + ): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = ( @@ -1323,7 +1324,7 @@ def test_concat_join_empty_dataframes_axis_1( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): + if isinstance(col.dtype, gd.CategoricalDtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") # if not expected.empty: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8fa4a230e2c..3181a747f2a 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -171,8 +171,8 @@ def cudf_dtype_from_pydata_dtype(dtype): """Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. """ - - if cudf.api.types._is_categorical_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): return cudf.core.dtypes.CategoricalDtype elif cudf.api.types.is_decimal32_dtype(dtype): return cudf.core.dtypes.Decimal32Dtype @@ -416,9 +416,9 @@ def get_min_float_dtype(col): def is_mixed_with_object_dtype(lhs, rhs): - if cudf.api.types._is_categorical_dtype(lhs.dtype): + if isinstance(lhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) - elif cudf.api.types._is_categorical_dtype(rhs.dtype): + elif isinstance(rhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) return (lhs.dtype == "object" and rhs.dtype != "object") or ( @@ -518,10 +518,10 @@ def find_common_type(dtypes): # Early exit for categoricals since they're not hashable and therefore # can't be put in a set. - if any(cudf.api.types._is_categorical_dtype(dtype) for dtype in dtypes): + if any(isinstance(dtype, cudf.CategoricalDtype) for dtype in dtypes): if all( ( - cudf.api.types._is_categorical_dtype(dtype) + isinstance(dtype, cudf.CategoricalDtype) and (not dtype.ordered if hasattr(dtype, "ordered") else True) ) for dtype in dtypes diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 026ab1d304a..f48cf3c2d88 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -333,7 +333,7 @@ def percentile_cudf(a, q, interpolation="linear"): if isinstance(q, Iterator): q = list(q) - if cudf.api.types._is_categorical_dtype(a.dtype): + if isinstance(a.dtype, cudf.CategoricalDtype): result = cp.percentile(a.cat.codes, q, interpolation=interpolation) return ( diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index d01ada92e33..d9f0f3375b7 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -17,7 +17,6 @@ from dask.utils import M import cudf as gd -from cudf.api.types import _is_categorical_dtype from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate _SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported @@ -230,7 +229,7 @@ def quantile_divisions(df, by, npartitions): if ( len(columns) == 1 and df[columns[0]].dtype != "object" - and not _is_categorical_dtype(df[columns[0]].dtype) + and not isinstance(df[columns[0]].dtype, gd.CategoricalDtype) ): dtype = df[columns[0]].dtype divisions = divisions[columns[0]].astype("int64")