From 460b41edadc90a43b02b1f1e7dc23190cc14d0b4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Apr 2024 05:47:58 -1000 Subject: [PATCH] Use less _is_categorical_dtype (#15148) Rehash of https://github.com/rapidsai/cudf/pull/14942 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15148 --- python/cudf/cudf/_fuzz_testing/csv.py | 2 +- python/cudf/cudf/_fuzz_testing/json.py | 2 +- python/cudf/cudf/_lib/csv.pyx | 15 +++--- python/cudf/cudf/core/column/column.py | 7 +-- python/cudf/cudf/core/dtypes.py | 10 +++- python/cudf/cudf/testing/testing.py | 24 +++++----- python/cudf/cudf/tests/test_column.py | 4 +- python/cudf/cudf/tests/test_concat.py | 66 ++++++++------------------ python/cudf/cudf/tests/test_csv.py | 22 +++++++-- python/cudf/cudf/utils/dtypes.py | 4 +- 10 files changed, 78 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 5b49143fd5a..67211a1c4bf 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -99,7 +99,7 @@ def set_rand_params(self, params): if dtype_val is not None: dtype_val = { col_name: "category" - if cudf.utils.dtypes._is_categorical_dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype) else pandas_dtypes_to_np_dtypes[dtype] for col_name, dtype in dtype_val.items() } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index bffd508b2ef..e987529c8ba 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val): if dtype_val is not None and isinstance(dtype_val, abc.Mapping): processed_dtypes = {} for col_name, dtype in dtype_val.items(): - if cudf.utils.dtypes._is_categorical_dtype(dtype): + if isinstance(dtype, cudf.CategoricalDtype): processed_dtypes[col_name] = "category" else: processed_dtypes[col_name] = str( diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 0f0bc3ce81a..b2e4d442bd2 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -434,7 +434,7 @@ def read_csv( if dtype is not None: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - if cudf.api.types._is_categorical_dtype(v): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): df._data[str(k)] = df._data[str(k)].astype(v) elif ( cudf.api.types.is_scalar(dtype) or @@ -442,11 +442,11 @@ def read_csv( np.dtype, pd.api.extensions.ExtensionDtype, type )) ): - if cudf.api.types._is_categorical_dtype(dtype): + if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): df = df.astype(dtype) elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): - if cudf.api.types._is_categorical_dtype(col_dtype): + if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) @@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 - if cudf.api.types._is_categorical_dtype(dtype): - if isinstance(dtype, str): - dtype = "str" - else: - dtype = dtype.categories.dtype + if isinstance(dtype, cudf.CategoricalDtype): + dtype = dtype.categories.dtype + elif dtype == "category": + dtype = "str" if isinstance(dtype, str): if str(dtype) == "date32": diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 67f44ad2f48..c8a6493ddda 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -52,7 +52,6 @@ from cudf._lib.types import size_type_dtype from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( - _is_categorical_dtype, _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, @@ -1381,7 +1380,7 @@ def column_empty_like( if ( hasattr(column, "dtype") - and _is_categorical_dtype(column.dtype) + and isinstance(column.dtype, cudf.CategoricalDtype) and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) @@ -2008,7 +2007,9 @@ def as_column( length = 1 elif length < 0: raise ValueError(f"{length=} must be >=0.") - if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype): + if isinstance( + arbitrary, pd.Interval + ) or cudf.api.types._is_categorical_dtype(dtype): # No cudf.Scalar support yet return as_column( pd.Series([arbitrary] * length), diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 73617763221..9bb1995b836 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -51,6 +51,11 @@ def dtype(arbitrary): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype + if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}: + # read_csv only accepts "hex" + # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow + return arbitrary + # use `pandas_dtype` to try and interpret # `arbitrary` as a Pandas extension type. # Return the corresponding NumPy/cuDF type. @@ -999,7 +1004,10 @@ def _is_categorical_dtype(obj): pd.Series, ), ): - return _is_categorical_dtype(obj.dtype) + try: + return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype) + except TypeError: + return False if hasattr(obj, "type"): if obj.type is pd.CategoricalDtype.type: return True diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index fc253c5c197..dffbbe92fc1 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -8,11 +8,7 @@ import cudf from cudf._lib.unary import is_nan -from cudf.api.types import ( - _is_categorical_dtype, - is_numeric_dtype, - is_string_dtype, -) +from cudf.api.types import is_numeric_dtype, is_string_dtype from cudf.core.missing import NA, NaT @@ -86,7 +82,7 @@ def _check_types( if ( exact and not isinstance(left, cudf.MultiIndex) - and _is_categorical_dtype(left) + and isinstance(left.dtype, cudf.CategoricalDtype) ): if left.dtype != right.dtype: raise_assert_detail( @@ -144,8 +140,8 @@ def assert_column_equal( """ if check_dtype is True: if ( - _is_categorical_dtype(left) - and _is_categorical_dtype(right) + isinstance(left.dtype, cudf.CategoricalDtype) + and isinstance(right.dtype, cudf.CategoricalDtype) and not check_categorical ): pass @@ -173,7 +169,9 @@ def assert_column_equal( return if check_exact and check_categorical: - if _is_categorical_dtype(left) and _is_categorical_dtype(right): + if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( + right.dtype, cudf.CategoricalDtype + ): left_cat = left.categories right_cat = right.categories @@ -207,8 +205,8 @@ def assert_column_equal( if ( not check_dtype - and _is_categorical_dtype(left) - and _is_categorical_dtype(right) + and isinstance(left.dtype, cudf.CategoricalDtype) + and isinstance(right.dtype, cudf.CategoricalDtype) ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) @@ -258,7 +256,9 @@ def assert_column_equal( raise e else: columns_equal = False - if _is_categorical_dtype(left) and _is_categorical_dtype(right): + if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( + right.dtype, cudf.CategoricalDtype + ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 2f70f955fa9..dace8009041 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if cudf.api.types._is_categorical_dtype(col.dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) elif cudf.api.types.is_string_dtype(col.dtype): @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): else: pd_series = series.to_pandas() - if cudf.api.types._is_categorical_dtype(col.dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): # The cudf.Series is constructed from an already sliced column, whereas # the pandas.Series is constructed from the unsliced series and then # sliced, so the indexes should be different and we must ignore it. diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 3d638da924b..87b3beb5589 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,7 +9,6 @@ import pytest import cudf -from cudf.api.types import _is_categorical_dtype from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( assert_eq, @@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual = cudf.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): - if not _is_categorical_dtype(expected[key].dtype): + if isinstance(col.dtype, cudf.CategoricalDtype): + if not isinstance(expected[key].dtype, pd.CategoricalDtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 expected[key] = expected[key].fillna("-1").astype("str") @@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis): @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0]) def test_concat_join_empty_dataframes( - df, other, ignore_index, axis, join, sort + request, df, other, ignore_index, join, sort ): + axis = 0 other_pd = [df] + other gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes( actual = cudf.concat( other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort ) - if expected.shape != df.shape: - if axis == 0: - for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): - if not _is_categorical_dtype(expected[key].dtype): - # TODO: Pandas bug: - # https://github.com/pandas-dev/pandas/issues/42840 - expected[key] = ( - expected[key].fillna("-1").astype("str") - ) - else: - expected[key] = ( - expected[key] - .cat.add_categories(["-1"]) - .fillna("-1") - .astype("str") - ) - actual[key] = col.astype("str").fillna("-1") - else: - expected[key] = expected[key].fillna(-1) - actual[key] = col.fillna(-1) - - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=False - if len(expected) == 0 or actual.empty - else True, - check_column_type=False, - ) - else: - # no need to fill in if axis=1 - assert_eq( - expected, - actual, - check_index_type=False, - check_column_type=False, + if ( + join == "outer" + and any( + isinstance(dtype, pd.CategoricalDtype) + for dtype in df.dtypes.tolist() + ) + and any( + isinstance(dtype, pd.CategoricalDtype) + for other_df in other + for dtype in other_df.dtypes.tolist() + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/42840" ) + ) assert_eq( expected, actual, check_dtype=False, - check_index_type=False, check_column_type=False, ) @@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1( if expected.shape != df.shape: if axis == 0: for key, col in actual[actual.columns].items(): - if _is_categorical_dtype(col.dtype): + if isinstance(expected[key].dtype, pd.CategoricalDtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") # if not expected.empty: diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 2d728fb94ba..5009a7f2628 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep( gdf1 = read_csv( str(fname), names=["1", "2", "3", "4", "5", "6", "7"], - dtype=["int64", "date", "float64", "int64", "category", "str", "bool"], + dtype=[ + "int64", + "datetime64[ns]", + "float64", + "int64", + "category", + "str", + "bool", + ], dayfirst=True, **cudf_arg, ) gdf2 = read_csv( str(fname), names=["1", "2", "3", "4", "5", "6", "7"], - dtype=["int64", "date", "float64", "int64", "category", "str", "bool"], + dtype=[ + "int64", + "datetime64[ns]", + "float64", + "int64", + "category", + "str", + "bool", + ], dayfirst=True, **pandas_arg, ) @@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): out = read_csv( str(fname), names=["1", "2", "3"], - dtype=["int64", "date", "float64"], + dtype=["int64", "datetime64[ns]", "float64"], skiprows=1, skipfooter=1, dayfirst=True, diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8521239413e..a33b5ca139c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -392,9 +392,9 @@ def get_min_float_dtype(col): def is_mixed_with_object_dtype(lhs, rhs): - if cudf.api.types._is_categorical_dtype(lhs.dtype): + if isinstance(lhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) - elif cudf.api.types._is_categorical_dtype(rhs.dtype): + elif isinstance(rhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) return (lhs.dtype == "object" and rhs.dtype != "object") or (