Skip to content

Commit

Permalink
Use less _is_categorical_dtype (#15148)
Browse files Browse the repository at this point in the history
Rehash of #14942

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15148
  • Loading branch information
mroeschke authored Apr 10, 2024
1 parent e727814 commit 460b41e
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 78 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def set_rand_params(self, params):
if dtype_val is not None:
dtype_val = {
col_name: "category"
if cudf.utils.dtypes._is_categorical_dtype(dtype)
if isinstance(dtype, cudf.CategoricalDtype)
else pandas_dtypes_to_np_dtypes[dtype]
for col_name, dtype in dtype_val.items()
}
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val):
if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
processed_dtypes = {}
for col_name, dtype in dtype_val.items():
if cudf.utils.dtypes._is_categorical_dtype(dtype):
if isinstance(dtype, cudf.CategoricalDtype):
processed_dtypes[col_name] = "category"
else:
processed_dtypes[col_name] = str(
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -434,19 +434,19 @@ def read_csv(
if dtype is not None:
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
if cudf.api.types._is_categorical_dtype(v):
if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
df._data[str(k)] = df._data[str(k)].astype(v)
elif (
cudf.api.types.is_scalar(dtype) or
isinstance(dtype, (
np.dtype, pd.api.extensions.ExtensionDtype, type
))
):
if cudf.api.types._is_categorical_dtype(dtype):
if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
df = df.astype(dtype)
elif isinstance(dtype, abc.Collection):
for index, col_dtype in enumerate(dtype):
if cudf.api.types._is_categorical_dtype(col_dtype):
if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
col_name = df._data.names[index]
df._data[col_name] = df._data[col_name].astype(col_dtype)

Expand Down Expand Up @@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
# TODO: Remove this work-around Dictionary types
# in libcudf are fully mapped to categorical columns:
# https://github.com/rapidsai/cudf/issues/3960
if cudf.api.types._is_categorical_dtype(dtype):
if isinstance(dtype, str):
dtype = "str"
else:
dtype = dtype.categories.dtype
if isinstance(dtype, cudf.CategoricalDtype):
dtype = dtype.categories.dtype
elif dtype == "category":
dtype = "str"

if isinstance(dtype, str):
if str(dtype) == "date32":
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
from cudf._lib.types import size_type_dtype
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.api.types import (
_is_categorical_dtype,
_is_non_decimal_numeric_dtype,
_is_pandas_nullable_extension_dtype,
infer_dtype,
Expand Down Expand Up @@ -1381,7 +1380,7 @@ def column_empty_like(

if (
hasattr(column, "dtype")
and _is_categorical_dtype(column.dtype)
and isinstance(column.dtype, cudf.CategoricalDtype)
and dtype == column.dtype
):
catcolumn = cast("cudf.core.column.CategoricalColumn", column)
Expand Down Expand Up @@ -2008,7 +2007,9 @@ def as_column(
length = 1
elif length < 0:
raise ValueError(f"{length=} must be >=0.")
if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
if isinstance(
arbitrary, pd.Interval
) or cudf.api.types._is_categorical_dtype(dtype):
# No cudf.Scalar support yet
return as_column(
pd.Series([arbitrary] * length),
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def dtype(arbitrary):
raise TypeError(f"Unsupported type {np_dtype}")
return np_dtype

if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}:
# read_csv only accepts "hex"
# e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow
return arbitrary

# use `pandas_dtype` to try and interpret
# `arbitrary` as a Pandas extension type.
# Return the corresponding NumPy/cuDF type.
Expand Down Expand Up @@ -999,7 +1004,10 @@ def _is_categorical_dtype(obj):
pd.Series,
),
):
return _is_categorical_dtype(obj.dtype)
try:
return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype)
except TypeError:
return False
if hasattr(obj, "type"):
if obj.type is pd.CategoricalDtype.type:
return True
Expand Down
24 changes: 12 additions & 12 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@

import cudf
from cudf._lib.unary import is_nan
from cudf.api.types import (
_is_categorical_dtype,
is_numeric_dtype,
is_string_dtype,
)
from cudf.api.types import is_numeric_dtype, is_string_dtype
from cudf.core.missing import NA, NaT


Expand Down Expand Up @@ -86,7 +82,7 @@ def _check_types(
if (
exact
and not isinstance(left, cudf.MultiIndex)
and _is_categorical_dtype(left)
and isinstance(left.dtype, cudf.CategoricalDtype)
):
if left.dtype != right.dtype:
raise_assert_detail(
Expand Down Expand Up @@ -144,8 +140,8 @@ def assert_column_equal(
"""
if check_dtype is True:
if (
_is_categorical_dtype(left)
and _is_categorical_dtype(right)
isinstance(left.dtype, cudf.CategoricalDtype)
and isinstance(right.dtype, cudf.CategoricalDtype)
and not check_categorical
):
pass
Expand Down Expand Up @@ -173,7 +169,9 @@ def assert_column_equal(
return

if check_exact and check_categorical:
if _is_categorical_dtype(left) and _is_categorical_dtype(right):
if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
right.dtype, cudf.CategoricalDtype
):
left_cat = left.categories
right_cat = right.categories

Expand Down Expand Up @@ -207,8 +205,8 @@ def assert_column_equal(

if (
not check_dtype
and _is_categorical_dtype(left)
and _is_categorical_dtype(right)
and isinstance(left.dtype, cudf.CategoricalDtype)
and isinstance(right.dtype, cudf.CategoricalDtype)
):
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
Expand Down Expand Up @@ -258,7 +256,9 @@ def assert_column_equal(
raise e
else:
columns_equal = False
if _is_categorical_dtype(left) and _is_categorical_dtype(right):
if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
right.dtype, cudf.CategoricalDtype
):
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
if not columns_equal:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
children=col.base_children,
)

if cudf.api.types._is_categorical_dtype(col.dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
assert col.size == col.codes.size
assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
elif cudf.api.types.is_string_dtype(col.dtype):
Expand Down Expand Up @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
else:
pd_series = series.to_pandas()

if cudf.api.types._is_categorical_dtype(col.dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
# The cudf.Series is constructed from an already sliced column, whereas
# the pandas.Series is constructed from the unsliced series and then
# sliced, so the indexes should be different and we must ignore it.
Expand Down
66 changes: 21 additions & 45 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest

import cudf
from cudf.api.types import _is_categorical_dtype
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
from cudf.testing._utils import (
assert_eq,
Expand Down Expand Up @@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index):
actual = cudf.concat(other_gd, ignore_index=ignore_index)
if expected.shape != df.shape:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if not _is_categorical_dtype(expected[key].dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
if not isinstance(expected[key].dtype, pd.CategoricalDtype):
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = expected[key].fillna("-1").astype("str")
Expand Down Expand Up @@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis):
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("join", ["inner", "outer"])
@pytest.mark.parametrize("axis", [0])
def test_concat_join_empty_dataframes(
df, other, ignore_index, axis, join, sort
request, df, other, ignore_index, join, sort
):
axis = 0
other_pd = [df] + other
gdf = cudf.from_pandas(df)
other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
Expand All @@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes(
actual = cudf.concat(
other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort
)
if expected.shape != df.shape:
if axis == 0:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if not _is_categorical_dtype(expected[key].dtype):
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = (
expected[key].fillna("-1").astype("str")
)
else:
expected[key] = (
expected[key]
.cat.add_categories(["-1"])
.fillna("-1")
.astype("str")
)
actual[key] = col.astype("str").fillna("-1")
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)

assert_eq(
expected.fillna(-1),
actual.fillna(-1),
check_dtype=False,
check_index_type=False
if len(expected) == 0 or actual.empty
else True,
check_column_type=False,
)
else:
# no need to fill in if axis=1
assert_eq(
expected,
actual,
check_index_type=False,
check_column_type=False,
if (
join == "outer"
and any(
isinstance(dtype, pd.CategoricalDtype)
for dtype in df.dtypes.tolist()
)
and any(
isinstance(dtype, pd.CategoricalDtype)
for other_df in other
for dtype in other_df.dtypes.tolist()
)
):
request.applymarker(
pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/42840"
)
)
assert_eq(
expected,
actual,
check_dtype=False,
check_index_type=False,
check_column_type=False,
)

Expand Down Expand Up @@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1(
if expected.shape != df.shape:
if axis == 0:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if isinstance(expected[key].dtype, pd.CategoricalDtype):
expected[key] = expected[key].fillna("-1")
actual[key] = col.astype("str").fillna("-1")
# if not expected.empty:
Expand Down
22 changes: 19 additions & 3 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep(
gdf1 = read_csv(
str(fname),
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dtype=[
"int64",
"datetime64[ns]",
"float64",
"int64",
"category",
"str",
"bool",
],
dayfirst=True,
**cudf_arg,
)
gdf2 = read_csv(
str(fname),
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dtype=[
"int64",
"datetime64[ns]",
"float64",
"int64",
"category",
"str",
"bool",
],
dayfirst=True,
**pandas_arg,
)
Expand Down Expand Up @@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
out = read_csv(
str(fname),
names=["1", "2", "3"],
dtype=["int64", "date", "float64"],
dtype=["int64", "datetime64[ns]", "float64"],
skiprows=1,
skipfooter=1,
dayfirst=True,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ def get_min_float_dtype(col):


def is_mixed_with_object_dtype(lhs, rhs):
if cudf.api.types._is_categorical_dtype(lhs.dtype):
if isinstance(lhs.dtype, cudf.CategoricalDtype):
return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
elif cudf.api.types._is_categorical_dtype(rhs.dtype):
elif isinstance(rhs.dtype, cudf.CategoricalDtype):
return is_mixed_with_object_dtype(lhs, rhs.dtype.categories)

return (lhs.dtype == "object" and rhs.dtype != "object") or (
Expand Down

0 comments on commit 460b41e

Please sign in to comment.