Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use less _is_categorical_dtype #15148

Merged
merged 28 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ea299a0
Use less _is_categorical_dtype
mroeschke Feb 26, 2024
03e1f81
Check pandas CategoricalDtype instead
mroeschke Feb 27, 2024
d61f58e
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Feb 27, 2024
ebcb877
Address test failures
mroeschke Feb 27, 2024
0459e91
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Feb 28, 2024
0c6e844
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Feb 29, 2024
50ead71
Merge branch 'branch-24.04' into rm/_is_categorical_dtype
vyasr Feb 29, 2024
a9543e2
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 5, 2024
357ccbe
Merge branch 'rm/_is_categorical_dtype' of https://github.com/mroesch…
mroeschke Mar 5, 2024
04d8d02
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 6, 2024
bd7c126
Add carvout for hex, return False on TypeError
mroeschke Mar 6, 2024
5b5e642
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 6, 2024
8c95ee7
Make stricter
mroeschke Mar 6, 2024
51fabd0
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 11, 2024
189a5a1
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 11, 2024
77a97e0
Remove import
mroeschke Mar 11, 2024
0b405bf
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 12, 2024
ed69b49
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 13, 2024
922bc6c
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 13, 2024
52b8c43
Merge remote-tracking branch 'upstream/branch-24.04' into rm/_is_cate…
mroeschke Mar 15, 2024
872a071
Merge remote-tracking branch 'upstream/branch-24.06' into rm/_is_cate…
mroeschke Mar 18, 2024
13811c8
Update python/cudf/cudf/_lib/csv.pyx
mroeschke Apr 8, 2024
5a497d6
Merge branch 'branch-24.06' into rm/_is_categorical_dtype
mroeschke Apr 8, 2024
fa33dcb
Style
mroeschke Apr 8, 2024
75a52c8
Merge remote-tracking branch 'upstream/branch-24.06' into rm/_is_cate…
mroeschke Apr 8, 2024
4216eb9
Line length
mroeschke Apr 8, 2024
e05c916
Merge remote-tracking branch 'upstream/branch-24.06' into rm/_is_cate…
mroeschke Apr 9, 2024
9ceb385
Fix condition
mroeschke Apr 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def set_rand_params(self, params):
if dtype_val is not None:
dtype_val = {
col_name: "category"
if cudf.utils.dtypes._is_categorical_dtype(dtype)
if isinstance(dtype, cudf.CategoricalDtype)
else pandas_dtypes_to_np_dtypes[dtype]
for col_name, dtype in dtype_val.items()
}
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val):
if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
processed_dtypes = {}
for col_name, dtype in dtype_val.items():
if cudf.utils.dtypes._is_categorical_dtype(dtype):
if isinstance(dtype, cudf.CategoricalDtype):
processed_dtypes[col_name] = "category"
else:
processed_dtypes[col_name] = str(
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -434,19 +434,19 @@ def read_csv(
if dtype is not None:
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
if cudf.api.types._is_categorical_dtype(v):
if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
df._data[str(k)] = df._data[str(k)].astype(v)
elif (
cudf.api.types.is_scalar(dtype) or
isinstance(dtype, (
np.dtype, pd.api.extensions.ExtensionDtype, type
))
):
if cudf.api.types._is_categorical_dtype(dtype):
if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
df = df.astype(dtype)
elif isinstance(dtype, abc.Collection):
for index, col_dtype in enumerate(dtype):
if cudf.api.types._is_categorical_dtype(col_dtype):
if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
col_name = df._data.names[index]
df._data[col_name] = df._data[col_name].astype(col_dtype)

Expand Down Expand Up @@ -554,7 +554,7 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
# TODO: Remove this work-around Dictionary types
# in libcudf are fully mapped to categorical columns:
# https://github.com/rapidsai/cudf/issues/3960
Comment on lines 554 to 556
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This linked issue has been resolved. Is there an action to be taken here (i.e. removal of the workaround)?

Copy link
Contributor Author

@mroeschke mroeschke Feb 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears although the issue is closed, the latest PR that addressed this function notes that it's still awaiting libcudf handling of category type (#12571 (comment)) so I think this is still needed

if cudf.api.types._is_categorical_dtype(dtype):
if isinstance(dtype, cudf.CategoricalDtype) or dtype == "category":
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(dtype, str):
dtype = "str"
else:
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
from cudf._lib.types import size_type_dtype
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.api.types import (
_is_categorical_dtype,
_is_non_decimal_numeric_dtype,
_is_pandas_nullable_extension_dtype,
infer_dtype,
Expand Down Expand Up @@ -1387,7 +1386,7 @@ def column_empty_like(

if (
hasattr(column, "dtype")
and _is_categorical_dtype(column.dtype)
and isinstance(column.dtype, cudf.CategoricalDtype)
and dtype == column.dtype
):
catcolumn = cast("cudf.core.column.CategoricalColumn", column)
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def dtype(arbitrary):
raise TypeError(f"Unsupported type {np_dtype}")
return np_dtype

if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}:
# read_csv only accepts "hex"
# e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow
return arbitrary

# use `pandas_dtype` to try and interpret
# `arbitrary` as a Pandas extension type.
# Return the corresponding NumPy/cuDF type.
Expand Down Expand Up @@ -1003,7 +1008,10 @@ def _is_categorical_dtype(obj):
pd.Series,
),
):
return _is_categorical_dtype(obj.dtype)
try:
return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype)
except TypeError:
return False
if hasattr(obj, "type"):
if obj.type is pd.CategoricalDtype.type:
return True
Expand Down
24 changes: 12 additions & 12 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@

import cudf
from cudf._lib.unary import is_nan
from cudf.api.types import (
_is_categorical_dtype,
is_numeric_dtype,
is_string_dtype,
)
from cudf.api.types import is_numeric_dtype, is_string_dtype
from cudf.core.missing import NA, NaT


Expand Down Expand Up @@ -86,7 +82,7 @@ def _check_types(
if (
exact
and not isinstance(left, cudf.MultiIndex)
and _is_categorical_dtype(left)
and isinstance(left.dtype, cudf.CategoricalDtype)
):
if left.dtype != right.dtype:
raise_assert_detail(
Expand Down Expand Up @@ -144,8 +140,8 @@ def assert_column_equal(
"""
if check_dtype is True:
if (
_is_categorical_dtype(left)
and _is_categorical_dtype(right)
isinstance(left.dtype, cudf.CategoricalDtype)
and isinstance(right.dtype, cudf.CategoricalDtype)
and not check_categorical
):
pass
Expand Down Expand Up @@ -173,7 +169,9 @@ def assert_column_equal(
return

if check_exact and check_categorical:
if _is_categorical_dtype(left) and _is_categorical_dtype(right):
if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
right.dtype, cudf.CategoricalDtype
):
left_cat = left.categories
right_cat = right.categories

Expand Down Expand Up @@ -207,8 +205,8 @@ def assert_column_equal(

if (
not check_dtype
and _is_categorical_dtype(left)
and _is_categorical_dtype(right)
and isinstance(left.dtype, cudf.CategoricalDtype)
and isinstance(right.dtype, cudf.CategoricalDtype)
):
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
Expand Down Expand Up @@ -258,7 +256,9 @@ def assert_column_equal(
raise e
else:
columns_equal = False
if _is_categorical_dtype(left) and _is_categorical_dtype(right):
if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
right.dtype, cudf.CategoricalDtype
):
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
if not columns_equal:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
children=col.base_children,
)

if cudf.api.types._is_categorical_dtype(col.dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
assert col.size == col.codes.size
assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
elif cudf.api.types.is_string_dtype(col.dtype):
Expand Down Expand Up @@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
else:
pd_series = series.to_pandas()

if cudf.api.types._is_categorical_dtype(col.dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
# The cudf.Series is constructed from an already sliced column, whereas
# the pandas.Series is constructed from the unsliced series and then
# sliced, so the indexes should be different and we must ignore it.
Expand Down
66 changes: 21 additions & 45 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest

import cudf
from cudf.api.types import _is_categorical_dtype
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
from cudf.testing._utils import (
assert_eq,
Expand Down Expand Up @@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index):
actual = cudf.concat(other_gd, ignore_index=ignore_index)
if expected.shape != df.shape:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if not _is_categorical_dtype(expected[key].dtype):
if isinstance(col.dtype, cudf.CategoricalDtype):
if not isinstance(expected[key].dtype, pd.CategoricalDtype):
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = expected[key].fillna("-1").astype("str")
Expand Down Expand Up @@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis):
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("join", ["inner", "outer"])
@pytest.mark.parametrize("axis", [0])
def test_concat_join_empty_dataframes(
df, other, ignore_index, axis, join, sort
request, df, other, ignore_index, join, sort
):
axis = 0
other_pd = [df] + other
gdf = cudf.from_pandas(df)
other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
Expand All @@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes(
actual = cudf.concat(
other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort
)
if expected.shape != df.shape:
if axis == 0:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if not _is_categorical_dtype(expected[key].dtype):
# TODO: Pandas bug:
# https://github.com/pandas-dev/pandas/issues/42840
expected[key] = (
expected[key].fillna("-1").astype("str")
)
else:
expected[key] = (
expected[key]
.cat.add_categories(["-1"])
.fillna("-1")
.astype("str")
)
actual[key] = col.astype("str").fillna("-1")
else:
expected[key] = expected[key].fillna(-1)
actual[key] = col.fillna(-1)

assert_eq(
expected.fillna(-1),
actual.fillna(-1),
check_dtype=False,
check_index_type=False
if len(expected) == 0 or actual.empty
else True,
check_column_type=False,
)
else:
# no need to fill in if axis=1
assert_eq(
expected,
actual,
check_index_type=False,
check_column_type=False,
if (
join == "outer"
and any(
isinstance(dtype, pd.CategoricalDtype)
for dtype in df.dtypes.tolist()
)
and any(
isinstance(dtype, pd.CategoricalDtype)
for other_df in other
for dtype in other_df.dtypes.tolist()
)
):
request.applymarker(
pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/42840"
)
)
assert_eq(
expected,
actual,
check_dtype=False,
check_index_type=False,
check_column_type=False,
)

Expand Down Expand Up @@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1(
if expected.shape != df.shape:
if axis == 0:
for key, col in actual[actual.columns].items():
if _is_categorical_dtype(col.dtype):
if isinstance(expected[key].dtype, pd.CategoricalDtype):
expected[key] = expected[key].fillna("-1")
actual[key] = col.astype("str").fillna("-1")
# if not expected.empty:
Expand Down
22 changes: 19 additions & 3 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep(
gdf1 = read_csv(
str(fname),
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dtype=[
"int64",
"datetime64[ns]",
"float64",
"int64",
"category",
"str",
"bool",
],
dayfirst=True,
**cudf_arg,
)
gdf2 = read_csv(
str(fname),
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dtype=[
"int64",
"datetime64[ns]",
"float64",
"int64",
"category",
"str",
"bool",
],
dayfirst=True,
**pandas_arg,
)
Expand Down Expand Up @@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
out = read_csv(
str(fname),
names=["1", "2", "3"],
dtype=["int64", "date", "float64"],
dtype=["int64", "datetime64[ns]", "float64"],
skiprows=1,
skipfooter=1,
dayfirst=True,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ def get_min_float_dtype(col):


def is_mixed_with_object_dtype(lhs, rhs):
if cudf.api.types._is_categorical_dtype(lhs.dtype):
if isinstance(lhs.dtype, cudf.CategoricalDtype):
return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
elif cudf.api.types._is_categorical_dtype(rhs.dtype):
elif isinstance(rhs.dtype, cudf.CategoricalDtype):
return is_mixed_with_object_dtype(lhs, rhs.dtype.categories)

return (lhs.dtype == "object" and rhs.dtype != "object") or (
Expand Down
Loading