Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add support for category dtypes in CSV reader #12571

Merged
merged 8 commits into from
Jan 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.map cimport map
Expand Down Expand Up @@ -49,6 +49,8 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table

from pyarrow.lib import NativeFile

from cudf.api.types import is_hashable

ctypedef int32_t underlying_type_t_compression


Expand Down Expand Up @@ -248,7 +250,7 @@ cdef csv_reader_options make_csv_reader_options(
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
col_type = v
if v in CSV_HEX_TYPE_MAP:
if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
col_type = CSV_HEX_TYPE_MAP[v]
c_hex_col_names.push_back(str(k).encode())

Expand All @@ -264,7 +266,7 @@ cdef csv_reader_options make_csv_reader_options(
))
):
c_dtypes_list.reserve(1)
if dtype in CSV_HEX_TYPE_MAP:
if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
dtype = CSV_HEX_TYPE_MAP[dtype]
c_hex_col_indexes.push_back(0)

Expand All @@ -276,7 +278,7 @@ cdef csv_reader_options make_csv_reader_options(
elif isinstance(dtype, abc.Collection):
c_dtypes_list.reserve(len(dtype))
for index, col_dtype in enumerate(dtype):
if col_dtype in CSV_HEX_TYPE_MAP:
if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
c_hex_col_indexes.push_back(index)

Expand Down Expand Up @@ -429,6 +431,25 @@ def read_csv(
column_names=meta_names
))

if dtype is not None:
if isinstance(dtype, abc.Mapping):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for this to be a generic Mapping, or is it guaranteed to be a dict? If you only care about dicts and not dict-like objects, then avoid the abstract class check in favor of isinstance(dtype, dict). It can be considerably faster.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this might come from unsanitized user input with random dict-like classes as the dtype...? Maybe this is the best we can do (here and in the other thread below).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our check's aren't much tighter checking for dict because in pandas dtype is super overloaded param, where for one example they support defaultdict too: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

for k, v in dtype.items():
if cudf.api.types.is_categorical_dtype(v):
df._data[str(k)] = df._data[str(k)].astype(v)
elif (
cudf.api.types.is_scalar(dtype) or
isinstance(dtype, (
np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
))
):
if cudf.api.types.is_categorical_dtype(dtype):
df = df.astype(dtype)
elif isinstance(dtype, abc.Collection):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can dtypes be this broad? Is there a tighter constraint like "list or tuple" that excludes the cases handled above? Just trying to avoid the abstract type check as above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Answered above

for index, col_dtype in enumerate(dtype):
if cudf.api.types.is_categorical_dtype(col_dtype):
col_name = df._data.names[index]
df._data[col_name] = df._data[col_name].astype(col_dtype)

if names is not None and isinstance(names[0], (int)):
df.columns = [int(x) for x in df._data]

Expand Down Expand Up @@ -517,14 +538,14 @@ def write_csv(


cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
# TODO: Remove this Error message once the
# following issue is fixed:
# TODO: Remove this work-around Dictionary types
# in libcudf are fully mapped to categorical columns:
# https://github.com/rapidsai/cudf/issues/3960
if cudf.api.types.is_categorical_dtype(dtype):
raise NotImplementedError(
"CategoricalDtype as dtype is not yet "
"supported in CSV reader"
)
if isinstance(dtype, str):
dtype = "str"
else:
dtype = dtype.categories.dtype

if isinstance(dtype, str):
if str(dtype) == "date32":
Expand Down
55 changes: 25 additions & 30 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def make_numpy_mixed_dataframe():
)
df["Float"] = np.array([9.001, 8.343, 6, 2.781])
df["Integer2"] = np.array([2345, 106, 2088, 789277])
# Category is not yet supported from libcudf
# df["Category"] = np.array(["M", "F", "F", "F"])
df["Category"] = np.array(["M", "F", "F", "F"])
df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"])
df["Boolean"] = np.array([True, False, True, False])
return df
Expand Down Expand Up @@ -270,34 +269,22 @@ def test_csv_reader_mixed_data_delimiter_sep(

gdf1 = read_csv(
str(fname),
# Category is not yet supported from libcudf
# names=["1", "2", "3", "4", "5", "6", "7"],
# dtype=[
# "int64", "date", "float64", "int64", "category", "str", "bool"
# ],
names=["1", "2", "3", "4", "5", "6"],
dtype=["int64", "date", "float64", "uint64", "str", "bool"],
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dayfirst=True,
**cudf_arg,
)
gdf2 = read_csv(
str(fname),
# Category is not yet supported from libcudf
# names=["1", "2", "3", "4", "5", "6", "7"],
# dtype=[
# "int64", "date", "float64", "int64", "category", "str", "bool"
# ],
names=["1", "2", "3", "4", "5", "6"],
dtype=["int64", "date", "float64", "uint64", "str", "bool"],
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dayfirst=True,
**pandas_arg,
)

pdf = pd.read_csv(
fname,
# Category is not yet supported from libcudf
# names=["1", "2", "3", "4", "5", "6", "7"],
names=["1", "2", "3", "4", "5", "6"],
names=["1", "2", "3", "4", "5", "6", "7"],
parse_dates=[1],
dayfirst=True,
**pandas_arg,
Expand Down Expand Up @@ -2046,19 +2033,27 @@ def test_csv_writer_category(df):
assert expected == actual


def test_csv_reader_category_error():
# TODO: Remove this test once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/3960
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
@pytest.mark.parametrize(
"dtype",
[
"category",
{"a": "category", "b": "str"},
{"b": "category"},
{"a": "category"},
{"a": pd.CategoricalDtype([1, 2])},
{"b": pd.CategoricalDtype([1, 2, 3])},
{"b": pd.CategoricalDtype(["b", "a"]), "a": "str"},
pd.CategoricalDtype(["a", "b"]),
],
)
def test_csv_reader_category(dtype):
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]})
csv_buf = df.to_csv()

with pytest.raises(
NotImplementedError,
match=re.escape(
"CategoricalDtype as dtype is not yet " "supported in CSV reader"
),
):
cudf.read_csv(StringIO(csv_buf), dtype="category")
actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype)
expected = pd.read_csv(StringIO(csv_buf), dtype=dtype)

assert_eq(expected, actual, check_dtype=True)


def test_csv_writer_datetime_sep():
Expand Down