Skip to content

Commit

Permalink
Add support for category dtypes in CSV reader (#12571)
Browse files Browse the repository at this point in the history
Fixes: #11977, #3960

This PR enables support for `category` dtypes in `dtype` parameter. This PR contains a workaround that enables reading columns as categorical dtypes, we can remove this workaround once `libcudf` has native support for dictionary type mapping to categorical columns.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: #12571
  • Loading branch information
galipremsagar authored Jan 21, 2023
1 parent 90d60cb commit 11f90d1
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 40 deletions.
41 changes: 31 additions & 10 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.map cimport map
Expand Down Expand Up @@ -49,6 +49,8 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table

from pyarrow.lib import NativeFile

from cudf.api.types import is_hashable

ctypedef int32_t underlying_type_t_compression


Expand Down Expand Up @@ -248,7 +250,7 @@ cdef csv_reader_options make_csv_reader_options(
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
col_type = v
if v in CSV_HEX_TYPE_MAP:
if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
col_type = CSV_HEX_TYPE_MAP[v]
c_hex_col_names.push_back(str(k).encode())

Expand All @@ -264,7 +266,7 @@ cdef csv_reader_options make_csv_reader_options(
))
):
c_dtypes_list.reserve(1)
if dtype in CSV_HEX_TYPE_MAP:
if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
dtype = CSV_HEX_TYPE_MAP[dtype]
c_hex_col_indexes.push_back(0)

Expand All @@ -276,7 +278,7 @@ cdef csv_reader_options make_csv_reader_options(
elif isinstance(dtype, abc.Collection):
c_dtypes_list.reserve(len(dtype))
for index, col_dtype in enumerate(dtype):
if col_dtype in CSV_HEX_TYPE_MAP:
if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
c_hex_col_indexes.push_back(index)

Expand Down Expand Up @@ -429,6 +431,25 @@ def read_csv(
column_names=meta_names
))

if dtype is not None:
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
if cudf.api.types.is_categorical_dtype(v):
df._data[str(k)] = df._data[str(k)].astype(v)
elif (
cudf.api.types.is_scalar(dtype) or
isinstance(dtype, (
np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
))
):
if cudf.api.types.is_categorical_dtype(dtype):
df = df.astype(dtype)
elif isinstance(dtype, abc.Collection):
for index, col_dtype in enumerate(dtype):
if cudf.api.types.is_categorical_dtype(col_dtype):
col_name = df._data.names[index]
df._data[col_name] = df._data[col_name].astype(col_dtype)

if names is not None and isinstance(names[0], (int)):
df.columns = [int(x) for x in df._data]

Expand Down Expand Up @@ -517,14 +538,14 @@ def write_csv(


cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
# TODO: Remove this Error message once the
# following issue is fixed:
# TODO: Remove this work-around Dictionary types
# in libcudf are fully mapped to categorical columns:
# https://github.com/rapidsai/cudf/issues/3960
if cudf.api.types.is_categorical_dtype(dtype):
raise NotImplementedError(
"CategoricalDtype as dtype is not yet "
"supported in CSV reader"
)
if isinstance(dtype, str):
dtype = "str"
else:
dtype = dtype.categories.dtype

if isinstance(dtype, str):
if str(dtype) == "date32":
Expand Down
55 changes: 25 additions & 30 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def make_numpy_mixed_dataframe():
)
df["Float"] = np.array([9.001, 8.343, 6, 2.781])
df["Integer2"] = np.array([2345, 106, 2088, 789277])
# Category is not yet supported from libcudf
# df["Category"] = np.array(["M", "F", "F", "F"])
df["Category"] = np.array(["M", "F", "F", "F"])
df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"])
df["Boolean"] = np.array([True, False, True, False])
return df
Expand Down Expand Up @@ -270,34 +269,22 @@ def test_csv_reader_mixed_data_delimiter_sep(

gdf1 = read_csv(
str(fname),
# Category is not yet supported from libcudf
# names=["1", "2", "3", "4", "5", "6", "7"],
# dtype=[
# "int64", "date", "float64", "int64", "category", "str", "bool"
# ],
names=["1", "2", "3", "4", "5", "6"],
dtype=["int64", "date", "float64", "uint64", "str", "bool"],
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dayfirst=True,
**cudf_arg,
)
gdf2 = read_csv(
str(fname),
# Category is not yet supported from libcudf
# names=["1", "2", "3", "4", "5", "6", "7"],
# dtype=[
# "int64", "date", "float64", "int64", "category", "str", "bool"
# ],
names=["1", "2", "3", "4", "5", "6"],
dtype=["int64", "date", "float64", "uint64", "str", "bool"],
names=["1", "2", "3", "4", "5", "6", "7"],
dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
dayfirst=True,
**pandas_arg,
)

pdf = pd.read_csv(
fname,
# Category is not yet supported from libcudf
# names=["1", "2", "3", "4", "5", "6", "7"],
names=["1", "2", "3", "4", "5", "6"],
names=["1", "2", "3", "4", "5", "6", "7"],
parse_dates=[1],
dayfirst=True,
**pandas_arg,
Expand Down Expand Up @@ -2046,19 +2033,27 @@ def test_csv_writer_category(df):
assert expected == actual


def test_csv_reader_category_error():
# TODO: Remove this test once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/3960
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
@pytest.mark.parametrize(
"dtype",
[
"category",
{"a": "category", "b": "str"},
{"b": "category"},
{"a": "category"},
{"a": pd.CategoricalDtype([1, 2])},
{"b": pd.CategoricalDtype([1, 2, 3])},
{"b": pd.CategoricalDtype(["b", "a"]), "a": "str"},
pd.CategoricalDtype(["a", "b"]),
],
)
def test_csv_reader_category(dtype):
df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]})
csv_buf = df.to_csv()

with pytest.raises(
NotImplementedError,
match=re.escape(
"CategoricalDtype as dtype is not yet " "supported in CSV reader"
),
):
cudf.read_csv(StringIO(csv_buf), dtype="category")
actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype)
expected = pd.read_csv(StringIO(csv_buf), dtype=dtype)

assert_eq(expected, actual, check_dtype=True)


def test_csv_writer_datetime_sep():
Expand Down

0 comments on commit 11f90d1

Please sign in to comment.