Add support for category dtypes in CSV reader (#12571)

Fixes: #11977, #3960 This PR enables support for `category` dtypes in `dtype` parameter. This PR contains a workaround that enables reading columns as categorical dtypes, we can remove this workaround once `libcudf` has native support for dictionary type mapping to categorical columns. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: #12571
rapidsai · Jan 21, 2023 · 11f90d1 · 11f90d1
1 parent 90d60cb
commit 11f90d1
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 40 deletions.
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -49,6 +49,8 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+from cudf.api.types import is_hashable
+
 ctypedef int32_t underlying_type_t_compression
 
 
@@ -248,7 +250,7 @@ cdef csv_reader_options make_csv_reader_options(
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
                 col_type = v
-                if v in CSV_HEX_TYPE_MAP:
+                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
                     col_type = CSV_HEX_TYPE_MAP[v]
                     c_hex_col_names.push_back(str(k).encode())
 
@@ -264,7 +266,7 @@ cdef csv_reader_options make_csv_reader_options(
             ))
         ):
             c_dtypes_list.reserve(1)
-            if dtype in CSV_HEX_TYPE_MAP:
+            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
                 dtype = CSV_HEX_TYPE_MAP[dtype]
                 c_hex_col_indexes.push_back(0)
 
@@ -276,7 +278,7 @@ cdef csv_reader_options make_csv_reader_options(
         elif isinstance(dtype, abc.Collection):
             c_dtypes_list.reserve(len(dtype))
             for index, col_dtype in enumerate(dtype):
-                if col_dtype in CSV_HEX_TYPE_MAP:
+                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
                     col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
                     c_hex_col_indexes.push_back(index)
 
@@ -429,6 +431,25 @@ def read_csv(
         column_names=meta_names
     ))
 
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            for k, v in dtype.items():
+                if cudf.api.types.is_categorical_dtype(v):
+                    df._data[str(k)] = df._data[str(k)].astype(v)
+        elif (
+            cudf.api.types.is_scalar(dtype) or
+            isinstance(dtype, (
+                np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
+            ))
+        ):
+            if cudf.api.types.is_categorical_dtype(dtype):
+                df = df.astype(dtype)
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if cudf.api.types.is_categorical_dtype(col_dtype):
+                    col_name = df._data.names[index]
+                    df._data[col_name] = df._data[col_name].astype(col_dtype)
+
     if names is not None and isinstance(names[0], (int)):
         df.columns = [int(x) for x in df._data]
 
@@ -517,14 +538,14 @@ def write_csv(
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
-    # TODO: Remove this Error message once the
-    # following issue is fixed:
+    # TODO: Remove this work-around Dictionary types
+    # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
     if cudf.api.types.is_categorical_dtype(dtype):
-        raise NotImplementedError(
-            "CategoricalDtype as dtype is not yet "
-            "supported in CSV reader"
-        )
+        if isinstance(dtype, str):
+            dtype = "str"
+        else:
+            dtype = dtype.categories.dtype
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -75,8 +75,7 @@ def make_numpy_mixed_dataframe():
     )
     df["Float"] = np.array([9.001, 8.343, 6, 2.781])
     df["Integer2"] = np.array([2345, 106, 2088, 789277])
-    # Category is not yet supported from libcudf
-    # df["Category"] = np.array(["M", "F", "F", "F"])
+    df["Category"] = np.array(["M", "F", "F", "F"])
     df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"])
     df["Boolean"] = np.array([True, False, True, False])
     return df
@@ -270,34 +269,22 @@ def test_csv_reader_mixed_data_delimiter_sep(
 
     gdf1 = read_csv(
         str(fname),
-        # Category is not yet supported from libcudf
-        # names=["1", "2", "3", "4", "5", "6", "7"],
-        # dtype=[
-        #    "int64", "date", "float64", "int64", "category", "str", "bool"
-        # ],
-        names=["1", "2", "3", "4", "5", "6"],
-        dtype=["int64", "date", "float64", "uint64", "str", "bool"],
+        names=["1", "2", "3", "4", "5", "6", "7"],
+        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
         dayfirst=True,
         **cudf_arg,
     )
     gdf2 = read_csv(
         str(fname),
-        # Category is not yet supported from libcudf
-        # names=["1", "2", "3", "4", "5", "6", "7"],
-        # dtype=[
-        #    "int64", "date", "float64", "int64", "category", "str", "bool"
-        # ],
-        names=["1", "2", "3", "4", "5", "6"],
-        dtype=["int64", "date", "float64", "uint64", "str", "bool"],
+        names=["1", "2", "3", "4", "5", "6", "7"],
+        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
         dayfirst=True,
         **pandas_arg,
     )
 
     pdf = pd.read_csv(
         fname,
-        # Category is not yet supported from libcudf
-        # names=["1", "2", "3", "4", "5", "6", "7"],
-        names=["1", "2", "3", "4", "5", "6"],
+        names=["1", "2", "3", "4", "5", "6", "7"],
         parse_dates=[1],
         dayfirst=True,
         **pandas_arg,
@@ -2046,19 +2033,27 @@ def test_csv_writer_category(df):
     assert expected == actual
 
 
-def test_csv_reader_category_error():
-    # TODO: Remove this test once following
-    # issue is fixed: https://github.com/rapidsai/cudf/issues/3960
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "category",
+        {"a": "category", "b": "str"},
+        {"b": "category"},
+        {"a": "category"},
+        {"a": pd.CategoricalDtype([1, 2])},
+        {"b": pd.CategoricalDtype([1, 2, 3])},
+        {"b": pd.CategoricalDtype(["b", "a"]), "a": "str"},
+        pd.CategoricalDtype(["a", "b"]),
+    ],
+)
+def test_csv_reader_category(dtype):
+    df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]})
     csv_buf = df.to_csv()
 
-    with pytest.raises(
-        NotImplementedError,
-        match=re.escape(
-            "CategoricalDtype as dtype is not yet " "supported in CSV reader"
-        ),
-    ):
-        cudf.read_csv(StringIO(csv_buf), dtype="category")
+    actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype)
+    expected = pd.read_csv(StringIO(csv_buf), dtype=dtype)
+
+    assert_eq(expected, actual, check_dtype=True)
 
 
 def test_csv_writer_datetime_sep():