From 8c804d4d8c746a66187b234168c4034dd522f47d Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Jan 2023 11:17:40 -0800
Subject: [PATCH 1/4] enable category dtype in read_csv

---
 python/cudf/cudf/_lib/csv.pyx      | 41 ++++++++++++++++++++++--------
 python/cudf/cudf/tests/test_csv.py | 28 ++++++++++++--------
 2 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 64c62be3e00..1f66ed0ee83 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -49,6 +49,8 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+from cudf.api.types import is_hashable
+
 ctypedef int32_t underlying_type_t_compression
 
 
@@ -248,7 +250,7 @@ cdef csv_reader_options make_csv_reader_options(
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
                 col_type = v
-                if v in CSV_HEX_TYPE_MAP:
+                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
                     col_type = CSV_HEX_TYPE_MAP[v]
                     c_hex_col_names.push_back(str(k).encode())
 
@@ -264,7 +266,7 @@ cdef csv_reader_options make_csv_reader_options(
             ))
         ):
             c_dtypes_list.reserve(1)
-            if dtype in CSV_HEX_TYPE_MAP:
+            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
                 dtype = CSV_HEX_TYPE_MAP[dtype]
                 c_hex_col_indexes.push_back(0)
 
@@ -276,7 +278,7 @@ cdef csv_reader_options make_csv_reader_options(
         elif isinstance(dtype, abc.Collection):
             c_dtypes_list.reserve(len(dtype))
             for index, col_dtype in enumerate(dtype):
-                if col_dtype in CSV_HEX_TYPE_MAP:
+                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
                     col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
                     c_hex_col_indexes.push_back(index)
 
@@ -429,6 +431,25 @@ def read_csv(
         column_names=meta_names
     ))
 
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            for k, v in dtype.items():
+                if cudf.api.types.is_categorical_dtype(v):
+                    df._data[str(k)] = df._data[str(k)].astype(v)
+        elif (
+            cudf.api.types.is_scalar(dtype) or
+            isinstance(dtype, (
+                np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
+            ))
+        ):
+            if cudf.api.types.is_categorical_dtype(dtype):
+                df = df.astype(dtype)
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if cudf.api.types.is_categorical_dtype(col_dtype):
+                    col_name = df._data.names[index]
+                    df._data[col_name] = df._data[col_name].astype(col_dtype)
+
     if names is not None and isinstance(names[0], (int)):
         df.columns = [int(x) for x in df._data]
 
@@ -517,14 +538,14 @@ def write_csv(
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
-    # TODO: Remove this Error message once the
-    # following issue is fixed:
+    # TODO: Remove this work-around Dictionary types
+    # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
     if cudf.api.types.is_categorical_dtype(dtype):
-        raise NotImplementedError(
-            "CategoricalDtype as dtype is not yet "
-            "supported in CSV reader"
-        )
+        if isinstance(dtype, str):
+            dtype = "str"
+        else:
+            dtype = dtype.categories.dtype
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 249c4e26a86..5cde147375c 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2046,19 +2046,27 @@ def test_csv_writer_category(df):
     assert expected == actual
 
 
-def test_csv_reader_category_error():
-    # TODO: Remove this test once following
-    # issue is fixed: https://github.com/rapidsai/cudf/issues/3960
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "category",
+        {"a": "category", "b": "str"},
+        {"b": "category"},
+        {"a": "category"},
+        {"a": pd.CategoricalDtype([1, 2])},
+        {"b": pd.CategoricalDtype([1, 2, 3])},
+        {"b": pd.CategoricalDtype(["b", "a"]), "a": "str"},
+        pd.CategoricalDtype(["a", "b"]),
+    ],
+)
+def test_csv_reader_category(dtype):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
     csv_buf = df.to_csv()
 
-    with pytest.raises(
-        NotImplementedError,
-        match=re.escape(
-            "CategoricalDtype as dtype is not yet " "supported in CSV reader"
-        ),
-    ):
-        cudf.read_csv(StringIO(csv_buf), dtype="category")
+    actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype)
+    expected = pd.read_csv(StringIO(csv_buf), dtype=dtype)
+
+    assert_eq(expected, actual)
 
 
 def test_csv_writer_datetime_sep():

From 3647c1503988b787b00adc52a0d1c8d00f4457af Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Jan 2023 11:44:30 -0800
Subject: [PATCH 2/4] enable more tests

---
 python/cudf/cudf/tests/test_csv.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5cde147375c..2badf6482d6 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -75,8 +75,7 @@ def make_numpy_mixed_dataframe():
     )
     df["Float"] = np.array([9.001, 8.343, 6, 2.781])
     df["Integer2"] = np.array([2345, 106, 2088, 789277])
-    # Category is not yet supported from libcudf
-    # df["Category"] = np.array(["M", "F", "F", "F"])
+    df["Category"] = np.array(["M", "F", "F", "F"])
     df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"])
     df["Boolean"] = np.array([True, False, True, False])
     return df
@@ -270,34 +269,22 @@ def test_csv_reader_mixed_data_delimiter_sep(
 
     gdf1 = read_csv(
         str(fname),
-        # Category is not yet supported from libcudf
-        # names=["1", "2", "3", "4", "5", "6", "7"],
-        # dtype=[
-        #    "int64", "date", "float64", "int64", "category", "str", "bool"
-        # ],
-        names=["1", "2", "3", "4", "5", "6"],
-        dtype=["int64", "date", "float64", "uint64", "str", "bool"],
+        names=["1", "2", "3", "4", "5", "6", "7"],
+        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
         dayfirst=True,
         **cudf_arg,
     )
     gdf2 = read_csv(
         str(fname),
-        # Category is not yet supported from libcudf
-        # names=["1", "2", "3", "4", "5", "6", "7"],
-        # dtype=[
-        #    "int64", "date", "float64", "int64", "category", "str", "bool"
-        # ],
-        names=["1", "2", "3", "4", "5", "6"],
-        dtype=["int64", "date", "float64", "uint64", "str", "bool"],
+        names=["1", "2", "3", "4", "5", "6", "7"],
+        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
         dayfirst=True,
         **pandas_arg,
     )
 
     pdf = pd.read_csv(
         fname,
-        # Category is not yet supported from libcudf
-        # names=["1", "2", "3", "4", "5", "6", "7"],
-        names=["1", "2", "3", "4", "5", "6"],
+        names=["1", "2", "3", "4", "5", "6", "7"],
         parse_dates=[1],
         dayfirst=True,
         **pandas_arg,

From 157326f287d7f16978b45dd4e9c959619a1cd636 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Jan 2023 18:36:42 -0800
Subject: [PATCH 3/4] add nulls

---
 python/cudf/cudf/tests/test_csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 2badf6482d6..1fab39e2b23 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2047,7 +2047,7 @@ def test_csv_writer_category(df):
     ],
 )
 def test_csv_reader_category(dtype):
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]})
     csv_buf = df.to_csv()
 
     actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype)

From ba6b16181ea9f41f1b45d0510fe1b71ba2904031 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Jan 2023 18:37:25 -0800
Subject: [PATCH 4/4] update

---
 python/cudf/cudf/tests/test_csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 1fab39e2b23..6066cd3b03e 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2053,7 +2053,7 @@ def test_csv_reader_category(dtype):
     actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype)
     expected = pd.read_csv(StringIO(csv_buf), dtype=dtype)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_dtype=True)
 
 
 def test_csv_writer_datetime_sep():