rapidsai · galipremsagar · Nov 23, 2020 · Nov 23, 2020 · Nov 23, 2020 · Nov 23, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -97,6 +97,7 @@
 - PR #6780 Move `cudf::cast` tests to separate test file
 - PR #6789 Rename `unary_op` to `unary_operator`
 - PR #6770 Support building decimal columns with Table.TestBuilder
+- PR #6829 Enable workaround to write categorical columns in csv
 - PR #6819 Use CMake 3.19 for RMM when building cuDF jar
 - PR #6833 Use settings.xml if existing for internal build
 

@@ -487,6 +487,15 @@ cpdef write_csv(
 
 
 def _get_cudf_compatible_str_from_dtype(dtype):
+    # TODO: Remove this Error message once the
+    # following issue is fixed:
+    # https://github.com/rapidsai/cudf/issues/3960
+    if cudf.utils.dtypes.is_categorical_dtype(dtype):
+        raise NotImplementedError(
+            "CategoricalDtype as dtype is not yet "
+            "supported in CSV reader"
+        )
+
     if (
         str(dtype) in cudf.utils.dtypes.ALL_TYPES or
         str(dtype) in {

@@ -4,6 +4,7 @@
 
 from nvtx import annotate
 
+import cudf
 from cudf import _lib as libcudf
 from cudf.utils import ioutils
 from cudf.utils.dtypes import is_scalar
@@ -58,6 +59,14 @@ def read_csv(
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
+    if keep_default_na is False:
+        # TODO: Remove this error once the following issue is fixed:
+        # https://github.com/rapidsai/cudf/issues/6680
+        raise NotImplementedError(
+            "keep_default_na=False is currently not supported, please refer "
+            "to: https://github.com/rapidsai/cudf/issues/6680"
+        )
+
     return libcudf.csv.read_csv(
         filepath_or_buffer,
         lineterminator=lineterminator,
@@ -127,6 +136,34 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
+    if sep == "-":
+        # TODO: Remove this error once following issue is fixed:
+        # https://github.com/rapidsai/cudf/issues/6699
+        if any(
+            isinstance(col, cudf.core.column.DatetimeColumn)
+            for col in df._data.columns
+        ):
+            raise ValueError(
+                "sep cannot be '-' when writing a datetime64 dtype to csv, "
+                "refer to: https://github.com/rapidsai/cudf/issues/6699"
+            )
+
+    # TODO: Need to typecast categorical columns to the underlying
+    # categories dtype to write the actual data to csv. Remove this
+    # workaround once following issue is fixed:
+    # https://github.com/rapidsai/cudf/issues/6661
+    if any(
+        isinstance(col, cudf.core.column.CategoricalColumn)
+        for col in df._data.columns
+    ) or isinstance(df.index, cudf.CategoricalIndex):
+        df = df.copy(deep=False)
+        for col_name, col in df._data.items():
+            if isinstance(col, cudf.core.column.CategoricalColumn):
+                df._data[col_name] = col.astype(col.cat().categories.dtype)
+
+        if isinstance(df.index, cudf.CategoricalIndex):
+            df.index = df.index.astype(df.index.categories.dtype)
+
     rows_per_chunk = chunksize if chunksize else len(df)
 
     if ioutils.is_fsspec_open_file(path_or_buf):

@@ -3,6 +3,7 @@
 import csv
 import gzip
 import os
+import re
 import shutil
 from collections import OrderedDict
 from io import BytesIO, StringIO
@@ -1841,3 +1842,90 @@ def test_csv_reader_datetime_dtypes(dtype):
     actual = cudf.read_csv(StringIO(buf), dtype=dtype)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        cudf.DataFrame(
+            {
+                "a": cudf.Series([1, 2, 3, 1, 2], dtype="category"),
+                "b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"),
+            }
+        ),
+        cudf.DataFrame(
+            {
+                "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"),
+                "b": cudf.Series(
+                    [None, "c", None, "b", "a"], dtype="category"
+                ),
+            }
+        ),
+        cudf.DataFrame(
+            {
+                "b": cudf.Series(
+                    [1.1, 2, 3, 1.1, 2],
+                    dtype="category",
+                    index=cudf.CategoricalIndex(
+                        ["abc", "def", "ghi", "jkl", "xyz"]
+                    ),
+                )
+            }
+        ),
+    ],
+)
+def test_csv_writer_category(df):
+    pdf = df.to_pandas()
+
+    expected = pdf.to_csv()
+    actual = df.to_csv()
+
+    assert expected == actual
+
+
+def test_csv_reader_category_error():
+    # TODO: Remove this test once following
+    # issue is fixed: https://github.com/rapidsai/cudf/issues/3960
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    csv_buf = df.to_csv()
+
+    with pytest.raises(
+        NotImplementedError,
+        match=re.escape(
+            "CategoricalDtype as dtype is not yet " "supported in CSV reader"
+        ),
+    ):
+        cudf.read_csv(StringIO(csv_buf), dtype="category")
+
+
+def test_csv_reader_keep_default_na_error():
+    # TODO: Remove this test once following
+    # issue is fixed: https://github.com/rapidsai/cudf/issues/6680
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    csv_buf = df.to_csv()
+
+    with pytest.raises(
+        NotImplementedError,
+        match=re.escape(
+            "keep_default_na=False is currently not supported, please refer "
+            "to: https://github.com/rapidsai/cudf/issues/6680"
+        ),
+    ):
+        cudf.read_csv(StringIO(csv_buf), keep_default_na=False)
+
+
+def test_csv_writer_datetime_sep_error():
+    # TODO: Remove this test once following
+    # issues is fixed: https://github.com/rapidsai/cudf/issues/6699
+    df = cudf.DataFrame(
+        {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")}
+    )
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "sep cannot be '-' when writing a datetime64 dtype to csv, "
+            "refer to: https://github.com/rapidsai/cudf/issues/6699"
+        ),
+    ):
+        df.to_csv(sep="-")