diff --git a/CHANGELOG.md b/CHANGELOG.md index 59070fed325..5f7cc3a421f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -97,6 +97,7 @@ - PR #6780 Move `cudf::cast` tests to separate test file - PR #6789 Rename `unary_op` to `unary_operator` - PR #6770 Support building decimal columns with Table.TestBuilder +- PR #6829 Enable workaround to write categorical columns in csv - PR #6819 Use CMake 3.19 for RMM when building cuDF jar - PR #6833 Use settings.xml if existing for internal build diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 7dc05f9961a..f3cde6d449a 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -487,6 +487,15 @@ cpdef write_csv( def _get_cudf_compatible_str_from_dtype(dtype): + # TODO: Remove this Error message once the + # following issue is fixed: + # https://github.com/rapidsai/cudf/issues/3960 + if cudf.utils.dtypes.is_categorical_dtype(dtype): + raise NotImplementedError( + "CategoricalDtype as dtype is not yet " + "supported in CSV reader" + ) + if ( str(dtype) in cudf.utils.dtypes.ALL_TYPES or str(dtype) in { diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index c4bcaa575f5..53273b42997 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -4,6 +4,7 @@ from nvtx import annotate +import cudf from cudf import _lib as libcudf from cudf.utils import ioutils from cudf.utils.dtypes import is_scalar @@ -58,6 +59,14 @@ def read_csv( if na_values is not None and is_scalar(na_values): na_values = [na_values] + if keep_default_na is False: + # TODO: Remove this error once the following issue is fixed: + # https://github.com/rapidsai/cudf/issues/6680 + raise NotImplementedError( + "keep_default_na=False is currently not supported, please refer " + "to: https://github.com/rapidsai/cudf/issues/6680" + ) + return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, @@ -127,6 +136,34 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) + if sep == "-": + # TODO: Remove this error once following issue is fixed: + # https://github.com/rapidsai/cudf/issues/6699 + if any( + isinstance(col, cudf.core.column.DatetimeColumn) + for col in df._data.columns + ): + raise ValueError( + "sep cannot be '-' when writing a datetime64 dtype to csv, " + "refer to: https://github.com/rapidsai/cudf/issues/6699" + ) + + # TODO: Need to typecast categorical columns to the underlying + # categories dtype to write the actual data to csv. Remove this + # workaround once following issue is fixed: + # https://github.com/rapidsai/cudf/issues/6661 + if any( + isinstance(col, cudf.core.column.CategoricalColumn) + for col in df._data.columns + ) or isinstance(df.index, cudf.CategoricalIndex): + df = df.copy(deep=False) + for col_name, col in df._data.items(): + if isinstance(col, cudf.core.column.CategoricalColumn): + df._data[col_name] = col.astype(col.cat().categories.dtype) + + if isinstance(df.index, cudf.CategoricalIndex): + df.index = df.index.astype(df.index.categories.dtype) + rows_per_chunk = chunksize if chunksize else len(df) if ioutils.is_fsspec_open_file(path_or_buf): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index d1d79a2292d..488fc014c64 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -3,6 +3,7 @@ import csv import gzip import os +import re import shutil from collections import OrderedDict from io import BytesIO, StringIO @@ -1841,3 +1842,90 @@ def test_csv_reader_datetime_dtypes(dtype): actual = cudf.read_csv(StringIO(buf), dtype=dtype) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "df", + [ + cudf.DataFrame( + { + "a": cudf.Series([1, 2, 3, 1, 2], dtype="category"), + "b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"), + } + ), + cudf.DataFrame( + { + "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), + "b": cudf.Series( + [None, "c", None, "b", "a"], dtype="category" + ), + } + ), + cudf.DataFrame( + { + "b": cudf.Series( + [1.1, 2, 3, 1.1, 2], + dtype="category", + index=cudf.CategoricalIndex( + ["abc", "def", "ghi", "jkl", "xyz"] + ), + ) + } + ), + ], +) +def test_csv_writer_category(df): + pdf = df.to_pandas() + + expected = pdf.to_csv() + actual = df.to_csv() + + assert expected == actual + + +def test_csv_reader_category_error(): + # TODO: Remove this test once following + # issue is fixed: https://github.com/rapidsai/cudf/issues/3960 + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + csv_buf = df.to_csv() + + with pytest.raises( + NotImplementedError, + match=re.escape( + "CategoricalDtype as dtype is not yet " "supported in CSV reader" + ), + ): + cudf.read_csv(StringIO(csv_buf), dtype="category") + + +def test_csv_reader_keep_default_na_error(): + # TODO: Remove this test once following + # issue is fixed: https://github.com/rapidsai/cudf/issues/6680 + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + csv_buf = df.to_csv() + + with pytest.raises( + NotImplementedError, + match=re.escape( + "keep_default_na=False is currently not supported, please refer " + "to: https://github.com/rapidsai/cudf/issues/6680" + ), + ): + cudf.read_csv(StringIO(csv_buf), keep_default_na=False) + + +def test_csv_writer_datetime_sep_error(): + # TODO: Remove this test once following + # issues is fixed: https://github.com/rapidsai/cudf/issues/6699 + df = cudf.DataFrame( + {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")} + ) + + with pytest.raises( + ValueError, + match=re.escape( + "sep cannot be '-' when writing a datetime64 dtype to csv, " + "refer to: https://github.com/rapidsai/cudf/issues/6699" + ), + ): + df.to_csv(sep="-")