Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Enable workaround to write categorical columns in csv #6829

Merged
merged 5 commits into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
- PR #6780 Move `cudf::cast` tests to separate test file
- PR #6789 Rename `unary_op` to `unary_operator`
- PR #6770 Support building decimal columns with Table.TestBuilder
- PR #6829 Enable workaround to write categorical columns in csv
- PR #6819 Use CMake 3.19 for RMM when building cuDF jar
- PR #6833 Use settings.xml if existing for internal build

Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,15 @@ cpdef write_csv(


def _get_cudf_compatible_str_from_dtype(dtype):
# TODO: Remove this Error message once the
# following issue is fixed:
# https://github.com/rapidsai/cudf/issues/3960
if cudf.utils.dtypes.is_categorical_dtype(dtype):
raise NotImplementedError(
"CategoricalDtype as dtype is not yet "
"supported in CSV reader"
)

if (
str(dtype) in cudf.utils.dtypes.ALL_TYPES or
str(dtype) in {
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from nvtx import annotate

import cudf
from cudf import _lib as libcudf
from cudf.utils import ioutils
from cudf.utils.dtypes import is_scalar
Expand Down Expand Up @@ -58,6 +59,14 @@ def read_csv(
if na_values is not None and is_scalar(na_values):
na_values = [na_values]

if keep_default_na is False:
# TODO: Remove this error once the following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6680
raise NotImplementedError(
"keep_default_na=False is currently not supported, please refer "
"to: https://github.com/rapidsai/cudf/issues/6680"
)

vuule marked this conversation as resolved.
Show resolved Hide resolved
return libcudf.csv.read_csv(
filepath_or_buffer,
lineterminator=lineterminator,
Expand Down Expand Up @@ -127,6 +136,34 @@ def to_csv(
"Dataframe doesn't have the labels provided in columns"
)

if sep == "-":
# TODO: Remove this error once following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6699
if any(
isinstance(col, cudf.core.column.DatetimeColumn)
for col in df._data.columns
):
raise ValueError(
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"sep cannot be '-' when writing a datetime64 dtype to csv, "
"refer to: https://github.com/rapidsai/cudf/issues/6699"
)

# TODO: Need to typecast categorical columns to the underlying
# categories dtype to write the actual data to csv. Remove this
# workaround once following issue is fixed:
# https://github.com/rapidsai/cudf/issues/6661
if any(
isinstance(col, cudf.core.column.CategoricalColumn)
for col in df._data.columns
) or isinstance(df.index, cudf.CategoricalIndex):
df = df.copy(deep=False)
for col_name, col in df._data.items():
if isinstance(col, cudf.core.column.CategoricalColumn):
df._data[col_name] = col.astype(col.cat().categories.dtype)

if isinstance(df.index, cudf.CategoricalIndex):
df.index = df.index.astype(df.index.categories.dtype)

rows_per_chunk = chunksize if chunksize else len(df)

if ioutils.is_fsspec_open_file(path_or_buf):
Expand Down
88 changes: 88 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv
import gzip
import os
import re
import shutil
from collections import OrderedDict
from io import BytesIO, StringIO
Expand Down Expand Up @@ -1841,3 +1842,90 @@ def test_csv_reader_datetime_dtypes(dtype):
actual = cudf.read_csv(StringIO(buf), dtype=dtype)

assert_eq(expected, actual)


@pytest.mark.parametrize(
"df",
[
cudf.DataFrame(
{
"a": cudf.Series([1, 2, 3, 1, 2], dtype="category"),
"b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"),
}
),
cudf.DataFrame(
{
"a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"),
"b": cudf.Series(
[None, "c", None, "b", "a"], dtype="category"
),
}
),
cudf.DataFrame(
{
"b": cudf.Series(
[1.1, 2, 3, 1.1, 2],
dtype="category",
index=cudf.CategoricalIndex(
["abc", "def", "ghi", "jkl", "xyz"]
),
)
}
),
],
)
def test_csv_writer_category(df):
pdf = df.to_pandas()

expected = pdf.to_csv()
actual = df.to_csv()

assert expected == actual


def test_csv_reader_category_error():
# TODO: Remove this test once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/3960
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
csv_buf = df.to_csv()

with pytest.raises(
NotImplementedError,
match=re.escape(
"CategoricalDtype as dtype is not yet " "supported in CSV reader"
),
):
cudf.read_csv(StringIO(csv_buf), dtype="category")


def test_csv_reader_keep_default_na_error():
# TODO: Remove this test once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/6680
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
csv_buf = df.to_csv()

with pytest.raises(
NotImplementedError,
match=re.escape(
"keep_default_na=False is currently not supported, please refer "
"to: https://github.com/rapidsai/cudf/issues/6680"
),
):
cudf.read_csv(StringIO(csv_buf), keep_default_na=False)


def test_csv_writer_datetime_sep_error():
# TODO: Remove this test once following
# issues is fixed: https://github.com/rapidsai/cudf/issues/6699
df = cudf.DataFrame(
{"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")}
)

with pytest.raises(
ValueError,
match=re.escape(
"sep cannot be '-' when writing a datetime64 dtype to csv, "
"refer to: https://github.com/rapidsai/cudf/issues/6699"
),
):
df.to_csv(sep="-")