From 8c804d4d8c746a66187b234168c4034dd522f47d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 18 Jan 2023 11:17:40 -0800 Subject: [PATCH 1/4] enable category dtype in read_csv --- python/cudf/cudf/_lib/csv.pyx | 41 ++++++++++++++++++++++-------- python/cudf/cudf/tests/test_csv.py | 28 ++++++++++++-------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 64c62be3e00..1f66ed0ee83 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.map cimport map @@ -49,6 +49,8 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table from pyarrow.lib import NativeFile +from cudf.api.types import is_hashable + ctypedef int32_t underlying_type_t_compression @@ -248,7 +250,7 @@ cdef csv_reader_options make_csv_reader_options( if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): col_type = v - if v in CSV_HEX_TYPE_MAP: + if is_hashable(v) and v in CSV_HEX_TYPE_MAP: col_type = CSV_HEX_TYPE_MAP[v] c_hex_col_names.push_back(str(k).encode()) @@ -264,7 +266,7 @@ cdef csv_reader_options make_csv_reader_options( )) ): c_dtypes_list.reserve(1) - if dtype in CSV_HEX_TYPE_MAP: + if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: dtype = CSV_HEX_TYPE_MAP[dtype] c_hex_col_indexes.push_back(0) @@ -276,7 +278,7 @@ cdef csv_reader_options make_csv_reader_options( elif isinstance(dtype, abc.Collection): c_dtypes_list.reserve(len(dtype)) for index, col_dtype in enumerate(dtype): - if col_dtype in CSV_HEX_TYPE_MAP: + if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: col_dtype = CSV_HEX_TYPE_MAP[col_dtype] c_hex_col_indexes.push_back(index) @@ -429,6 +431,25 @@ def read_csv( column_names=meta_names )) + if dtype is not None: + if isinstance(dtype, abc.Mapping): + for k, v in dtype.items(): + if cudf.api.types.is_categorical_dtype(v): + df._data[str(k)] = df._data[str(k)].astype(v) + elif ( + cudf.api.types.is_scalar(dtype) or + isinstance(dtype, ( + np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type + )) + ): + if cudf.api.types.is_categorical_dtype(dtype): + df = df.astype(dtype) + elif isinstance(dtype, abc.Collection): + for index, col_dtype in enumerate(dtype): + if cudf.api.types.is_categorical_dtype(col_dtype): + col_name = df._data.names[index] + df._data[col_name] = df._data[col_name].astype(col_dtype) + if names is not None and isinstance(names[0], (int)): df.columns = [int(x) for x in df._data] @@ -517,14 +538,14 @@ def write_csv( cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: - # TODO: Remove this Error message once the - # following issue is fixed: + # TODO: Remove this work-around Dictionary types + # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 if cudf.api.types.is_categorical_dtype(dtype): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in CSV reader" - ) + if isinstance(dtype, str): + dtype = "str" + else: + dtype = dtype.categories.dtype if isinstance(dtype, str): if str(dtype) == "date32": diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 249c4e26a86..5cde147375c 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2046,19 +2046,27 @@ def test_csv_writer_category(df): assert expected == actual -def test_csv_reader_category_error(): - # TODO: Remove this test once following - # issue is fixed: https://github.com/rapidsai/cudf/issues/3960 +@pytest.mark.parametrize( + "dtype", + [ + "category", + {"a": "category", "b": "str"}, + {"b": "category"}, + {"a": "category"}, + {"a": pd.CategoricalDtype([1, 2])}, + {"b": pd.CategoricalDtype([1, 2, 3])}, + {"b": pd.CategoricalDtype(["b", "a"]), "a": "str"}, + pd.CategoricalDtype(["a", "b"]), + ], +) +def test_csv_reader_category(dtype): df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) csv_buf = df.to_csv() - with pytest.raises( - NotImplementedError, - match=re.escape( - "CategoricalDtype as dtype is not yet " "supported in CSV reader" - ), - ): - cudf.read_csv(StringIO(csv_buf), dtype="category") + actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype) + expected = pd.read_csv(StringIO(csv_buf), dtype=dtype) + + assert_eq(expected, actual) def test_csv_writer_datetime_sep(): From 3647c1503988b787b00adc52a0d1c8d00f4457af Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 18 Jan 2023 11:44:30 -0800 Subject: [PATCH 2/4] enable more tests --- python/cudf/cudf/tests/test_csv.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 5cde147375c..2badf6482d6 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -75,8 +75,7 @@ def make_numpy_mixed_dataframe(): ) df["Float"] = np.array([9.001, 8.343, 6, 2.781]) df["Integer2"] = np.array([2345, 106, 2088, 789277]) - # Category is not yet supported from libcudf - # df["Category"] = np.array(["M", "F", "F", "F"]) + df["Category"] = np.array(["M", "F", "F", "F"]) df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) df["Boolean"] = np.array([True, False, True, False]) return df @@ -270,34 +269,22 @@ def test_csv_reader_mixed_data_delimiter_sep( gdf1 = read_csv( str(fname), - # Category is not yet supported from libcudf - # names=["1", "2", "3", "4", "5", "6", "7"], - # dtype=[ - # "int64", "date", "float64", "int64", "category", "str", "bool" - # ], - names=["1", "2", "3", "4", "5", "6"], - dtype=["int64", "date", "float64", "uint64", "str", "bool"], + names=["1", "2", "3", "4", "5", "6", "7"], + dtype=["int64", "date", "float64", "int64", "category", "str", "bool"], dayfirst=True, **cudf_arg, ) gdf2 = read_csv( str(fname), - # Category is not yet supported from libcudf - # names=["1", "2", "3", "4", "5", "6", "7"], - # dtype=[ - # "int64", "date", "float64", "int64", "category", "str", "bool" - # ], - names=["1", "2", "3", "4", "5", "6"], - dtype=["int64", "date", "float64", "uint64", "str", "bool"], + names=["1", "2", "3", "4", "5", "6", "7"], + dtype=["int64", "date", "float64", "int64", "category", "str", "bool"], dayfirst=True, **pandas_arg, ) pdf = pd.read_csv( fname, - # Category is not yet supported from libcudf - # names=["1", "2", "3", "4", "5", "6", "7"], - names=["1", "2", "3", "4", "5", "6"], + names=["1", "2", "3", "4", "5", "6", "7"], parse_dates=[1], dayfirst=True, **pandas_arg, From 157326f287d7f16978b45dd4e9c959619a1cd636 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 18 Jan 2023 18:36:42 -0800 Subject: [PATCH 3/4] add nulls --- python/cudf/cudf/tests/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 2badf6482d6..1fab39e2b23 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2047,7 +2047,7 @@ def test_csv_writer_category(df): ], ) def test_csv_reader_category(dtype): - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]}) csv_buf = df.to_csv() actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype) From ba6b16181ea9f41f1b45d0510fe1b71ba2904031 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 18 Jan 2023 18:37:25 -0800 Subject: [PATCH 4/4] update --- python/cudf/cudf/tests/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 1fab39e2b23..6066cd3b03e 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2053,7 +2053,7 @@ def test_csv_reader_category(dtype): actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype) expected = pd.read_csv(StringIO(csv_buf), dtype=dtype) - assert_eq(expected, actual) + assert_eq(expected, actual, check_dtype=True) def test_csv_writer_datetime_sep():