From 9f0c30b022fe085f5388c98e849f67a2a313afa5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 29 May 2024 23:56:21 +0000 Subject: [PATCH 1/2] Add a test --- python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8b18e53d320..d76d5eb8065 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1984,6 +1984,18 @@ def test_from_arrow(nelem, data_type): np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) +def test_from_arrow_chunked_categories(): + # Verify that categories are properly deduplicated across chunked arrays. + indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) + dictionary = pa.array(["foo", "bar", "baz"]) + dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + chunked_array = pa.chunked_array([dict_array, dict_array]) + table = pa.table({"a": chunked_array}) + df = cudf.DataFrame.from_arrow(table) + final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist() + assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) + + @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_to_arrow(nelem, data_type): From 185914949422c53132e5970bb3cf0a98f38bd642 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 30 May 2024 00:10:21 +0000 Subject: [PATCH 2/2] Minimal fix for categoricals --- python/cudf/cudf/core/frame.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b561906afb..d60c206ac24 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -897,6 +897,13 @@ def from_arrow(cls, data: pa.Table) -> Self: # so handling indices and dictionary as two different columns. # This needs be removed once we have hooked libcudf dictionary32 # with categorical. + if any( + isinstance(x.type, pa.DictionaryType) + and isinstance(x, pa.ChunkedArray) + for x in data + ): + data = data.combine_chunks() + dict_indices = {} dict_dictionaries = {} dict_ordered = {}