From 9f0c30b022fe085f5388c98e849f67a2a313afa5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 29 May 2024 23:56:21 +0000
Subject: [PATCH 1/2] Add a test

---
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 8b18e53d320..d76d5eb8065 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1984,6 +1984,18 @@ def test_from_arrow(nelem, data_type):
     np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy())
 
 
+def test_from_arrow_chunked_categories():
+    # Verify that categories are properly deduplicated across chunked arrays.
+    indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
+    dictionary = pa.array(["foo", "bar", "baz"])
+    dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
+    chunked_array = pa.chunked_array([dict_array, dict_array])
+    table = pa.table({"a": chunked_array})
+    df = cudf.DataFrame.from_arrow(table)
+    final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist()
+    assert sorted(final_dictionary) == sorted(dictionary.to_pylist())
+
+
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_to_arrow(nelem, data_type):

From 185914949422c53132e5970bb3cf0a98f38bd642 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 30 May 2024 00:10:21 +0000
Subject: [PATCH 2/2] Minimal fix for categoricals

---
 python/cudf/cudf/core/frame.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7b561906afb..d60c206ac24 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -897,6 +897,13 @@ def from_arrow(cls, data: pa.Table) -> Self:
         # so handling indices and dictionary as two different columns.
         # This needs be removed once we have hooked libcudf dictionary32
         # with categorical.
+        if any(
+            isinstance(x.type, pa.DictionaryType)
+            and isinstance(x, pa.ChunkedArray)
+            for x in data
+        ):
+            data = data.combine_chunks()
+
         dict_indices = {}
         dict_dictionaries = {}
         dict_ordered = {}