From 9678c900a484818b489b723e2568e7b7c0d0b090 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 6 Mar 2024 12:54:09 -1000
Subject: [PATCH] Avoid factorization in MultiIndex.to_pandas (#15150)

This also uncovered a bug in `DataFrame.rename` where the underlying `MultiIndex` `ColumnAccessor` was not being replaced

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15150
---
 python/cudf/cudf/core/dataframe.py       |  6 +++++-
 python/cudf/cudf/core/multiindex.py      | 15 +++++++++++----
 python/cudf/cudf/tests/test_dataframe.py | 16 ++++------------
 python/cudf/cudf/tests/test_dropna.py    | 11 +----------
 4 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1dc79127f60..6a4fe346eb1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3583,12 +3583,16 @@ def rename(
                 )
 
             if level is not None and isinstance(self.index, MultiIndex):
+                level = self.index._get_level_label(level)
                 out_index = self.index.copy(deep=copy)
-                out_index.get_level_values(level).to_frame().replace(
+                level_values = out_index.get_level_values(level)
+                level_values.to_frame().replace(
                     to_replace=list(index.keys()),
                     value=list(index.values()),
                     inplace=True,
                 )
+                out_index._data[level] = column.as_column(level_values)
+                out_index._compute_levels_and_codes()
                 out = DataFrame(index=out_index)
             else:
                 to_replace = list(index.keys())
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 315a21020a2..019daacddba 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1577,10 +1577,17 @@ def droplevel(self, level=-1):
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
-        result = self.to_frame(
-            index=False, name=list(range(self.nlevels))
-        ).to_pandas(nullable=nullable, arrow_type=arrow_type)
-        return pd.MultiIndex.from_frame(result, names=self.names)
+        # cudf uses np.iinfo(size_type_dtype).min as missing code
+        # pandas uses -1 as missing code
+        pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1)
+        return pd.MultiIndex(
+            levels=[
+                level.to_pandas(nullable=nullable, arrow_type=arrow_type)
+                for level in self.levels
+            ],
+            codes=[col.values_host for col in pd_codes._columns],
+            names=self.names,
+        )
 
     @classmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 444a4c60055..e6cf3988d23 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9332,18 +9332,10 @@ def test_dataframe_setitem_cupy_array():
     assert_eq(pdf, gdf)
 
 
-@pytest.mark.parametrize(
-    "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}]
-)
-@pytest.mark.parametrize(
-    "index",
-    [{0: 123, 1: 4, 2: 6}],
-)
-@pytest.mark.parametrize(
-    "level",
-    ["x", 0],
-)
-def test_rename_for_level_MultiIndex_dataframe(data, index, level):
+@pytest.mark.parametrize("level", ["x", 0])
+def test_rename_for_level_MultiIndex_dataframe(level):
+    data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+    index = {0: 123, 1: 4, 2: 6}
     pdf = pd.DataFrame(
         data,
         index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]),
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index f1acd7b4320..c3c8ed922f0 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -252,21 +252,12 @@ def test_dropna_index(data, dtype):
 
 @pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]])
 @pytest.mark.parametrize("how", ["all", "any"])
-def test_dropna_multiindex(data, how, request):
+def test_dropna_multiindex(data, how):
     pi = pd.MultiIndex.from_arrays(data)
     gi = cudf.from_pandas(pi)
 
     expect = pi.dropna(how)
     got = gi.dropna(how)
-
-    if how == "all" and "data0" in request.node.callspec.id:
-        request.applymarker(
-            pytest.mark.xfail(
-                reason="pandas NA value np.nan results in float type. "
-                "cuDF correctly retains int type "
-                "(https://github.com/pandas-dev/pandas/issues/44792)"
-            )
-        )
     assert_eq(expect, got)