From 637334b9fbac1cc58887190d6a3d4cd0d07ce05e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 15 May 2024 18:50:57 -0700
Subject: [PATCH] Fix id_vars and value_vars not accepting string scalars

---
 python/cudf/cudf/core/reshape.py       | 27 +++++++++++++-------------
 python/cudf/cudf/tests/test_reshape.py | 27 ++++++++++++++++++--------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 26d91bed173..0b44ab58f30 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -2,10 +2,8 @@
 
 import itertools
 import warnings
-from collections import abc
 from typing import Dict, Optional
 
-import cupy
 import numpy as np
 import pandas as pd
 
@@ -590,7 +588,7 @@ def melt(
 
     # id_vars
     if id_vars is not None:
-        if not isinstance(id_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(id_vars):
             id_vars = [id_vars]
         id_vars = list(id_vars)
         missing = set(id_vars) - set(frame._column_names)
@@ -604,7 +602,7 @@ def melt(
 
     # value_vars
     if value_vars is not None:
-        if not isinstance(value_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(value_vars):
             value_vars = [value_vars]
         value_vars = list(value_vars)
         missing = set(value_vars) - set(frame._column_names)
@@ -658,21 +656,22 @@ def _tile(A, reps):
     # Step 2: add variable
     nval = len(value_vars)
     dtype = min_unsigned_type(nval)
-    temp = cudf.Series(cupy.repeat(cupy.arange(nval, dtype=dtype), N))
 
     if not var_name:
         var_name = "variable"
 
-    mdata[var_name] = cudf.Series(
-        cudf.core.column.build_categorical_column(
-            categories=value_vars,
-            codes=temp._column,
-            mask=temp._column.base_mask,
-            size=temp._column.size,
-            offset=temp._column.offset,
-            ordered=False,
+    if not value_vars:
+        # TODO: Use frame._data.label_dtype when it's more consistently set
+        var_data = cudf.Series(
+            value_vars, dtype=frame._data.to_pandas_index().dtype
         )
-    )
+    else:
+        var_data = (
+            cudf.Series(value_vars)
+            .take(np.repeat(np.arange(nval, dtype=dtype), N))
+            .reset_index(drop=True)
+        )
+    mdata[var_name] = var_data
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index d618669755d..daa1e70808f 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -8,7 +8,6 @@
 import pytest
 
 import cudf
-from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import (
@@ -71,15 +70,10 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 
     gdf = cudf.from_pandas(pdf)
 
-    got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
+    got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
     got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars)
 
     expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars)
-    # pandas' melt makes the 'variable' column of 'object' type (string)
-    # cuDF's melt makes it Categorical because it doesn't support strings
-    expect["variable"] = expect["variable"].astype(
-        got["variable"].dtype.to_pandas()
-    )
 
     assert_eq(expect, got)
 
@@ -98,11 +92,28 @@ def test_melt_many_columns():
     grid_df_d = cudf.melt(
         df_d, id_vars=["id"], var_name="d", value_name="sales"
     )
-    grid_df_d["d"] = grid_df_d["d"].astype("str")
+    grid_df_d["d"] = grid_df_d["d"]
 
     assert_eq(grid_df, grid_df_d)
 
 
+def test_melt_str_scalar_id_var():
+    data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]}
+    result = cudf.melt(
+        cudf.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    expected = pd.melt(
+        pd.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(