From 637334b9fbac1cc58887190d6a3d4cd0d07ce05e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 May 2024 18:50:57 -0700 Subject: [PATCH] Fix id_vars and value_vars not accepting string scalars --- python/cudf/cudf/core/reshape.py | 27 +++++++++++++------------- python/cudf/cudf/tests/test_reshape.py | 27 ++++++++++++++++++-------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 26d91bed173..0b44ab58f30 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -2,10 +2,8 @@ import itertools import warnings -from collections import abc from typing import Dict, Optional -import cupy import numpy as np import pandas as pd @@ -590,7 +588,7 @@ def melt( # id_vars if id_vars is not None: - if not isinstance(id_vars, abc.Sequence): + if cudf.api.types.is_scalar(id_vars): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame._column_names) @@ -604,7 +602,7 @@ def melt( # value_vars if value_vars is not None: - if not isinstance(value_vars, abc.Sequence): + if cudf.api.types.is_scalar(value_vars): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame._column_names) @@ -658,21 +656,22 @@ def _tile(A, reps): # Step 2: add variable nval = len(value_vars) dtype = min_unsigned_type(nval) - temp = cudf.Series(cupy.repeat(cupy.arange(nval, dtype=dtype), N)) if not var_name: var_name = "variable" - mdata[var_name] = cudf.Series( - cudf.core.column.build_categorical_column( - categories=value_vars, - codes=temp._column, - mask=temp._column.base_mask, - size=temp._column.size, - offset=temp._column.offset, - ordered=False, + if not value_vars: + # TODO: Use frame._data.label_dtype when it's more consistently set + var_data = cudf.Series( + value_vars, dtype=frame._data.to_pandas_index().dtype ) - ) + else: + var_data = ( + cudf.Series(value_vars) + .take(np.repeat(np.arange(nval, dtype=dtype), N)) + .reset_index(drop=True) + ) + mdata[var_name] = var_data # Step 3: add values mdata[value_name] = cudf.Series._concat( diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index d618669755d..daa1e70808f 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -8,7 +8,6 @@ import pytest import cudf -from cudf import melt as cudf_melt from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing._utils import ( @@ -71,15 +70,10 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): gdf = cudf.from_pandas(pdf) - got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars) + got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars) got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars) expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars) - # pandas' melt makes the 'variable' column of 'object' type (string) - # cuDF's melt makes it Categorical because it doesn't support strings - expect["variable"] = expect["variable"].astype( - got["variable"].dtype.to_pandas() - ) assert_eq(expect, got) @@ -98,11 +92,28 @@ def test_melt_many_columns(): grid_df_d = cudf.melt( df_d, id_vars=["id"], var_name="d", value_name="sales" ) - grid_df_d["d"] = grid_df_d["d"].astype("str") + grid_df_d["d"] = grid_df_d["d"] assert_eq(grid_df, grid_df_d) +def test_melt_str_scalar_id_var(): + data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]} + result = cudf.melt( + cudf.DataFrame(data), + id_vars="index", + var_name="column", + value_name="value", + ) + expected = pd.melt( + pd.DataFrame(data), + id_vars="index", + var_name="column", + value_name="value", + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize(