diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index a0a8279b213..d8eb6134042 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -163,7 +163,7 @@ def one_hot_encode(Column input_column, Column categories): move(c_result.second), owner=owner, column_names=[ - x if x is not None else 'null' for x in pylist_categories + x if x is not None else '' for x in pylist_categories ] ) return encodings diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 6d5bfde7740..baaca0b806f 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -7,7 +7,6 @@ import pytest import cudf -from cudf import DataFrame from cudf.testing._utils import assert_eq pytestmark = pytest.mark.spilling @@ -24,7 +23,7 @@ ], ) def test_get_dummies(data, index): - gdf = DataFrame({"x": data}, index=index) + gdf = cudf.DataFrame({"x": data}, index=index) pdf = pd.DataFrame({"x": data}, index=index) encoded_expected = pd.get_dummies(pdf, prefix="test") @@ -66,16 +65,16 @@ def test_onehot_get_dummies_multicol(n_cols): @pytest.mark.parametrize("dummy_na", [True, False]) def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): pdf = pd.DataFrame({"a": [0, 1, np.nan]}) - df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) + df = cudf.DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"]) with pytest.warns(FutureWarning): - got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) + actual = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"]) if dummy_na and nan_as_null: - got = got.rename(columns={"a_null": "a_nan"})[expected.columns] + actual = actual.rename(columns={"a_": "a_nan"})[expected.columns] - assert_eq(expected, got) + assert_eq(expected, actual) @pytest.mark.parametrize( @@ -104,7 +103,7 @@ def test_get_dummies_prefix_sep(prefix, prefix_sep): "third": ["ji", "ji", "ji"], } - gdf = DataFrame(data) + gdf = cudf.DataFrame(data) pdf = pd.DataFrame(data) encoded_expected = pd.get_dummies( @@ -122,15 +121,11 @@ def test_get_dummies_with_nan(): df = cudf.DataFrame( {"a": cudf.Series([1, 2, np.nan, None], nan_as_null=False)} ) - expected = cudf.DataFrame( - { - "a_1.0": [1, 0, 0, 0], - "a_2.0": [0, 1, 0, 0], - "a_nan": [0, 0, 1, 0], - "a_null": [0, 0, 0, 1], - }, - dtype="uint8", + + expected = pd.get_dummies( + df.to_pandas(nullable=True), dummy_na=True, columns=["a"] ) + with pytest.warns(FutureWarning): actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) @@ -166,16 +161,11 @@ def test_get_dummies_array_like(data, prefix_sep, prefix, dtype): def test_get_dummies_array_like_with_nan(): ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False) - expected = cudf.DataFrame( - { - "a_0.1": [1, 0, 0, 0, 0], - "a_2.0": [0, 1, 0, 0, 0], - "a_3.0": [0, 0, 1, 0, 0], - "a_nan": [0, 0, 0, 0, 1], - "a_null": [0, 0, 0, 1, 0], - }, - dtype="uint8", + + expected = pd.get_dummies( + ser.to_pandas(nullable=True), dummy_na=True, prefix="a", prefix_sep="_" ) + with pytest.warns(FutureWarning): actual = cudf.get_dummies( ser, dummy_na=True, prefix="a", prefix_sep="_"