From d3dbce403b4220ab90fc10330eaa45eb5770ea23 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 27 Feb 2024 17:30:28 +0000 Subject: [PATCH 1/2] Fix List.to_pandas() --- python/cudf/cudf/core/column/lists.py | 18 ++++++++++++++++++ python/cudf/cudf/tests/test_list.py | 4 +++- .../dask_cudf/dask_cudf/tests/test_groupby.py | 6 +----- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b2205af34e8..e9548dd1dd8 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -6,6 +6,7 @@ from typing import List, Optional, Sequence, Tuple, Union import numpy as np +import pandas as pd import pyarrow as pa from typing_extensions import Self @@ -288,6 +289,23 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: ) return lc + def to_pandas( + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + ) -> pd.Series: + # Can't rely on Column.to_pandas implementation for lists. + # Need to perform `to_pylist` to preserve list types. + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") + + pd_series = pd.Series(self.to_arrow().to_pylist()) + + if index is not None: + pd_series.index = index + return pd_series + class ListMethods(ColumnMethods): """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 7ae7ae34b97..f04cb8a91a4 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import functools import operator @@ -41,6 +41,8 @@ def test_create_list_series(data): expect = pd.Series(data) got = cudf.Series(data) assert_eq(expect, got) + assert isinstance(got[0], type(expect[0])) + assert isinstance(got.to_pandas()[0], type(expect[0])) @pytest.mark.parametrize( diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index c8cc6e65fa5..30251b88dea 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -702,13 +702,9 @@ def test_is_supported(arg, supported): def test_groupby_unique_lists(): df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]}) - ddf = dd.from_pandas(df, 2) gdf = cudf.from_pandas(df) gddf = dask_cudf.from_cudf(gdf, 2) - dd.assert_eq( - ddf.groupby("a").b.unique().compute(), - gddf.groupby("a").b.unique().compute(), - ) + dd.assert_eq( gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(), From 82817ff0b5c32eed63c895ebd0ca35f49d3dea08 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Feb 2024 18:03:57 -0600 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- python/cudf/cudf/core/column/lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index e9548dd1dd8..d1bf0b74d3c 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -300,7 +300,7 @@ def to_pandas( if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - pd_series = pd.Series(self.to_arrow().to_pylist()) + pd_series = pd.Series(self.to_arrow().to_pylist(), dtype="object") if index is not None: pd_series.index = index