Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Column.to_pandas return Index instead of Series #15833

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,12 +789,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif arrow_type:
raise NotImplementedError(f"{arrow_type=} is not implemented.")

Expand Down Expand Up @@ -828,7 +827,7 @@ def to_pandas(
data = pd.Categorical.from_codes(
codes, categories=cats.to_pandas(), ordered=col.ordered
)
return pd.Series(data, index=index)
return pd.Index(data)

def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array."""
Expand Down
13 changes: 3 additions & 10 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,9 @@ def __repr__(self):
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
"""Convert object to pandas type.

The default implementation falls back to PyArrow for the conversion.
Expand All @@ -208,15 +207,9 @@ def to_pandas(
raise NotImplementedError(f"{nullable=} is not implemented.")
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
return pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
else:
pd_series = pa_array.to_pandas()

if index is not None:
pd_series.index = index
return pd_series
return pd.Index(pa_array.to_pandas())

@property
def values_host(self) -> "np.ndarray":
Expand Down
20 changes: 4 additions & 16 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,27 +840,15 @@ def __init__(
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
) -> pd.Index:
if arrow_type or nullable:
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
else:
series = self._local_time.to_pandas().dt.tz_localize(
return self._local_time.to_pandas().tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)
if index is not None:
series.index = index
return series

def to_arrow(self):
return pa.compute.assume_timezone(
Expand Down
15 changes: 3 additions & 12 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
from typing import Optional

import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -109,28 +107,21 @@ def as_interval_column(self, dtype):
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
# Note: This does not handle null values in the interval column.
# However, this exact sequence (calling __from_arrow__ on the output of
# self.to_arrow) is currently the best known way to convert interval
# types into pandas (trying to convert the underlying numerical columns
# directly is problematic), so we're stuck with this for now.
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif arrow_type:
raise NotImplementedError(f"{arrow_type=} is not implemented.")

pd_type = self.dtype.to_pandas()
return pd.Series(
pd_type.__from_arrow__(self.to_arrow()), index=index, dtype=pd_type
)
return pd.Index(pd_type.__from_arrow__(self.to_arrow()), dtype=pd_type)

def element_indexing(self, index: int):
result = super().element_indexing(index)
Expand Down
20 changes: 4 additions & 16 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,25 +292,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
# Can't rely on Column.to_pandas implementation for lists.
# Need to perform `to_pylist` to preserve list types.
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
) -> pd.Index:
if arrow_type or nullable:
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
else:
return pd.Series(pa_array.tolist(), dtype="object", index=index)
return pd.Index(self.to_arrow().tolist(), dtype="object")


class ListMethods(ColumnMethods):
Expand Down
17 changes: 6 additions & 11 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,18 +674,13 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
elif (
nullable
and (
Expand All @@ -697,11 +692,11 @@ def to_pandas(
):
arrow_array = self.to_arrow()
pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) # type: ignore[attr-defined]
return pd.Series(pandas_array, copy=False, index=index)
return pd.Index(pandas_array, copy=False)
elif self.dtype.kind in set("iuf") and not self.has_nulls():
return pd.Series(self.values_host, copy=False, index=index)
return pd.Index(self.values_host, copy=False)
else:
return super().to_pandas(index=index, nullable=nullable)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)

def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
col_dtype = self.dtype
Expand Down
17 changes: 4 additions & 13 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5783,23 +5783,14 @@ def values(self) -> cupy.ndarray:
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
)
elif nullable:
) -> pd.Index:
if nullable and not arrow_type:
pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
return pd.Series(pandas_array, copy=False, index=index)
return pd.Index(pandas_array, copy=False)
else:
return super().to_pandas(index=index, nullable=nullable)
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)

def can_cast_safely(self, to_dtype: Dtype) -> bool:
to_dtype = cudf.api.types.dtype(to_dtype)
Expand Down
19 changes: 4 additions & 15 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from __future__ import annotations

from functools import cached_property
from typing import Optional

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -60,25 +59,15 @@ def to_arrow(self):
def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
arrow_type: bool = False,
) -> pd.Series:
) -> pd.Index:
# We cannot go via Arrow's `to_pandas` because of the following issue:
# https://issues.apache.org/jira/browse/ARROW-12680
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
pa_array = self.to_arrow()
if arrow_type:
return pd.Series(
pd.arrays.ArrowExtensionArray(pa_array), index=index
)
if arrow_type or nullable:
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
else:
return pd.Series(pa_array.tolist(), dtype="object", index=index)
return pd.Index(self.to_arrow().tolist(), dtype="object")

@cached_property
def memory_usage(self):
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5321,9 +5321,7 @@ def to_pandas(
"""
out_index = self.index.to_pandas()
out_data = {
i: col.to_pandas(
index=out_index, nullable=nullable, arrow_type=arrow_type
)
i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
for i, col in enumerate(self._data.columns)
}

Expand Down
45 changes: 8 additions & 37 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1562,10 +1562,11 @@ def any(self):
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.Index:
return pd.Index(
self._values.to_pandas(nullable=nullable, arrow_type=arrow_type),
name=self.name,
result = self._column.to_pandas(
nullable=nullable, arrow_type=arrow_type
)
result.name = self.name
return result

def append(self, other):
if is_list_like(other):
Expand Down Expand Up @@ -2185,23 +2186,10 @@ def isocalendar(self):
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.DatetimeIndex:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

result = self._values.to_pandas(arrow_type=arrow_type)
if arrow_type:
return pd.Index(result, name=self.name)
else:
freq = (
self._freq._maybe_as_fast_pandas_offset()
if self._freq is not None
else None
)
return pd.DatetimeIndex(result, name=self.name, freq=freq)
result = super().to_pandas(nullable=nullable, arrow_type=arrow_type)
if not arrow_type and self._freq is not None:
result.freq = self._freq._maybe_as_fast_pandas_offset()
return result

@_cudf_nvtx_annotate
def _get_dt_field(self, field):
Expand Down Expand Up @@ -2521,23 +2509,6 @@ def __getitem__(self, index):
return pd.Timedelta(value)
return value

@_cudf_nvtx_annotate
def to_pandas(
self, *, nullable: bool = False, arrow_type: bool = False
) -> pd.TimedeltaIndex:
if arrow_type and nullable:
raise ValueError(
f"{arrow_type=} and {nullable=} cannot both be set."
)
elif nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

result = self._values.to_pandas(arrow_type=arrow_type)
if arrow_type:
return pd.Index(result, name=self.name)
else:
return pd.TimedeltaIndex(result, name=self.name)

@property # type: ignore
@_cudf_nvtx_annotate
def days(self):
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2022,11 +2022,11 @@ def to_pandas(
index = self.index.to_pandas()
else:
index = None # type: ignore[assignment]
s = self._column.to_pandas(
index=index, nullable=nullable, arrow_type=arrow_type
return pd.Series(
self._column.to_pandas(nullable=nullable, arrow_type=arrow_type),
index=index,
name=self.name,
)
s.name = self.name
return s

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_cuda_array_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
a = cudf.Series(cupy.asarray([1, 2, 3]))._column
a = cudf.core.column.as_column(a)
b = cupy.asarray([1, 1, 1]) # noqa: F841
assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
assert_eq(pd.Index([1, 2, 3]), a.to_pandas())

a = cudf.Series(cupy.asarray([1, 2, 3]))._column
a.name = "b"
b = cupy.asarray([1, 1, 1]) # noqa: F841
assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
assert_eq(pd.Index([1, 2, 3]), a.to_pandas())


@pytest.mark.xfail(
Expand Down
Loading