Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Remove **kwargs from to_pandas, raise if nullable is not implemented #14438

Merged
merged 14 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ def notna(self):
"""
raise NotImplementedError

def to_pandas(self, nullable=False):
def to_pandas(self, *, nullable: bool = False):
"""
Convert to a Pandas Index.

Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -974,8 +974,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
)

def to_pandas(
self, index: Optional[pd.Index] = None, **kwargs
self, *, index: Optional[pd.Index] = None, nullable: bool = False
) -> pd.Series:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

if self.categories.dtype.kind == "f":
new_mask = bools_to_mask(self.notnull())
col = column.build_categorical_column(
Expand Down
13 changes: 8 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,20 @@ def __repr__(self):
)

def to_pandas(
self, index: Optional[pd.Index] = None, **kwargs
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
) -> pd.Series:
"""Convert object to pandas type.

The default implementation falls back to PyArrow for the conversion.
"""
# This default implementation does not handle nulls in any meaningful
# way, but must consume the parameter to avoid passing it to PyArrow
# (which does not recognize it).
kwargs.pop("nullable", None)
pd_series = self.to_arrow().to_pandas(**kwargs)
# way
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
pd_series = self.to_arrow().to_pandas()
wence- marked this conversation as resolved.
Show resolved Hide resolved

if index is not None:
pd_series.index = index
Expand Down
17 changes: 12 additions & 5 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,12 @@ def day_of_year(self) -> ColumnBase:

def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
**kwargs,
) -> "cudf.Series":
) -> pd.Series:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772

Expand Down Expand Up @@ -686,13 +688,18 @@ def __init__(

def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
**kwargs,
) -> "cudf.Series":
return self._local_time.to_pandas().dt.tz_localize(
) -> pd.Series:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
series = self._local_time.to_pandas().dt.tz_localize(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)
if index is not None:
series.index = index
return series

def to_arrow(self):
return pa.compute.assume_timezone(
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,15 @@ def as_interval_column(self, dtype, **kwargs):
raise ValueError("dtype must be IntervalDtype")

def to_pandas(
self, index: Optional[pd.Index] = None, **kwargs
self, *, index: Optional[pd.Index] = None, nullable: bool = False
) -> pd.Series:
# Note: This does not handle null values in the interval column.
# However, this exact sequence (calling __from_arrow__ on the output of
# self.to_arrow) is currently the best known way to convert interval
# types into pandas (trying to convert the underlying numerical columns
# directly is problematic), so we're stuck with this for now.
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return pd.Series(
self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,9 +679,9 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:

def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
**kwargs,
) -> pd.Series:
if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
Expand All @@ -691,7 +691,7 @@ def to_pandas(
elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls():
pd_series = pd.Series(self.values_host, copy=False)
else:
pd_series = self.to_arrow().to_pandas(**kwargs)
pd_series = self.to_arrow().to_pandas()

if index is not None:
pd_series.index = index
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5756,15 +5756,15 @@ def values(self) -> cupy.ndarray:

def to_pandas(
self,
*,
index: Optional[pd.Index] = None,
nullable: bool = False,
**kwargs,
) -> pd.Series:
if nullable:
pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
pd_series = pd.Series(pandas_array, copy=False)
else:
pd_series = self.to_arrow().to_pandas(**kwargs)
pd_series = self.to_arrow().to_pandas()

if index is not None:
pd_series.index = index
Expand Down
10 changes: 4 additions & 6 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,14 @@ def to_arrow(self):
)

def to_pandas(
self, index: Optional[pd.Index] = None, **kwargs
self, *, index: Optional[pd.Index] = None, nullable: bool = False
) -> pd.Series:
# We cannot go via Arrow's `to_pandas` because of the following issue:
# https://issues.apache.org/jira/browse/ARROW-12680
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

pd_series = pd.Series(self.to_arrow().tolist(), dtype="object")

if index is not None:
pd_series.index = index
return pd_series
return pd.Series(self.to_arrow().tolist(), dtype="object", index=index)

@cached_property
def memory_usage(self):
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,17 @@ def to_arrow(self) -> pa.Array:
)

def to_pandas(
self, index=None, nullable: bool = False, **kwargs
self, *, index: Optional[pd.Index] = None, nullable: bool = False
) -> pd.Series:
# `copy=True` workaround until following issue is fixed:
# https://issues.apache.org/jira/browse/ARROW-9772

# Pandas only supports `timedelta64[ns]` dtype
# and conversion to this type is necessary to make
# arrow to pandas conversion happen for large values.
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")

return pd.Series(
self.astype("timedelta64[ns]").to_arrow(),
copy=True,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5128,7 +5128,7 @@ def describe(
)

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False, **kwargs):
def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
"""
Convert to a Pandas DataFrame.

Expand Down
17 changes: 11 additions & 6 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,9 @@ def dtype(self):
return _maybe_convert_to_default_type(dtype)

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False):
def to_pandas(self, *, nullable: bool = False) -> pd.RangeIndex:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return pd.RangeIndex(
start=self._start,
stop=self._stop,
Expand Down Expand Up @@ -1567,7 +1569,7 @@ def _clean_nulls_from_index(self):
def any(self):
return self._values.any()

def to_pandas(self, nullable=False):
def to_pandas(self, *, nullable: bool = False) -> pd.Index:
return pd.Index(
self._values.to_pandas(nullable=nullable), name=self.name
)
Expand Down Expand Up @@ -2508,7 +2510,9 @@ def isocalendar(self):
return cudf.core.tools.datetimes._to_iso_calendar(self)

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False):
def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
# TODO: no need to convert to nanos with Pandas 2.x
if isinstance(self.dtype, pd.DatetimeTZDtype):
nanos = self._values.astype(
Expand Down Expand Up @@ -2832,11 +2836,12 @@ def __getitem__(self, index):
return value

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False):
def to_pandas(self, *, nullable: bool = False) -> pd.TimedeltaIndex:
if nullable:
raise NotImplementedError(f"{nullable=} is not implemented.")
return pd.TimedeltaIndex(
self._values.to_pandas(),
name=self.name,
unit=self._values.time_unit,
)

@property # type: ignore
Expand Down Expand Up @@ -3300,7 +3305,7 @@ def __init__(self, values, copy=False, **kwargs):
super().__init__(values, **kwargs)

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False):
def to_pandas(self, *, nullable: bool = False) -> pd.Index:
return pd.Index(
self.to_numpy(na_value=None),
name=self.name,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1592,7 +1592,7 @@ def droplevel(self, level=-1):
return mi

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False, **kwargs):
def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:
result = self.to_frame(
index=False, name=list(range(self.nlevels))
).to_pandas(nullable=nullable)
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2064,7 +2064,9 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
return super().any(axis, skipna, level, **kwargs)

@_cudf_nvtx_annotate
def to_pandas(self, index=True, nullable=False, **kwargs):
def to_pandas(
self, *, index: bool = True, nullable: bool = False
) -> pd.Series:
"""
Convert to a Pandas Series.

Expand Down Expand Up @@ -2126,6 +2128,8 @@ def to_pandas(self, index=True, nullable=False, **kwargs):
"""
if index is True:
index = self.index.to_pandas()
else:
index = None # type: ignore[assignment]
s = self._column.to_pandas(index=index, nullable=nullable)
s.name = self.name
return s
Expand Down
16 changes: 15 additions & 1 deletion python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,13 @@ def read_json(
return df


def maybe_return_nullable_pd_obj(cudf_obj):
try:
return cudf_obj.to_pandas(nullable=True)
except NotImplementedError:
return cudf_obj.to_pandas(nullable=False)


@ioutils.doc_to_json()
def to_json(
cudf_val,
Expand Down Expand Up @@ -228,7 +235,14 @@ def to_json(
return path_or_buf.read()
elif engine == "pandas":
warnings.warn("Using CPU via Pandas to write JSON dataset")
pd_value = cudf_val.to_pandas(nullable=True)
if isinstance(cudf_val, cudf.DataFrame):
pd_data = {
col: maybe_return_nullable_pd_obj(series)
for col, series in cudf_val.items()
}
pd_value = pd.DataFrame(pd_data)
else:
pd_value = maybe_return_nullable_pd_obj(cudf_val)
Comment on lines +238 to +245
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just handling that nullable=True was previously silently ignored for to_pandas calls that didn't support it, and now it is loud?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct!

return pd.io.json.to_json(
path_or_buf,
pd_value,
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,12 @@ def assert_column_equal(
left = left.astype(left.categories.dtype)
right = right.astype(right.categories.dtype)
if not columns_equal:
ldata = str([val for val in left.to_pandas(nullable=True)])
rdata = str([val for val in right.to_pandas(nullable=True)])
try:
ldata = str([val for val in left.to_pandas(nullable=True)])
rdata = str([val for val in right.to_pandas(nullable=True)])
except NotImplementedError:
ldata = str([val for val in left.to_pandas(nullable=False)])
rdata = str([val for val in right.to_pandas(nullable=False)])
try:
diff = 0
for i in range(left.size):
Expand Down
12 changes: 10 additions & 2 deletions python/cudf/cudf/tests/dataframe/test_conversion.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
import pandas as pd
import pytest

import cudf
from cudf.testing._utils import assert_eq
Expand All @@ -26,13 +27,20 @@ def test_convert_dtypes():
"category",
"datetime64[ns]",
]
nullable_columns = list("abcdef")
non_nullable_columns = list(set(data.keys()).difference(nullable_columns))

df = pd.DataFrame(
{
k: pd.Series(v, dtype=d)
for k, v, d in zip(data.keys(), data.values(), dtypes)
}
)
gdf = cudf.DataFrame.from_pandas(df)
expect = df.convert_dtypes()
got = gdf.convert_dtypes().to_pandas(nullable=True)
expect = df[nullable_columns].convert_dtypes()
got = gdf[nullable_columns].convert_dtypes().to_pandas(nullable=True)
assert_eq(expect, got)

with pytest.raises(NotImplementedError):
# category and datetime64[ns] are not nullable
gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True)
3 changes: 2 additions & 1 deletion python/cudf/cudf/tests/series/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def test_convert_dtypes(data, dtype):

# because we don't have distinct nullable types, we check that we
# get the same result if we convert to nullable pandas types:
got = gs.convert_dtypes().to_pandas(nullable=True)
nullable = dtype not in ("category", "datetime64[ns]")
got = gs.convert_dtypes().to_pandas(nullable=nullable)
assert_eq(expect, got)


Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,19 @@ def test_convert_edge_cases(data, original_timezone, target_timezone):
assert_eq(expect, got)


def test_to_pandas_index_true_timezone():
data = [
"2008-05-12",
"2008-12-12",
"2009-05-12",
]
dti = cudf.DatetimeIndex(data).tz_localize("UTC")
ser = cudf.Series(dti, index=list("abc"))
result = ser.to_pandas(index=True)
expected = pd.Series(pd.to_datetime(data, utc=True), index=list("abc"))
assert_eq(result, expected)


def test_tz_aware_attributes_local():
data = [
"2008-05-12 13:50:00",
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def test_df_cat_set_index():
df["b"] = np.arange(len(df))
got = df.set_index("a")

pddf = df.to_pandas(nullable_pd_dtype=False)
pddf = df.to_pandas()
expect = pddf.set_index("a")

assert_eq(got, expect)
Expand All @@ -239,7 +239,7 @@ def test_df_cat_sort_index():
df["b"] = np.arange(len(df))

got = df.set_index("a").sort_index()
expect = df.to_pandas(nullable_pd_dtype=False).set_index("a").sort_index()
expect = df.to_pandas().set_index("a").sort_index()

assert_eq(got, expect)

Expand Down
Loading