From 17ef8cb6a3a88ce1c2383409d2233454f24d951c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 16:07:11 -0800 Subject: [PATCH 1/9] REF: Remove **kwargs from to_pandas, raise if nullable is not implemented --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column/categorical.py | 5 +++- python/cudf/cudf/core/column/column.py | 12 ++++++---- python/cudf/cudf/core/column/datetime.py | 12 ++++++---- python/cudf/cudf/core/column/interval.py | 4 +++- python/cudf/cudf/core/column/numerical.py | 3 +-- python/cudf/cudf/core/column/string.py | 3 +-- python/cudf/cudf/core/column/struct.py | 10 ++++---- python/cudf/cudf/core/column/timedelta.py | 7 +++--- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/index.py | 17 +++++++++----- python/cudf/cudf/core/multiindex.py | 6 ++--- python/cudf/cudf/core/series.py | 4 +++- python/cudf/cudf/tests/test_index.py | 26 +++++++++++++++++---- 14 files changed, 71 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 3616ec1b542..943f363feca 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -850,7 +850,7 @@ def notna(self): """ raise NotImplementedError - def to_pandas(self, nullable=False): + def to_pandas(self, nullable: bool = False): """ Convert to a Pandas Index. diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index bab07624dfa..6ab079412c0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -974,8 +974,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: ) def to_pandas( - self, index: Optional[pd.Index] = None, **kwargs + self, index: Optional[pd.Index] = None, nullable: bool = False ) -> pd.Series: + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") + if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b4f65693d85..3fc0484211e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -203,17 +203,19 @@ def __repr__(self): ) def to_pandas( - self, index: Optional[pd.Index] = None, **kwargs + self, + index: Optional[pd.Index] = None, + nullable: bool = False, ) -> pd.Series: """Convert object to pandas type. The default implementation falls back to PyArrow for the conversion. """ # This default implementation does not handle nulls in any meaningful - # way, but must consume the parameter to avoid passing it to PyArrow - # (which does not recognize it). - kwargs.pop("nullable", None) - pd_series = self.to_arrow().to_pandas(**kwargs) + # way + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") + pd_series = self.to_arrow().to_pandas() if index is not None: pd_series.index = index diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b03b3c905a4..c473b845861 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -306,8 +306,9 @@ def to_pandas( self, index: Optional[pd.Index] = None, nullable: bool = False, - **kwargs, - ) -> "cudf.Series": + ) -> pd.Series: + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -684,8 +685,11 @@ def to_pandas( self, index: Optional[pd.Index] = None, nullable: bool = False, - **kwargs, - ) -> "cudf.Series": + ) -> pd.Series: + if index is not None: + raise NotImplementedError(f"{index=} is not implemented.") + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") return self._local_time.to_pandas().dt.tz_localize( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index d4855def832..afae4c8528e 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -126,13 +126,15 @@ def as_interval_column(self, dtype, **kwargs): raise ValueError("dtype must be IntervalDtype") def to_pandas( - self, index: Optional[pd.Index] = None, **kwargs + self, index: Optional[pd.Index] = None, nullable: bool = False ) -> pd.Series: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of # self.to_arrow) is currently the best known way to convert interval # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") return pd.Series( self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4ea49c8c7c0..151b5d5de8d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -681,7 +681,6 @@ def to_pandas( self, index: Optional[pd.Index] = None, nullable: bool = False, - **kwargs, ) -> pd.Series: if nullable and self.dtype in np_dtypes_to_pandas_dtypes: pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype] @@ -691,7 +690,7 @@ def to_pandas( elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls(): pd_series = pd.Series(self.values_host, copy=False) else: - pd_series = self.to_arrow().to_pandas(**kwargs) + pd_series = self.to_arrow().to_pandas() if index is not None: pd_series.index = index diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index eb86f555432..5820dec72b1 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5758,13 +5758,12 @@ def to_pandas( self, index: Optional[pd.Index] = None, nullable: bool = False, - **kwargs, ) -> pd.Series: if nullable: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) pd_series = pd.Series(pandas_array, copy=False) else: - pd_series = self.to_arrow().to_pandas(**kwargs) + pd_series = self.to_arrow().to_pandas() if index is not None: pd_series.index = index diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 0bb21f4c25a..3083d1ac729 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -59,16 +59,14 @@ def to_arrow(self): ) def to_pandas( - self, index: Optional[pd.Index] = None, **kwargs + self, index: Optional[pd.Index] = None, nullable: bool = False ) -> pd.Series: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") - pd_series = pd.Series(self.to_arrow().tolist(), dtype="object") - - if index is not None: - pd_series.index = index - return pd_series + return pd.Series(self.to_arrow().tolist(), dtype="object", index=index) @cached_property def memory_usage(self): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 13bb97b9a89..1e8ab43afaf 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -143,15 +143,16 @@ def to_arrow(self) -> pa.Array: null_count=self.null_count, ) - def to_pandas( - self, index=None, nullable: bool = False, **kwargs - ) -> pd.Series: + def to_pandas(self, index=None, nullable: bool = False) -> pd.Series: # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 # Pandas only supports `timedelta64[ns]` dtype # and conversion to this type is necessary to make # arrow to pandas conversion happen for large values. + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") + return pd.Series( self.astype("timedelta64[ns]").to_arrow(), copy=True, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16eead6ea81..21eb5151088 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5111,7 +5111,7 @@ def describe( ) @_cudf_nvtx_annotate - def to_pandas(self, nullable=False, **kwargs): + def to_pandas(self, nullable: bool = False) -> pd.DataFrame: """ Convert to a Pandas DataFrame. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9f0c66a5c74..57812335d35 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -503,7 +503,9 @@ def dtype(self): return _maybe_convert_to_default_type(dtype) @_cudf_nvtx_annotate - def to_pandas(self, nullable=False): + def to_pandas(self, nullable: bool = False) -> pd.RangeIndex: + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") return pd.RangeIndex( start=self._start, stop=self._stop, @@ -1569,7 +1571,7 @@ def _clean_nulls_from_index(self): def any(self): return self._values.any() - def to_pandas(self, nullable=False): + def to_pandas(self, nullable: bool = False) -> pd.Index: return pd.Index( self._values.to_pandas(nullable=nullable), name=self.name ) @@ -2510,7 +2512,9 @@ def isocalendar(self): return cudf.core.tools.datetimes._to_iso_calendar(self) @_cudf_nvtx_annotate - def to_pandas(self, nullable=False): + def to_pandas(self, nullable: bool = False) -> pd.DatetimeIndex: + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") # TODO: no need to convert to nanos with Pandas 2.x if isinstance(self.dtype, pd.DatetimeTZDtype): nanos = self._values.astype( @@ -2834,11 +2838,12 @@ def __getitem__(self, index): return value @_cudf_nvtx_annotate - def to_pandas(self, nullable=False): + def to_pandas(self, nullable: bool = False) -> pd.TimedeltaIndex: + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") return pd.TimedeltaIndex( self._values.to_pandas(), name=self.name, - unit=self._values.time_unit, ) @property # type: ignore @@ -3302,7 +3307,7 @@ def __init__(self, values, copy=False, **kwargs): super().__init__(values, **kwargs) @_cudf_nvtx_annotate - def to_pandas(self, nullable=False): + def to_pandas(self, nullable: bool = False) -> pd.Index: return pd.Index( self.to_numpy(na_value=None), name=self.name, diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index d0c8a513686..4f71c030cc3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1592,10 +1592,8 @@ def droplevel(self, level=-1): return mi @_cudf_nvtx_annotate - def to_pandas(self, nullable=False, **kwargs): - result = self.to_frame( - index=False, name=list(range(self.nlevels)) - ).to_pandas(nullable=nullable) + def to_pandas(self, nullable: bool = False) -> pd.MultiIndex: + result = self.to_frame(index=False).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 04a7ed3abf7..746edafbe5e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2064,7 +2064,9 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return super().any(axis, skipna, level, **kwargs) @_cudf_nvtx_annotate - def to_pandas(self, index=True, nullable=False, **kwargs): + def to_pandas( + self, index: bool = True, nullable: bool = False + ) -> pd.Series: """ Convert to a Pandas Series. diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 087b93f1a02..c393522c28b 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2750,11 +2750,14 @@ def test_index_error_list_index(): ) def test_index_hasnans(data): gs = cudf.Index(data, nan_as_null=False) - ps = gs.to_pandas(nullable=True) - - # Check type to avoid mixing Python bool and NumPy bool - assert isinstance(gs.hasnans, bool) - assert gs.hasnans == ps.hasnans + if isinstance(gs, cudf.RangeIndex): + with pytest.raises(NotImplementedError): + gs.to_pandas(nullable=True) + else: + ps = gs.to_pandas(nullable=True) + # Check type to avoid mixing Python bool and NumPy bool + assert isinstance(gs.hasnans, bool) + assert gs.hasnans == ps.hasnans @pytest.mark.parametrize( @@ -2949,3 +2952,16 @@ def test_index_getitem_from_int(idx): def test_index_getitem_from_nonint_raises(idx): with pytest.raises(ValueError): cudf.Index([1, 2])[idx] + + +@pytest.mark.parametrize( + "idx", + [ + cudf.RangeIndex(1), + cudf.DatetimeIndex(np.array([1, 2], dtype="datetime64[ns]")), + cudf.TimedeltaIndex(np.array([1, 2], dtype="timedelta64[ns]")), + ], +) +def test_index_to_pandas_nullable_notimplemented(idx): + with pytest.raises(NotImplementedError): + idx.to_pandas(nullable=True) From 292c3d120394bfb02f4c7e8a31cd7526f6b46740 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:30:28 -0800 Subject: [PATCH 2/9] Ignore index --- python/cudf/cudf/core/column/datetime.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c473b845861..ff9fe3cc17d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -686,8 +686,6 @@ def to_pandas( index: Optional[pd.Index] = None, nullable: bool = False, ) -> pd.Series: - if index is not None: - raise NotImplementedError(f"{index=} is not implemented.") if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") return self._local_time.to_pandas().dt.tz_localize( From 864777ae054b3e5618396a404f4b957c1de180a8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:28:32 -0800 Subject: [PATCH 3/9] Make nullable keyword only --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 3 +-- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/index.py | 10 +++++----- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/series.py | 6 +++--- python/cudf/cudf/testing/testing.py | 8 ++++++-- python/cudf/cudf/tests/dataframe/test_conversion.py | 10 ++++++++-- python/cudf/cudf/tests/test_categorical.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 6 +++--- python/cudf/cudf/tests/test_parquet.py | 8 +++++++- python/cudf/cudf/tests/test_udf_masked_ops.py | 6 +++--- 12 files changed, 41 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 943f363feca..8387ef96dfa 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -850,7 +850,7 @@ def notna(self): """ raise NotImplementedError - def to_pandas(self, nullable: bool = False): + def to_pandas(self, *, nullable: bool = False): """ Convert to a Pandas Index. diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 1e8ab43afaf..ddbc7309133 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -143,7 +143,7 @@ def to_arrow(self) -> pa.Array: null_count=self.null_count, ) - def to_pandas(self, index=None, nullable: bool = False) -> pd.Series: + def to_pandas(self, *, nullable: bool = False) -> pd.Series: # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -157,7 +157,6 @@ def to_pandas(self, index=None, nullable: bool = False) -> pd.Series: self.astype("timedelta64[ns]").to_arrow(), copy=True, dtype=self.dtype, - index=index, ) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 21eb5151088..703d278d003 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5111,7 +5111,7 @@ def describe( ) @_cudf_nvtx_annotate - def to_pandas(self, nullable: bool = False) -> pd.DataFrame: + def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: """ Convert to a Pandas DataFrame. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 57812335d35..6777e485061 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -503,7 +503,7 @@ def dtype(self): return _maybe_convert_to_default_type(dtype) @_cudf_nvtx_annotate - def to_pandas(self, nullable: bool = False) -> pd.RangeIndex: + def to_pandas(self, *, nullable: bool = False) -> pd.RangeIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") return pd.RangeIndex( @@ -1571,7 +1571,7 @@ def _clean_nulls_from_index(self): def any(self): return self._values.any() - def to_pandas(self, nullable: bool = False) -> pd.Index: + def to_pandas(self, *, nullable: bool = False) -> pd.Index: return pd.Index( self._values.to_pandas(nullable=nullable), name=self.name ) @@ -2512,7 +2512,7 @@ def isocalendar(self): return cudf.core.tools.datetimes._to_iso_calendar(self) @_cudf_nvtx_annotate - def to_pandas(self, nullable: bool = False) -> pd.DatetimeIndex: + def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") # TODO: no need to convert to nanos with Pandas 2.x @@ -2838,7 +2838,7 @@ def __getitem__(self, index): return value @_cudf_nvtx_annotate - def to_pandas(self, nullable: bool = False) -> pd.TimedeltaIndex: + def to_pandas(self, *, nullable: bool = False) -> pd.TimedeltaIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") return pd.TimedeltaIndex( @@ -3307,7 +3307,7 @@ def __init__(self, values, copy=False, **kwargs): super().__init__(values, **kwargs) @_cudf_nvtx_annotate - def to_pandas(self, nullable: bool = False) -> pd.Index: + def to_pandas(self, *, nullable: bool = False) -> pd.Index: return pd.Index( self.to_numpy(na_value=None), name=self.name, diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4f71c030cc3..a2a1a9b3d58 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1592,7 +1592,7 @@ def droplevel(self, level=-1): return mi @_cudf_nvtx_annotate - def to_pandas(self, nullable: bool = False) -> pd.MultiIndex: + def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: result = self.to_frame(index=False).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 746edafbe5e..9b5e5f2e01e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2126,10 +2126,10 @@ def to_pandas( 3 30.0 dtype: float64 """ - if index is True: - index = self.index.to_pandas() - s = self._column.to_pandas(index=index, nullable=nullable) + s = self._column.to_pandas(nullable=nullable) s.name = self.name + if index is True: + s.index = self.index.to_pandas() return s @property # type: ignore diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index a9c54ddcaa1..a6262e99481 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -271,8 +271,12 @@ def assert_column_equal( left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: - ldata = str([val for val in left.to_pandas(nullable=True)]) - rdata = str([val for val in right.to_pandas(nullable=True)]) + try: + ldata = str([val for val in left.to_pandas(nullable=True)]) + rdata = str([val for val in right.to_pandas(nullable=True)]) + except NotImplementedError: + ldata = str([val for val in left.to_pandas(nullable=False)]) + rdata = str([val for val in right.to_pandas(nullable=False)]) try: diff = 0 for i in range(left.size): diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/test_conversion.py index 3673ea827f9..2d3014f048b 100644 --- a/python/cudf/cudf/tests/dataframe/test_conversion.py +++ b/python/cudf/cudf/tests/dataframe/test_conversion.py @@ -1,5 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. import pandas as pd +import pytest import cudf from cudf.testing._utils import assert_eq @@ -26,6 +27,7 @@ def test_convert_dtypes(): "category", "datetime64[ns]", ] + nullable_columns = list("abcdef") df = pd.DataFrame( { k: pd.Series(v, dtype=d) @@ -33,6 +35,10 @@ def test_convert_dtypes(): } ) gdf = cudf.DataFrame.from_pandas(df) - expect = df.convert_dtypes() - got = gdf.convert_dtypes().to_pandas(nullable=True) + expect = df[nullable_columns].convert_dtypes() + got = gdf[nullable_columns].convert_dtypes().to_pandas(nullable=True) assert_eq(expect, got) + + with pytest.raises(NotImplementedError): + # category and datetime64[ns] are not nullable + gdf[nullable_columns].convert_dtypes().to_pandas(nullable=True) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index afbc0dc6c17..49eeff01bee 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -227,7 +227,7 @@ def test_df_cat_set_index(): df["b"] = np.arange(len(df)) got = df.set_index("a") - pddf = df.to_pandas(nullable_pd_dtype=False) + pddf = df.to_pandas() expect = pddf.set_index("a") assert_eq(got, expect) @@ -239,7 +239,7 @@ def test_df_cat_sort_index(): df["b"] = np.arange(len(df)) got = df.set_index("a").sort_index() - expect = df.to_pandas(nullable_pd_dtype=False).set_index("a").sort_index() + expect = df.to_pandas().set_index("a").sort_index() assert_eq(got, expect) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5677f97408a..3ae991fa9aa 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2163,9 +2163,9 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): got_property = gdf.T expect = pdf.transpose() - - assert_eq(expect, got_function.to_pandas(nullable=True)) - assert_eq(expect, got_property.to_pandas(nullable=True)) + nullable = dtype not in DATETIME_TYPES + assert_eq(expect, got_function.to_pandas(nullable=nullable)) + assert_eq(expect, got_property.to_pandas(nullable=nullable)) @pytest.mark.parametrize("num_cols", [1, 2, 10]) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index af4d0294293..cd6c3de61aa 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2593,7 +2593,8 @@ def test_parquet_writer_decimal(decimal_type, data): gdf.to_parquet(buff) got = pd.read_parquet(buff, use_nullable_dtypes=True) - assert_eq(gdf.to_pandas(nullable=True), got) + assert_eq(gdf["val"].to_pandas(nullable=True), got["val"]) + assert_eq(gdf["dec_val"].to_pandas(), got["dec_val"]) def test_parquet_writer_column_validation(): @@ -2627,6 +2628,11 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): got = pd.read_parquet(fname) nullable = num_rows > 0 + if nullable: + gdf = gdf.drop(columns="col_datetime64[ms]") + gdf = gdf.drop(columns="col_datetime64[us]") + got = got.drop(columns="col_datetime64[ms]") + got = got.drop(columns="col_datetime64[us]") assert_eq(gdf.to_pandas(nullable=nullable), got) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 85531f8fae8..ad0c961a749 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -64,9 +64,9 @@ def substr(request): return request.param -def run_masked_udf_test(func, data, args=(), **kwargs): +def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs): gdf = data - pdf = data.to_pandas(nullable=True) + pdf = data.to_pandas(nullable=nullable) expect = pdf.apply(func, args=args, axis=1) obtain = gdf.apply(func, args=args, axis=1) @@ -184,7 +184,7 @@ def func(row): ) gdf["a"] = gdf["a"].astype(dtype_l) gdf["b"] = gdf["b"].astype(dtype_r) - run_masked_udf_test(func, gdf, check_dtype=False) + run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) @pytest.mark.parametrize("op", comparison_ops) From c1fb1010efae85c4309761749aec0a990078c67c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:34:33 -0800 Subject: [PATCH 4/9] Fix typing --- python/cudf/cudf/core/column/column.py | 1 + python/cudf/cudf/core/column/timedelta.py | 5 ++++- python/cudf/cudf/core/series.py | 8 +++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3fc0484211e..508e1c0c7bc 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -204,6 +204,7 @@ def __repr__(self): def to_pandas( self, + *, index: Optional[pd.Index] = None, nullable: bool = False, ) -> pd.Series: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index ddbc7309133..572b3b894dc 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -143,7 +143,9 @@ def to_arrow(self) -> pa.Array: null_count=self.null_count, ) - def to_pandas(self, *, nullable: bool = False) -> pd.Series: + def to_pandas( + self, *, index: Optional[pd.Index] = None, nullable: bool = False + ) -> pd.Series: # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -157,6 +159,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.Series: self.astype("timedelta64[ns]").to_arrow(), copy=True, dtype=self.dtype, + index=index, ) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 9b5e5f2e01e..96758084894 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2126,10 +2126,12 @@ def to_pandas( 3 30.0 dtype: float64 """ - s = self._column.to_pandas(nullable=nullable) - s.name = self.name if index is True: - s.index = self.index.to_pandas() + index = self.index.to_pandas() + else: + index = None # type: ignore[assignment] + s = self._column.to_pandas(index=index, nullable=nullable) + s.name = self.name return s @property # type: ignore From e4707e85bcb05d90adf91583d9a66c620185a6ba Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:36:26 -0800 Subject: [PATCH 5/9] Make column signatures keyword only --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/datetime.py | 2 ++ python/cudf/cudf/core/column/interval.py | 2 +- python/cudf/cudf/core/column/numerical.py | 1 + python/cudf/cudf/core/column/string.py | 1 + python/cudf/cudf/core/column/struct.py | 2 +- 6 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6ab079412c0..689cb02ed86 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -974,7 +974,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: ) def to_pandas( - self, index: Optional[pd.Index] = None, nullable: bool = False + self, *, index: Optional[pd.Index] = None, nullable: bool = False ) -> pd.Series: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index ff9fe3cc17d..c04501a40bb 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -304,6 +304,7 @@ def day_of_year(self) -> ColumnBase: def to_pandas( self, + *, index: Optional[pd.Index] = None, nullable: bool = False, ) -> pd.Series: @@ -683,6 +684,7 @@ def __init__( def to_pandas( self, + *, index: Optional[pd.Index] = None, nullable: bool = False, ) -> pd.Series: diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index afae4c8528e..b550e272b2c 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -126,7 +126,7 @@ def as_interval_column(self, dtype, **kwargs): raise ValueError("dtype must be IntervalDtype") def to_pandas( - self, index: Optional[pd.Index] = None, nullable: bool = False + self, *, index: Optional[pd.Index] = None, nullable: bool = False ) -> pd.Series: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 151b5d5de8d..cdecf44cc8f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -679,6 +679,7 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def to_pandas( self, + *, index: Optional[pd.Index] = None, nullable: bool = False, ) -> pd.Series: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 5820dec72b1..a5e9bbf81f2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5756,6 +5756,7 @@ def values(self) -> cupy.ndarray: def to_pandas( self, + *, index: Optional[pd.Index] = None, nullable: bool = False, ) -> pd.Series: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 3083d1ac729..35fb18cc5c7 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -59,7 +59,7 @@ def to_arrow(self): ) def to_pandas( - self, index: Optional[pd.Index] = None, nullable: bool = False + self, *, index: Optional[pd.Index] = None, nullable: bool = False ) -> pd.Series: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 From 15acccdd5bd05f69240acb087f8456d0061fbc53 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:44:18 -0800 Subject: [PATCH 6/9] Fix datetimetz not respecting index --- python/cudf/cudf/core/column/datetime.py | 5 ++++- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/series/test_datetimelike.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c04501a40bb..335fafe499c 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -690,9 +690,12 @@ def to_pandas( ) -> pd.Series: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - return self._local_time.to_pandas().dt.tz_localize( + series = self._local_time.to_pandas().dt.tz_localize( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) + if index is not None: + series.index = index + return series def to_arrow(self): return pa.compute.assume_timezone( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 96758084894..b3782aae24a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2065,7 +2065,7 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): @_cudf_nvtx_annotate def to_pandas( - self, index: bool = True, nullable: bool = False + self, *, index: bool = True, nullable: bool = False ) -> pd.Series: """ Convert to a Pandas Series. diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 85da985940f..df68eaca399 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -180,6 +180,19 @@ def test_convert_edge_cases(data, original_timezone, target_timezone): assert_eq(expect, got) +def test_to_pandas_index_true_timezone(): + data = [ + "2008-05-12", + "2008-12-12", + "2009-05-12", + ] + dti = cudf.DatetimeIndex(data).tz_localize("UTC") + ser = cudf.Series(dti, index=list("abc")) + result = ser.to_pandas(index=True) + expected = pd.Series(pd.to_datetime(data, utc=True), index=list("abc")) + assert_eq(result, expected) + + def test_tz_aware_attributes_local(): data = [ "2008-05-12 13:50:00", From 88cc13c0fc241fb1adb24c648ee003a37fc1a9ae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:07:42 -0800 Subject: [PATCH 7/9] Fix multiindex typo, test cases --- python/cudf/cudf/core/multiindex.py | 4 +++- python/cudf/cudf/io/json.py | 16 +++++++++++++++- .../cudf/cudf/tests/dataframe/test_conversion.py | 4 +++- python/cudf/cudf/tests/series/test_conversion.py | 3 ++- python/cudf/cudf/tests/test_parquet.py | 2 +- 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a2a1a9b3d58..820f6e7769c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1593,7 +1593,9 @@ def droplevel(self, level=-1): @_cudf_nvtx_annotate def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: - result = self.to_frame(index=False).to_pandas(nullable=nullable) + result = self.to_frame( + index=False, name=list(range(self.nlevels)) + ).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index efac24aee17..ae2f0203642 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -179,6 +179,13 @@ def read_json( return df +def maybe_return_nullable_pd_obj(cudf_obj): + try: + return cudf_obj.to_pandas(nullable=True) + except NotImplementedError: + return cudf_obj.to_pandas(nullable=False) + + @ioutils.doc_to_json() def to_json( cudf_val, @@ -228,7 +235,14 @@ def to_json( return path_or_buf.read() elif engine == "pandas": warnings.warn("Using CPU via Pandas to write JSON dataset") - pd_value = cudf_val.to_pandas(nullable=True) + if isinstance(cudf_val, cudf.DataFrame): + pd_data = { + col: maybe_return_nullable_pd_obj(series) + for col, series in cudf_val.items() + } + pd_value = pd.DataFrame(pd_data) + else: + pd_value = maybe_return_nullable_pd_obj(cudf_val) return pd.io.json.to_json( path_or_buf, pd_value, diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/test_conversion.py index 2d3014f048b..fa7e5ec1d4c 100644 --- a/python/cudf/cudf/tests/dataframe/test_conversion.py +++ b/python/cudf/cudf/tests/dataframe/test_conversion.py @@ -28,6 +28,8 @@ def test_convert_dtypes(): "datetime64[ns]", ] nullable_columns = list("abcdef") + non_nullable_columns = list(set(data.keys()).difference(nullable_columns)) + df = pd.DataFrame( { k: pd.Series(v, dtype=d) @@ -41,4 +43,4 @@ def test_convert_dtypes(): with pytest.raises(NotImplementedError): # category and datetime64[ns] are not nullable - gdf[nullable_columns].convert_dtypes().to_pandas(nullable=True) + gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True) diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py index 08124a9a98e..43ac35e41a6 100644 --- a/python/cudf/cudf/tests/series/test_conversion.py +++ b/python/cudf/cudf/tests/series/test_conversion.py @@ -26,7 +26,8 @@ def test_convert_dtypes(data, dtype): # because we don't have distinct nullable types, we check that we # get the same result if we convert to nullable pandas types: - got = gs.convert_dtypes().to_pandas(nullable=True) + nullable = dtype not in ("category", "datetime64[ns]") + got = gs.convert_dtypes().to_pandas(nullable=nullable) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index cd6c3de61aa..b3a06dbd742 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2677,7 +2677,7 @@ def test_parquet_reader_one_level_list(datadir): fname = datadir / "one_level_list.parquet" expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname).to_pandas(nullable=True) + got = cudf.read_parquet(fname) assert_eq(expect, got) From 8629ab34b0b604bdc3399e919b284b2c716a0100 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 Nov 2023 12:12:33 -0800 Subject: [PATCH 8/9] test raising behavior instead of silent --- python/cudf/cudf/tests/test_list.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 8149188ba93..7ae7ae34b97 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -90,10 +90,8 @@ def test_leaves(data): def test_list_to_pandas_nullable_true(): df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) - actual = df.to_pandas(nullable=True) - expected = pd.DataFrame({"a": pd.Series([[1, 2, 3]])}) - - assert_eq(actual, expected) + with pytest.raises(NotImplementedError): + df.to_pandas(nullable=True) def test_listdtype_hash(): From f0d9466d0c4d054337ff88eb663b241631ef815b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:39:01 -0800 Subject: [PATCH 9/9] Change nullable_pd_dtype to nullable --- python/dask_cudf/dask_cudf/core.py | 6 ++---- python/dask_cudf/dask_cudf/tests/test_accessor.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 17650c9b70d..ecdc566037d 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -56,10 +56,8 @@ def __repr__(self): @_dask_cudf_nvtx_annotate def to_dask_dataframe(self, **kwargs): """Create a dask.dataframe object from a dask_cudf object""" - nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False) - return self.map_partitions( - M.to_pandas, nullable_pd_dtype=nullable_pd_dtype - ) + nullable = kwargs.get("nullable", False) + return self.map_partitions(M.to_pandas, nullable=nullable) concat = dd.concat diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index bea0cbb48ae..7ed5d797822 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -259,13 +259,13 @@ def test_categorical_categories(): {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)} ) df["a"] = df["a"].astype("category") - pdf = df.to_pandas(nullable_pd_dtype=False) + pdf = df.to_pandas(nullable=False) ddf = dgd.from_cudf(df, 2) dpdf = dd.from_pandas(pdf, 2) dd.assert_eq( - ddf.a.cat.categories.to_series().to_pandas(nullable_pd_dtype=False), + ddf.a.cat.categories.to_series().to_pandas(nullable=False), dpdf.a.cat.categories.to_series(), check_index=False, )