rapidsai · rapids-bot · Nov 27, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
@@ -850,7 +850,7 @@ def notna(self):
         """
         raise NotImplementedError
 
-    def to_pandas(self, nullable=False):
+    def to_pandas(self, *, nullable: bool = False):
         """
         Convert to a Pandas Index.
 

@@ -974,8 +974,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
         )
 
     def to_pandas(
-        self, index: Optional[pd.Index] = None, **kwargs
+        self, *, index: Optional[pd.Index] = None, nullable: bool = False
     ) -> pd.Series:
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
+
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())
             col = column.build_categorical_column(

@@ -203,17 +203,20 @@ def __repr__(self):
         )
 
     def to_pandas(
-        self, index: Optional[pd.Index] = None, **kwargs
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
     ) -> pd.Series:
         """Convert object to pandas type.
 
         The default implementation falls back to PyArrow for the conversion.
         """
         # This default implementation does not handle nulls in any meaningful
-        # way, but must consume the parameter to avoid passing it to PyArrow
-        # (which does not recognize it).
-        kwargs.pop("nullable", None)
-        pd_series = self.to_arrow().to_pandas(**kwargs)
+        # way
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
+        pd_series = self.to_arrow().to_pandas()
 
         if index is not None:
             pd_series.index = index

@@ -308,10 +308,12 @@ def day_of_year(self) -> ColumnBase:
 
     def to_pandas(
         self,
+        *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
-        **kwargs,
-    ) -> "cudf.Series":
+    ) -> pd.Series:
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         # `copy=True` workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
@@ -686,13 +688,18 @@ def __init__(
 
     def to_pandas(
         self,
+        *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
-        **kwargs,
-    ) -> "cudf.Series":
-        return self._local_time.to_pandas().dt.tz_localize(
+    ) -> pd.Series:
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
+        series = self._local_time.to_pandas().dt.tz_localize(
             self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
         )
+        if index is not None:
+            series.index = index
+        return series
 
     def to_arrow(self):
         return pa.compute.assume_timezone(

@@ -126,13 +126,15 @@ def as_interval_column(self, dtype, **kwargs):
             raise ValueError("dtype must be IntervalDtype")
 
     def to_pandas(
-        self, index: Optional[pd.Index] = None, **kwargs
+        self, *, index: Optional[pd.Index] = None, nullable: bool = False
     ) -> pd.Series:
         # Note: This does not handle null values in the interval column.
         # However, this exact sequence (calling __from_arrow__ on the output of
         # self.to_arrow) is currently the best known way to convert interval
         # types into pandas (trying to convert the underlying numerical columns
         # directly is problematic), so we're stuck with this for now.
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         return pd.Series(
             self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
         )

@@ -679,9 +679,9 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
 
     def to_pandas(
         self,
+        *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
-        **kwargs,
     ) -> pd.Series:
         if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
             pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
@@ -691,7 +691,7 @@ def to_pandas(
         elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls():
             pd_series = pd.Series(self.values_host, copy=False)
         else:
-            pd_series = self.to_arrow().to_pandas(**kwargs)
+            pd_series = self.to_arrow().to_pandas()
 
         if index is not None:
             pd_series.index = index

@@ -5756,15 +5756,15 @@ def values(self) -> cupy.ndarray:
 
     def to_pandas(
         self,
+        *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
-        **kwargs,
     ) -> pd.Series:
         if nullable:
             pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
             pd_series = pd.Series(pandas_array, copy=False)
         else:
-            pd_series = self.to_arrow().to_pandas(**kwargs)
+            pd_series = self.to_arrow().to_pandas()
 
         if index is not None:
             pd_series.index = index

@@ -59,16 +59,14 @@ def to_arrow(self):
         )
 
     def to_pandas(
-        self, index: Optional[pd.Index] = None, **kwargs
+        self, *, index: Optional[pd.Index] = None, nullable: bool = False
     ) -> pd.Series:
         # We cannot go via Arrow's `to_pandas` because of the following issue:
         # https://issues.apache.org/jira/browse/ARROW-12680
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
 
-        pd_series = pd.Series(self.to_arrow().tolist(), dtype="object")
-
-        if index is not None:
-            pd_series.index = index
-        return pd_series
+        return pd.Series(self.to_arrow().tolist(), dtype="object", index=index)
 
     @cached_property
     def memory_usage(self):

@@ -144,14 +144,17 @@ def to_arrow(self) -> pa.Array:
         )
 
     def to_pandas(
-        self, index=None, nullable: bool = False, **kwargs
+        self, *, index: Optional[pd.Index] = None, nullable: bool = False
     ) -> pd.Series:
         # `copy=True` workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
         # Pandas only supports `timedelta64[ns]` dtype
         # and conversion to this type is necessary to make
         # arrow to pandas conversion happen for large values.
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
+
         return pd.Series(
             self.astype("timedelta64[ns]").to_arrow(),
             copy=True,

@@ -5128,7 +5128,7 @@ def describe(
             )
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, nullable=False, **kwargs):
+    def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
         """
         Convert to a Pandas DataFrame.
 

@@ -501,7 +501,9 @@ def dtype(self):
         return _maybe_convert_to_default_type(dtype)
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, nullable=False):
+    def to_pandas(self, *, nullable: bool = False) -> pd.RangeIndex:
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         return pd.RangeIndex(
             start=self._start,
             stop=self._stop,
@@ -1567,7 +1569,7 @@ def _clean_nulls_from_index(self):
     def any(self):
         return self._values.any()
 
-    def to_pandas(self, nullable=False):
+    def to_pandas(self, *, nullable: bool = False) -> pd.Index:
         return pd.Index(
             self._values.to_pandas(nullable=nullable), name=self.name
         )
@@ -2508,7 +2510,9 @@ def isocalendar(self):
         return cudf.core.tools.datetimes._to_iso_calendar(self)
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, nullable=False):
+    def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex:
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         # TODO: no need to convert to nanos with Pandas 2.x
         if isinstance(self.dtype, pd.DatetimeTZDtype):
             nanos = self._values.astype(
@@ -2832,11 +2836,12 @@ def __getitem__(self, index):
         return value
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, nullable=False):
+    def to_pandas(self, *, nullable: bool = False) -> pd.TimedeltaIndex:
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         return pd.TimedeltaIndex(
             self._values.to_pandas(),
             name=self.name,
-            unit=self._values.time_unit,
         )
 
     @property  # type: ignore
@@ -3300,7 +3305,7 @@ def __init__(self, values, copy=False, **kwargs):
         super().__init__(values, **kwargs)
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, nullable=False):
+    def to_pandas(self, *, nullable: bool = False) -> pd.Index:
         return pd.Index(
             self.to_numpy(na_value=None),
             name=self.name,

@@ -1592,7 +1592,7 @@ def droplevel(self, level=-1):
             return mi
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, nullable=False, **kwargs):
+    def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:
         result = self.to_frame(
             index=False, name=list(range(self.nlevels))
         ).to_pandas(nullable=nullable)

@@ -2064,7 +2064,9 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         return super().any(axis, skipna, level, **kwargs)
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, index=True, nullable=False, **kwargs):
+    def to_pandas(
+        self, *, index: bool = True, nullable: bool = False
+    ) -> pd.Series:
         """
         Convert to a Pandas Series.
 
@@ -2126,6 +2128,8 @@ def to_pandas(self, index=True, nullable=False, **kwargs):
         """
         if index is True:
             index = self.index.to_pandas()
+        else:
+            index = None  # type: ignore[assignment]
         s = self._column.to_pandas(index=index, nullable=nullable)
         s.name = self.name
         return s

@@ -179,6 +179,13 @@ def read_json(
     return df
 
 
+def maybe_return_nullable_pd_obj(cudf_obj):
+    try:
+        return cudf_obj.to_pandas(nullable=True)
+    except NotImplementedError:
+        return cudf_obj.to_pandas(nullable=False)
+
+
 @ioutils.doc_to_json()
 def to_json(
     cudf_val,
@@ -228,7 +235,14 @@ def to_json(
             return path_or_buf.read()
     elif engine == "pandas":
         warnings.warn("Using CPU via Pandas to write JSON dataset")
-        pd_value = cudf_val.to_pandas(nullable=True)
+        if isinstance(cudf_val, cudf.DataFrame):
+            pd_data = {
+                col: maybe_return_nullable_pd_obj(series)
+                for col, series in cudf_val.items()
+            }
+            pd_value = pd.DataFrame(pd_data)
+        else:
+            pd_value = maybe_return_nullable_pd_obj(cudf_val)
         return pd.io.json.to_json(
             path_or_buf,
             pd_value,

@@ -271,8 +271,12 @@ def assert_column_equal(
                 left = left.astype(left.categories.dtype)
                 right = right.astype(right.categories.dtype)
     if not columns_equal:
-        ldata = str([val for val in left.to_pandas(nullable=True)])
-        rdata = str([val for val in right.to_pandas(nullable=True)])
+        try:
+            ldata = str([val for val in left.to_pandas(nullable=True)])
+            rdata = str([val for val in right.to_pandas(nullable=True)])
+        except NotImplementedError:
+            ldata = str([val for val in left.to_pandas(nullable=False)])
+            rdata = str([val for val in right.to_pandas(nullable=False)])
         try:
             diff = 0
             for i in range(left.size):

@@ -1,5 +1,6 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 import pandas as pd
+import pytest
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -26,13 +27,20 @@ def test_convert_dtypes():
         "category",
         "datetime64[ns]",
     ]
+    nullable_columns = list("abcdef")
+    non_nullable_columns = list(set(data.keys()).difference(nullable_columns))
+
     df = pd.DataFrame(
         {
             k: pd.Series(v, dtype=d)
             for k, v, d in zip(data.keys(), data.values(), dtypes)
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
-    expect = df.convert_dtypes()
-    got = gdf.convert_dtypes().to_pandas(nullable=True)
+    expect = df[nullable_columns].convert_dtypes()
+    got = gdf[nullable_columns].convert_dtypes().to_pandas(nullable=True)
     assert_eq(expect, got)
+
+    with pytest.raises(NotImplementedError):
+        # category and datetime64[ns] are not nullable
+        gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True)
@@ -26,7 +26,8 @@ def test_convert_dtypes(data, dtype):
 
     # because we don't have distinct nullable types, we check that we
     # get the same result if we convert to nullable pandas types:
-    got = gs.convert_dtypes().to_pandas(nullable=True)
+    nullable = dtype not in ("category", "datetime64[ns]")
+    got = gs.convert_dtypes().to_pandas(nullable=nullable)
     assert_eq(expect, got)
 
 

@@ -180,6 +180,19 @@ def test_convert_edge_cases(data, original_timezone, target_timezone):
     assert_eq(expect, got)
 
 
+def test_to_pandas_index_true_timezone():
+    data = [
+        "2008-05-12",
+        "2008-12-12",
+        "2009-05-12",
+    ]
+    dti = cudf.DatetimeIndex(data).tz_localize("UTC")
+    ser = cudf.Series(dti, index=list("abc"))
+    result = ser.to_pandas(index=True)
+    expected = pd.Series(pd.to_datetime(data, utc=True), index=list("abc"))
+    assert_eq(result, expected)
+
+
 def test_tz_aware_attributes_local():
     data = [
         "2008-05-12 13:50:00",

@@ -227,7 +227,7 @@ def test_df_cat_set_index():
     df["b"] = np.arange(len(df))
     got = df.set_index("a")
 
-    pddf = df.to_pandas(nullable_pd_dtype=False)
+    pddf = df.to_pandas()
     expect = pddf.set_index("a")
 
     assert_eq(got, expect)
@@ -239,7 +239,7 @@ def test_df_cat_sort_index():
     df["b"] = np.arange(len(df))
 
     got = df.set_index("a").sort_index()
-    expect = df.to_pandas(nullable_pd_dtype=False).set_index("a").sort_index()
+    expect = df.to_pandas().set_index("a").sort_index()
 
     assert_eq(got, expect)