rapidsai · rapids-bot · Apr 28, 2021 · Apr 26, 2021 · Apr 26, 2021 · Apr 27, 2021
@@ -17,6 +17,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from numba import cuda
 
 import cudf
@@ -1099,6 +1100,24 @@ def to_pandas(
         )
         return pd.Series(data, index=index)
 
+    def to_arrow(self) -> pa.Array:
+        """Convert to PyArrow Array."""
+        # arrow doesn't support unsigned codes
+        signed_type = (
+            min_signed_type(self.codes.max())
+            if self.codes.size > 0
+            else np.int8
+        )
+        codes = self.codes.astype(signed_type)
+        categories = self.categories
+
+        out_indices = codes.to_arrow()
+        out_dictionary = categories.to_arrow()
+
+        return pa.DictionaryArray.from_arrays(
+            out_indices, out_dictionary, ordered=self.ordered,
+        )
+
     @property
     def values_host(self) -> np.ndarray:
         """

@@ -12,7 +12,6 @@
     Callable,
     Dict,
     List,
-    Mapping,
     Optional,
     Sequence,
     Tuple,
@@ -44,9 +43,7 @@
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
-    NUMERIC_TYPES,
     check_cast_unsupported_dtype,
-    cudf_dtypes_to_pandas_dtypes,
     get_time_unit,
     is_categorical_dtype,
     is_decimal_dtype,
@@ -56,7 +53,6 @@
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
-    min_signed_type,
     min_unsigned_type,
     np_to_pa_dtype,
 )
@@ -119,25 +115,19 @@ def __repr__(self):
     def to_pandas(
         self, index: ColumnLike = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":
-        if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
-            pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
-            arrow_array = self.to_arrow()
-            pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
-            pd_series = pd.Series(pandas_array, copy=False)
-        elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
-            pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
-        elif is_interval_dtype(self.dtype):
-            pd_series = pd.Series(
-                pd.IntervalDtype().__from_arrow__(self.to_arrow())
-            )
-        else:
-            pd_series = self.to_arrow().to_pandas(**kwargs)
+        """Convert object to pandas type.
+
+        The default implementation falls back to PyArrow for the conversion.
+        """
+        pd_series = self.to_arrow().to_pandas(**kwargs)
 
         if index is not None:
             pd_series.index = index
         return pd_series
 
     def __iter__(self):
+        # TODO: Why don't we just implement this method in terms of one of the
+        # proposed alternatives (to_arrow, to_pandas, or values_host)?
         cudf.utils.utils.raise_iteration_error(obj=self)
 
     @property
@@ -333,46 +323,14 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        if isinstance(self, cudf.core.column.CategoricalColumn):
-            # arrow doesn't support unsigned codes
-            signed_type = (
-                min_signed_type(self.codes.max())
-                if self.codes.size > 0
-                else np.int8
-            )
-            codes = self.codes.astype(signed_type)
-            categories = self.categories
-
-            out_indices = codes.to_arrow()
-            out_dictionary = categories.to_arrow()
-
-            return pa.DictionaryArray.from_arrays(
-                out_indices, out_dictionary, ordered=self.ordered,
-            )
-
-        if isinstance(self, cudf.core.column.StringColumn) and (
-            self.null_count == len(self)
-        ):
-            return pa.NullArray.from_buffers(
-                pa.null(), len(self), [pa.py_buffer((b""))]
-            )
-
-        result = libcudf.interop.to_arrow(
+        return libcudf.interop.to_arrow(
             libcudf.table.Table(
                 cudf.core.column_accessor.ColumnAccessor({"None": self})
             ),
             [["None"]],
             keep_index=False,
         )["None"].chunk(0)
 
-        if isinstance(self.dtype, cudf.Decimal64Dtype):
-            result = result.view(
-                pa.decimal128(
-                    scale=result.type.scale, precision=self.dtype.precision
-                )
-            )
-        return result
-
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
@@ -838,7 +796,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         return indices[-1]
 
     def append(self, other: ColumnBase) -> ColumnBase:
-        return ColumnBase._concat([self, as_column(other)])
+        return self.__class__._concat([self, as_column(other)])
 
     def quantile(
         self,
@@ -890,9 +848,6 @@ def isin(self, values: Sequence) -> ColumnBase:
         result: Column
             Column of booleans indicating if each element is in values.
         """
-        lhs = self
-        rhs = None
-
         try:
             lhs, rhs = self._process_values_for_isin(values)
             res = lhs._isin_earlystop(rhs)
@@ -1168,31 +1123,11 @@ def argsort(
         return sorted_indices
 
     @property
-    def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls:
-
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            mask = SimpleNamespace(
-                __cuda_array_interface__={
-                    "shape": (len(self),),
-                    "typestr": "<t1",
-                    "data": (self.mask_ptr, True),
-                    "version": 1,
-                }
-            )
-            output["mask"] = mask
-
-        return output
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            f"dtype {self.dtype} is not yet supported via "
+            "`__cuda_array_interface__`"
+        )
 
     def __add__(self, other):
         return self.binary_operator("add", other)
@@ -1289,10 +1224,18 @@ def deserialize(cls, header: dict, frames: list) -> ColumnBase:
             mask = Buffer.deserialize(header["mask"], [frames[1]])
         return build_column(data=data, dtype=dtype, mask=mask)
 
+    def unary_operator(self, unaryop: builtins.str):
+        raise TypeError(
+            f"Operation {unaryop} not supported for dtype {self.dtype}."
+        )
+
     def binary_operator(
         self, op: builtins.str, other: BinaryOperand, reflect: bool = False
     ) -> ColumnBase:
-        raise NotImplementedError
+        raise TypeError(
+            f"Operation {op} not supported between dtypes {self.dtype} and "
+            f"{other.dtype}."
+        )
 
     def min(self, skipna: bool = None, dtype: Dtype = None):
         result_col = self._process_for_reduction(skipna=skipna)

@@ -139,15 +139,12 @@ def to_pandas(
         # https://issues.apache.org/jira/browse/ARROW-9772
 
         # Pandas supports only `datetime64[ns]`, hence the cast.
-        pd_series = pd.Series(
-            self.astype("datetime64[ns]").to_array("NAT"), copy=False
+        return pd.Series(
+            self.astype("datetime64[ns]").to_array("NAT"),
+            copy=False,
+            index=index,
         )
 
-        if index is not None:
-            pd_series.index = index
-
-        return pd_series
-
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 

@@ -1,6 +1,9 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+import pandas as pd
 import pyarrow as pa
+
 import cudf
+from cudf._typing import ColumnLike
 from cudf.core.column import StructColumn
 from cudf.core.dtypes import IntervalDtype
 from cudf.utils.dtypes import is_interval_dtype
@@ -110,3 +113,10 @@ def as_interval_column(self, dtype, **kwargs):
             )
         else:
             raise ValueError("dtype must be IntervalDtype")
+
+    def to_pandas(
+        self, index: ColumnLike = None, nullable: bool = False, **kwargs
+    ) -> "pd.Series":
+        return pd.Series(
+            pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index
+        )
@@ -2,9 +2,12 @@
 
 from __future__ import annotations
 
+import builtins
 from numbers import Number
-from typing import Any, Callable, Sequence, Tuple, Union, cast
+from types import SimpleNamespace
+from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast
 
+import cupy
 import numpy as np
 import pandas as pd
 from numba import cuda, njit
@@ -27,6 +30,8 @@
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
+    NUMERIC_TYPES,
+    cudf_dtypes_to_pandas_dtypes,
     min_column_type,
     min_signed_type,
     numeric_normalize_types,
@@ -86,6 +91,33 @@ def __contains__(self, item: ScalarLike) -> bool:
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    @property
+    def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls:
+
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            mask = SimpleNamespace(
+                __cuda_array_interface__={
+                    "shape": (len(self),),
+                    "typestr": "<t1",
+                    "data": (self.mask_ptr, True),
+                    "version": 1,
+                }
+            )
+            output["mask"] = mask
+
+        return output
+
     def unary_operator(self, unaryop: str) -> ColumnBase:
         return _numeric_column_unaryop(self, op=unaryop)
 
@@ -407,7 +439,7 @@ def round(self, decimals: int = 0) -> NumericalColumn:
     def applymap(
         self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
     ) -> ColumnBase:
-        """Apply an element-wise function to transform the values in the Column.
+        """Apply an elementwise function to transform the values in the Column.
 
         Parameters
         ----------
@@ -711,6 +743,23 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         return False
 
+    def to_pandas(
+        self, index: ColumnLike = None, nullable: bool = False, **kwargs
+    ) -> "pd.Series":
+        if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
+            pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
+            arrow_array = self.to_arrow()
+            pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
+            pd_series = pd.Series(pandas_array, copy=False)
+        elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
+            pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
+        else:
+            pd_series = self.to_arrow().to_pandas(**kwargs)
+
+        if index is not None:
+            pd_series.index = index
+        return pd_series
+
 
 @annotate("BINARY_OP", color="orange", domain="cudf_python")
 def _numeric_column_binop(