From 6d7d144482c22b9b11ecb8228e14f41a0e3383c6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 16:12:39 -0700 Subject: [PATCH 01/12] Proper ducktyping for to_pandas. --- python/cudf/cudf/core/column/column.py | 20 +++++--------------- python/cudf/cudf/core/column/interval.py | 10 ++++++++++ python/cudf/cudf/core/column/numerical.py | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2d438d37b3e..2b10d3ad630 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -44,9 +44,7 @@ from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - NUMERIC_TYPES, check_cast_unsupported_dtype, - cudf_dtypes_to_pandas_dtypes, get_time_unit, is_categorical_dtype, is_decimal_dtype, @@ -119,19 +117,11 @@ def __repr__(self): def to_pandas( self, index: ColumnLike = None, nullable: bool = False, **kwargs ) -> "pd.Series": - if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: - pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] - arrow_array = self.to_arrow() - pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) - pd_series = pd.Series(pandas_array, copy=False) - elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0: - pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) - elif is_interval_dtype(self.dtype): - pd_series = pd.Series( - pd.IntervalDtype().__from_arrow__(self.to_arrow()) - ) - else: - pd_series = self.to_arrow().to_pandas(**kwargs) + """Convert object to pandas type. + + The default implementation falls back to PyArrow for the conversion. + """ + pd_series = self.to_arrow().to_pandas(**kwargs) if index is not None: pd_series.index = index diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index d8bea6b1658..7436a69e14a 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,6 +1,9 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +import pandas as pd import pyarrow as pa + import cudf +from cudf._typing import ColumnLike from cudf.core.column import StructColumn from cudf.core.dtypes import IntervalDtype from cudf.utils.dtypes import is_interval_dtype @@ -110,3 +113,10 @@ def as_interval_column(self, dtype, **kwargs): ) else: raise ValueError("dtype must be IntervalDtype") + + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> "pd.Series": + return pd.Series( + pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index + ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 70b4569b180..0ae302da9e1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -5,6 +5,7 @@ from numbers import Number from typing import Any, Callable, Sequence, Tuple, Union, cast +import cupy import numpy as np import pandas as pd from numba import cuda, njit @@ -27,6 +28,8 @@ from cudf.core.dtypes import Decimal64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( + NUMERIC_TYPES, + cudf_dtypes_to_pandas_dtypes, min_column_type, min_signed_type, numeric_normalize_types, @@ -711,6 +714,23 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: return False + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> "pd.Series": + if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: + pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] + arrow_array = self.to_arrow() + pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) + pd_series = pd.Series(pandas_array, copy=False) + elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0: + pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) + else: + pd_series = self.to_arrow().to_pandas(**kwargs) + + if index is not None: + pd_series.index = index + return pd_series + @annotate("BINARY_OP", color="orange", domain="cudf_python") def _numeric_column_binop( From 2dc087253f009411b2ae62a9707789d1cf5ade92 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 16:23:28 -0700 Subject: [PATCH 02/12] Some minor improvements. --- python/cudf/cudf/core/column/column.py | 4 +++- python/cudf/cudf/core/column/numerical.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2b10d3ad630..382688580e7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -128,6 +128,8 @@ def to_pandas( return pd_series def __iter__(self): + # TODO: Why don't we just implement this method in terms of one of the + # proposed alternatives (to_arrow, to_pandas, or values_host)? cudf.utils.utils.raise_iteration_error(obj=self) @property @@ -828,7 +830,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return indices[-1] def append(self, other: ColumnBase) -> ColumnBase: - return ColumnBase._concat([self, as_column(other)]) + return self.__class__._concat([self, as_column(other)]) def quantile( self, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0ae302da9e1..0f717159558 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -410,7 +410,7 @@ def round(self, decimals: int = 0) -> NumericalColumn: def applymap( self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None ) -> ColumnBase: - """Apply an element-wise function to transform the values in the Column. + """Apply an elementwise function to transform the values in the Column. Parameters ---------- From ab555650772f76649a338528406478ccd08f6aee Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 17:08:36 -0700 Subject: [PATCH 03/12] Add to_pandas implementation for str. --- python/cudf/cudf/core/column/string.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index de2df9b50d7..f15a3fd105b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4945,6 +4945,19 @@ def __arrow_array__(self, type=None): "consider using .to_arrow()" ) + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> "pd.Series": + if nullable: + pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) + pd_series = pd.Series(pandas_array, copy=False) + else: + pd_series = self.to_arrow().to_pandas(**kwargs) + + if index is not None: + pd_series.index = index + return pd_series + def serialize(self) -> Tuple[dict, list]: header = {"null_count": self.null_count} # type: Dict[Any, Any] header["type-serialized"] = pickle.dumps(type(self)) From 4538ecb09100d0901513a9f23bbd0f5d705dac1c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 17:16:36 -0700 Subject: [PATCH 04/12] Proper ducktyping for to_arrow. --- python/cudf/cudf/core/column/categorical.py | 19 +++++++++++++ python/cudf/cudf/core/column/column.py | 31 --------------------- python/cudf/cudf/core/column/string.py | 30 ++++++++++++++++++++ 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index bb1bf3c5d5c..d55bd245cb7 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from numba import cuda import cudf @@ -1099,6 +1100,24 @@ def to_pandas( ) return pd.Series(data, index=index) + def to_arrow(self) -> pa.Array: + """Convert to PyArrow Array.""" + # arrow doesn't support unsigned codes + signed_type = ( + min_signed_type(self.codes.max()) + if self.codes.size > 0 + else np.int8 + ) + codes = self.codes.astype(signed_type) + categories = self.categories + + out_indices = codes.to_arrow() + out_dictionary = categories.to_arrow() + + return pa.DictionaryArray.from_arrays( + out_indices, out_dictionary, ordered=self.ordered, + ) + @property def values_host(self) -> np.ndarray: """ diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 382688580e7..d14f621cf76 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -54,7 +54,6 @@ is_scalar, is_string_dtype, is_struct_dtype, - min_signed_type, min_unsigned_type, np_to_pa_dtype, ) @@ -325,30 +324,6 @@ def to_arrow(self) -> pa.Array: 4 ] """ - if isinstance(self, cudf.core.column.CategoricalColumn): - # arrow doesn't support unsigned codes - signed_type = ( - min_signed_type(self.codes.max()) - if self.codes.size > 0 - else np.int8 - ) - codes = self.codes.astype(signed_type) - categories = self.categories - - out_indices = codes.to_arrow() - out_dictionary = categories.to_arrow() - - return pa.DictionaryArray.from_arrays( - out_indices, out_dictionary, ordered=self.ordered, - ) - - if isinstance(self, cudf.core.column.StringColumn) and ( - self.null_count == len(self) - ): - return pa.NullArray.from_buffers( - pa.null(), len(self), [pa.py_buffer((b""))] - ) - result = libcudf.interop.to_arrow( libcudf.table.Table( cudf.core.column_accessor.ColumnAccessor({"None": self}) @@ -357,12 +332,6 @@ def to_arrow(self) -> pa.Array: keep_index=False, )["None"].chunk(0) - if isinstance(self.dtype, cudf.Decimal64Dtype): - result = result.view( - pa.decimal128( - scale=result.type.scale, precision=self.dtype.precision - ) - ) return result @classmethod diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f15a3fd105b..af0f8df04bd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -10,6 +10,7 @@ import cupy import numpy as np import pandas as pd +import pyarrow as pa from numba import cuda from nvtx import annotate @@ -4766,6 +4767,35 @@ def base_size(self) -> int: def data_array_view(self) -> cuda.devicearray.DeviceNDArray: raise ValueError("Cannot get an array view of a StringColumn") + def to_arrow(self) -> pa.Array: + """Convert to PyArrow Array + + Examples + -------- + >>> import cudf + >>> col = cudf.core.column.as_column([1, 2, 3, 4]) + >>> col.to_arrow() + + [ + 1, + 2, + 3, + 4 + ] + """ + if self.null_count == len(self): + return pa.NullArray.from_buffers( + pa.null(), len(self), [pa.py_buffer((b""))] + ) + else: + return libcudf.interop.to_arrow( + libcudf.table.Table( + cudf.core.column_accessor.ColumnAccessor({"None": self}) + ), + [["None"]], + keep_index=False, + )["None"].chunk(0) + def sum( self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 ): From e846a331c832c9d0de621c0ba995034dd7b22fe6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 17:34:46 -0700 Subject: [PATCH 05/12] Minor fix. --- python/cudf/cudf/core/column/column.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d14f621cf76..40351b6efd9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -851,9 +851,6 @@ def isin(self, values: Sequence) -> ColumnBase: result: Column Column of booleans indicating if each element is in values. """ - lhs = self - rhs = None - try: lhs, rhs = self._process_values_for_isin(values) res = lhs._isin_earlystop(rhs) From 71ca4f41421889b96be0b80bc59530fae442142a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Apr 2021 20:24:49 -0700 Subject: [PATCH 06/12] Remove reudndant functions from string column, moving the ones that should be generic into ColumnBase. --- python/cudf/cudf/core/column/column.py | 41 ++++++++--------------- python/cudf/cudf/core/column/numerical.py | 31 ++++++++++++++++- python/cudf/cudf/core/column/string.py | 30 ----------------- 3 files changed, 44 insertions(+), 58 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 40351b6efd9..901617d6d0e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -12,7 +12,6 @@ Callable, Dict, List, - Mapping, Optional, Sequence, Tuple, @@ -1126,31 +1125,11 @@ def argsort( return sorted_indices @property - def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: - output = { - "shape": (len(self),), - "strides": (self.dtype.itemsize,), - "typestr": self.dtype.str, - "data": (self.data_ptr, False), - "version": 1, - } - - if self.nullable and self.has_nulls: - - # Create a simple Python object that exposes the - # `__cuda_array_interface__` attribute here since we need to modify - # some of the attributes from the numba device array - mask = SimpleNamespace( - __cuda_array_interface__={ - "shape": (len(self),), - "typestr": " ColumnBase: mask = Buffer.deserialize(header["mask"], [frames[1]]) return build_column(data=data, dtype=dtype, mask=mask) + def unary_operator(self, unaryop: builtins.str): + raise TypeError( + f"Operation {unaryop} not supported for dtype {self.dtype}." + ) + def binary_operator( self, op: builtins.str, other: BinaryOperand, reflect: bool = False ) -> ColumnBase: - raise NotImplementedError + raise TypeError( + f"Operation {op} not supported between dtypes {self.dtype} and " + f"{other.dtype}." + ) def min(self, skipna: bool = None, dtype: Dtype = None): result_col = self._process_for_reduction(skipna=skipna) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0f717159558..0c815db0b49 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -2,8 +2,10 @@ from __future__ import annotations +import builtins from numbers import Number -from typing import Any, Callable, Sequence, Tuple, Union, cast +from types import SimpleNamespace +from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast import cupy import numpy as np @@ -89,6 +91,33 @@ def __contains__(self, item: ScalarLike) -> bool: self, column.as_column([item], dtype=self.dtype) ).any() + @property + def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: + output = { + "shape": (len(self),), + "strides": (self.dtype.itemsize,), + "typestr": self.dtype.str, + "data": (self.data_ptr, False), + "version": 1, + } + + if self.nullable and self.has_nulls: + + # Create a simple Python object that exposes the + # `__cuda_array_interface__` attribute here since we need to modify + # some of the attributes from the numba device array + mask = SimpleNamespace( + __cuda_array_interface__={ + "shape": (len(self),), + "typestr": " ColumnBase: return _numeric_column_unaryop(self, op=unaryop) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index af0f8df04bd..60b159c8350 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4829,15 +4829,6 @@ def __contains__(self, item: ScalarLike) -> bool: def str(self, parent: ParentType = None) -> StringMethods: return StringMethods(self, parent=parent) - def unary_operator(self, unaryop: builtins.str): - raise TypeError( - f"Series of dtype `str` cannot perform the operation: " - f"{unaryop}" - ) - - def __len__(self) -> int: - return self.size - @property def _nbytes(self) -> int: if self.size == 0: @@ -4960,21 +4951,6 @@ def to_array(self, fillna: bool = None) -> np.ndarray: return self.to_arrow().to_pandas().values - def __array__(self, dtype=None): - raise TypeError( - "Implicit conversion to a host NumPy array via __array__ is not " - "allowed, Conversion to GPU array in strings is not yet " - "supported.\nTo explicitly construct a host array, " - "consider using .to_array()" - ) - - def __arrow_array__(self, type=None): - raise TypeError( - "Implicit conversion to a host PyArrow Array via __arrow_array__ " - "is not allowed, To explicitly construct a PyArrow Array, " - "consider using .to_arrow()" - ) - def to_pandas( self, index: ColumnLike = None, nullable: bool = False, **kwargs ) -> "pd.Series": @@ -5161,12 +5137,6 @@ def binary_operator( def is_unique(self) -> bool: return len(self.unique()) == len(self) - @property - def __cuda_array_interface__(self): - raise NotImplementedError( - "Strings are not yet supported via `__cuda_array_interface__`" - ) - @copy_docstring(column.ColumnBase.view) def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: From 0329ad54cbbabf83cc319f0f0bef79d6b1a5a7ce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Apr 2021 09:09:17 -0700 Subject: [PATCH 07/12] Remove redundant method. --- python/cudf/cudf/core/column/string.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 60b159c8350..78ff8c65272 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5133,10 +5133,6 @@ def binary_operator( f"{op} operator not supported between {type(self)} and {type(rhs)}" ) - @property - def is_unique(self) -> bool: - return len(self.unique()) == len(self) - @copy_docstring(column.ColumnBase.view) def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: From 51e6bea5ab6963f3059623c0b45bb0992834857f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Apr 2021 09:35:05 -0700 Subject: [PATCH 08/12] Fixed a few more redundancies. --- python/cudf/cudf/core/column/column.py | 4 +--- python/cudf/cudf/core/column/datetime.py | 11 ++++------- python/cudf/cudf/core/column/string.py | 8 +------- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 901617d6d0e..5226893d524 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -323,7 +323,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - result = libcudf.interop.to_arrow( + return libcudf.interop.to_arrow( libcudf.table.Table( cudf.core.column_accessor.ColumnAccessor({"None": self}) ), @@ -331,8 +331,6 @@ def to_arrow(self) -> pa.Array: keep_index=False, )["None"].chunk(0) - return result - @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: """ diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 66141fec610..b9d42c031c7 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -139,15 +139,12 @@ def to_pandas( # https://issues.apache.org/jira/browse/ARROW-9772 # Pandas supports only `datetime64[ns]`, hence the cast. - pd_series = pd.Series( - self.astype("datetime64[ns]").to_array("NAT"), copy=False + return pd.Series( + self.astype("datetime64[ns]").to_array("NAT"), + copy=False, + index=index, ) - if index is not None: - pd_series.index = index - - return pd_series - def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 78ff8c65272..ea919866e34 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4788,13 +4788,7 @@ def to_arrow(self) -> pa.Array: pa.null(), len(self), [pa.py_buffer((b""))] ) else: - return libcudf.interop.to_arrow( - libcudf.table.Table( - cudf.core.column_accessor.ColumnAccessor({"None": self}) - ), - [["None"]], - keep_index=False, - )["None"].chunk(0) + return super().to_arrow() def sum( self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 From 0c2c7a1dc4a556722a3bd28c9188f4fdc84d229f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Apr 2021 12:46:12 -0700 Subject: [PATCH 09/12] Add missing __cuda_array_interface__ for datetime objects. --- python/cudf/cudf/core/column/datetime.py | 31 +++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b9d42c031c7..d86a54e6970 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -2,10 +2,12 @@ from __future__ import annotations +import builtins import datetime as dt import re from numbers import Number -from typing import Any, Sequence, Union, cast +from types import SimpleNamespace +from typing import Any, Mapping, Sequence, Union, cast import numpy as np import pandas as pd @@ -199,6 +201,33 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ), ) + @property + def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: + output = { + "shape": (len(self),), + "strides": (self.dtype.itemsize,), + "typestr": self.dtype.str, + "data": (self.data_ptr, False), + "version": 1, + } + + if self.nullable and self.has_nulls: + + # Create a simple Python object that exposes the + # `__cuda_array_interface__` attribute here since we need to modify + # some of the attributes from the numba device array + mask = SimpleNamespace( + __cuda_array_interface__={ + "shape": (len(self),), + "typestr": " DatetimeColumn: dtype = np.dtype(dtype) if dtype == self.dtype: From f575e88356aaaae49a3256c414e8df72175166f2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Apr 2021 16:03:10 -0700 Subject: [PATCH 10/12] Address PR comments. --- python/cudf/cudf/core/column/categorical.py | 5 +---- python/cudf/cudf/core/column/column.py | 10 +++++----- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/interval.py | 10 ++++++---- python/cudf/cudf/core/column/numerical.py | 4 ++-- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d55bd245cb7..3cd1a599ddc 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1076,10 +1076,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: " if you need this functionality." ) - def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs - ) -> pd.Series: - + def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5226893d524..65fcc6791d8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -112,13 +112,15 @@ def __repr__(self): f"dtype: {self.dtype}" ) - def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs - ) -> "pd.Series": + def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": """Convert object to pandas type. The default implementation falls back to PyArrow for the conversion. """ + # This default implementation does not handle nulls in any meaningful + # way, but must consume the parameter to avoid passing it to PyArrow + # (which does not recognize it). + kwargs.pop("nullable", None) pd_series = self.to_arrow().to_pandas(**kwargs) if index is not None: @@ -126,8 +128,6 @@ def to_pandas( return pd_series def __iter__(self): - # TODO: Why don't we just implement this method in terms of one of the - # proposed alternatives (to_arrow, to_pandas, or values_host)? cudf.utils.utils.raise_iteration_error(obj=self) @property diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d86a54e6970..14c82b5ff45 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -135,7 +135,7 @@ def weekday(self) -> ColumnBase: return self.get_dt_field("weekday") def to_pandas( - self, index: "cudf.Index" = None, nullable: bool = False, **kwargs + self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "cudf.Series": # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 7436a69e14a..24541c57044 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -3,7 +3,6 @@ import pyarrow as pa import cudf -from cudf._typing import ColumnLike from cudf.core.column import StructColumn from cudf.core.dtypes import IntervalDtype from cudf.utils.dtypes import is_interval_dtype @@ -114,9 +113,12 @@ def as_interval_column(self, dtype, **kwargs): else: raise ValueError("dtype must be IntervalDtype") - def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs - ) -> "pd.Series": + def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + # Note: This does not handle null values in the interval column. + # However, this exact sequence (calling __from_arrow__ on the output of + # self.to_arrow) is currently the best known way to convert interval + # types into pandas (trying to convert the underlying numerical columns + # directly is problematic), so we're stuck with this for now. return pd.Series( pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 0c815db0b49..d710129900a 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -744,14 +744,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: return False def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs + self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "pd.Series": if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] arrow_array = self.to_arrow() pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) pd_series = pd.Series(pandas_array, copy=False) - elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0: + elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls: pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) else: pd_series = self.to_arrow().to_pandas(**kwargs) From 6583e59bd69783d3e97f071b50fcc494511fafb3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Apr 2021 17:00:12 -0700 Subject: [PATCH 11/12] Explicitly prohibit conversion of columns to host arrays. --- python/cudf/cudf/core/column/column.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 65fcc6791d8..a3a6813908a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1122,6 +1122,13 @@ def argsort( ) return sorted_indices + def __array__(self, dtype=None): + raise TypeError( + "Implicit conversion to a host NumPy array via __array__ is not " + "allowed. To explicitly construct a host array, consider using " + ".to_array()" + ) + @property def __cuda_array_interface__(self): raise NotImplementedError( From 381d15f8a951ab9fbbdb68bd37416cb91de76da4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Apr 2021 09:20:30 -0700 Subject: [PATCH 12/12] Explicitly prohibit conversion of columns to arrow (host) arrays. --- python/cudf/cudf/core/column/column.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a3a6813908a..bd67376642f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1122,6 +1122,13 @@ def argsort( ) return sorted_indices + def __arrow_array__(self, type=None): + raise TypeError( + "Implicit conversion to a host PyArrow Array via __arrow_array__ " + "is not allowed, To explicitly construct a PyArrow Array, " + "consider using .to_arrow()" + ) + def __array__(self, dtype=None): raise TypeError( "Implicit conversion to a host NumPy array via __array__ is not "