diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index bb1bf3c5d5c..3cd1a599ddc 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from numba import cuda import cudf @@ -1075,10 +1076,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: " if you need this functionality." ) - def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs - ) -> pd.Series: - + def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( @@ -1099,6 +1097,24 @@ def to_pandas( ) return pd.Series(data, index=index) + def to_arrow(self) -> pa.Array: + """Convert to PyArrow Array.""" + # arrow doesn't support unsigned codes + signed_type = ( + min_signed_type(self.codes.max()) + if self.codes.size > 0 + else np.int8 + ) + codes = self.codes.astype(signed_type) + categories = self.categories + + out_indices = codes.to_arrow() + out_dictionary = categories.to_arrow() + + return pa.DictionaryArray.from_arrays( + out_indices, out_dictionary, ordered=self.ordered, + ) + @property def values_host(self) -> np.ndarray: """ diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2d438d37b3e..bd67376642f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -12,7 +12,6 @@ Callable, Dict, List, - Mapping, Optional, Sequence, Tuple, @@ -44,9 +43,7 @@ from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - NUMERIC_TYPES, check_cast_unsupported_dtype, - cudf_dtypes_to_pandas_dtypes, get_time_unit, is_categorical_dtype, is_decimal_dtype, @@ -56,7 +53,6 @@ is_scalar, is_string_dtype, is_struct_dtype, - min_signed_type, min_unsigned_type, np_to_pa_dtype, ) @@ -116,22 +112,16 @@ def __repr__(self): f"dtype: {self.dtype}" ) - def to_pandas( - self, index: ColumnLike = None, nullable: bool = False, **kwargs - ) -> "pd.Series": - if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: - pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] - arrow_array = self.to_arrow() - pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) - pd_series = pd.Series(pandas_array, copy=False) - elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0: - pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) - elif is_interval_dtype(self.dtype): - pd_series = pd.Series( - pd.IntervalDtype().__from_arrow__(self.to_arrow()) - ) - else: - pd_series = self.to_arrow().to_pandas(**kwargs) + def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + """Convert object to pandas type. + + The default implementation falls back to PyArrow for the conversion. + """ + # This default implementation does not handle nulls in any meaningful + # way, but must consume the parameter to avoid passing it to PyArrow + # (which does not recognize it). + kwargs.pop("nullable", None) + pd_series = self.to_arrow().to_pandas(**kwargs) if index is not None: pd_series.index = index @@ -333,31 +323,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - if isinstance(self, cudf.core.column.CategoricalColumn): - # arrow doesn't support unsigned codes - signed_type = ( - min_signed_type(self.codes.max()) - if self.codes.size > 0 - else np.int8 - ) - codes = self.codes.astype(signed_type) - categories = self.categories - - out_indices = codes.to_arrow() - out_dictionary = categories.to_arrow() - - return pa.DictionaryArray.from_arrays( - out_indices, out_dictionary, ordered=self.ordered, - ) - - if isinstance(self, cudf.core.column.StringColumn) and ( - self.null_count == len(self) - ): - return pa.NullArray.from_buffers( - pa.null(), len(self), [pa.py_buffer((b""))] - ) - - result = libcudf.interop.to_arrow( + return libcudf.interop.to_arrow( libcudf.table.Table( cudf.core.column_accessor.ColumnAccessor({"None": self}) ), @@ -365,14 +331,6 @@ def to_arrow(self) -> pa.Array: keep_index=False, )["None"].chunk(0) - if isinstance(self.dtype, cudf.Decimal64Dtype): - result = result.view( - pa.decimal128( - scale=result.type.scale, precision=self.dtype.precision - ) - ) - return result - @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: """ @@ -838,7 +796,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return indices[-1] def append(self, other: ColumnBase) -> ColumnBase: - return ColumnBase._concat([self, as_column(other)]) + return self.__class__._concat([self, as_column(other)]) def quantile( self, @@ -890,9 +848,6 @@ def isin(self, values: Sequence) -> ColumnBase: result: Column Column of booleans indicating if each element is in values. """ - lhs = self - rhs = None - try: lhs, rhs = self._process_values_for_isin(values) res = lhs._isin_earlystop(rhs) @@ -1167,32 +1122,26 @@ def argsort( ) return sorted_indices - @property - def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: - output = { - "shape": (len(self),), - "strides": (self.dtype.itemsize,), - "typestr": self.dtype.str, - "data": (self.data_ptr, False), - "version": 1, - } - - if self.nullable and self.has_nulls: - - # Create a simple Python object that exposes the - # `__cuda_array_interface__` attribute here since we need to modify - # some of the attributes from the numba device array - mask = SimpleNamespace( - __cuda_array_interface__={ - "shape": (len(self),), - "typestr": " ColumnBase: mask = Buffer.deserialize(header["mask"], [frames[1]]) return build_column(data=data, dtype=dtype, mask=mask) + def unary_operator(self, unaryop: builtins.str): + raise TypeError( + f"Operation {unaryop} not supported for dtype {self.dtype}." + ) + def binary_operator( self, op: builtins.str, other: BinaryOperand, reflect: bool = False ) -> ColumnBase: - raise NotImplementedError + raise TypeError( + f"Operation {op} not supported between dtypes {self.dtype} and " + f"{other.dtype}." + ) def min(self, skipna: bool = None, dtype: Dtype = None): result_col = self._process_for_reduction(skipna=skipna) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 66141fec610..14c82b5ff45 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -2,10 +2,12 @@ from __future__ import annotations +import builtins import datetime as dt import re from numbers import Number -from typing import Any, Sequence, Union, cast +from types import SimpleNamespace +from typing import Any, Mapping, Sequence, Union, cast import numpy as np import pandas as pd @@ -133,21 +135,18 @@ def weekday(self) -> ColumnBase: return self.get_dt_field("weekday") def to_pandas( - self, index: "cudf.Index" = None, nullable: bool = False, **kwargs + self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "cudf.Series": # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 # Pandas supports only `datetime64[ns]`, hence the cast. - pd_series = pd.Series( - self.astype("datetime64[ns]").to_array("NAT"), copy=False + return pd.Series( + self.astype("datetime64[ns]").to_array("NAT"), + copy=False, + index=index, ) - if index is not None: - pd_series.index = index - - return pd_series - def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) @@ -202,6 +201,33 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ), ) + @property + def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: + output = { + "shape": (len(self),), + "strides": (self.dtype.itemsize,), + "typestr": self.dtype.str, + "data": (self.data_ptr, False), + "version": 1, + } + + if self.nullable and self.has_nulls: + + # Create a simple Python object that exposes the + # `__cuda_array_interface__` attribute here since we need to modify + # some of the attributes from the numba device array + mask = SimpleNamespace( + __cuda_array_interface__={ + "shape": (len(self),), + "typestr": " DatetimeColumn: dtype = np.dtype(dtype) if dtype == self.dtype: diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index d8bea6b1658..24541c57044 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,5 +1,7 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +import pandas as pd import pyarrow as pa + import cudf from cudf.core.column import StructColumn from cudf.core.dtypes import IntervalDtype @@ -110,3 +112,13 @@ def as_interval_column(self, dtype, **kwargs): ) else: raise ValueError("dtype must be IntervalDtype") + + def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": + # Note: This does not handle null values in the interval column. + # However, this exact sequence (calling __from_arrow__ on the output of + # self.to_arrow) is currently the best known way to convert interval + # types into pandas (trying to convert the underlying numerical columns + # directly is problematic), so we're stuck with this for now. + return pd.Series( + pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index + ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 70b4569b180..d710129900a 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -2,9 +2,12 @@ from __future__ import annotations +import builtins from numbers import Number -from typing import Any, Callable, Sequence, Tuple, Union, cast +from types import SimpleNamespace +from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast +import cupy import numpy as np import pandas as pd from numba import cuda, njit @@ -27,6 +30,8 @@ from cudf.core.dtypes import Decimal64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( + NUMERIC_TYPES, + cudf_dtypes_to_pandas_dtypes, min_column_type, min_signed_type, numeric_normalize_types, @@ -86,6 +91,33 @@ def __contains__(self, item: ScalarLike) -> bool: self, column.as_column([item], dtype=self.dtype) ).any() + @property + def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: + output = { + "shape": (len(self),), + "strides": (self.dtype.itemsize,), + "typestr": self.dtype.str, + "data": (self.data_ptr, False), + "version": 1, + } + + if self.nullable and self.has_nulls: + + # Create a simple Python object that exposes the + # `__cuda_array_interface__` attribute here since we need to modify + # some of the attributes from the numba device array + mask = SimpleNamespace( + __cuda_array_interface__={ + "shape": (len(self),), + "typestr": " ColumnBase: return _numeric_column_unaryop(self, op=unaryop) @@ -407,7 +439,7 @@ def round(self, decimals: int = 0) -> NumericalColumn: def applymap( self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None ) -> ColumnBase: - """Apply an element-wise function to transform the values in the Column. + """Apply an elementwise function to transform the values in the Column. Parameters ---------- @@ -711,6 +743,23 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: return False + def to_pandas( + self, index: pd.Index = None, nullable: bool = False, **kwargs + ) -> "pd.Series": + if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: + pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] + arrow_array = self.to_arrow() + pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) + pd_series = pd.Series(pandas_array, copy=False) + elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls: + pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) + else: + pd_series = self.to_arrow().to_pandas(**kwargs) + + if index is not None: + pd_series.index = index + return pd_series + @annotate("BINARY_OP", color="orange", domain="cudf_python") def _numeric_column_binop( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index de2df9b50d7..ea919866e34 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -10,6 +10,7 @@ import cupy import numpy as np import pandas as pd +import pyarrow as pa from numba import cuda from nvtx import annotate @@ -4766,6 +4767,29 @@ def base_size(self) -> int: def data_array_view(self) -> cuda.devicearray.DeviceNDArray: raise ValueError("Cannot get an array view of a StringColumn") + def to_arrow(self) -> pa.Array: + """Convert to PyArrow Array + + Examples + -------- + >>> import cudf + >>> col = cudf.core.column.as_column([1, 2, 3, 4]) + >>> col.to_arrow() + + [ + 1, + 2, + 3, + 4 + ] + """ + if self.null_count == len(self): + return pa.NullArray.from_buffers( + pa.null(), len(self), [pa.py_buffer((b""))] + ) + else: + return super().to_arrow() + def sum( self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 ): @@ -4799,15 +4823,6 @@ def __contains__(self, item: ScalarLike) -> bool: def str(self, parent: ParentType = None) -> StringMethods: return StringMethods(self, parent=parent) - def unary_operator(self, unaryop: builtins.str): - raise TypeError( - f"Series of dtype `str` cannot perform the operation: " - f"{unaryop}" - ) - - def __len__(self) -> int: - return self.size - @property def _nbytes(self) -> int: if self.size == 0: @@ -4930,20 +4945,18 @@ def to_array(self, fillna: bool = None) -> np.ndarray: return self.to_arrow().to_pandas().values - def __array__(self, dtype=None): - raise TypeError( - "Implicit conversion to a host NumPy array via __array__ is not " - "allowed, Conversion to GPU array in strings is not yet " - "supported.\nTo explicitly construct a host array, " - "consider using .to_array()" - ) + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> "pd.Series": + if nullable: + pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) + pd_series = pd.Series(pandas_array, copy=False) + else: + pd_series = self.to_arrow().to_pandas(**kwargs) - def __arrow_array__(self, type=None): - raise TypeError( - "Implicit conversion to a host PyArrow Array via __arrow_array__ " - "is not allowed, To explicitly construct a PyArrow Array, " - "consider using .to_arrow()" - ) + if index is not None: + pd_series.index = index + return pd_series def serialize(self) -> Tuple[dict, list]: header = {"null_count": self.null_count} # type: Dict[Any, Any] @@ -5114,16 +5127,6 @@ def binary_operator( f"{op} operator not supported between {type(self)} and {type(rhs)}" ) - @property - def is_unique(self) -> bool: - return len(self.unique()) == len(self) - - @property - def __cuda_array_interface__(self): - raise NotImplementedError( - "Strings are not yet supported via `__cuda_array_interface__`" - ) - @copy_docstring(column.ColumnBase.view) def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: