Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring column logic Part 1 #8081

Merged
merged 12 commits into from
Apr 28, 2021
19 changes: 19 additions & 0 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa
from numba import cuda

import cudf
Expand Down Expand Up @@ -1099,6 +1100,24 @@ def to_pandas(
)
return pd.Series(data, index=index)

def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array."""
# arrow doesn't support unsigned codes
signed_type = (
min_signed_type(self.codes.max())
if self.codes.size > 0
else np.int8
)
codes = self.codes.astype(signed_type)
categories = self.categories

out_indices = codes.to_arrow()
out_dictionary = categories.to_arrow()

return pa.DictionaryArray.from_arrays(
out_indices, out_dictionary, ordered=self.ordered,
)

@property
def values_host(self) -> np.ndarray:
"""
Expand Down
103 changes: 23 additions & 80 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
Callable,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
Expand Down Expand Up @@ -44,9 +43,7 @@
from cudf.core.dtypes import CategoricalDtype, IntervalDtype
from cudf.utils import ioutils, utils
from cudf.utils.dtypes import (
NUMERIC_TYPES,
check_cast_unsupported_dtype,
cudf_dtypes_to_pandas_dtypes,
get_time_unit,
is_categorical_dtype,
is_decimal_dtype,
Expand All @@ -56,7 +53,6 @@
is_scalar,
is_string_dtype,
is_struct_dtype,
min_signed_type,
min_unsigned_type,
np_to_pa_dtype,
)
Expand Down Expand Up @@ -119,25 +115,19 @@ def __repr__(self):
def to_pandas(
self, index: ColumnLike = None, nullable: bool = False, **kwargs
) -> "pd.Series":
if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
arrow_array = self.to_arrow()
pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
pd_series = pd.Series(pandas_array, copy=False)
elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
elif is_interval_dtype(self.dtype):
pd_series = pd.Series(
pd.IntervalDtype().__from_arrow__(self.to_arrow())
)
else:
pd_series = self.to_arrow().to_pandas(**kwargs)
"""Convert object to pandas type.

The default implementation falls back to PyArrow for the conversion.
"""
pd_series = self.to_arrow().to_pandas(**kwargs)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

if index is not None:
pd_series.index = index
return pd_series

def __iter__(self):
# TODO: Why don't we just implement this method in terms of one of the
# proposed alternatives (to_arrow, to_pandas, or values_host)?
vyasr marked this conversation as resolved.
Show resolved Hide resolved
cudf.utils.utils.raise_iteration_error(obj=self)

@property
Expand Down Expand Up @@ -333,46 +323,14 @@ def to_arrow(self) -> pa.Array:
4
]
"""
if isinstance(self, cudf.core.column.CategoricalColumn):
# arrow doesn't support unsigned codes
signed_type = (
min_signed_type(self.codes.max())
if self.codes.size > 0
else np.int8
)
codes = self.codes.astype(signed_type)
categories = self.categories

out_indices = codes.to_arrow()
out_dictionary = categories.to_arrow()

return pa.DictionaryArray.from_arrays(
out_indices, out_dictionary, ordered=self.ordered,
)

if isinstance(self, cudf.core.column.StringColumn) and (
self.null_count == len(self)
):
return pa.NullArray.from_buffers(
pa.null(), len(self), [pa.py_buffer((b""))]
)

result = libcudf.interop.to_arrow(
return libcudf.interop.to_arrow(
libcudf.table.Table(
cudf.core.column_accessor.ColumnAccessor({"None": self})
),
[["None"]],
keep_index=False,
)["None"].chunk(0)

if isinstance(self.dtype, cudf.Decimal64Dtype):
result = result.view(
pa.decimal128(
scale=result.type.scale, precision=self.dtype.precision
)
)
return result

@classmethod
def from_arrow(cls, array: pa.Array) -> ColumnBase:
"""
Expand Down Expand Up @@ -838,7 +796,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
return indices[-1]

def append(self, other: ColumnBase) -> ColumnBase:
return ColumnBase._concat([self, as_column(other)])
return self.__class__._concat([self, as_column(other)])

def quantile(
self,
Expand Down Expand Up @@ -890,9 +848,6 @@ def isin(self, values: Sequence) -> ColumnBase:
result: Column
Column of booleans indicating if each element is in values.
"""
lhs = self
rhs = None

try:
lhs, rhs = self._process_values_for_isin(values)
res = lhs._isin_earlystop(rhs)
Expand Down Expand Up @@ -1168,31 +1123,11 @@ def argsort(
return sorted_indices

@property
def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
output = {
"shape": (len(self),),
"strides": (self.dtype.itemsize,),
"typestr": self.dtype.str,
"data": (self.data_ptr, False),
"version": 1,
}

if self.nullable and self.has_nulls:

# Create a simple Python object that exposes the
# `__cuda_array_interface__` attribute here since we need to modify
# some of the attributes from the numba device array
mask = SimpleNamespace(
__cuda_array_interface__={
"shape": (len(self),),
"typestr": "<t1",
"data": (self.mask_ptr, True),
"version": 1,
}
)
output["mask"] = mask

return output
def __cuda_array_interface__(self):
raise NotImplementedError(
f"dtype {self.dtype} is not yet supported via "
"`__cuda_array_interface__`"
)

def __add__(self, other):
return self.binary_operator("add", other)
Expand Down Expand Up @@ -1289,10 +1224,18 @@ def deserialize(cls, header: dict, frames: list) -> ColumnBase:
mask = Buffer.deserialize(header["mask"], [frames[1]])
return build_column(data=data, dtype=dtype, mask=mask)

def unary_operator(self, unaryop: builtins.str):
raise TypeError(
f"Operation {unaryop} not supported for dtype {self.dtype}."
)

def binary_operator(
self, op: builtins.str, other: BinaryOperand, reflect: bool = False
) -> ColumnBase:
raise NotImplementedError
raise TypeError(
f"Operation {op} not supported between dtypes {self.dtype} and "
f"{other.dtype}."
)

def min(self, skipna: bool = None, dtype: Dtype = None):
result_col = self._process_for_reduction(skipna=skipna)
Expand Down
11 changes: 4 additions & 7 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,12 @@ def to_pandas(
# https://issues.apache.org/jira/browse/ARROW-9772

# Pandas supports only `datetime64[ns]`, hence the cast.
pd_series = pd.Series(
self.astype("datetime64[ns]").to_array("NAT"), copy=False
return pd.Series(
self.astype("datetime64[ns]").to_array("NAT"),
copy=False,
index=index,
)

if index is not None:
pd_series.index = index

return pd_series

def get_dt_field(self, field: str) -> ColumnBase:
return libcudf.datetime.extract_datetime_component(self, field)

Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
import pandas as pd
import pyarrow as pa

import cudf
from cudf._typing import ColumnLike
from cudf.core.column import StructColumn
from cudf.core.dtypes import IntervalDtype
from cudf.utils.dtypes import is_interval_dtype
Expand Down Expand Up @@ -110,3 +113,10 @@ def as_interval_column(self, dtype, **kwargs):
)
else:
raise ValueError("dtype must be IntervalDtype")

def to_pandas(
self, index: ColumnLike = None, nullable: bool = False, **kwargs
) -> "pd.Series":
return pd.Series(
pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index
)
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
53 changes: 51 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@

from __future__ import annotations

import builtins
from numbers import Number
from typing import Any, Callable, Sequence, Tuple, Union, cast
from types import SimpleNamespace
from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast

import cupy
import numpy as np
import pandas as pd
from numba import cuda, njit
Expand All @@ -27,6 +30,8 @@
from cudf.core.dtypes import Decimal64Dtype
from cudf.utils import cudautils, utils
from cudf.utils.dtypes import (
NUMERIC_TYPES,
cudf_dtypes_to_pandas_dtypes,
min_column_type,
min_signed_type,
numeric_normalize_types,
Expand Down Expand Up @@ -86,6 +91,33 @@ def __contains__(self, item: ScalarLike) -> bool:
self, column.as_column([item], dtype=self.dtype)
).any()

@property
def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
output = {
"shape": (len(self),),
"strides": (self.dtype.itemsize,),
"typestr": self.dtype.str,
"data": (self.data_ptr, False),
"version": 1,
}

if self.nullable and self.has_nulls:

# Create a simple Python object that exposes the
# `__cuda_array_interface__` attribute here since we need to modify
# some of the attributes from the numba device array
mask = SimpleNamespace(
__cuda_array_interface__={
"shape": (len(self),),
"typestr": "<t1",
"data": (self.mask_ptr, True),
"version": 1,
}
)
output["mask"] = mask
vyasr marked this conversation as resolved.
Show resolved Hide resolved

return output

def unary_operator(self, unaryop: str) -> ColumnBase:
return _numeric_column_unaryop(self, op=unaryop)

Expand Down Expand Up @@ -407,7 +439,7 @@ def round(self, decimals: int = 0) -> NumericalColumn:
def applymap(
self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
) -> ColumnBase:
"""Apply an element-wise function to transform the values in the Column.
"""Apply an elementwise function to transform the values in the Column.

Parameters
----------
Expand Down Expand Up @@ -711,6 +743,23 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:

return False

def to_pandas(
self, index: ColumnLike = None, nullable: bool = False, **kwargs
) -> "pd.Series":
if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
arrow_array = self.to_arrow()
pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
pd_series = pd.Series(pandas_array, copy=False)
elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
vyasr marked this conversation as resolved.
Show resolved Hide resolved
pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
else:
pd_series = self.to_arrow().to_pandas(**kwargs)

if index is not None:
pd_series.index = index
vyasr marked this conversation as resolved.
Show resolved Hide resolved
return pd_series


@annotate("BINARY_OP", color="orange", domain="cudf_python")
def _numeric_column_binop(
Expand Down
Loading