Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cudf.dtype function #8949

Merged
merged 27 commits into from
Aug 13, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
60c7c87
Replace cudf.dtype -> np.dtype
shwina Aug 4, 2021
5e50f52
First stab at cudf.dtype
shwina Aug 4, 2021
367b743
Handle datetimes/timedeltas in cudf.dtype
shwina Aug 4, 2021
d04a5f1
Fix test
shwina Aug 4, 2021
85351e9
Handle disallowed numpy types
shwina Aug 5, 2021
3c9dd97
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 5, 2021
67cca8a
Update python/cudf/cudf/tests/test_dtypes.py
shwina Aug 5, 2021
a10eae0
Some fixes
shwina Aug 6, 2021
89ac918
Remaining failures
shwina Aug 9, 2021
acda2ee
Merge branch 'cudf-dtype-function' of github.com:shwina/cudf into cud…
shwina Aug 9, 2021
64a3290
Style
shwina Aug 9, 2021
a62ab32
Update python/cudf/cudf/api/types.py
shwina Aug 9, 2021
f79e59f
cudf.dtype -> np.dtype
shwina Aug 10, 2021
9dceb80
Merge branch 'cudf-dtype-function' of github.com:shwina/cudf into cud…
shwina Aug 10, 2021
d0bef49
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 10, 2021
3eba47c
Progress
shwina Aug 11, 2021
048629c
More fix
shwina Aug 11, 2021
40736c4
Early returns
shwina Aug 11, 2021
550c7ba
More tests
shwina Aug 11, 2021
1cfa67c
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 11, 2021
72d6304
Resolve circular import issues
shwina Aug 11, 2021
c8925f5
Unused import
shwina Aug 12, 2021
26df99a
Space
shwina Aug 12, 2021
fec34d9
Add interval tests
shwina Aug 12, 2021
5fc19a9
:(
shwina Aug 12, 2021
11156f5
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 12, 2021
2a684be
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
register_index_accessor,
register_series_accessor,
)
from cudf.api.types import dtype
from cudf.core import (
NA,
BaseIndex,
Expand Down
49 changes: 49 additions & 0 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,55 @@
)


def dtype(arbitrary):
"""
Return the cuDF-supported dtype corresponding to `arbitrary`.

Inputs
------
shwina marked this conversation as resolved.
Show resolved Hide resolved
arbitrary: dtype or scalar-like

Returns
-------
dtype: the cuDF-supported dtype that best matches `arbitrary`
"""
# first, try interpreting arbitrary as a NumPy dtype that we support:
try:
np_dtype = np.dtype(arbitrary)
if np_dtype.name == "float16":
np_dtype = np.dtype("float32")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
elif np_dtype.kind in ("OU"):
np_dtype = np.dtype("object")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
except TypeError:
pass
else:
if np_dtype.kind not in "biufUOMm":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if np_dtype.kind not in "biufUOMm":
if np_dtype not in cudf._lib.types.np_to_cudf_types:

To make this maintainable should we just lookup our np<->libcudf type-map here? This was any new dtype support added will automatically be supported here by cudf.dtype.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree -- but it would be nicer if the source of truth was in a more obiously named constant. For exmaple, something like: cudf._lib.types.SUPPORTED_NUMPY_TYPES.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to have a cudf._lib.types.SUPPORTED_NUMPY_TYPES

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a slight problem here where <M8 is an acceptable return type here, but it's not a SUPPORTED_NUMPY_TYPE (supported types are <M8[unit]).

raise TypeError(f"Unsupported type {np_dtype}")
return np_dtype

# next, check if `arbitrary` is one of our extension types:
if isinstance(arbitrary, cudf.core.dtypes._BaseDtype):
return arbitrary

# use `pandas_dtype` to try and interpret
# `arbitrary` as a Pandas extension type.
# Return the corresponding NumPy/cuDF type.
pd_dtype = pd.api.types.pandas_dtype(arbitrary)
try:
return pd_dtype.numpy_dtype
vyasr marked this conversation as resolved.
Show resolved Hide resolved
except AttributeError:
if isinstance(pd_dtype, pd.CategoricalDtype):
return cudf.CategoricalDtype.from_pandas(pd_dtype)
elif isinstance(pd_dtype, pd.StringDtype):
return np.dtype("object")
elif isinstance(pd_dtype, pd.IntervalDtype):
return cudf.IntervalDtype.from_pandas(pd_dtype)
else:
raise TypeError(
f"Cannot interpret {arbitrary} as a valid cuDF dtype"
)


def is_numeric_dtype(obj):
"""Check whether the provided array or dtype is of a numeric dtype.

Expand Down
20 changes: 9 additions & 11 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
is_scalar,
is_string_dtype,
is_struct_dtype,
pandas_dtype,
)
from cudf.core.abc import Serializable
from cudf.core.buffer import Buffer
Expand Down Expand Up @@ -432,7 +431,7 @@ def view(self, dtype: Dtype) -> ColumnBase:

"""

dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)

if dtype.kind in ("o", "u", "s"):
raise TypeError(
Expand Down Expand Up @@ -889,7 +888,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
return self.as_numerical_column(dtype, **kwargs)
elif is_categorical_dtype(dtype):
return self.as_categorical_column(dtype, **kwargs)
elif pandas_dtype(dtype).type in {
elif cudf.dtype(dtype).type in {
np.str_,
np.object_,
str,
Expand Down Expand Up @@ -1299,7 +1298,7 @@ def column_empty(
) -> ColumnBase:
"""Allocate a new column like the given row_count and dtype.
"""
dtype = pandas_dtype(dtype)
dtype = cudf.dtype(dtype)
children = () # type: Tuple[ColumnBase, ...]

if is_struct_dtype(dtype):
Expand Down Expand Up @@ -1364,7 +1363,7 @@ def build_column(
offset : int, optional
children : tuple, optional
"""
dtype = pandas_dtype(dtype)
dtype = cudf.dtype(dtype)

if _is_non_decimal_numeric_dtype(dtype):
assert data is not None
Expand Down Expand Up @@ -1769,9 +1768,9 @@ def as_column(
col = ColumnBase.from_arrow(arbitrary)
if isinstance(arbitrary, pa.NullArray):
if type(dtype) == str and dtype == "empty":
new_dtype = pandas_dtype(arbitrary.type.to_pandas_dtype())
new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())
else:
new_dtype = pandas_dtype(dtype)
new_dtype = np.dtype(dtype)
col = col.astype(new_dtype)

return col
Expand Down Expand Up @@ -1865,7 +1864,7 @@ def as_column(
arbitrary = np.ascontiguousarray(arbitrary)

if dtype is not None:
arbitrary = arbitrary.astype(dtype)
arbitrary = arbitrary.astype(np.dtype(dtype))

if arb_dtype.kind == "M":

Expand Down Expand Up @@ -2034,7 +2033,6 @@ def as_column(
return cudf.core.column.Decimal32Column.from_arrow(
data
)
dtype = pd.api.types.pandas_dtype(dtype)
np_type = np.dtype(dtype).type
if np_type == np.bool_:
pa_type = pa.bool_()
Expand Down Expand Up @@ -2088,7 +2086,7 @@ def _construct_array(
Construct a CuPy or NumPy array from `arbitrary`
"""
try:
dtype = dtype if dtype is None else np.dtype(dtype)
dtype = dtype if dtype is None else cudf.dtype(dtype)
arbitrary = cupy.asarray(arbitrary, dtype=dtype)
except (TypeError, ValueError):
native_dtype = dtype
Expand Down Expand Up @@ -2280,7 +2278,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
"""Concatenate a sequence of columns."""
if len(objs) == 0:
dtype = pandas_dtype(None)
dtype = cudf.dtype(None)
return column_empty(0, dtype=dtype, masked=True)

# If all columns are `NumericalColumn` with different dtypes,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __init__(
mask : Buffer; optional
The validity mask
"""
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
if size is None:
Expand Down Expand Up @@ -236,7 +236,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
return output

def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype=dtype)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(
The dtype associated with the data Buffer
mask : Buffer, optional
"""
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
if size is None:
Expand Down Expand Up @@ -253,7 +253,7 @@ def as_decimal_column(
return libcudf.unary.cast(self, dtype)

def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype)
Expand Down Expand Up @@ -608,7 +608,7 @@ def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
else:
raise TypeError(
f"Cannot safely cast non-equivalent "
f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}"
f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}"
)


Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5062,7 +5062,7 @@ def __contains__(self, item: ScalarLike) -> bool:
def as_numerical_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.NumericalColumn":
out_dtype = np.dtype(dtype)
out_dtype = cudf.dtype(dtype)

if out_dtype.kind in {"i", "u"}:
if not libstrings.is_integer(self).all():
Expand Down Expand Up @@ -5104,7 +5104,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
def as_datetime_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.DatetimeColumn":
out_dtype = np.dtype(dtype)
out_dtype = cudf.dtype(dtype)

# infer on host from the first not na element
# or return all null column if all values
Expand All @@ -5128,7 +5128,7 @@ def as_datetime_column(
def as_timedelta_column(
self, dtype: Dtype, **kwargs
) -> "cudf.core.column.TimeDeltaColumn":
out_dtype = np.dtype(dtype)
out_dtype = cudf.dtype(dtype)
format = "%D days %H:%M:%S"
return self._as_datetime_or_timedelta_column(out_dtype, format)

Expand Down Expand Up @@ -5387,7 +5387,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
raise ValueError(
"Can not produce a view of a string column with nulls"
)
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
str_byte_offset = self.base_children[0].element_indexing(self.offset)
str_end_byte_offset = self.base_children[0].element_indexing(
self.offset + self.size
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(
The number of null values.
If None, it is calculated automatically.
"""
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if data.size % dtype.itemsize:
raise ValueError("Buffer size must be divisible by element size")
if size is None:
Expand Down Expand Up @@ -353,7 +353,7 @@ def as_string_column(
)

def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn:
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype=dtype)
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,12 @@ def to_arrow(self):
pa.from_numpy_dtype(self.subtype), self.closed
)

@classmethod
def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
return cls(
subtype=pd_dtype.subtype
) # TODO: needs `closed` when we upgrade Pandas


def is_categorical_dtype(obj):
"""Check whether an array-like or dtype is of the Categorical dtype.
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pyarrow as pa
from pandas._libs.missing import NAType as pd_NAType

import cudf
from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
from cudf.core.column.column import ColumnBase
from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
Expand Down Expand Up @@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype):
dtype = value.dtype

if not isinstance(dtype, Decimal64Dtype):
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)

if not valid:
value = NA
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3764,7 +3764,7 @@ def one_hot_encoding(self, cats, dtype="float64"):
cats = cats.to_pandas()
else:
cats = pd.Series(cats, dtype="object")
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)

def encode(cat):
if cat is None:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):


def gen_rand(dtype, size, **kwargs):
dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if dtype.kind == "f":
res = np.random.random(size=size).astype(dtype)
if kwargs.get("positive_only", False):
Expand Down Expand Up @@ -284,7 +284,7 @@ def gen_rand(dtype, size, **kwargs):
return pd.to_datetime(
np.random.randint(low=low, high=high, size=size), unit=time_unit
)
elif dtype.kind == "U":
elif dtype.kind in ("O", "U"):
return pd.util.testing.rands_array(10, size)
raise NotImplementedError(f"dtype.kind={dtype.kind}")

Expand Down
Loading