rapidsai · rapids-bot · Aug 13, 2021 · Aug 4, 2021 · Aug 4, 2021 · Aug 4, 2021
@@ -15,6 +15,7 @@
     register_index_accessor,
     register_series_accessor,
 )
+from cudf.api.types import dtype
 from cudf.core import (
     NA,
     BaseIndex,

@@ -27,6 +27,55 @@
 )
 
 
+def dtype(arbitrary):
+    """
+    Return the cuDF-supported dtype corresponding to `arbitrary`.
+
+    Inputs
+    ------
+    arbitrary: dtype or scalar-like
+
+    Returns
+    -------
+    dtype: the cuDF-supported dtype that best matches `arbitrary`
+    """
+    # first, try interpreting arbitrary as a NumPy dtype that we support:
+    try:
+        np_dtype = np.dtype(arbitrary)
+        if np_dtype.name == "float16":
+            np_dtype = np.dtype("float32")
+        elif np_dtype.kind in ("OU"):
+            np_dtype = np.dtype("object")
+    except TypeError:
+        pass
+    else:
+        if np_dtype.kind not in "biufUOMm":
-        if np_dtype.kind not in "biufUOMm":
+        if np_dtype not in cudf._lib.types.np_to_cudf_types:
-        if np_dtype.kind not in "biufUOMm":
+        if np_dtype not in cudf._lib.types.np_to_cudf_types:
+            raise TypeError(f"Unsupported type {np_dtype}")
+        return np_dtype
+
+    #  next, check if `arbitrary` is one of our extension types:
+    if isinstance(arbitrary, cudf.core.dtypes._BaseDtype):
+        return arbitrary
+
+    # use `pandas_dtype` to try and interpret
+    # `arbitrary` as a Pandas extension type.
+    #  Return the corresponding NumPy/cuDF type.
+    pd_dtype = pd.api.types.pandas_dtype(arbitrary)
+    try:
+        return pd_dtype.numpy_dtype
+    except AttributeError:
+        if isinstance(pd_dtype, pd.CategoricalDtype):
+            return cudf.CategoricalDtype.from_pandas(pd_dtype)
+        elif isinstance(pd_dtype, pd.StringDtype):
+            return np.dtype("object")
+        elif isinstance(pd_dtype, pd.IntervalDtype):
+            return cudf.IntervalDtype.from_pandas(pd_dtype)
+        else:
+            raise TypeError(
+                f"Cannot interpret {arbitrary} as a valid cuDF dtype"
+            )
+
+
 def is_numeric_dtype(obj):
     """Check whether the provided array or dtype is of a numeric dtype.
 

@@ -53,7 +53,6 @@
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
-    pandas_dtype,
 )
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
@@ -432,7 +431,7 @@ def view(self, dtype: Dtype) -> ColumnBase:
 
         """
 
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
         if dtype.kind in ("o", "u", "s"):
             raise TypeError(
@@ -889,7 +888,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
             return self.as_numerical_column(dtype, **kwargs)
         elif is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
-        elif pandas_dtype(dtype).type in {
+        elif cudf.dtype(dtype).type in {
             np.str_,
             np.object_,
             str,
@@ -1299,7 +1298,7 @@ def column_empty(
 ) -> ColumnBase:
     """Allocate a new column like the given row_count and dtype.
     """
-    dtype = pandas_dtype(dtype)
+    dtype = cudf.dtype(dtype)
     children = ()  # type: Tuple[ColumnBase, ...]
 
     if is_struct_dtype(dtype):
@@ -1364,7 +1363,7 @@ def build_column(
     offset : int, optional
     children : tuple, optional
     """
-    dtype = pandas_dtype(dtype)
+    dtype = cudf.dtype(dtype)
 
     if _is_non_decimal_numeric_dtype(dtype):
         assert data is not None
@@ -1769,9 +1768,9 @@ def as_column(
         col = ColumnBase.from_arrow(arbitrary)
         if isinstance(arbitrary, pa.NullArray):
             if type(dtype) == str and dtype == "empty":
-                new_dtype = pandas_dtype(arbitrary.type.to_pandas_dtype())
+                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())
             else:
-                new_dtype = pandas_dtype(dtype)
+                new_dtype = np.dtype(dtype)
             col = col.astype(new_dtype)
 
         return col
@@ -1865,7 +1864,7 @@ def as_column(
             arbitrary = np.ascontiguousarray(arbitrary)
 
         if dtype is not None:
-            arbitrary = arbitrary.astype(dtype)
+            arbitrary = arbitrary.astype(np.dtype(dtype))
 
         if arb_dtype.kind == "M":
 
@@ -2034,7 +2033,6 @@ def as_column(
                         return cudf.core.column.Decimal32Column.from_arrow(
                             data
                         )
-                    dtype = pd.api.types.pandas_dtype(dtype)
                     np_type = np.dtype(dtype).type
                     if np_type == np.bool_:
                         pa_type = pa.bool_()
@@ -2088,7 +2086,7 @@ def _construct_array(
     Construct a CuPy or NumPy array from `arbitrary`
     """
     try:
-        dtype = dtype if dtype is None else np.dtype(dtype)
+        dtype = dtype if dtype is None else cudf.dtype(dtype)
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
@@ -2280,7 +2278,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
 def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
-        dtype = pandas_dtype(None)
+        dtype = cudf.dtype(None)
         return column_empty(0, dtype=dtype, masked=True)
 
     # If all columns are `NumericalColumn` with different dtypes,

@@ -71,7 +71,7 @@ def __init__(
         mask : Buffer; optional
             The validity mask
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -236,7 +236,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
         return output
 
     def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)

@@ -53,7 +53,7 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -253,7 +253,7 @@ def as_decimal_column(
         return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype)
@@ -608,7 +608,7 @@ def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
     else:
         raise TypeError(
             f"Cannot safely cast non-equivalent "
-            f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}"
+            f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}"
         )
 
 

@@ -5062,7 +5062,7 @@ def __contains__(self, item: ScalarLike) -> bool:
     def as_numerical_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
 
         if out_dtype.kind in {"i", "u"}:
             if not libstrings.is_integer(self).all():
@@ -5104,7 +5104,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
     def as_datetime_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DatetimeColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
 
         # infer on host from the first not na element
         # or return all null column if all values
@@ -5128,7 +5128,7 @@ def as_datetime_column(
     def as_timedelta_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.TimeDeltaColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
         format = "%D days %H:%M:%S"
         return self._as_datetime_or_timedelta_column(out_dtype, format)
 
@@ -5387,7 +5387,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
             raise ValueError(
                 "Can not produce a view of a string column with nulls"
             )
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         str_byte_offset = self.base_children[0].element_indexing(self.offset)
         str_end_byte_offset = self.base_children[0].element_indexing(
             self.offset + self.size

@@ -60,7 +60,7 @@ def __init__(
             The number of null values.
             If None, it is calculated automatically.
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -353,7 +353,7 @@ def as_string_column(
             )
 
     def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)

@@ -559,6 +559,12 @@ def to_arrow(self):
             pa.from_numpy_dtype(self.subtype), self.closed
         )
 
+    @classmethod
+    def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
+        return cls(
+            subtype=pd_dtype.subtype
+        )  # TODO: needs `closed` when we upgrade Pandas
+
 
 def is_categorical_dtype(obj):
     """Check whether an array-like or dtype is of the Categorical dtype.

@@ -5,6 +5,7 @@
 import pyarrow as pa
 from pandas._libs.missing import NAType as pd_NAType
 
+import cudf
 from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
 from cudf.core.column.column import ColumnBase
 from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
@@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype):
                 dtype = value.dtype
 
         if not isinstance(dtype, Decimal64Dtype):
-            dtype = np.dtype(dtype)
+            dtype = cudf.dtype(dtype)
 
         if not valid:
             value = NA

@@ -3764,7 +3764,7 @@ def one_hot_encoding(self, cats, dtype="float64"):
             cats = cats.to_pandas()
         else:
             cats = pd.Series(cats, dtype="object")
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
         def encode(cat):
             if cat is None:

@@ -245,7 +245,7 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
 
 
 def gen_rand(dtype, size, **kwargs):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.kind == "f":
         res = np.random.random(size=size).astype(dtype)
         if kwargs.get("positive_only", False):
@@ -284,7 +284,7 @@ def gen_rand(dtype, size, **kwargs):
         return pd.to_datetime(
             np.random.randint(low=low, high=high, size=size), unit=time_unit
         )
-    elif dtype.kind == "U":
+    elif dtype.kind in ("O", "U"):
         return pd.util.testing.rands_array(10, size)
     raise NotImplementedError(f"dtype.kind={dtype.kind}")