rapidsai · rapids-bot · Aug 13, 2021 · Aug 4, 2021 · Aug 4, 2021 · Aug 4, 2021
@@ -15,6 +15,7 @@
     register_index_accessor,
     register_series_accessor,
 )
+from cudf.api.types import dtype
 from cudf.core import (
     NA,
     BaseIndex,

@@ -27,6 +27,35 @@
 )
 
 
+def dtype(arbitrary):
+    try:
+        np_dtype = np.dtype(arbitrary)
+        if np_dtype.name == "float16":
+            np_dtype = np.dtype("float32")
+        elif np_dtype.name in ("object", "str"):
+            np_dtype = np.dtype("object")
+        elif np_dtype.str == "<m8":
+            np_dtype = np.dtype("<m8[ns]")
+        elif np_dtype.str == "<M8":
+            np_dtype = np.dtype("<M8[ns]")
+        return np_dtype
+    except TypeError:
+        pass
+    if isinstance(arbitrary, cudf.core.dtypes._BaseDtype):
+        return arbitrary
+    elif isinstance(arbitrary, pd.CategoricalDtype):
+        return cudf.CategoricalDtype.from_pandas(arbitrary)
+    elif isinstance(arbitrary, pd.IntervalDtype):
+        return cudf.IntervalDtype.from_pandas(arbitrary)
+    pd_dtype = pd.api.types.pandas_dtype(arbitrary)
+    try:
+        return pd_dtype.numpy_dtype
+    except AttributeError:
+        # no NumPy type corresponding to this type
+        # always object?
+        return np.dtype("object")
+
+
 def is_numeric_dtype(obj):
     """Check whether the provided array or dtype is of a numeric dtype.
 

@@ -432,7 +432,7 @@ def view(self, dtype: Dtype) -> ColumnBase:
 
         """
 
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
         if dtype.kind in ("o", "u", "s"):
             raise TypeError(
@@ -2078,11 +2078,11 @@ def as_column(
                             data
                         )
                     dtype = pd.api.types.pandas_dtype(dtype)
-                    np_type = np.dtype(dtype).type
+                    np_type = cudf.dtype(dtype).type
                     if np_type == np.bool_:
                         pa_type = pa.bool_()
                     else:
-                        pa_type = np_to_pa_dtype(np.dtype(dtype))
+                        pa_type = np_to_pa_dtype(cudf.dtype(dtype))
                 data = as_column(
                     pa.array(
                         arbitrary,
@@ -2131,7 +2131,7 @@ def _construct_array(
     Construct a CuPy or NumPy array from `arbitrary`
     """
     try:
-        dtype = dtype if dtype is None else np.dtype(dtype)
+        dtype = dtype if dtype is None else cudf.dtype(dtype)
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype

@@ -71,7 +71,7 @@ def __init__(
         mask : Buffer; optional
             The validity mask
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -236,7 +236,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
         return output
 
     def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)

@@ -53,7 +53,7 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -253,7 +253,7 @@ def as_decimal_column(
         return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype)
@@ -608,7 +608,7 @@ def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
     else:
         raise TypeError(
             f"Cannot safely cast non-equivalent "
-            f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}"
+            f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}"
         )
 
 

@@ -5054,7 +5054,7 @@ def __contains__(self, item: ScalarLike) -> bool:
     def as_numerical_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
 
         if out_dtype.kind in {"i", "u"}:
             if not libstrings.is_integer(self).all():
@@ -5096,7 +5096,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
     def as_datetime_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DatetimeColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
 
         # infer on host from the first not na element
         # or return all null column if all values
@@ -5120,7 +5120,7 @@ def as_datetime_column(
     def as_timedelta_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.TimeDeltaColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
         format = "%D days %H:%M:%S"
         return self._as_datetime_or_timedelta_column(out_dtype, format)
 
@@ -5379,7 +5379,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
             raise ValueError(
                 "Can not produce a view of a string column with nulls"
             )
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         str_byte_offset = self.base_children[0].element_indexing(self.offset)
         str_end_byte_offset = self.base_children[0].element_indexing(
             self.offset + self.size

@@ -60,7 +60,7 @@ def __init__(
             The number of null values.
             If None, it is calculated automatically.
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -353,7 +353,7 @@ def as_string_column(
             )
 
     def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)

@@ -559,6 +559,12 @@ def to_arrow(self):
             pa.from_numpy_dtype(self.subtype), self.closed
         )
 
+    @classmethod
+    def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
+        return cls(
+            subtype=pd_dtype.subtype
+        )  # TODO: needs `closed` when we upgrade Pandas
+
 
 def is_categorical_dtype(obj):
     """Check whether an array-like or dtype is of the Categorical dtype.

@@ -5,6 +5,7 @@
 import pyarrow as pa
 from pandas._libs.missing import NAType as pd_NAType
 
+import cudf
 from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
 from cudf.core.column.column import ColumnBase
 from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
@@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype):
                 dtype = value.dtype
 
         if not isinstance(dtype, Decimal64Dtype):
-            dtype = np.dtype(dtype)
+            dtype = cudf.dtype(dtype)
 
         if not valid:
             value = NA

@@ -3774,7 +3774,7 @@ def one_hot_encoding(self, cats, dtype="float64"):
             cats = cats.to_pandas()
         else:
             cats = pd.Series(cats, dtype="object")
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
         def encode(cat):
             if cat is None:

@@ -245,7 +245,7 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
 
 
 def gen_rand(dtype, size, **kwargs):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.kind == "f":
         res = np.random.random(size=size).astype(dtype)
         if kwargs.get("positive_only", False):

@@ -380,7 +380,7 @@ def rand_dataframe(
                 )
             )
         else:
-            dtype = np.dtype(dtype)
+            dtype = cudf.dtype(dtype)
             if dtype.kind in ("i", "u"):
                 column_params.append(
                     ColumnParameters(
@@ -428,7 +428,7 @@ def rand_dataframe(
                             dtype=dtype, size=cardinality
                         ),
                         is_sorted=False,
-                        dtype=np.dtype(dtype),
+                        dtype=cudf.dtype(dtype),
                     )
                 )
             elif dtype.kind == "m":
@@ -440,7 +440,7 @@ def rand_dataframe(
                             dtype=dtype, size=cardinality
                         ),
                         is_sorted=False,
-                        dtype=np.dtype(dtype),
+                        dtype=cudf.dtype(dtype),
                     )
                 )
             elif dtype.kind == "b":
@@ -450,7 +450,7 @@ def rand_dataframe(
                         null_frequency=null_frequency,
                         generator=boolean_generator(cardinality),
                         is_sorted=False,
-                        dtype=np.dtype(dtype),
+                        dtype=cudf.dtype(dtype),
                     )
                 )
             else:
@@ -538,7 +538,7 @@ def get_values_for_nested_data(dtype, lists_max_length):
     Returns list of values based on dtype.
     """
     cardinality = np.random.randint(0, lists_max_length)
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
         values = int_generator(dtype=dtype, size=cardinality)()
     elif dtype.kind == "f":

@@ -931,7 +931,7 @@ def test_ufunc_ops(lhs, rhs, ops):
 def dtype_scalar(val, dtype):
     if dtype == "str":
         return str(val)
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.type in {np.datetime64, np.timedelta64}:
         res, _ = np.datetime_data(dtype)
         return dtype.type(val, res)
@@ -1695,13 +1695,15 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype):
     )
 
     if dtype == "datetime64[s]":
-        val = np.dtype(dtype).type(4, "s")
+        val = cudf.dtype(dtype).type(4, "s")
     elif dtype == "timedelta64[s]":
-        val = np.dtype(dtype).type(4, "s")
+        val = cudf.dtype(dtype).type(4, "s")
     elif dtype == "category":
         val = np.int64(4)
+    elif dtype == "str":
+        val = str(4)
     else:
-        val = np.dtype(dtype).type(4)
+        val = cudf.dtype(dtype).type(4)
 
     expected = val == data.to_pandas()
     got = val == data
@@ -2793,11 +2795,11 @@ def test_column_null_scalar_comparison(dtype, null_scalar, cmpop):
     # a new series where all the elements are <NA>.
 
     if isinstance(null_scalar, np.datetime64):
-        if np.dtype(dtype).kind not in "mM":
+        if cudf.dtype(dtype).kind not in "mM":
             pytest.skip()
         null_scalar = null_scalar.astype(dtype)
 
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
 
     data = [1, 2, 3, 4, 5]
     sr = cudf.Series(data, dtype=dtype)

@@ -799,7 +799,7 @@ def test_categorical_setitem_with_nan():
 @pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"])
 @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
 def test_series_construction_with_nulls(input_obj, dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     input_obj = [
         dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj
     ]

@@ -4,6 +4,7 @@
 import pandas as pd
 import pytest
 
+import cudf
 from cudf import Series
 from cudf.core.index import RangeIndex, as_index
 from cudf.testing._utils import (
@@ -82,7 +83,7 @@ def test_rangeindex_contains():
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
 def test_lists_contains(dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     inner_data = np.array([1, 2, 3], dtype=dtype)
 
     data = Series([inner_data])
@@ -96,7 +97,7 @@ def test_lists_contains(dtype):
 
 @pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES)
 def test_lists_contains_datetime(dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     inner_data = np.array([1, 2, 3])
 
     unit, _ = np.datetime_data(dtype)

@@ -257,3 +257,34 @@ def test_lists_of_structs_dtype(data):
 
     assert_column_array_dtype_equal(got._column, expected)
     assert expected.equals(got._column.to_arrow())
+
+
+@pytest.mark.parametrize(
+    "in_dtype,expect",
+    [
+        (np.dtype("int8"), np.dtype("int8")),
+        (np.int8, np.dtype("int8")),
+        (np.float16, np.dtype("float32")),
+        (pd.Int8Dtype(), np.dtype("int8")),
+        (pd.StringDtype(), np.dtype("object")),
+        ("int8", np.dtype("int8")),
+        ("boolean", np.dtype("bool")),
+        (int, np.dtype("int64")),
+        (float, np.dtype("float64")),
+        (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
+        ("float16", np.dtype("float32")),
+        (np.dtype("U"), np.dtype("object")),
+        ("timedelta64", np.dtype("<m8[ns]")),
+        ("timedelta64[ns]", np.dtype("<m8[ns]")),
+        ("timedelta64[ms]", np.dtype("<m8[ms]")),
+        ("timedelta64[D]", np.dtype("<m8[D]")),
+        ("<m8[s]", np.dtype("<m8[s]")),
+        ("datetime64", np.dtype("<M8[ns]")),
+        ("datetime64[ns]", np.dtype("<M8[ns]")),
+        ("datetime64[ms]", np.dtype("<M8[ms]")),
+        ("datetime64[D]", np.dtype("<M8[D]")),
+        ("<M8[s]", np.dtype("<M8[s]")),
+    ],
+)
+def test_dtype(in_dtype, expect):
+    assert_eq(cudf.dtype(in_dtype), expect)
@@ -810,7 +810,7 @@ def test_join_datetimes_index(dtype):
     pdf = pdf_lhs.join(pdf_rhs, sort=True)
     gdf = gdf_lhs.join(gdf_rhs, sort=True)
 
-    assert gdf["d"].dtype == np.dtype(dtype)
+    assert gdf["d"].dtype == cudf.dtype(dtype)
 
     assert_join_results_equal(pdf, gdf, how="inner")