From 1c83eacc21fb0c0910d2c26861bb7dda3bbb6462 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 28 Jul 2020 06:47:01 -0700
Subject: [PATCH 01/80] initial dtype work

---
 python/cudf/cudf/core/dtypes.py | 172 ++++++++++++++++++++++++++++++++
 python/cudf/cudf/core/series.py |   4 +
 2 files changed, 176 insertions(+)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 8d313b19707..09eef0b1790 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -9,6 +9,178 @@
 
 import cudf
 
+pa_to_pd_dtypes = {
+    pa.uint8(): pd.UInt8Dtype(),
+    pa.uint16(): pd.UInt16Dtype(),
+    pa.uint32(): pd.UInt32Dtype(),
+    pa.uint64(): pd.UInt64Dtype(),
+    pa.int8(): pd.Int8Dtype(),
+    pa.int16(): pd.Int16Dtype(),
+    pa.int32(): pd.Int32Dtype(),
+    pa.int64(): pd.Int64Dtype(),
+    pa.bool_(): pd.BooleanDtype(),
+    pa.string(): pd.StringDtype(),
+    pa.float32(): np.float32(),
+    pa.float64(): np.float64(),
+    pa.timestamp('ns'): np.dtype('datetime64[ns]'),
+    pa.timestamp('us'): np.dtype('datetime64[us]'),
+    pa.timestamp('ms'): np.dtype('datetime64[ms]'),
+    pa.timestamp('s'): np.dtype('datetime64[s]'),
+}
+
+pa_to_np_dtypes = {
+    pa.uint8(): np.dtype('uint8'),
+    pa.uint16(): np.dtype('uint16'),
+    pa.uint32(): np.dtype('uint32'),
+    pa.uint64(): np.dtype('uint64'),
+    pa.int8(): np.dtype('int8'),
+    pa.int16(): np.dtype('int16'),
+    pa.int32(): np.dtype('int32'),
+    pa.int64(): np.dtype('int64'),
+    pa.bool_(): np.dtype('bool'),
+    pa.string(): np.dtype('object'),
+    pa.float32(): np.dtype('float32'),
+    pa.float64(): np.dtype('float64'),
+    pa.timestamp('ns'): np.dtype('datetime64[ns]'),
+    pa.timestamp('us'): np.dtype('datetime64[us]'),
+    pa.timestamp('ms'): np.dtype('datetime64[ms]'),
+    pa.timestamp('s'): np.dtype('datetime64[s]'),
+}
+
+class Dtype(ExtensionDtype):
+    def __init__(self, arg):
+        cudf_dtype = make_dtype_from_obj(arg)
+        cudf_dtype.__init__(self)
+
+    def __eq__(self, other):
+        if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__:
+            return True
+        if self.to_numpy == other:
+            return True
+        raise NotImplementedError
+
+    @property
+    def to_numpy(self):
+        return pa_to_np_dtypes[self.pa_type]
+
+    @property
+    def to_pandas(self):
+        return pa_to_pd_dtypes[self.pa_type]
+
+    @property
+    def type(self):
+        return self.pandas_dtype().type
+
+class UInt8Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.uint8()
+        
+class UInt16Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.uint16()
+
+class UInt32Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.uint32()
+
+class UInt64Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.uint64()
+
+class Int8Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.int8()
+
+class Int16Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.int16()
+
+class Int32Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.int32()
+
+class Int64Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.int64()
+
+class Float32Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.float32()
+
+class Float64Dtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.float64()
+
+class BooleanDtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.bool()
+
+class Datetime64NSDtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.timestamp('ns')
+
+class Datetime64USDtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.timestamp('us')
+
+class Datetime64MSDtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.timestamp('ms')
+
+class Datetime64SDtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.timestamp('s')
+
+class StringDtype(Dtype):
+    def __init__(self):
+        self.pa_type = pa.string()
+
+def make_dtype_from_string(obj):
+    if obj in {'str', 'string', 'object'}:
+        return StringDtype
+    elif 'datetime' in obj:
+        if obj == 'datetime64[ns]':
+            return Datetime64NSDtype
+        elif obj == 'datetime64[us]':
+            return Datetime64USDtype
+        elif obj == 'datetime64[ms]':
+            return Datetime64MSDtype
+        elif obj == 'datetime64[s]':
+            return Datetime64SDtype
+    elif 'int' in obj or 'Int' in obj:
+        if obj in {'int', 'Int', 'int64', 'Int64'}:
+            return Int64Dtype
+        elif obj in {'int32', 'Int32'}:
+            return Int32Dtype
+        elif obj in {'int16', 'Int16'}:
+            return Int16Dtype
+        elif obj in {'int8', 'Int8'}:
+            return Int8Dtype
+        elif obj in {'uint64', 'UInt64'}:
+            return UInt64Dtype
+        elif obj in {'uint32', 'UInt32'}:
+            return UInt32Dtype
+        elif obj in {'uint16', 'UInt16'}:
+            return UInt16Dtype
+        elif obj in {'uint8', 'Uint8'}:
+            return UInt8Dtype
+    elif 'float' in obj:
+        if obj in {'float64', 'Float64'}:
+            return Float64Dtype
+        elif obj in {'float32', 'Float32'}:
+            return Float32Dtype
+    elif 'bool' in obj:
+        return BooleanDtype
+
+def make_dtype_from_numpy(obj):
+    np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()}
+    result = np_to_pd_types.get(obj)
+
+def make_dtype_from_obj(obj):
+    if isinstance(obj, np.dtype):
+        return make_dtype_from_numpy(obj)
+    elif isinstance(obj, str):
+        return make_dtype_from_string(obj)
 
 class CategoricalDtype(ExtensionDtype):
     def __init__(self, categories=None, ordered=None):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5e01e110c28..277772331b9 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -10,6 +10,8 @@
 from pandas._config import get_option
 from pandas.api.types import is_dict_like
 
+from cudf.core.dtypes import Dtype
+
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
@@ -143,6 +145,8 @@ def __init__(
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if dtype:
+            dtype = Dtype(dtype)
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name

From 33bd96c83675f02c06a3bb22be59356006af2be4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 28 Jul 2020 13:43:16 -0700
Subject: [PATCH 02/80] begin to plumb dtype

---
 python/cudf/cudf/core/column/column.py | 18 +++++-------------
 python/cudf/cudf/core/dtypes.py        |  7 +++++++
 python/cudf/cudf/core/series.py        |  3 +--
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 4b68e4a1159..b73a88b7ee2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1639,23 +1639,15 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
             )
         except TypeError:
-            pa_type = None
-            np_type = None
             try:
                 if dtype is not None:
                     dtype = pd.api.types.pandas_dtype(dtype)
-                    if is_categorical_dtype(dtype):
+                    if dtype.is_categorical_dtype:
                         raise TypeError
-                    else:
-                        np_type = np.dtype(dtype).type
-                        if np_type == np.bool_:
-                            pa_type = pa.bool_()
-                        else:
-                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                 data = as_column(
                     pa.array(
                         arbitrary,
-                        type=pa_type,
+                        type=dtype.pa_type,
                         from_pandas=True
                         if nan_as_null is None
                         else nan_as_null,
@@ -1664,14 +1656,14 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                     nan_as_null=nan_as_null,
                 )
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
-                if is_categorical_dtype(dtype):
+                if dtype.is_categorical_dtype:
                     sr = pd.Series(arbitrary, dtype="category")
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
-                elif np_type == np.str_:
+                elif dtype.to_numpy == np.str_:
                     sr = pd.Series(arbitrary, dtype="str")
                     data = as_column(sr, nan_as_null=nan_as_null)
                 else:
-                    native_dtype = dtype
+                    native_dtype = dtype.to_numpy
                     if dtype is None and pd.api.types.infer_dtype(
                         arbitrary
                     ) in ("mixed", "mixed-integer"):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 09eef0b1790..fbaa760c0b5 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -45,9 +45,13 @@
     pa.timestamp('us'): np.dtype('datetime64[us]'),
     pa.timestamp('ms'): np.dtype('datetime64[ms]'),
     pa.timestamp('s'): np.dtype('datetime64[s]'),
+    None: None
 }
 
 class Dtype(ExtensionDtype):
+
+    is_categorical_dtype = False
+    pa_type = None
     def __init__(self, arg):
         cudf_dtype = make_dtype_from_obj(arg)
         cudf_dtype.__init__(self)
@@ -183,6 +187,9 @@ def make_dtype_from_obj(obj):
         return make_dtype_from_string(obj)
 
 class CategoricalDtype(ExtensionDtype):
+
+    is_categorical_dtype = True
+
     def __init__(self, categories=None, ordered=None):
         """
         dtype similar to pd.CategoricalDtype with the categories
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 277772331b9..f4a026996ff 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -145,8 +145,7 @@ def __init__(
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
-        if dtype:
-            dtype = Dtype(dtype)
+        dtype = Dtype(dtype)
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name

From baf138c99789ce82684a26286625f6b9fadbf924 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 29 Jul 2020 06:42:19 -0700
Subject: [PATCH 03/80] migrate dtypes to cudf main __init__

---
 python/cudf/cudf/__init__.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index d953f517e4a..38b31e5e7b7 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -31,7 +31,27 @@
     from_pandas,
     merge,
 )
-from cudf.core.dtypes import CategoricalDtype
+from cudf.core.dtypes import (
+    Dtype,
+    CategoricalDtype, 
+    Int8Dtype,
+    Int16Dtype, 
+    Int32Dtype, 
+    Int64Dtype, 
+    UInt8Dtype, 
+    UInt16Dtype,
+    UInt32Dtype, 
+    UInt64Dtype, 
+    StringDtype,
+    Float32Dtype,
+    Float64Dtype, 
+    BooleanDtype,
+    Datetime64NSDtype,
+    Datetime64USDtype, 
+    Datetime64MSDtype,
+    Datetime64SDtype
+)
+
 from cudf.core.groupby import Grouper
 from cudf.core.ops import (
     arccos,

From bdb87fa9c887e8b43af86ac53a52ee81683ffd1a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 29 Jul 2020 11:02:44 -0700
Subject: [PATCH 04/80] numerical column plumbing

---
 python/cudf/cudf/core/column/column.py    | 25 ++++++++---------------
 python/cudf/cudf/core/column/numerical.py |  4 ++--
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b73a88b7ee2..01284c11062 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -878,11 +878,11 @@ def distinct_count(self, method="sort", dropna=True):
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype, **kwargs):
-        if is_categorical_dtype(dtype):
+        if dtype.is_categorical:
             return self.as_categorical_column(dtype, **kwargs)
-        elif np.issubdtype(dtype, np.datetime64):
+        elif dtype.is_datetime:
             return self.as_datetime_column(dtype, **kwargs)
-        elif pd.api.types.pandas_dtype(dtype).type in (np.str_, np.object_):
+        elif dtype.is_string:
             return self.as_string_column(dtype, **kwargs)
         else:
             return self.as_numerical_column(dtype, **kwargs)
@@ -1447,7 +1447,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             )
             data = cudf.core.column.NumericalColumn(
                 data=padata,
-                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
+                dtype=dtype,
                 mask=pamask,
                 size=pa_size,
                 offset=pa_offset,
@@ -1642,19 +1642,12 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             try:
                 if dtype is not None:
                     dtype = pd.api.types.pandas_dtype(dtype)
-                    if dtype.is_categorical_dtype:
+                    if dtype.is_categorical:
                         raise TypeError
-                data = as_column(
-                    pa.array(
-                        arbitrary,
-                        type=dtype.pa_type,
-                        from_pandas=True
-                        if nan_as_null is None
-                        else nan_as_null,
-                    ),
-                    dtype=dtype,
-                    nan_as_null=nan_as_null,
-                )
+
+                pa_data = pa.array(arbitrary, type=dtype.pa_type, from_pandas=True if nan_as_null is None else nan_as_null)
+                data = as_column(pa_data, dtype=cudf.Dtype(pa_data.type), nan_as_null=nan_as_null)
+
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                 if dtype.is_categorical_dtype:
                     sr = pd.Series(arbitrary, dtype="category")
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 27281111993..02494d7617c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -32,7 +32,7 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.Dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -139,7 +139,7 @@ def as_string_column(self, dtype, **kwargs):
 
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
-                np.dtype(self.dtype)
+                dtype.to_numpy
             ](self, **kwargs)
         else:
             return as_column([], dtype="object")

From 4a3fe713e9d1ee58aa4ceb6c34c1c8694a027400 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 29 Jul 2020 11:03:36 -0700
Subject: [PATCH 05/80] update dtype classes, mappings

---
 python/cudf/cudf/core/dtypes.py | 110 +++++++++++++++++++++++++++-----
 1 file changed, 93 insertions(+), 17 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index fbaa760c0b5..4fad10aaaee 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -49,10 +49,17 @@
 }
 
 class Dtype(ExtensionDtype):
+    is_integer = False
+    is_string = False
+    is_boolean = False
+    is_categorical = False
+    is_datetime = False
+    is_list = False
+    is_float = False
 
-    is_categorical_dtype = False
     pa_type = None
     def __init__(self, arg):
+
         cudf_dtype = make_dtype_from_obj(arg)
         cudf_dtype.__init__(self)
 
@@ -71,71 +78,93 @@ def to_numpy(self):
     def to_pandas(self):
         return pa_to_pd_dtypes[self.pa_type]
 
+    @property
+    def itemsize(self):
+        return self.to_numpy.itemsize
+
     @property
     def type(self):
         return self.pandas_dtype().type
 
-class UInt8Dtype(Dtype):
+    def __repr__(self):
+        return self.pa_type.__repr__()
+
+    def __hash__(self):
+        return hash(self.__repr__())
+
+class IntDtype(Dtype):
+    is_integer = True
+
+class UInt8Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint8()
         
-class UInt16Dtype(Dtype):
+class UInt16Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint16()
 
-class UInt32Dtype(Dtype):
+class UInt32Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint32()
 
-class UInt64Dtype(Dtype):
+class UInt64Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint64()
 
-class Int8Dtype(Dtype):
+class Int8Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int8()
 
-class Int16Dtype(Dtype):
+class Int16Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int16()
 
-class Int32Dtype(Dtype):
+class Int32Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int32()
 
-class Int64Dtype(Dtype):
+class Int64Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int64()
 
-class Float32Dtype(Dtype):
+
+class FloatDtype(Dtype):
+    is_float = True
+
+class Float32Dtype(FloatDtype):
     def __init__(self):
         self.pa_type = pa.float32()
 
-class Float64Dtype(Dtype):
+class Float64Dtype(FloatDtype):
     def __init__(self):
         self.pa_type = pa.float64()
 
 class BooleanDtype(Dtype):
+    is_boolean = True
     def __init__(self):
-        self.pa_type = pa.bool()
+        self.pa_type = pa.bool_()
 
-class Datetime64NSDtype(Dtype):
+class DatetimeDtype(Dtype):
+    is_datetime = True
+
+class Datetime64NSDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('ns')
 
-class Datetime64USDtype(Dtype):
+class Datetime64USDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('us')
 
-class Datetime64MSDtype(Dtype):
+class Datetime64MSDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('ms')
 
-class Datetime64SDtype(Dtype):
+class Datetime64SDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('s')
 
 class StringDtype(Dtype):
+    is_string = True
     def __init__(self):
         self.pa_type = pa.string()
 
@@ -176,13 +205,20 @@ def make_dtype_from_string(obj):
     elif 'bool' in obj:
         return BooleanDtype
 
+
+
 def make_dtype_from_numpy(obj):
     np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()}
     result = np_to_pd_types.get(obj)
+    return result
 
 def make_dtype_from_obj(obj):
+    if isinstance(obj, Dtype):
+        return np_to_cudf_dtypes[obj.to_numpy]
     if isinstance(obj, np.dtype):
-        return make_dtype_from_numpy(obj)
+        return np_to_cudf_dtypes[obj]
+    elif isinstance(obj, pa.lib.DataType):
+        return pa_to_cudf_dtypes[obj]
     elif isinstance(obj, str):
         return make_dtype_from_string(obj)
 
@@ -346,3 +382,43 @@ def __repr__(self):
             return f"ListDtype({self.element_type.__repr__()})"
         else:
             return f"ListDtype({self.element_type})"
+
+
+pa_to_cudf_dtypes = {
+    pa.uint8(): UInt8Dtype,
+    pa.uint16(): UInt16Dtype,
+    pa.uint32(): UInt32Dtype,
+    pa.uint64(): UInt64Dtype,
+    pa.int8(): Int8Dtype,
+    pa.int16(): Int16Dtype,
+    pa.int32(): Int32Dtype,
+    pa.int64(): Int64Dtype,
+    pa.bool_(): BooleanDtype,
+    pa.string(): StringDtype,
+    pa.float32(): Float32Dtype,
+    pa.float64(): Float64Dtype,
+    pa.timestamp('ns'): Datetime64NSDtype,
+    pa.timestamp('us'): Datetime64USDtype,
+    pa.timestamp('ms'): Datetime64MSDtype,
+    pa.timestamp('s'): Datetime64SDtype,
+    None: Dtype
+}
+
+np_to_cudf_dtypes = {
+    np.dtype('int8'): Int8Dtype,
+    np.dtype('int16'): Int16Dtype,
+    np.dtype('int32'): Int32Dtype,
+    np.dtype('int64'): Int64Dtype,
+    np.dtype('uint8'): UInt8Dtype,
+    np.dtype('uint16'): UInt16Dtype,
+    np.dtype('uint32'): UInt32Dtype,
+    np.dtype('uint64'): UInt64Dtype,
+    np.dtype('bool'): BooleanDtype,
+    np.dtype('object'): StringDtype,
+    np.dtype('float32'): Float32Dtype,
+    np.dtype('float64'): Float64Dtype,
+    np.dtype('datetime64[ns]'): Datetime64NSDtype,
+    np.dtype('datetime64[us]'): Datetime64USDtype,
+    np.dtype('datetime64[ms]'): Datetime64MSDtype,
+    np.dtype('datetime64[s]'): Datetime64SDtype,
+}

From 1cf2c3ef58e805eaa55850520cbb5ed96849025a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 29 Jul 2020 11:04:00 -0700
Subject: [PATCH 06/80] start to plumb stringcolumn

---
 python/cudf/cudf/core/column/string.py | 31 +++++++++++++-------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 987c23c8139..ceaf6b4ff3e 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -134,25 +134,26 @@
     is_scalar,
     is_string_dtype,
 )
+from cudf.core.dtypes import Dtype
 
 _str_to_numeric_typecast_functions = {
-    np.dtype("int8"): str_cast.stoi8,
-    np.dtype("int16"): str_cast.stoi16,
-    np.dtype("int32"): str_cast.stoi,
-    np.dtype("int64"): str_cast.stol,
-    np.dtype("uint8"): str_cast.stoui8,
-    np.dtype("uint16"): str_cast.stoui16,
-    np.dtype("uint32"): str_cast.stoui,
-    np.dtype("uint64"): str_cast.stoul,
-    np.dtype("float32"): str_cast.stof,
-    np.dtype("float64"): str_cast.stod,
-    np.dtype("bool"): str_cast.to_booleans,
+    Dtype("int8"): str_cast.stoi8,
+    Dtype("int16"): str_cast.stoi16,
+    Dtype("int32"): str_cast.stoi,
+    Dtype("int64"): str_cast.stol,
+    Dtype("uint8"): str_cast.stoui8,
+    Dtype("uint16"): str_cast.stoui16,
+    Dtype("uint32"): str_cast.stoui,
+    Dtype("uint64"): str_cast.stoul,
+    Dtype("float32"): str_cast.stof,
+    Dtype("float64"): str_cast.stod,
+    Dtype("bool"): str_cast.to_booleans,
     # TODO: support Date32 UNIX days
     # np.dtype("datetime64[D]"): str_cast.timestamp2int,
-    np.dtype("datetime64[s]"): str_cast.timestamp2int,
-    np.dtype("datetime64[ms]"): str_cast.timestamp2int,
-    np.dtype("datetime64[us]"): str_cast.timestamp2int,
-    np.dtype("datetime64[ns]"): str_cast.timestamp2int,
+    Dtype("datetime64[s]"): str_cast.timestamp2int,
+    Dtype("datetime64[ms]"): str_cast.timestamp2int,
+    Dtype("datetime64[us]"): str_cast.timestamp2int,
+    Dtype("datetime64[ns]"): str_cast.timestamp2int,
 }
 
 _numeric_to_str_typecast_functions = {

From dbc4970057ca03a7de370f9a5882a9707e064e36 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 30 Jul 2020 07:22:36 -0700
Subject: [PATCH 07/80] inherit from basic cython class

---
 python/cudf/cudf/_lib/column.pyx | 10 +++-------
 python/cudf/cudf/_lib/types.pxd  |  4 ++++
 python/cudf/cudf/_lib/types.pyx  | 13 +++++++++++++
 python/cudf/cudf/core/dtypes.py  |  4 ++--
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index d061d7065de..5b48d92f1b6 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -27,6 +27,7 @@ from cudf._lib.move cimport move
 from cudf._lib.cpp.column.column cimport column, column_contents
 from cudf._lib.cpp.column.column_view cimport column_view
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.types cimport _Dtype
 
 
 cdef class Column:
@@ -352,13 +353,8 @@ cdef class Column:
             col = self.base_children[0]
         else:
             col = self
-        data_dtype = col.dtype
-        cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(data_dtype)]
-            )
-        )
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
+        cdef _Dtype data_dtype = col.dtype
+        cdef libcudf_types.data_type dtype = data_dtype.get_libcudf_type()
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 923fbe0aa7c..f6c0c39174a 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool
+from cudf._lib.cpp.types cimport data_type
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
@@ -9,3 +10,6 @@ ctypedef bool underlying_type_t_sorted
 ctypedef int32_t underlying_type_t_interpolation
 ctypedef int32_t underlying_type_t_type_id
 ctypedef bool underlying_type_t_null_policy
+
+cdef class _Dtype:
+    cdef data_type get_libcudf_type(self)
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index f5351d12b03..fc8d4fada22 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -11,6 +11,7 @@ from cudf._lib.types cimport (
     underlying_type_t_interpolation
 )
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.types cimport data_type
 
 
 class TypeId(IntEnum):
@@ -119,3 +120,15 @@ class NullOrder(IntEnum):
 class NullHandling(IntEnum):
     INCLUDE = <underlying_type_t_null_policy> libcudf_types.null_policy.INCLUDE
     EXCLUDE = <underlying_type_t_null_policy> libcudf_types.null_policy.EXCLUDE
+
+
+cdef class _Dtype:
+    cdef data_type get_libcudf_type(self):
+        np_dtype = self.to_numpy
+        cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
+                <underlying_type_t_type_id> (
+                    np_to_cudf_types[np_dtype]
+                )
+            )
+        cdef data_type libcudf_type = libcudf_types.data_type(tid)
+        return libcudf_type
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 4fad10aaaee..b610f29b4b7 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import pyarrow as pa
 from pandas.api.extensions import ExtensionDtype
-
+from cudf._lib.types import _Dtype
 import cudf
 
 pa_to_pd_dtypes = {
@@ -48,7 +48,7 @@
     None: None
 }
 
-class Dtype(ExtensionDtype):
+class Dtype(ExtensionDtype, _Dtype):
     is_integer = False
     is_string = False
     is_boolean = False

From ba42bd8140d8390b53bd56253f42b316bc98d28f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 30 Jul 2020 08:11:32 -0700
Subject: [PATCH 08/80] plumb numerical column __repr__, default_na_value

---
 python/cudf/cudf/_lib/column.pyx          |  2 +-
 python/cudf/cudf/core/column/column.py    | 12 +++++-----
 python/cudf/cudf/core/column/numerical.py |  4 ++--
 python/cudf/cudf/core/dtypes.py           | 29 +++++++++++++++++++++--
 4 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 5b48d92f1b6..7cf3549ed1c 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -349,7 +349,7 @@ cdef class Column:
         return self._view(c_null_count)
 
     cdef column_view _view(self, libcudf_types.size_type null_count) except *:
-        if is_categorical_dtype(self.dtype):
+        if self.dtype.is_categorical:
             col = self.base_children[0]
         else:
             col = self
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 01284c11062..baa8a329847 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -81,10 +81,10 @@ def data_array_view(self):
         """
         View the data as a device array object
         """
-        if self.dtype == "object":
+        if self.dtype.is_string:
             raise ValueError("Cannot get an array view of a StringColumn")
 
-        if is_categorical_dtype(self.dtype):
+        if self.dtype.is_categorical:
             return self.codes.data_array_view
         else:
             dtype = self.dtype
@@ -95,7 +95,7 @@ def data_array_view(self):
         result = cuda.devicearray.DeviceNDArray(
             shape=(result.nbytes // dtype.itemsize,),
             strides=(dtype.itemsize,),
-            dtype=dtype,
+            dtype=dtype.to_numpy,
             gpu_data=result.gpu_data,
         )
         return result
@@ -1320,11 +1320,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
 
         if isinstance(col, cudf.core.column.CategoricalColumn):
             return col
-        elif np.issubdtype(col.dtype, np.floating):
+        elif col.dtype.is_float:
             if nan_as_null or (mask is None and nan_as_null is None):
                 mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan))
                 col = col.set_mask(mask)
-        elif np.issubdtype(col.dtype, np.datetime64):
+        elif col.dtype.is_datetime:
             if nan_as_null or (mask is None and nan_as_null is None):
                 col = utils.time_col_replace_nulls(col)
         return col
@@ -1602,7 +1602,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 arb_dtype = np.dtype("O")
             else:
                 arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
-                if arb_dtype != arbitrary.dtype.numpy_dtype:
+                if arb_dtype != arbitrary.dtype.to_numpy:
                     arbitrary = arbitrary.astype(arb_dtype)
         if arb_dtype.kind in ("O", "U"):
             data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 02494d7617c..8d45af88626 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -266,9 +266,9 @@ def default_na_value(self):
         if dkind == "f":
             return self.dtype.type(np.nan)
         elif dkind == "i":
-            return np.iinfo(self.dtype).min
+            return np.iinfo(self.dtype.to_numpy).min
         elif dkind == "u":
-            return np.iinfo(self.dtype).max
+            return np.iinfo(self.dtype.to_numpy).max
         elif dkind == "b":
             return self.dtype.type(False)
         else:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index b610f29b4b7..fe4710a2de9 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -84,7 +84,15 @@ def itemsize(self):
 
     @property
     def type(self):
-        return self.pandas_dtype().type
+        return self.to_pandas.type
+
+    @property
+    def kind(self):
+        return self.to_pandas.kind
+
+    @property
+    def name(self):
+        return self._name
 
     def __repr__(self):
         return self.pa_type.__repr__()
@@ -92,41 +100,50 @@ def __repr__(self):
     def __hash__(self):
         return hash(self.__repr__())
 
+    
+
 class IntDtype(Dtype):
     is_integer = True
 
 class UInt8Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint8()
+        self._name = "UInt8"
         
 class UInt16Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint16()
+        self._name = "UInt16"
 
 class UInt32Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint32()
+        self._name = "UInt32"
 
 class UInt64Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint64()
+        self._name = "UInt64"
 
 class Int8Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int8()
+        self._name = "Int8"
 
 class Int16Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int16()
+        self._name = "Int16"
 
 class Int32Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int32()
+        self._name = "Int32"
 
 class Int64Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int64()
-
+        self._name = "Int64"
 
 class FloatDtype(Dtype):
     is_float = True
@@ -134,15 +151,18 @@ class FloatDtype(Dtype):
 class Float32Dtype(FloatDtype):
     def __init__(self):
         self.pa_type = pa.float32()
+        self._name = "Float32"
 
 class Float64Dtype(FloatDtype):
     def __init__(self):
         self.pa_type = pa.float64()
+        self._name = "Float64"
 
 class BooleanDtype(Dtype):
     is_boolean = True
     def __init__(self):
         self.pa_type = pa.bool_()
+        self._name = "Boolean"
 
 class DatetimeDtype(Dtype):
     is_datetime = True
@@ -150,23 +170,28 @@ class DatetimeDtype(Dtype):
 class Datetime64NSDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('ns')
+        self._name = "Datetime64NS"
 
 class Datetime64USDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('us')
+        self._name = "Datetime64US"
 
 class Datetime64MSDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('ms')
+        self._name = "Datetime64MS"
 
 class Datetime64SDtype(DatetimeDtype):
     def __init__(self):
         self.pa_type = pa.timestamp('s')
+        self._name = "Datetime64S"
 
 class StringDtype(Dtype):
     is_string = True
     def __init__(self):
         self.pa_type = pa.string()
+        self._name = "String"
 
 def make_dtype_from_string(obj):
     if obj in {'str', 'string', 'object'}:

From 60272e2a8578ab02b4c7221b84652d717bf14881 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 30 Jul 2020 15:04:35 -0700
Subject: [PATCH 09/80] plumb some parts of unary

---
 python/cudf/cudf/_lib/unary.pyx | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index 2511556656e..11f7524f934 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -6,12 +6,15 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 import numpy as np
 
+
+from cudf.core.dtypes import Float64Dtype
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport (
     column_view, mutable_column_view
 )
 from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types cimport _Dtype
 from cudf._lib.cpp.types cimport (
     size_type,
     data_type,
@@ -90,16 +93,11 @@ def is_valid(Column input):
     return Column.from_unique_ptr(move(c_result))
 
 
-def cast(Column input, object dtype=np.float64):
+def cast(Column input, object dtype=Float64Dtype()):
     cdef column_view c_input = input.view()
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
-    cdef data_type c_dtype = data_type(tid)
+    cdef _Dtype data_dtype = dtype
+
+    cdef data_type c_dtype = data_dtype.get_libcudf_type()
     cdef unique_ptr[column] c_result
 
     with nogil:

From c03be406e5cfea22cc7847c53b378096eac35415 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 30 Jul 2020 15:05:40 -0700
Subject: [PATCH 10/80] make a factory and fix bugs

---
 python/cudf/cudf/core/column/column.py    |  16 ++--
 python/cudf/cudf/core/column/numerical.py |  11 ++-
 python/cudf/cudf/core/column/string.py    |  64 ++++++-------
 python/cudf/cudf/core/dtypes.py           | 106 +++++++++++-----------
 python/cudf/cudf/core/series.py           |   4 +-
 5 files changed, 103 insertions(+), 98 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index baa8a329847..2365e904881 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -37,7 +37,7 @@
     np_to_pa_dtype,
 )
 from cudf.utils.utils import buffers_from_pyarrow, mask_dtype
-
+from cudf.core.dtypes import make_dtype_from_obj
 
 class ColumnBase(Column, Serializable):
     def __init__(
@@ -878,6 +878,7 @@ def distinct_count(self, method="sort", dropna=True):
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype, **kwargs):
+        dtype = make_dtype_from_obj(dtype)
         if dtype.is_categorical:
             return self.as_categorical_column(dtype, **kwargs)
         elif dtype.is_datetime:
@@ -1263,6 +1264,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
     * pyarrow array
     * pandas.Categorical objects
     """
+
     if isinstance(arbitrary, ColumnBase):
         if dtype is not None:
             return arbitrary.astype(dtype)
@@ -1552,7 +1554,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             arbitrary = np.ascontiguousarray(arbitrary)
 
         if dtype is not None:
-            arbitrary = arbitrary.astype(dtype)
+            arbitrary = arbitrary.astype(dtype.to_numpy)
 
         if arb_dtype.kind == "M":
 
@@ -1575,8 +1577,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 data=buffer, mask=mask, dtype=arbitrary.dtype
             )
         elif arb_dtype.kind in ("O", "U"):
+
+            pa_data = pa.Array.from_pandas(arbitrary)
             data = as_column(
-                pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
+                pa_data, dtype=make_dtype_from_obj(pa_data.type)
             )
             # There is no cast operation available for pa.Array from int to
             # str, Hence instead of handling in pa.Array block, we
@@ -1645,11 +1649,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                     if dtype.is_categorical:
                         raise TypeError
 
-                pa_data = pa.array(arbitrary, type=dtype.pa_type, from_pandas=True if nan_as_null is None else nan_as_null)
-                data = as_column(pa_data, dtype=cudf.Dtype(pa_data.type), nan_as_null=nan_as_null)
+                pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null)
+                data = as_column(pa_data, dtype=make_dtype_from_obj(pa_data.type), nan_as_null=nan_as_null)
 
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
-                if dtype.is_categorical_dtype:
+                if dtype.is_categorical:
                     sr = pd.Series(arbitrary, dtype="category")
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
                 elif dtype.to_numpy == np.str_:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 8d45af88626..958b22136dd 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -18,7 +18,7 @@
     np_to_pa_dtype,
     numeric_normalize_types,
 )
-
+from cudf.core.dtypes import make_dtype_from_obj
 
 class NumericalColumn(column.ColumnBase):
     def __init__(
@@ -32,7 +32,7 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
-        dtype = cudf.Dtype(dtype)
+        dtype = make_dtype_from_obj(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -139,7 +139,7 @@ def as_string_column(self, dtype, **kwargs):
 
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
-                dtype.to_numpy
+                self.dtype
             ](self, **kwargs)
         else:
             return as_column([], dtype="object")
@@ -156,9 +156,12 @@ def as_datetime_column(self, dtype, **kwargs):
         )
 
     def as_numerical_column(self, dtype, **kwargs):
-        dtype = np.dtype(dtype)
+        # dtype = np.dtype(dtype)
+        # expect a cudf dtype always here
         if dtype == self.dtype:
             return self
+        import pdb
+        pdb.set_trace()
         return libcudf.unary.cast(self, dtype)
 
     def to_pandas(self, index=None, nullable_pd_dtype=False):
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ceaf6b4ff3e..2b8e030d49e 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -134,46 +134,46 @@
     is_scalar,
     is_string_dtype,
 )
-from cudf.core.dtypes import Dtype
+from cudf.core.dtypes import make_dtype_from_obj
 
 _str_to_numeric_typecast_functions = {
-    Dtype("int8"): str_cast.stoi8,
-    Dtype("int16"): str_cast.stoi16,
-    Dtype("int32"): str_cast.stoi,
-    Dtype("int64"): str_cast.stol,
-    Dtype("uint8"): str_cast.stoui8,
-    Dtype("uint16"): str_cast.stoui16,
-    Dtype("uint32"): str_cast.stoui,
-    Dtype("uint64"): str_cast.stoul,
-    Dtype("float32"): str_cast.stof,
-    Dtype("float64"): str_cast.stod,
-    Dtype("bool"): str_cast.to_booleans,
+    make_dtype_from_obj("int8"): str_cast.stoi8,
+    make_dtype_from_obj("int16"): str_cast.stoi16,
+    make_dtype_from_obj("int32"): str_cast.stoi,
+    make_dtype_from_obj("int64"): str_cast.stol,
+    make_dtype_from_obj("uint8"): str_cast.stoui8,
+    make_dtype_from_obj("uint16"): str_cast.stoui16,
+    make_dtype_from_obj("uint32"): str_cast.stoui,
+    make_dtype_from_obj("uint64"): str_cast.stoul,
+    make_dtype_from_obj("float32"): str_cast.stof,
+    make_dtype_from_obj("float64"): str_cast.stod,
+    make_dtype_from_obj("bool"): str_cast.to_booleans,
     # TODO: support Date32 UNIX days
     # np.dtype("datetime64[D]"): str_cast.timestamp2int,
-    Dtype("datetime64[s]"): str_cast.timestamp2int,
-    Dtype("datetime64[ms]"): str_cast.timestamp2int,
-    Dtype("datetime64[us]"): str_cast.timestamp2int,
-    Dtype("datetime64[ns]"): str_cast.timestamp2int,
+    make_dtype_from_obj("datetime64[s]"): str_cast.timestamp2int,
+    make_dtype_from_obj("datetime64[ms]"): str_cast.timestamp2int,
+    make_dtype_from_obj("datetime64[us]"): str_cast.timestamp2int,
+    make_dtype_from_obj("datetime64[ns]"): str_cast.timestamp2int,
 }
 
 _numeric_to_str_typecast_functions = {
-    np.dtype("int8"): str_cast.i8tos,
-    np.dtype("int16"): str_cast.i16tos,
-    np.dtype("int32"): str_cast.itos,
-    np.dtype("int64"): str_cast.ltos,
-    np.dtype("uint8"): str_cast.ui8tos,
-    np.dtype("uint16"): str_cast.ui16tos,
-    np.dtype("uint32"): str_cast.uitos,
-    np.dtype("uint64"): str_cast.ultos,
-    np.dtype("float32"): str_cast.ftos,
-    np.dtype("float64"): str_cast.dtos,
-    np.dtype("bool"): str_cast.from_booleans,
+    make_dtype_from_obj(np.dtype("int8")): str_cast.i8tos,
+    make_dtype_from_obj(np.dtype("int16")): str_cast.i16tos,
+    make_dtype_from_obj(np.dtype("int32")): str_cast.itos,
+    make_dtype_from_obj(np.dtype("int64")): str_cast.ltos,
+    make_dtype_from_obj(np.dtype("uint8")): str_cast.ui8tos,
+    make_dtype_from_obj(np.dtype("uint16")): str_cast.ui16tos,
+    make_dtype_from_obj(np.dtype("uint32")): str_cast.uitos,
+    make_dtype_from_obj(np.dtype("uint64")): str_cast.ultos,
+    make_dtype_from_obj(np.dtype("float32")): str_cast.ftos,
+    make_dtype_from_obj(np.dtype("float64")): str_cast.dtos,
+    make_dtype_from_obj(np.dtype("bool")): str_cast.from_booleans,
     # TODO: support Date32 UNIX days
     # np.dtype("datetime64[D]"): str_cast.int2timestamp,
-    np.dtype("datetime64[s]"): str_cast.int2timestamp,
-    np.dtype("datetime64[ms]"): str_cast.int2timestamp,
-    np.dtype("datetime64[us]"): str_cast.int2timestamp,
-    np.dtype("datetime64[ns]"): str_cast.int2timestamp,
+    make_dtype_from_obj(np.dtype("datetime64[s]")): str_cast.int2timestamp,
+    make_dtype_from_obj(np.dtype("datetime64[ms]")): str_cast.int2timestamp,
+    make_dtype_from_obj(np.dtype("datetime64[us]")): str_cast.int2timestamp,
+    make_dtype_from_obj(np.dtype("datetime64[ns]")): str_cast.int2timestamp,
 }
 
 
@@ -4138,7 +4138,7 @@ def __init__(
             Two non-null columns containing the string data and offsets
             respectively
         """
-        dtype = np.dtype("object")
+        dtype = cudf.StringDtype()
 
         if size is None:
             for child in children:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index fe4710a2de9..46d07bfcea8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -56,14 +56,10 @@ class Dtype(ExtensionDtype, _Dtype):
     is_datetime = False
     is_list = False
     is_float = False
-
     pa_type = None
-    def __init__(self, arg):
-
-        cudf_dtype = make_dtype_from_obj(arg)
-        cudf_dtype.__init__(self)
-
     def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return True
         if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__:
             return True
         if self.to_numpy == other:
@@ -195,40 +191,40 @@ def __init__(self):
 
 def make_dtype_from_string(obj):
     if obj in {'str', 'string', 'object'}:
-        return StringDtype
+        return StringDtype()
     elif 'datetime' in obj:
         if obj == 'datetime64[ns]':
-            return Datetime64NSDtype
+            return Datetime64NSDtype()
         elif obj == 'datetime64[us]':
-            return Datetime64USDtype
+            return Datetime64USDtype()
         elif obj == 'datetime64[ms]':
-            return Datetime64MSDtype
+            return Datetime64MSDtype()
         elif obj == 'datetime64[s]':
-            return Datetime64SDtype
+            return Datetime64SDtype()
     elif 'int' in obj or 'Int' in obj:
         if obj in {'int', 'Int', 'int64', 'Int64'}:
-            return Int64Dtype
+            return Int64Dtype()
         elif obj in {'int32', 'Int32'}:
-            return Int32Dtype
+            return Int32Dtype()
         elif obj in {'int16', 'Int16'}:
-            return Int16Dtype
+            return Int16Dtype()
         elif obj in {'int8', 'Int8'}:
-            return Int8Dtype
+            return Int8Dtype()
         elif obj in {'uint64', 'UInt64'}:
-            return UInt64Dtype
+            return UInt64Dtype()
         elif obj in {'uint32', 'UInt32'}:
-            return UInt32Dtype
+            return UInt32Dtype()
         elif obj in {'uint16', 'UInt16'}:
-            return UInt16Dtype
+            return UInt16Dtype()
         elif obj in {'uint8', 'Uint8'}:
-            return UInt8Dtype
+            return UInt8Dtype()
     elif 'float' in obj:
         if obj in {'float64', 'Float64'}:
-            return Float64Dtype
+            return Float64Dtype()
         elif obj in {'float32', 'Float32'}:
-            return Float32Dtype
+            return Float32Dtype()
     elif 'bool' in obj:
-        return BooleanDtype
+        return BooleanDtype()
 
 
 
@@ -246,6 +242,8 @@ def make_dtype_from_obj(obj):
         return pa_to_cudf_dtypes[obj]
     elif isinstance(obj, str):
         return make_dtype_from_string(obj)
+    else:
+        raise TypeError
 
 class CategoricalDtype(ExtensionDtype):
 
@@ -410,40 +408,40 @@ def __repr__(self):
 
 
 pa_to_cudf_dtypes = {
-    pa.uint8(): UInt8Dtype,
-    pa.uint16(): UInt16Dtype,
-    pa.uint32(): UInt32Dtype,
-    pa.uint64(): UInt64Dtype,
-    pa.int8(): Int8Dtype,
-    pa.int16(): Int16Dtype,
-    pa.int32(): Int32Dtype,
-    pa.int64(): Int64Dtype,
-    pa.bool_(): BooleanDtype,
-    pa.string(): StringDtype,
-    pa.float32(): Float32Dtype,
-    pa.float64(): Float64Dtype,
-    pa.timestamp('ns'): Datetime64NSDtype,
-    pa.timestamp('us'): Datetime64USDtype,
-    pa.timestamp('ms'): Datetime64MSDtype,
-    pa.timestamp('s'): Datetime64SDtype,
+    pa.uint8(): UInt8Dtype(),
+    pa.uint16(): UInt16Dtype(),
+    pa.uint32(): UInt32Dtype(),
+    pa.uint64(): UInt64Dtype(),
+    pa.int8(): Int8Dtype(),
+    pa.int16(): Int16Dtype(),
+    pa.int32(): Int32Dtype(),
+    pa.int64(): Int64Dtype(),
+    pa.bool_(): BooleanDtype(),
+    pa.string(): StringDtype(),
+    pa.float32(): Float32Dtype(),
+    pa.float64(): Float64Dtype(),
+    pa.timestamp('ns'): Datetime64NSDtype(),
+    pa.timestamp('us'): Datetime64USDtype(),
+    pa.timestamp('ms'): Datetime64MSDtype(),
+    pa.timestamp('s'): Datetime64SDtype(),
     None: Dtype
 }
 
 np_to_cudf_dtypes = {
-    np.dtype('int8'): Int8Dtype,
-    np.dtype('int16'): Int16Dtype,
-    np.dtype('int32'): Int32Dtype,
-    np.dtype('int64'): Int64Dtype,
-    np.dtype('uint8'): UInt8Dtype,
-    np.dtype('uint16'): UInt16Dtype,
-    np.dtype('uint32'): UInt32Dtype,
-    np.dtype('uint64'): UInt64Dtype,
-    np.dtype('bool'): BooleanDtype,
-    np.dtype('object'): StringDtype,
-    np.dtype('float32'): Float32Dtype,
-    np.dtype('float64'): Float64Dtype,
-    np.dtype('datetime64[ns]'): Datetime64NSDtype,
-    np.dtype('datetime64[us]'): Datetime64USDtype,
-    np.dtype('datetime64[ms]'): Datetime64MSDtype,
-    np.dtype('datetime64[s]'): Datetime64SDtype,
+    np.dtype('int8'): Int8Dtype(),
+    np.dtype('int16'): Int16Dtype(),
+    np.dtype('int32'): Int32Dtype(),
+    np.dtype('int64'): Int64Dtype(),
+    np.dtype('uint8'): UInt8Dtype(),
+    np.dtype('uint16'): UInt16Dtype(),
+    np.dtype('uint32'): UInt32Dtype(),
+    np.dtype('uint64'): UInt64Dtype(),
+    np.dtype('bool'): BooleanDtype(),
+    np.dtype('object'): StringDtype(),
+    np.dtype('float32'): Float32Dtype(),
+    np.dtype('float64'): Float64Dtype(),
+    np.dtype('datetime64[ns]'): Datetime64NSDtype(),
+    np.dtype('datetime64[us]'): Datetime64USDtype(),
+    np.dtype('datetime64[ms]'): Datetime64MSDtype(),
+    np.dtype('datetime64[s]'): Datetime64SDtype(),
 }
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f4a026996ff..4eda63c345c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -145,7 +145,8 @@ def __init__(
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
-        dtype = Dtype(dtype)
+        from cudf.core.dtypes import make_dtype_from_obj
+        dtype = make_dtype_from_obj(dtype) if dtype is not None else None
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name
@@ -187,7 +188,6 @@ def __init__(
                 )
             else:
                 data = {}
-
         if not isinstance(data, column.ColumnBase):
             data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype)
 

From 7f6cb360cd7de8db6504afac443b63ca09772028 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 31 Jul 2020 08:56:15 -0700
Subject: [PATCH 11/80] more progress on columns, dtype object

---
 python/cudf/cudf/_lib/binaryop.pyx        | 11 ++-----
 python/cudf/cudf/core/column/column.py    | 13 +++++----
 python/cudf/cudf/core/column/datetime.py  |  7 +++--
 python/cudf/cudf/core/column/numerical.py | 33 +++++++++++----------
 python/cudf/cudf/core/dtypes.py           | 35 +++++++++++++++++++++--
 python/cudf/cudf/tests/test_column.py     |  4 +--
 6 files changed, 64 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 7823ad20c5a..4323e1f4b79 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -27,6 +27,7 @@ from cudf.utils.dtypes import is_string_dtype
 
 from cudf._lib.cpp.binaryop cimport binary_operator
 cimport cudf._lib.cpp.binaryop as cpp_binaryop
+from cudf._lib.types cimport _Dtype
 
 
 class BinaryOperation(IntEnum):
@@ -170,19 +171,13 @@ def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
     """
+    cdef _Dtype py_dtype = dtype
     op = BinaryOperation[op.upper()]
     cdef binary_operator c_op = <binary_operator> (
         <underlying_type_t_binary_operator> op
     )
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
 
-    cdef data_type c_dtype = data_type(tid)
+    cdef data_type c_dtype = py_dtype.get_libcudf_type()
 
     if isinstance(lhs, Scalar) or np.isscalar(lhs) or lhs is None:
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2365e904881..ae0c939f74c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1129,9 +1129,9 @@ def build_column(
     offset : int, optional
     children : tuple, optional
     """
-    dtype = pd.api.types.pandas_dtype(dtype)
+    dtype = make_dtype_from_obj(dtype)
 
-    if is_categorical_dtype(dtype):
+    if dtype.is_categorical:
         if not len(children) == 1:
             raise ValueError(
                 "Must specify exactly one child column for CategoricalColumn"
@@ -1146,7 +1146,7 @@ def build_column(
             null_count=null_count,
             children=children,
         )
-    elif dtype.type is np.datetime64:
+    elif dtype.is_datetime:
         return cudf.core.column.DatetimeColumn(
             data=data,
             dtype=dtype,
@@ -1155,7 +1155,7 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
-    elif dtype.type in (np.object_, np.str_):
+    elif dtype.is_string:
         return cudf.core.column.StringColumn(
             mask=mask,
             size=size,
@@ -1265,6 +1265,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
     * pandas.Categorical objects
     """
 
+    dtype = make_dtype_from_obj(dtype) if dtype is not None else None
+
     if isinstance(arbitrary, ColumnBase):
         if dtype is not None:
             return arbitrary.astype(dtype)
@@ -1449,7 +1451,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             )
             data = cudf.core.column.NumericalColumn(
                 data=padata,
-                dtype=dtype,
+                dtype=make_dtype_from_obj(arbitrary.type),
                 mask=pamask,
                 size=pa_size,
                 offset=pa_offset,
@@ -1648,7 +1650,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                     dtype = pd.api.types.pandas_dtype(dtype)
                     if dtype.is_categorical:
                         raise TypeError
-
                 pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null)
                 data = as_column(pa_data, dtype=make_dtype_from_obj(pa_data.type), nan_as_null=nan_as_null)
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 939123f2474..271351c3890 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -11,6 +11,7 @@
 from cudf.core.column import column
 from cudf.utils import utils
 from cudf.utils.dtypes import is_scalar, np_to_pa_dtype
+from cudf.core.dtypes import make_dtype_from_obj
 
 # nanoseconds per time_unit
 _numpy_to_pandas_conversion = {
@@ -45,7 +46,7 @@ def __init__(
         mask : Buffer; optional
             The validity mask
         """
-        dtype = np.dtype(dtype)
+        dtype = make_dtype_from_obj(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -60,7 +61,7 @@ def __init__(
             null_count=null_count,
         )
         assert self.dtype.type is np.datetime64
-        self._time_unit, _ = np.datetime_data(self.dtype)
+        self._time_unit, _ = np.datetime_data(self.dtype.to_numpy)
 
     def __contains__(self, item):
         # Handles improper item types
@@ -164,7 +165,7 @@ def as_string_column(self, dtype, **kwargs):
 
     def to_pandas(self, index=None, nullable_pd_dtype=False):
         return pd.Series(
-            self.to_array(fillna="pandas").astype(self.dtype), index=index
+            self.to_array(fillna="pandas").astype(self.dtype.to_pandas), index=index
         )
 
     def to_arrow(self):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 958b22136dd..459461d94e5 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -70,26 +70,27 @@ def unary_operator(self, unaryop):
 
     def binary_operator(self, binop, rhs, reflect=False):
         int_dtypes = [
-            np.dtype("int8"),
-            np.dtype("int16"),
-            np.dtype("int32"),
-            np.dtype("int64"),
-            np.dtype("uint8"),
-            np.dtype("uint16"),
-            np.dtype("uint32"),
-            np.dtype("uint64"),
+            cudf.Int8Dtype(),
+            cudf.Int16Dtype(),
+            cudf.Int32Dtype(),
+            cudf.Int64Dtype(),
+            cudf.UInt8Dtype(),
+            cudf.UInt16Dtype(),
+            cudf.UInt32Dtype(),
+            cudf.UInt64Dtype(),
         ]
         tmp = rhs
         if reflect:
             tmp = self
         if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs):
-            out_dtype = np.result_type(self.dtype, rhs.dtype)
+            out_dtype = np.result_type(make_dtype_from_obj(self.dtype).to_numpy, make_dtype_from_obj(rhs.dtype).to_numpy)
+            out_dtype = make_dtype_from_obj(out_dtype)
             if binop in ["mod", "floordiv"]:
                 if (tmp.dtype in int_dtypes) and (
                     (np.isscalar(tmp) and (0 == tmp))
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
                 ):
-                    out_dtype = np.dtype("float_")
+                    out_dtype = cudf.Float64Dtype()
         elif rhs is None:
             out_dtype = self.dtype
         else:
@@ -160,8 +161,6 @@ def as_numerical_column(self, dtype, **kwargs):
         # expect a cudf dtype always here
         if dtype == self.dtype:
             return self
-        import pdb
-        pdb.set_trace()
         return libcudf.unary.cast(self, dtype)
 
     def to_pandas(self, index=None, nullable_pd_dtype=False):
@@ -198,14 +197,13 @@ def to_arrow(self):
         if self.nullable:
             mask = pa.py_buffer(self.mask_array_view.copy_to_host())
         data = pa.py_buffer(self.data_array_view.copy_to_host())
-        pa_dtype = np_to_pa_dtype(self.dtype)
         out = pa.Array.from_buffers(
-            type=pa_dtype,
+            type=self.dtype.pa_type,
             length=len(self),
             buffers=[mask, data],
             null_count=self.null_count,
         )
-        if self.dtype == np.bool:
+        if self.dtype.is_boolean:
             return out.cast(pa.bool_())
         else:
             return out
@@ -312,7 +310,8 @@ def fillna(self, fill_value):
         """
         if np.isscalar(fill_value):
             # castsafely to the same dtype as self
-            fill_value_casted = self.dtype.type(fill_value)
+            # TODO - produce a libcudf scalar directly
+            fill_value_casted = self.dtype.to_numpy.type(fill_value)
             if not np.isnan(fill_value) and (fill_value_casted != fill_value):
                 raise TypeError(
                     "Cannot safely cast non-equivalent {} to {}".format(
@@ -455,7 +454,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
 
     if is_op_comparison:
         out_dtype = "bool"
-
+        
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     if is_op_comparison:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 46d07bfcea8..2d8c22307f5 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -60,8 +60,11 @@ class Dtype(ExtensionDtype, _Dtype):
     def __eq__(self, other):
         if isinstance(other, self.__class__):
             return True
+        if isinstance(other, Dtype) and not isinstance(other, self.__class__):
+            return False
         if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__:
             return True
+        
         if self.to_numpy == other:
             return True
         raise NotImplementedError
@@ -80,11 +83,16 @@ def itemsize(self):
 
     @property
     def type(self):
-        return self.to_pandas.type
+        if self.is_float or self.is_datetime:
+            return self.to_numpy.kind
+        else: self.to_pandas.type
 
     @property
     def kind(self):
-        return self.to_pandas.kind
+        if self.is_float:
+            return 'f'
+        else:
+            return self.to_pandas.kind
 
     @property
     def name(self):
@@ -242,8 +250,16 @@ def make_dtype_from_obj(obj):
         return pa_to_cudf_dtypes[obj]
     elif isinstance(obj, str):
         return make_dtype_from_string(obj)
+    elif obj in pd_to_cudf_dtypes.keys():
+        return pd_to_cudf_dtypes[obj]
     else:
-        raise TypeError
+        try:
+            if issubclass(obj, np.generic):
+                return np_to_cudf_dtypes[np.dtype(obj)]
+        except:
+            import pdb
+            pdb.set_trace()
+            raise TypeError('cant transform this object to a cudf dtype. ')
 
 class CategoricalDtype(ExtensionDtype):
 
@@ -445,3 +461,16 @@ def __repr__(self):
     np.dtype('datetime64[ms]'): Datetime64MSDtype(),
     np.dtype('datetime64[s]'): Datetime64SDtype(),
 }
+
+pd_to_cudf_dtypes = {
+    pd.Int8Dtype(): Int8Dtype(),
+    pd.Int16Dtype(): Int16Dtype(),
+    pd.Int32Dtype(): Int32Dtype(),
+    pd.Int64Dtype(): Int64Dtype(),
+    pd.UInt8Dtype(): UInt8Dtype(),
+    pd.UInt16Dtype(): UInt16Dtype(),
+    pd.UInt32Dtype(): UInt32Dtype(),
+    pd.UInt64Dtype(): UInt64Dtype(),
+    pd.BooleanDtype(): BooleanDtype(),
+    pd.StringDtype(): StringDtype()
+}
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 60165b51fc4..87b643de9a7 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -44,10 +44,10 @@ def test_column_offset_and_size(pandas_input, offset, size):
         children=col.base_children,
     )
 
-    if cudf.utils.dtypes.is_categorical_dtype(col.dtype):
+    if col.dtype.is_categorical:
         assert col.size == col.codes.size
         assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
-    elif pd.api.types.is_string_dtype(col.dtype):
+    elif col.dtype.is_string:
         assert col.size == (col.children[0].size - 1)
         assert col.size == (
             (col.children[0].data.size / col.children[0].dtype.itemsize) - 1

From a81c3681f3ae343b8aa01817ad19ce031eb7ebfd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 31 Jul 2020 11:40:37 -0700
Subject: [PATCH 12/80] forgot string O

---
 python/cudf/cudf/core/dtypes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 2d8c22307f5..ad0bf4c2a37 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -198,7 +198,9 @@ def __init__(self):
         self._name = "String"
 
 def make_dtype_from_string(obj):
-    if obj in {'str', 'string', 'object'}:
+    import pdb
+    pdb.set_trace()
+    if obj in {'str', 'string', 'object', 'O'}:
         return StringDtype()
     elif 'datetime' in obj:
         if obj == 'datetime64[ns]':

From 572c39f950a32f4f188d9eff205ecae10df096b1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 31 Jul 2020 13:27:41 -0700
Subject: [PATCH 13/80] more progress

---
 python/cudf/cudf/core/column/column.py |  14 +-
 python/cudf/cudf/core/dtypes.py        | 193 +++++++++++++++----------
 python/cudf/cudf/tests/test_column.py  |   2 +-
 3 files changed, 121 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index ae0c939f74c..84cb50afecf 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -187,7 +187,7 @@ def _concat(cls, objs, dtype=None):
 
         if len(objs) == 0:
             dtype = pd.api.types.pandas_dtype(dtype)
-            if is_categorical_dtype(dtype):
+            if dtype.is_categorical:
                 dtype = CategoricalDtype()
             return column_empty(0, dtype=dtype, masked=True)
 
@@ -200,18 +200,18 @@ def _concat(cls, objs, dtype=None):
                 [
                     o
                     for o in not_null_cols
-                    if not is_numerical_dtype(o.dtype)
-                    or np.issubdtype(o.dtype, np.datetime64)
+                    if not o.dtype.is_numeric
+                    or o.dtype.is_datetime
                 ]
             )
             == 0
         ):
-            col_dtypes = [o.dtype for o in not_null_cols]
+            np_col_dtypes = [o.dtype.to_numpy for o in not_null_cols]
             # Use NumPy to find a common dtype
-            common_dtype = np.find_common_type(col_dtypes, [])
+            np_common_dtype = np.find_common_type(np_col_dtypes, [])
             # Cast all columns to the common dtype
             for i in range(len(objs)):
-                objs[i] = objs[i].astype(common_dtype)
+                objs[i] = objs[i].astype(make_dtype_from_obj(np_common_dtype))
 
         # Find the first non-null column:
         head = objs[0]
@@ -232,7 +232,7 @@ def _concat(cls, objs, dtype=None):
                     raise ValueError("All columns must be the same type")
 
         cats = None
-        is_categorical = all(is_categorical_dtype(o.dtype) for o in objs)
+        is_categorical = all(o.dtype.is_categorical for o in objs)
 
         # Combine CategoricalColumn categories
         if is_categorical:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index ad0bf4c2a37..cf2614c8017 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -22,32 +22,33 @@
     pa.string(): pd.StringDtype(),
     pa.float32(): np.float32(),
     pa.float64(): np.float64(),
-    pa.timestamp('ns'): np.dtype('datetime64[ns]'),
-    pa.timestamp('us'): np.dtype('datetime64[us]'),
-    pa.timestamp('ms'): np.dtype('datetime64[ms]'),
-    pa.timestamp('s'): np.dtype('datetime64[s]'),
+    pa.timestamp("ns"): np.dtype("datetime64[ns]"),
+    pa.timestamp("us"): np.dtype("datetime64[us]"),
+    pa.timestamp("ms"): np.dtype("datetime64[ms]"),
+    pa.timestamp("s"): np.dtype("datetime64[s]"),
 }
 
 pa_to_np_dtypes = {
-    pa.uint8(): np.dtype('uint8'),
-    pa.uint16(): np.dtype('uint16'),
-    pa.uint32(): np.dtype('uint32'),
-    pa.uint64(): np.dtype('uint64'),
-    pa.int8(): np.dtype('int8'),
-    pa.int16(): np.dtype('int16'),
-    pa.int32(): np.dtype('int32'),
-    pa.int64(): np.dtype('int64'),
-    pa.bool_(): np.dtype('bool'),
-    pa.string(): np.dtype('object'),
-    pa.float32(): np.dtype('float32'),
-    pa.float64(): np.dtype('float64'),
-    pa.timestamp('ns'): np.dtype('datetime64[ns]'),
-    pa.timestamp('us'): np.dtype('datetime64[us]'),
-    pa.timestamp('ms'): np.dtype('datetime64[ms]'),
-    pa.timestamp('s'): np.dtype('datetime64[s]'),
-    None: None
+    pa.uint8(): np.dtype("uint8"),
+    pa.uint16(): np.dtype("uint16"),
+    pa.uint32(): np.dtype("uint32"),
+    pa.uint64(): np.dtype("uint64"),
+    pa.int8(): np.dtype("int8"),
+    pa.int16(): np.dtype("int16"),
+    pa.int32(): np.dtype("int32"),
+    pa.int64(): np.dtype("int64"),
+    pa.bool_(): np.dtype("bool"),
+    pa.string(): np.dtype("object"),
+    pa.float32(): np.dtype("float32"),
+    pa.float64(): np.dtype("float64"),
+    pa.timestamp("ns"): np.dtype("datetime64[ns]"),
+    pa.timestamp("us"): np.dtype("datetime64[us]"),
+    pa.timestamp("ms"): np.dtype("datetime64[ms]"),
+    pa.timestamp("s"): np.dtype("datetime64[s]"),
+    None: None,
 }
 
+
 class Dtype(ExtensionDtype, _Dtype):
     is_integer = False
     is_string = False
@@ -56,18 +57,25 @@ class Dtype(ExtensionDtype, _Dtype):
     is_datetime = False
     is_list = False
     is_float = False
+    is_numeric = False
     pa_type = None
+
     def __eq__(self, other):
         if isinstance(other, self.__class__):
             return True
         if isinstance(other, Dtype) and not isinstance(other, self.__class__):
             return False
-        if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__:
+        if (
+            isinstance(other, self.to_pandas.__class__)
+            or other is self.to_pandas.__class__
+        ):
             return True
-        
+
         if self.to_numpy == other:
             return True
-        raise NotImplementedError
+        if isinstance(other, str) and str(self.to_numpy) == other:
+            return True
+        return False
 
     @property
     def to_numpy(self):
@@ -85,12 +93,13 @@ def itemsize(self):
     def type(self):
         if self.is_float or self.is_datetime:
             return self.to_numpy.kind
-        else: self.to_pandas.type
+        else:
+            return self.to_pandas.type
 
     @property
     def kind(self):
         if self.is_float:
-            return 'f'
+            return "f"
         else:
             return self.to_pandas.kind
 
@@ -104,147 +113,169 @@ def __repr__(self):
     def __hash__(self):
         return hash(self.__repr__())
 
-    
 
 class IntDtype(Dtype):
     is_integer = True
+    is_numeric = True
+
 
 class UInt8Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint8()
         self._name = "UInt8"
-        
+
+
 class UInt16Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint16()
         self._name = "UInt16"
 
+
 class UInt32Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint32()
         self._name = "UInt32"
 
+
 class UInt64Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.uint64()
         self._name = "UInt64"
 
+
 class Int8Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int8()
         self._name = "Int8"
 
+
 class Int16Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int16()
         self._name = "Int16"
 
+
 class Int32Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int32()
         self._name = "Int32"
 
+
 class Int64Dtype(IntDtype):
     def __init__(self):
         self.pa_type = pa.int64()
         self._name = "Int64"
 
+
 class FloatDtype(Dtype):
     is_float = True
+    is_numeric = True
+
 
 class Float32Dtype(FloatDtype):
     def __init__(self):
         self.pa_type = pa.float32()
         self._name = "Float32"
 
+
 class Float64Dtype(FloatDtype):
     def __init__(self):
         self.pa_type = pa.float64()
         self._name = "Float64"
 
+
 class BooleanDtype(Dtype):
     is_boolean = True
+
     def __init__(self):
         self.pa_type = pa.bool_()
         self._name = "Boolean"
 
+
 class DatetimeDtype(Dtype):
     is_datetime = True
 
+
 class Datetime64NSDtype(DatetimeDtype):
     def __init__(self):
-        self.pa_type = pa.timestamp('ns')
+        self.pa_type = pa.timestamp("ns")
         self._name = "Datetime64NS"
 
+
 class Datetime64USDtype(DatetimeDtype):
     def __init__(self):
-        self.pa_type = pa.timestamp('us')
+        self.pa_type = pa.timestamp("us")
         self._name = "Datetime64US"
 
+
 class Datetime64MSDtype(DatetimeDtype):
     def __init__(self):
-        self.pa_type = pa.timestamp('ms')
+        self.pa_type = pa.timestamp("ms")
         self._name = "Datetime64MS"
 
+
 class Datetime64SDtype(DatetimeDtype):
     def __init__(self):
-        self.pa_type = pa.timestamp('s')
+        self.pa_type = pa.timestamp("s")
         self._name = "Datetime64S"
 
+
 class StringDtype(Dtype):
     is_string = True
+
     def __init__(self):
         self.pa_type = pa.string()
         self._name = "String"
 
+
 def make_dtype_from_string(obj):
-    import pdb
-    pdb.set_trace()
-    if obj in {'str', 'string', 'object', 'O'}:
+    if obj in {"str", "string", "object", "O"}:
         return StringDtype()
-    elif 'datetime' in obj:
-        if obj == 'datetime64[ns]':
+    elif "datetime" in obj:
+        if obj == "datetime64[ns]":
             return Datetime64NSDtype()
-        elif obj == 'datetime64[us]':
+        elif obj == "datetime64[us]":
             return Datetime64USDtype()
-        elif obj == 'datetime64[ms]':
+        elif obj == "datetime64[ms]":
             return Datetime64MSDtype()
-        elif obj == 'datetime64[s]':
+        elif obj == "datetime64[s]":
             return Datetime64SDtype()
-    elif 'int' in obj or 'Int' in obj:
-        if obj in {'int', 'Int', 'int64', 'Int64'}:
+    elif "int" in obj or "Int" in obj:
+        if obj in {"int", "Int", "int64", "Int64"}:
             return Int64Dtype()
-        elif obj in {'int32', 'Int32'}:
+        elif obj in {"int32", "Int32"}:
             return Int32Dtype()
-        elif obj in {'int16', 'Int16'}:
+        elif obj in {"int16", "Int16"}:
             return Int16Dtype()
-        elif obj in {'int8', 'Int8'}:
+        elif obj in {"int8", "Int8"}:
             return Int8Dtype()
-        elif obj in {'uint64', 'UInt64'}:
+        elif obj in {"uint64", "UInt64"}:
             return UInt64Dtype()
-        elif obj in {'uint32', 'UInt32'}:
+        elif obj in {"uint32", "UInt32"}:
             return UInt32Dtype()
-        elif obj in {'uint16', 'UInt16'}:
+        elif obj in {"uint16", "UInt16"}:
             return UInt16Dtype()
-        elif obj in {'uint8', 'Uint8'}:
+        elif obj in {"uint8", "Uint8"}:
             return UInt8Dtype()
-    elif 'float' in obj:
-        if obj in {'float64', 'Float64'}:
+    elif "float" in obj:
+        if obj in {"float64", "Float64"}:
             return Float64Dtype()
-        elif obj in {'float32', 'Float32'}:
+        elif obj in {"float32", "Float32"}:
             return Float32Dtype()
-    elif 'bool' in obj:
+    elif "bool" in obj:
         return BooleanDtype()
 
 
-
 def make_dtype_from_numpy(obj):
     np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()}
     result = np_to_pd_types.get(obj)
     return result
 
+
 def make_dtype_from_obj(obj):
-    if isinstance(obj, Dtype):
+    if isinstance(obj, CategoricalDtype):
+        return obj
+    elif isinstance(obj, Dtype):
         return np_to_cudf_dtypes[obj.to_numpy]
     if isinstance(obj, np.dtype):
         return np_to_cudf_dtypes[obj]
@@ -260,12 +291,14 @@ def make_dtype_from_obj(obj):
                 return np_to_cudf_dtypes[np.dtype(obj)]
         except:
             import pdb
+
             pdb.set_trace()
-            raise TypeError('cant transform this object to a cudf dtype. ')
+            raise TypeError("cant transform this object to a cudf dtype. ")
+
 
-class CategoricalDtype(ExtensionDtype):
+class CategoricalDtype(Dtype):
 
-    is_categorical_dtype = True
+    is_categorical = True
 
     def __init__(self, categories=None, ordered=None):
         """
@@ -438,30 +471,30 @@ def __repr__(self):
     pa.string(): StringDtype(),
     pa.float32(): Float32Dtype(),
     pa.float64(): Float64Dtype(),
-    pa.timestamp('ns'): Datetime64NSDtype(),
-    pa.timestamp('us'): Datetime64USDtype(),
-    pa.timestamp('ms'): Datetime64MSDtype(),
-    pa.timestamp('s'): Datetime64SDtype(),
-    None: Dtype
+    pa.timestamp("ns"): Datetime64NSDtype(),
+    pa.timestamp("us"): Datetime64USDtype(),
+    pa.timestamp("ms"): Datetime64MSDtype(),
+    pa.timestamp("s"): Datetime64SDtype(),
+    None: Dtype,
 }
 
 np_to_cudf_dtypes = {
-    np.dtype('int8'): Int8Dtype(),
-    np.dtype('int16'): Int16Dtype(),
-    np.dtype('int32'): Int32Dtype(),
-    np.dtype('int64'): Int64Dtype(),
-    np.dtype('uint8'): UInt8Dtype(),
-    np.dtype('uint16'): UInt16Dtype(),
-    np.dtype('uint32'): UInt32Dtype(),
-    np.dtype('uint64'): UInt64Dtype(),
-    np.dtype('bool'): BooleanDtype(),
-    np.dtype('object'): StringDtype(),
-    np.dtype('float32'): Float32Dtype(),
-    np.dtype('float64'): Float64Dtype(),
-    np.dtype('datetime64[ns]'): Datetime64NSDtype(),
-    np.dtype('datetime64[us]'): Datetime64USDtype(),
-    np.dtype('datetime64[ms]'): Datetime64MSDtype(),
-    np.dtype('datetime64[s]'): Datetime64SDtype(),
+    np.dtype("int8"): Int8Dtype(),
+    np.dtype("int16"): Int16Dtype(),
+    np.dtype("int32"): Int32Dtype(),
+    np.dtype("int64"): Int64Dtype(),
+    np.dtype("uint8"): UInt8Dtype(),
+    np.dtype("uint16"): UInt16Dtype(),
+    np.dtype("uint32"): UInt32Dtype(),
+    np.dtype("uint64"): UInt64Dtype(),
+    np.dtype("bool"): BooleanDtype(),
+    np.dtype("object"): StringDtype(),
+    np.dtype("float32"): Float32Dtype(),
+    np.dtype("float64"): Float64Dtype(),
+    np.dtype("datetime64[ns]"): Datetime64NSDtype(),
+    np.dtype("datetime64[us]"): Datetime64USDtype(),
+    np.dtype("datetime64[ms]"): Datetime64MSDtype(),
+    np.dtype("datetime64[s]"): Datetime64SDtype(),
 }
 
 pd_to_cudf_dtypes = {
@@ -474,5 +507,5 @@ def __repr__(self):
     pd.UInt32Dtype(): UInt32Dtype(),
     pd.UInt64Dtype(): UInt64Dtype(),
     pd.BooleanDtype(): BooleanDtype(),
-    pd.StringDtype(): StringDtype()
+    pd.StringDtype(): StringDtype(),
 }
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 87b643de9a7..7cb1509cb7f 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -66,7 +66,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
 
     slicer = slice(offset, size)
     expect = pandas_input.iloc[slicer].reset_index(drop=True)
-
+    print(got)
     assert_eq(expect, got)
 
 

From 4f6f316b670cc65c62b359234bd819dafbf6d42f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 Aug 2020 12:45:34 -0700
Subject: [PATCH 14/80] column tests pass

---
 python/cudf/cudf/__init__.py              |  9 ++-
 python/cudf/cudf/_lib/column.pyx          |  2 +-
 python/cudf/cudf/_lib/types.pxd           |  2 +-
 python/cudf/cudf/_lib/types.pyx           |  2 +-
 python/cudf/cudf/core/column/column.py    | 60 +++++++-------
 python/cudf/cudf/core/column/datetime.py  |  8 +-
 python/cudf/cudf/core/column/numerical.py |  8 +-
 python/cudf/cudf/core/column/string.py    | 63 +++++++--------
 python/cudf/cudf/core/dtypes.py           | 97 +++++++++++------------
 python/cudf/cudf/core/series.py           |  5 +-
 python/cudf/cudf/tests/test_column.py     |  4 +-
 python/cudf/cudf/utils/utils.py           |  2 +-
 12 files changed, 130 insertions(+), 132 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 38b31e5e7b7..8d16edafa64 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -32,7 +32,12 @@
     merge,
 )
 from cudf.core.dtypes import (
-    Dtype,
+    dtype,
+    Generic,
+    Datetime,
+    Floating,
+    Number,
+    Flexible,
     CategoricalDtype, 
     Int8Dtype,
     Int16Dtype, 
@@ -49,7 +54,7 @@
     Datetime64NSDtype,
     Datetime64USDtype, 
     Datetime64MSDtype,
-    Datetime64SDtype
+    Datetime64SDtype,
 )
 
 from cudf.core.groupby import Grouper
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 7cf3549ed1c..5b48d92f1b6 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -349,7 +349,7 @@ cdef class Column:
         return self._view(c_null_count)
 
     cdef column_view _view(self, libcudf_types.size_type null_count) except *:
-        if self.dtype.is_categorical:
+        if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
         else:
             col = self
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index f6c0c39174a..54b5d8cfae4 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -12,4 +12,4 @@ ctypedef int32_t underlying_type_t_type_id
 ctypedef bool underlying_type_t_null_policy
 
 cdef class _Dtype:
-    cdef data_type get_libcudf_type(self)
+    cdef data_type get_libcudf_type(self) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index fc8d4fada22..53bae8d7389 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -123,7 +123,7 @@ class NullHandling(IntEnum):
 
 
 cdef class _Dtype:
-    cdef data_type get_libcudf_type(self):
+    cdef data_type get_libcudf_type(self) except *:
         np_dtype = self.to_numpy
         cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
                 <underlying_type_t_type_id> (
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 84cb50afecf..f35687bb642 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -37,7 +37,6 @@
     np_to_pa_dtype,
 )
 from cudf.utils.utils import buffers_from_pyarrow, mask_dtype
-from cudf.core.dtypes import make_dtype_from_obj
 
 class ColumnBase(Column, Serializable):
     def __init__(
@@ -81,10 +80,10 @@ def data_array_view(self):
         """
         View the data as a device array object
         """
-        if self.dtype.is_string:
+        if isinstance(self.dtype, cudf.StringDtype):
             raise ValueError("Cannot get an array view of a StringColumn")
 
-        if self.dtype.is_categorical:
+        if is_categorical_dtype(self.dtype):
             return self.codes.data_array_view
         else:
             dtype = self.dtype
@@ -187,7 +186,7 @@ def _concat(cls, objs, dtype=None):
 
         if len(objs) == 0:
             dtype = pd.api.types.pandas_dtype(dtype)
-            if dtype.is_categorical:
+            if is_categorical_dtype(dtype):
                 dtype = CategoricalDtype()
             return column_empty(0, dtype=dtype, masked=True)
 
@@ -200,8 +199,7 @@ def _concat(cls, objs, dtype=None):
                 [
                     o
                     for o in not_null_cols
-                    if not o.dtype.is_numeric
-                    or o.dtype.is_datetime
+                    if not isinstance(o.dtype, (cudf.Number, cudf.Datetime))
                 ]
             )
             == 0
@@ -211,7 +209,7 @@ def _concat(cls, objs, dtype=None):
             np_common_dtype = np.find_common_type(np_col_dtypes, [])
             # Cast all columns to the common dtype
             for i in range(len(objs)):
-                objs[i] = objs[i].astype(make_dtype_from_obj(np_common_dtype))
+                objs[i] = objs[i].astype(cudf.dtype(np_common_dtype))
 
         # Find the first non-null column:
         head = objs[0]
@@ -232,7 +230,7 @@ def _concat(cls, objs, dtype=None):
                     raise ValueError("All columns must be the same type")
 
         cats = None
-        is_categorical = all(o.dtype.is_categorical for o in objs)
+        is_categorical = all(is_categorical_dtype(o.dtype) for o in objs)
 
         # Combine CategoricalColumn categories
         if is_categorical:
@@ -878,12 +876,12 @@ def distinct_count(self, method="sort", dropna=True):
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype, **kwargs):
-        dtype = make_dtype_from_obj(dtype)
-        if dtype.is_categorical:
+        dtype = cudf.dtype(dtype)
+        if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
-        elif dtype.is_datetime:
+        elif isinstance(dtype, cudf.Datetime):
             return self.as_datetime_column(dtype, **kwargs)
-        elif dtype.is_string:
+        elif isinstance(dtype, cudf.StringDtype):
             return self.as_string_column(dtype, **kwargs)
         else:
             return self.as_numerical_column(dtype, **kwargs)
@@ -1129,9 +1127,9 @@ def build_column(
     offset : int, optional
     children : tuple, optional
     """
-    dtype = make_dtype_from_obj(dtype)
+    dtype = cudf.dtype(dtype)
 
-    if dtype.is_categorical:
+    if is_categorical_dtype(dtype):
         if not len(children) == 1:
             raise ValueError(
                 "Must specify exactly one child column for CategoricalColumn"
@@ -1146,7 +1144,7 @@ def build_column(
             null_count=null_count,
             children=children,
         )
-    elif dtype.is_datetime:
+    elif isinstance(dtype, cudf.Datetime):
         return cudf.core.column.DatetimeColumn(
             data=data,
             dtype=dtype,
@@ -1155,7 +1153,7 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
-    elif dtype.is_string:
+    elif isinstance(dtype, cudf.StringDtype):
         return cudf.core.column.StringColumn(
             mask=mask,
             size=size,
@@ -1265,7 +1263,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
     * pandas.Categorical objects
     """
 
-    dtype = make_dtype_from_obj(dtype) if dtype is not None else None
+    dtype = cudf.dtype(dtype) if dtype is not None else None
 
     if isinstance(arbitrary, ColumnBase):
         if dtype is not None:
@@ -1324,11 +1322,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
 
         if isinstance(col, cudf.core.column.CategoricalColumn):
             return col
-        elif col.dtype.is_float:
+        elif isinstance(col.dtype, cudf.Floating):
             if nan_as_null or (mask is None and nan_as_null is None):
                 mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan))
                 col = col.set_mask(mask)
-        elif col.dtype.is_datetime:
+        elif isinstance(col.dtype, cudf.Datetime):
             if nan_as_null or (mask is None and nan_as_null is None):
                 col = utils.time_col_replace_nulls(col)
         return col
@@ -1451,7 +1449,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             )
             data = cudf.core.column.NumericalColumn(
                 data=padata,
-                dtype=make_dtype_from_obj(arbitrary.type),
+                dtype=cudf.dtype(arbitrary.type),
                 mask=pamask,
                 size=pa_size,
                 offset=pa_offset,
@@ -1517,15 +1515,15 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         ):
             arbitrary = None
             if dtype is None:
-                dtype = np.dtype("float64")
+                dtype = cudf.Float64Dtype()
 
         data = as_column(
             utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
         )
         if not nan_as_null:
-            if np.issubdtype(data.dtype, np.floating):
+            if isinstance(data.dtype, cudf.Floating):
                 data = data.fillna(np.nan)
-            elif np.issubdtype(data.dtype, np.datetime64):
+            elif isinstance(data.dtype, cudf.Datetime):
                 data = data.fillna(np.datetime64("NaT"))
 
     elif hasattr(arbitrary, "__array_interface__"):
@@ -1582,7 +1580,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
 
             pa_data = pa.Array.from_pandas(arbitrary)
             data = as_column(
-                pa_data, dtype=make_dtype_from_obj(pa_data.type)
+                pa_data, dtype=cudf.dtype(pa_data.type)
             )
             # There is no cast operation available for pa.Array from int to
             # str, Hence instead of handling in pa.Array block, we
@@ -1633,9 +1631,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         data = as_column(data, dtype=cudf_dtype)
 
         mask = arbitrary._mask
-        mask = bools_to_mask(
-            as_column(mask).binary_operator("eq", np.bool_(False))
-        )
+        mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
 
@@ -1648,20 +1644,20 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             try:
                 if dtype is not None:
                     dtype = pd.api.types.pandas_dtype(dtype)
-                    if dtype.is_categorical:
+                    if is_categorical_dtype(dtype):
                         raise TypeError
                 pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null)
-                data = as_column(pa_data, dtype=make_dtype_from_obj(pa_data.type), nan_as_null=nan_as_null)
+                data = as_column(pa_data, dtype=cudf.dtype(pa_data.type), nan_as_null=nan_as_null)
 
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
-                if dtype.is_categorical:
+                if is_categorical_dtype(dtype):
                     sr = pd.Series(arbitrary, dtype="category")
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
-                elif dtype.to_numpy == np.str_:
+                elif isinstance(cudf.dtype(dtype), cudf.StringDtype):
                     sr = pd.Series(arbitrary, dtype="str")
                     data = as_column(sr, nan_as_null=nan_as_null)
                 else:
-                    native_dtype = dtype.to_numpy
+                    native_dtype = dtype.to_numpy if dtype is not None else None
                     if dtype is None and pd.api.types.infer_dtype(
                         arbitrary
                     ) in ("mixed", "mixed-integer"):
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 271351c3890..3e3c5c5f077 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -5,13 +5,13 @@
 import pandas as pd
 import pyarrow as pa
 
+import cudf.core.dtypes as cudf_dtypes
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.utils import utils
 from cudf.utils.dtypes import is_scalar, np_to_pa_dtype
-from cudf.core.dtypes import make_dtype_from_obj
 
 # nanoseconds per time_unit
 _numpy_to_pandas_conversion = {
@@ -46,7 +46,7 @@ def __init__(
         mask : Buffer; optional
             The validity mask
         """
-        dtype = make_dtype_from_obj(dtype)
+        dtype = cudf_dtypes.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -60,7 +60,7 @@ def __init__(
             offset=offset,
             null_count=null_count,
         )
-        assert self.dtype.type is np.datetime64
+        assert isinstance(self.dtype, cudf_dtypes.Datetime)
         self._time_unit, _ = np.datetime_data(self.dtype.to_numpy)
 
     def __contains__(self, item):
@@ -158,7 +158,7 @@ def as_string_column(self, dtype, **kwargs):
             kwargs["format"] = fmt
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
-                np.dtype(self.dtype)
+                self.dtype
             ](self, **kwargs)
         else:
             return column.column_empty(0, dtype="object", masked=False)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 459461d94e5..82fa5e2a4e2 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -18,7 +18,6 @@
     np_to_pa_dtype,
     numeric_normalize_types,
 )
-from cudf.core.dtypes import make_dtype_from_obj
 
 class NumericalColumn(column.ColumnBase):
     def __init__(
@@ -32,7 +31,7 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
-        dtype = make_dtype_from_obj(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -83,8 +82,8 @@ def binary_operator(self, binop, rhs, reflect=False):
         if reflect:
             tmp = self
         if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs):
-            out_dtype = np.result_type(make_dtype_from_obj(self.dtype).to_numpy, make_dtype_from_obj(rhs.dtype).to_numpy)
-            out_dtype = make_dtype_from_obj(out_dtype)
+            out_dtype = np.result_type(cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy)
+            out_dtype = cudf.dtype(out_dtype)
             if binop in ["mod", "floordiv"]:
                 if (tmp.dtype in int_dtypes) and (
                     (np.isscalar(tmp) and (0 == tmp))
@@ -454,7 +453,6 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
 
     if is_op_comparison:
         out_dtype = "bool"
-        
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     if is_op_comparison:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 2b8e030d49e..4056e0dc7d5 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -134,46 +134,47 @@
     is_scalar,
     is_string_dtype,
 )
-from cudf.core.dtypes import make_dtype_from_obj
+
+from cudf.core.dtypes import dtype
 
 _str_to_numeric_typecast_functions = {
-    make_dtype_from_obj("int8"): str_cast.stoi8,
-    make_dtype_from_obj("int16"): str_cast.stoi16,
-    make_dtype_from_obj("int32"): str_cast.stoi,
-    make_dtype_from_obj("int64"): str_cast.stol,
-    make_dtype_from_obj("uint8"): str_cast.stoui8,
-    make_dtype_from_obj("uint16"): str_cast.stoui16,
-    make_dtype_from_obj("uint32"): str_cast.stoui,
-    make_dtype_from_obj("uint64"): str_cast.stoul,
-    make_dtype_from_obj("float32"): str_cast.stof,
-    make_dtype_from_obj("float64"): str_cast.stod,
-    make_dtype_from_obj("bool"): str_cast.to_booleans,
+    dtype("int8"): str_cast.stoi8,
+    dtype("int16"): str_cast.stoi16,
+    dtype("int32"): str_cast.stoi,
+    dtype("int64"): str_cast.stol,
+    dtype("uint8"): str_cast.stoui8,
+    dtype("uint16"): str_cast.stoui16,
+    dtype("uint32"): str_cast.stoui,
+    dtype("uint64"): str_cast.stoul,
+    dtype("float32"): str_cast.stof,
+    dtype("float64"): str_cast.stod,
+    dtype("bool"): str_cast.to_booleans,
     # TODO: support Date32 UNIX days
     # np.dtype("datetime64[D]"): str_cast.timestamp2int,
-    make_dtype_from_obj("datetime64[s]"): str_cast.timestamp2int,
-    make_dtype_from_obj("datetime64[ms]"): str_cast.timestamp2int,
-    make_dtype_from_obj("datetime64[us]"): str_cast.timestamp2int,
-    make_dtype_from_obj("datetime64[ns]"): str_cast.timestamp2int,
+    dtype("datetime64[s]"): str_cast.timestamp2int,
+    dtype("datetime64[ms]"): str_cast.timestamp2int,
+    dtype("datetime64[us]"): str_cast.timestamp2int,
+    dtype("datetime64[ns]"): str_cast.timestamp2int,
 }
 
 _numeric_to_str_typecast_functions = {
-    make_dtype_from_obj(np.dtype("int8")): str_cast.i8tos,
-    make_dtype_from_obj(np.dtype("int16")): str_cast.i16tos,
-    make_dtype_from_obj(np.dtype("int32")): str_cast.itos,
-    make_dtype_from_obj(np.dtype("int64")): str_cast.ltos,
-    make_dtype_from_obj(np.dtype("uint8")): str_cast.ui8tos,
-    make_dtype_from_obj(np.dtype("uint16")): str_cast.ui16tos,
-    make_dtype_from_obj(np.dtype("uint32")): str_cast.uitos,
-    make_dtype_from_obj(np.dtype("uint64")): str_cast.ultos,
-    make_dtype_from_obj(np.dtype("float32")): str_cast.ftos,
-    make_dtype_from_obj(np.dtype("float64")): str_cast.dtos,
-    make_dtype_from_obj(np.dtype("bool")): str_cast.from_booleans,
+    dtype(np.dtype("int8")): str_cast.i8tos,
+    dtype(np.dtype("int16")): str_cast.i16tos,
+    dtype(np.dtype("int32")): str_cast.itos,
+    dtype(np.dtype("int64")): str_cast.ltos,
+    dtype(np.dtype("uint8")): str_cast.ui8tos,
+    dtype(np.dtype("uint16")): str_cast.ui16tos,
+    dtype(np.dtype("uint32")): str_cast.uitos,
+    dtype(np.dtype("uint64")): str_cast.ultos,
+    dtype(np.dtype("float32")): str_cast.ftos,
+    dtype(np.dtype("float64")): str_cast.dtos,
+    dtype(np.dtype("bool")): str_cast.from_booleans,
     # TODO: support Date32 UNIX days
     # np.dtype("datetime64[D]"): str_cast.int2timestamp,
-    make_dtype_from_obj(np.dtype("datetime64[s]")): str_cast.int2timestamp,
-    make_dtype_from_obj(np.dtype("datetime64[ms]")): str_cast.int2timestamp,
-    make_dtype_from_obj(np.dtype("datetime64[us]")): str_cast.int2timestamp,
-    make_dtype_from_obj(np.dtype("datetime64[ns]")): str_cast.int2timestamp,
+    dtype(np.dtype("datetime64[s]")): str_cast.int2timestamp,
+    dtype(np.dtype("datetime64[ms]")): str_cast.int2timestamp,
+    dtype(np.dtype("datetime64[us]")): str_cast.int2timestamp,
+    dtype(np.dtype("datetime64[ns]")): str_cast.int2timestamp,
 }
 
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index cf2614c8017..88bb9406144 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -49,21 +49,13 @@
 }
 
 
-class Dtype(ExtensionDtype, _Dtype):
-    is_integer = False
-    is_string = False
-    is_boolean = False
-    is_categorical = False
-    is_datetime = False
-    is_list = False
-    is_float = False
-    is_numeric = False
+class Generic(ExtensionDtype, _Dtype):
     pa_type = None
 
     def __eq__(self, other):
         if isinstance(other, self.__class__):
             return True
-        if isinstance(other, Dtype) and not isinstance(other, self.__class__):
+        if isinstance(other, Generic) and not isinstance(other, self.__class__):
             return False
         if (
             isinstance(other, self.to_pandas.__class__)
@@ -91,14 +83,14 @@ def itemsize(self):
 
     @property
     def type(self):
-        if self.is_float or self.is_datetime:
-            return self.to_numpy.kind
+        if isinstance(self, (Floating, Datetime)):
+            return self.to_numpy.type
         else:
             return self.to_pandas.type
 
     @property
     def kind(self):
-        if self.is_float:
+        if isinstance(self, Floating):
             return "f"
         else:
             return self.to_pandas.kind
@@ -113,78 +105,87 @@ def __repr__(self):
     def __hash__(self):
         return hash(self.__repr__())
 
+class Number(Generic):
+    pass
 
-class IntDtype(Dtype):
-    is_integer = True
-    is_numeric = True
+class Integer(Number):
+    pass
 
+class SignedInteger(Integer):
+    pass
 
-class UInt8Dtype(IntDtype):
+class UnsignedInteger(Integer):
+    pass
+
+class Inexact(Number):
+    pass
+
+class Floating(Inexact):
+    pass
+
+class Flexible(Generic):
+    pass
+
+class UInt8Dtype(UnsignedInteger):
     def __init__(self):
         self.pa_type = pa.uint8()
         self._name = "UInt8"
 
 
-class UInt16Dtype(IntDtype):
+class UInt16Dtype(UnsignedInteger):
     def __init__(self):
         self.pa_type = pa.uint16()
         self._name = "UInt16"
 
 
-class UInt32Dtype(IntDtype):
+class UInt32Dtype(UnsignedInteger):
     def __init__(self):
         self.pa_type = pa.uint32()
         self._name = "UInt32"
 
 
-class UInt64Dtype(IntDtype):
+class UInt64Dtype(UnsignedInteger):
     def __init__(self):
         self.pa_type = pa.uint64()
         self._name = "UInt64"
 
 
-class Int8Dtype(IntDtype):
+class Int8Dtype(SignedInteger):
     def __init__(self):
         self.pa_type = pa.int8()
         self._name = "Int8"
 
 
-class Int16Dtype(IntDtype):
+class Int16Dtype(SignedInteger):
     def __init__(self):
         self.pa_type = pa.int16()
         self._name = "Int16"
 
 
-class Int32Dtype(IntDtype):
+class Int32Dtype(SignedInteger):
     def __init__(self):
         self.pa_type = pa.int32()
         self._name = "Int32"
 
 
-class Int64Dtype(IntDtype):
+class Int64Dtype(SignedInteger):
     def __init__(self):
         self.pa_type = pa.int64()
         self._name = "Int64"
 
-
-class FloatDtype(Dtype):
-    is_float = True
-    is_numeric = True
-
-
-class Float32Dtype(FloatDtype):
+class Float32Dtype(Floating):
     def __init__(self):
         self.pa_type = pa.float32()
         self._name = "Float32"
 
 
-class Float64Dtype(FloatDtype):
+class Float64Dtype(Floating):
     def __init__(self):
         self.pa_type = pa.float64()
         self._name = "Float64"
 
 
-class BooleanDtype(Dtype):
+class BooleanDtype(Generic):
     is_boolean = True
 
     def __init__(self):
@@ -192,35 +193,34 @@ def __init__(self):
         self._name = "Boolean"
 
 
-class DatetimeDtype(Dtype):
-    is_datetime = True
-
+class Datetime(Generic):
+    pass
 
-class Datetime64NSDtype(DatetimeDtype):
+class Datetime64NSDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("ns")
         self._name = "Datetime64NS"
 
 
-class Datetime64USDtype(DatetimeDtype):
+class Datetime64USDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("us")
         self._name = "Datetime64US"
 
 
-class Datetime64MSDtype(DatetimeDtype):
+class Datetime64MSDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("ms")
         self._name = "Datetime64MS"
 
 
-class Datetime64SDtype(DatetimeDtype):
+class Datetime64SDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("s")
         self._name = "Datetime64S"
 
 
-class StringDtype(Dtype):
+class StringDtype(Flexible):
     is_string = True
 
     def __init__(self):
@@ -272,10 +272,12 @@ def make_dtype_from_numpy(obj):
     return result
 
 
-def make_dtype_from_obj(obj):
+def dtype(obj):
+    if obj is None:
+        return None
     if isinstance(obj, CategoricalDtype):
         return obj
-    elif isinstance(obj, Dtype):
+    elif isinstance(obj, Generic):
         return np_to_cudf_dtypes[obj.to_numpy]
     if isinstance(obj, np.dtype):
         return np_to_cudf_dtypes[obj]
@@ -291,14 +293,11 @@ def make_dtype_from_obj(obj):
                 return np_to_cudf_dtypes[np.dtype(obj)]
         except:
             import pdb
-
             pdb.set_trace()
-            raise TypeError("cant transform this object to a cudf dtype. ")
-
+    
 
-class CategoricalDtype(Dtype):
 
-    is_categorical = True
+class CategoricalDtype(Generic):
 
     def __init__(self, categories=None, ordered=None):
         """
@@ -475,7 +474,7 @@ def __repr__(self):
     pa.timestamp("us"): Datetime64USDtype(),
     pa.timestamp("ms"): Datetime64MSDtype(),
     pa.timestamp("s"): Datetime64SDtype(),
-    None: Dtype,
+    pa.null(): None
 }
 
 np_to_cudf_dtypes = {
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4eda63c345c..914d8d0ec1a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -10,7 +10,7 @@
 from pandas._config import get_option
 from pandas.api.types import is_dict_like
 
-from cudf.core.dtypes import Dtype
+from cudf.core.dtypes import dtype as cudf_dtype
 
 import cudf
 from cudf import _lib as libcudf
@@ -145,8 +145,7 @@ def __init__(
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
-        from cudf.core.dtypes import make_dtype_from_obj
-        dtype = make_dtype_from_obj(dtype) if dtype is not None else None
+        dtype = cudf_dtype(dtype)
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 7cb1509cb7f..35b4806f127 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -44,10 +44,10 @@ def test_column_offset_and_size(pandas_input, offset, size):
         children=col.base_children,
     )
 
-    if col.dtype.is_categorical:
+    if cudf.utils.dtypes.is_categorical_dtype(col.dtype):
         assert col.size == col.codes.size
         assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
-    elif col.dtype.is_string:
+    elif isinstance(col.dtype, cudf.StringDtype):
         assert col.size == (col.children[0].size - 1)
         assert col.size == (
             (col.children[0].data.size / col.children[0].dtype.itemsize) - 1
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 6b8a199550b..30c7528b329 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -417,7 +417,7 @@ def time_col_replace_nulls(input_col):
         input_col,
         column.as_column(
             Buffer(
-                np.array([np.datetime64("NaT")], dtype=input_col.dtype).view(
+                np.array([np.datetime64("NaT")], dtype=input_col.dtype.to_numpy).view(
                     "|u1"
                 )
             ),

From ee6ece5a73c664e604c06cc1abbc3f997d553ab4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 Aug 2020 13:44:48 -0700
Subject: [PATCH 15/80] working up through test_array_func

---
 python/cudf/cudf/_lib/reduce.pyx       | 18 ++++++------------
 python/cudf/cudf/_lib/transform.pyx    |  2 +-
 python/cudf/cudf/core/column/column.py |  2 +-
 python/cudf/cudf/core/dtypes.py        |  2 ++
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 9657693582e..39aad31f570 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -8,11 +8,12 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.scalar cimport Scalar
 from cudf._lib.column cimport Column
 from cudf._lib.types import np_to_cudf_types
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, _Dtype
 from cudf._lib.move cimport move
 from cudf._lib.aggregation cimport make_aggregation, aggregation
 from libcpp.memory cimport unique_ptr
 import numpy as np
+from cudf.core.dtypes import dtype as cudf_dtype
 
 
 def reduce(reduction_op, Column incol, dtype=None, **kwargs):
@@ -32,23 +33,16 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
 
     col_dtype = incol.dtype
     if reduction_op in ['sum', 'sum_of_squares', 'product']:
-        col_dtype = np.find_common_type([col_dtype], [np.uint64])
-    col_dtype = col_dtype if dtype is None else dtype
+        col_dtype = np.find_common_type([col_dtype.to_numpy], [np.uint64])
+    col_dtype = cudf_dtype(col_dtype)
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
     cdef unique_ptr[aggregation] c_agg = move(make_aggregation(
         reduction_op, kwargs
     ))
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(col_dtype)]
-            )
-        )
-    )
-
-    cdef data_type c_out_dtype = data_type(tid)
+    cdef _Dtype data_dtype = col_dtype
+    cdef data_type c_out_dtype = data_dtype.get_libcudf_type()
 
     # check empty case
     if len(incol) <= incol.null_count:
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index c58669d6910..e80b7d9252f 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -75,7 +75,7 @@ def transform(Column input, op):
     cdef type_id c_tid
     cdef data_type c_dtype
 
-    nb_type = numpy_support.from_dtype(input.dtype)
+    nb_type = numpy_support.from_dtype(input.dtype.to_numpy)
     nb_signature = (nb_type,)
     compiled_op = cudautils.compile_udf(op, nb_signature)
     c_str = compiled_op[0].encode('UTF-8')
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f35687bb642..d2340abdc83 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1606,7 +1606,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 arb_dtype = np.dtype("O")
             else:
                 arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
-                if arb_dtype != arbitrary.dtype.to_numpy:
+                if cudf.dtype(arb_dtype) != cudf.dtype(arbitrary.dtype):
                     arbitrary = arbitrary.astype(arb_dtype)
         if arb_dtype.kind in ("O", "U"):
             data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 88bb9406144..830c305f05a 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -287,6 +287,8 @@ def dtype(obj):
         return make_dtype_from_string(obj)
     elif obj in pd_to_cudf_dtypes.keys():
         return pd_to_cudf_dtypes[obj]
+    elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
+        return make_dtype_from_string(obj.name)
     else:
         try:
             if issubclass(obj, np.generic):

From 62c5e17c48fb4feb2940e51920a94f5675792e52 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 Aug 2020 07:12:36 -0700
Subject: [PATCH 16/80] more tests pass

---
 python/cudf/cudf/core/column/numerical.py | 4 ++--
 python/cudf/cudf/core/series.py           | 1 +
 python/cudf/cudf/tests/test_avro.py       | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 82fa5e2a4e2..3f2fcc05240 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -85,7 +85,7 @@ def binary_operator(self, binop, rhs, reflect=False):
             out_dtype = np.result_type(cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy)
             out_dtype = cudf.dtype(out_dtype)
             if binop in ["mod", "floordiv"]:
-                if (tmp.dtype in int_dtypes) and (
+                if (cudf.dtype(tmp.dtype) in int_dtypes) and (
                     (np.isscalar(tmp) and (0 == tmp))
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
                 ):
@@ -107,7 +107,7 @@ def normalize_binop_value(self, other):
             return other
         other_dtype = np.min_scalar_type(other)
         if other_dtype.kind in {"b", "i", "u", "f"}:
-            other_dtype = np.promote_types(self.dtype, other_dtype)
+            other_dtype = np.promote_types(self.dtype.to_numpy, other_dtype)
             if other_dtype == np.dtype("float16"):
                 other = np.dtype("float32").type(other)
                 other_dtype = other.dtype
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 914d8d0ec1a..a1ae149bb22 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1388,6 +1388,7 @@ def __rtruediv__(self, other):
     __div__ = __truediv__
 
     def _bitwise_binop(self, other, op):
+        if isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer)
         if (
             np.issubdtype(self.dtype, np.bool_)
             or np.issubdtype(self.dtype, np.integer)
diff --git a/python/cudf/cudf/tests/test_avro.py b/python/cudf/cudf/tests/test_avro.py
index 4a8a8d1bbdb..059f5343e0b 100644
--- a/python/cudf/cudf/tests/test_avro.py
+++ b/python/cudf/cudf/tests/test_avro.py
@@ -65,7 +65,7 @@ def test_avro_reader_basic(datadir, inputfile, columns, engine):
     # FASTAVRO produces int64 columns from avro int32 dtype, so convert
     # it back to int32 here
     for col in expect.columns:
-        expect[col] = expect[col].astype(got[col].dtype)
+        expect[col] = expect[col].astype(got[col].dtype.to_numpy)
 
     # fastavro appears to return columns in reverse order
     # (actual order may depend on pandas/python version)

From ef5b9cb2f2e6c948b85d63d9b2995a7ee79336f3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 21 Aug 2020 07:43:35 -0700
Subject: [PATCH 17/80] handle list dtype in _Dtype

---
 python/cudf/cudf/_lib/column.pyx | 20 ++------------------
 python/cudf/cudf/_lib/types.pyx  | 20 ++++++++++++++------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 4b3a18b4a97..812c42c8c51 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -364,25 +364,9 @@ cdef class Column:
             col = self.base_children[0]
         else:
             col = self
-<<<<<<< HEAD
-        cdef _Dtype data_dtype = col.dtype
-        cdef libcudf_types.data_type dtype = data_dtype.get_libcudf_type()
-=======
 
-        data_dtype = col.dtype
-        cdef libcudf_types.type_id tid
-
-        if not is_list_dtype(self.dtype):
-            tid = <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[np.dtype(data_dtype)]
-                )
-            )
-        else:
-            tid = libcudf_types.type_id.LIST
-
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
->>>>>>> branch-0.15
+        cdef _Dtype pydtype = self.dtype
+        cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type()
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 6f7739324f7..457bf16eeab 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -164,13 +164,21 @@ class NullHandling(IntEnum):
 
 cdef class _Dtype:
     cdef data_type get_libcudf_type(self) except *:
-        np_dtype = self.to_numpy
-        cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[np_dtype]
+
+        cdef libcudf_types.type_id tid
+        cdef data_type libcudf_type 
+
+        if not isinstance(self, ListDtype):
+            np_dtype = self.to_numpy
+            tid = <libcudf_types.type_id> (
+                    <underlying_type_t_type_id> (
+                        np_to_cudf_types[np_dtype]
+                    )
                 )
-            )
-        cdef data_type libcudf_type = libcudf_types.data_type(tid)
+        else:
+            tid = libcudf_types.type_id.LIST
+        
+        libcudf_type = libcudf_types.data_type(tid)
         return libcudf_type
 
 

From 93207553d516a5d332fbb8d55e239b425c30c2a0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 21 Aug 2020 07:43:44 -0700
Subject: [PATCH 18/80] fix series syntax error

---
 python/cudf/cudf/core/series.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 61b9fb5a26d..e5c8ecfed29 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1381,7 +1381,6 @@ def __rtruediv__(self, other):
     __div__ = __truediv__
 
     def _bitwise_binop(self, other, op):
-        if isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer)
         if (
             np.issubdtype(self.dtype, np.bool_)
             or np.issubdtype(self.dtype, np.integer)

From dac2940270933cc25d312851ff65a36ecf27a7ab Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 21 Aug 2020 08:26:54 -0700
Subject: [PATCH 19/80] add timedelta dtypes

---
 python/cudf/cudf/__init__.py    |  4 ++++
 python/cudf/cudf/core/dtypes.py | 37 +++++++++++++++++++++++++++++----
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index af28742cfee..04dcc286968 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -56,6 +56,10 @@
     Datetime64USDtype, 
     Datetime64MSDtype,
     Datetime64SDtype,
+    Timedelta64NSDtype,
+    Timedelta64USDtype,
+    Timedelta64MSDtype,
+    Timedelta64SDtype
 )
 
 from cudf.core.groupby import Grouper
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 4dc138059ee..dc77e30d882 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -126,6 +126,12 @@ class Floating(Inexact):
 class Flexible(Generic):
     pass
 
+class Datetime(Generic):
+    pass
+
+class Timedelta(Generic):
+    pass
+
 class UInt8Dtype(UnsignedInteger):
     def __init__(self):
         self.pa_type = pa.uint8()
@@ -192,10 +198,6 @@ def __init__(self):
         self.pa_type = pa.bool_()
         self._name = "Boolean"
 
-
-class Datetime(Generic):
-    pass
-
 class Datetime64NSDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("ns")
@@ -219,6 +221,25 @@ def __init__(self):
         self.pa_type = pa.timestamp("s")
         self._name = "Datetime64S"
 
+class Timedelta64NSDtype(Timedelta):
+    def __init__(self):
+        self.pa_type = pa.duration('ns')
+        self._name = "Timedelta64NS"
+
+class Timedelta64USDtype(Timedelta):
+    def __init__(self):
+        self.pa_type = pa.duration('us')
+        self._name = "Timedelta64US"
+
+class Timedelta64MSDtype(Timedelta):
+    def __init__(self):
+        self.pa_type = pa.duration('ms')
+        self._name = "Timedelta64MS"
+
+class Timedelta64SDtype(Timedelta):
+    def __init__(self):
+        self.pa_type = pa.duration('s')
+        self._name = "Timedelta64S"
 
 class StringDtype(Flexible):
     is_string = True
@@ -475,6 +496,10 @@ def __repr__(self):
     pa.timestamp("us"): Datetime64USDtype(),
     pa.timestamp("ms"): Datetime64MSDtype(),
     pa.timestamp("s"): Datetime64SDtype(),
+    pa.duration("ns"): Timedelta64NSDtype(),
+    pa.duration("us"): Timedelta64USDtype(),
+    pa.duration("ms"): Timedelta64MSDtype(),
+    pa.duration("s"): Timedelta64SDtype(),
     pa.null(): None
 }
 
@@ -495,6 +520,10 @@ def __repr__(self):
     np.dtype("datetime64[us]"): Datetime64USDtype(),
     np.dtype("datetime64[ms]"): Datetime64MSDtype(),
     np.dtype("datetime64[s]"): Datetime64SDtype(),
+    np.dtype("timedelta64[ns]"): Timedelta64NSDtype(),
+    np.dtype("timedelta64[us]"): Timedelta64USDtype(),
+    np.dtype("timedelta64[ms]"): Timedelta64MSDtype(),
+    np.dtype("timedelta64[s]"): Timedelta64SDtype(),
 }
 
 pd_to_cudf_dtypes = {

From 6eee9eb122b02b7af2daade5f7604038950518f2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 21 Aug 2020 08:27:13 -0700
Subject: [PATCH 20/80] fix some numericalcolumn bugs

---
 python/cudf/cudf/core/column/numerical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7d06a9a01f8..2464e691ce7 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -194,7 +194,7 @@ def to_arrow(self):
             buffers=[mask, data],
             null_count=self.null_count,
         )
-        if self.dtype.is_boolean:
+        if isinstance(self.dtype, cudf.core.dtypes.BooleanDtype):
             return out.cast(pa.bool_())
         else:
             return out

From 1ace46016eb601015afffaabe78700b99eb1f206 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 21 Aug 2020 08:27:26 -0700
Subject: [PATCH 21/80] fix index type mapping dicts

---
 python/cudf/cudf/core/index.py | 41 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 69a5b0680a5..2663ad8e22d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -33,6 +33,7 @@
     numeric_normalize_types,
 )
 from cudf.utils.utils import cached_property
+from cudf.core.dtypes import dtype
 
 
 def _to_frame(this_index, index=True, name=None):
@@ -2561,29 +2562,29 @@ def as_index(arbitrary, **kwargs):
 
 
 _dtype_to_index = {
-    np.int8: Int8Index,
-    np.int16: Int16Index,
-    np.int32: Int32Index,
-    np.int64: Int64Index,
-    np.uint8: UInt8Index,
-    np.uint16: UInt16Index,
-    np.uint32: UInt32Index,
-    np.uint64: UInt64Index,
-    np.float32: Float32Index,
-    np.float64: Float64Index,
+    dtype(np.int8): Int8Index,
+    dtype(np.int16): Int16Index,
+    dtype(np.int32): Int32Index,
+    dtype(np.int64): Int64Index,
+    dtype(np.uint8): UInt8Index,
+    dtype(np.uint16): UInt16Index,
+    dtype(np.uint32): UInt32Index,
+    dtype(np.uint64): UInt64Index,
+    dtype(np.float32): Float32Index,
+    dtype(np.float64): Float64Index,
 }
 
 _index_to_dtype = {
-    Int8Index: np.int8,
-    Int16Index: np.int16,
-    Int32Index: np.int32,
-    Int64Index: np.int64,
-    UInt8Index: np.uint8,
-    UInt16Index: np.uint16,
-    UInt32Index: np.uint32,
-    UInt64Index: np.uint64,
-    Float32Index: np.float32,
-    Float64Index: np.float64,
+    Int8Index: dtype(np.int8),
+    Int16Index: dtype(np.int16),
+    Int32Index: dtype(np.int32),
+    Int64Index: dtype(np.int64),
+    UInt8Index: dtype(np.uint8),
+    UInt16Index: dtype(np.uint16),
+    UInt32Index: dtype(np.uint32),
+    UInt64Index: dtype(np.uint64),
+    Float32Index: dtype(np.float32),
+    Float64Index: dtype(np.float64),
 }
 
 

From df6426bbbe41ba307926551b28dfbde0eb60444e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 24 Aug 2020 07:58:22 -0700
Subject: [PATCH 22/80] pass all binop tests

---
 python/cudf/cudf/__init__.py              |  3 ++
 python/cudf/cudf/_lib/scalar.pyx          | 10 ++---
 python/cudf/cudf/core/column/column.py    |  6 +--
 python/cudf/cudf/core/column/datetime.py  | 24 ++++++------
 python/cudf/cudf/core/column/numerical.py |  4 +-
 python/cudf/cudf/core/column/string.py    |  2 +-
 python/cudf/cudf/core/dtypes.py           | 13 +++++--
 python/cudf/cudf/core/series.py           | 45 +++++++++--------------
 8 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 04dcc286968..56892f6787f 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -38,7 +38,10 @@
     Datetime,
     Floating,
     Number,
+    Integer,
     Flexible,
+    Datetime,
+    Timedelta,
     CategoricalDtype, 
     Int8Dtype,
     Int16Dtype, 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 7757c5a8ad6..ba2ecef1cd5 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -73,17 +73,17 @@ cdef class Scalar:
             else:
                 dtype = value.dtype
 
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
-        if pd.api.types.is_string_dtype(dtype):
+        if isinstance(dtype, cudf.StringDtype):
             _set_string_from_np_string(self.c_value, value, valid)
-        elif pd.api.types.is_numeric_dtype(dtype):
+        elif isinstance(dtype, (cudf.Number, cudf.BooleanDtype)):
             _set_numeric_from_np_scalar(self.c_value, value, dtype, valid)
-        elif pd.api.types.is_datetime64_dtype(dtype):
+        elif isinstance(dtype, cudf.Datetime):
             _set_datetime64_from_np_scalar(
                 self.c_value, value, dtype, valid
             )
-        elif pd.api.types.is_timedelta64_dtype(dtype):
+        elif isinstance(dtype, cudf.Timedelta):
             _set_timedelta64_from_np_scalar(
                 self.c_value, value, dtype, valid
             )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6c741d01580..c89de55148c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -887,9 +887,9 @@ def astype(self, dtype, **kwargs):
                     "Casting list columns not currently supported"
                 )
             return self
-        elif np.issubdtype(dtype, np.datetime64):
+        elif isinstance(dtype, cudf.Datetime):
             return self.as_datetime_column(dtype, **kwargs)
-        elif np.issubdtype(dtype, np.timedelta64):
+        elif isinstance(dtype, cudf.Timedelta):
             return self.as_timedelta_column(dtype, **kwargs)
         else:
             return self.as_numerical_column(dtype, **kwargs)
@@ -968,7 +968,7 @@ def __cuda_array_interface__(self):
         output = {
             "shape": (len(self),),
             "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
+            "typestr": self.dtype.to_numpy.str,
             "data": (self.data_ptr, False),
             "version": 1,
         }
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 75c48273d42..807f0803e7f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -66,11 +66,9 @@ def __init__(
         if not (self.dtype.type is np.datetime64):
             raise TypeError(f"{self.dtype} is not a supported datetime type")
 
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item):
         try:
-            item = np.datetime64(item, self._time_unit)
+            item = np.datetime64(item, self.dtype._time_unit)
         except ValueError:
             # If item cannot be converted to datetime type
             # np.datetime64 raises ValueError, hence `item`
@@ -80,7 +78,7 @@ def __contains__(self, item):
 
     @property
     def time_unit(self):
-        return self._time_unit
+        return self.dtype._time_unit
 
     @property
     def year(self):
@@ -127,7 +125,7 @@ def normalize_binop_value(self, other):
             if np.isnat(other):
                 return as_scalar(val=None, dtype=self.dtype)
 
-            other = other.astype(self.dtype)
+            other = other.astype(self.dtype.to_numpy)
             return as_scalar(other)
         elif isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
@@ -200,25 +198,29 @@ def default_na_value(self):
 
     def binary_operator(self, op, rhs, reflect=False):
         lhs, rhs = self, rhs
+
+        lhs_dtype = cudf.dtype(lhs.dtype)
+        rhs_dtype = cudf.dtype(rhs.dtype)
+
         if op in ("eq", "ne", "lt", "gt", "le", "ge"):
-            out_dtype = np.bool
-        elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
+            out_dtype = cudf.BooleanDtype()
+        elif op == "add" and isinstance(rhs_dtype, cudf.Timedelta):
             out_dtype = cudf.core.column.timedelta._timedelta_binary_op_add(
                 rhs, lhs
             )
-        elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
+        elif op == "sub" and isinstance(rhs_dtype, cudf.Timedelta):
             out_dtype = cudf.core.column.timedelta._timedelta_binary_op_sub(
                 rhs if reflect else lhs, lhs if reflect else rhs
             )
-        elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype):
+        elif op == "sub" and isinstance(rhs.dtype, cudf.Datetime):
             units = ["s", "ms", "us", "ns"]
             lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
             lhs_unit = units.index(lhs_time_unit)
             rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
             rhs_unit = units.index(rhs_time_unit)
-            out_dtype = np.dtype(
+            out_dtype = cudf.dtype(np.dtype(
                 f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]"
-            )
+            ))
         else:
             raise TypeError(
                 f"Series of dtype {self.dtype} cannot perform "
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 2464e691ce7..8cb8ecaf752 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -189,7 +189,7 @@ def to_arrow(self):
             mask = pa.py_buffer(self.mask_array_view.copy_to_host())
         data = pa.py_buffer(self.data_array_view.copy_to_host())
         out = pa.Array.from_buffers(
-            type=self.dtype.pa_type,
+            type=self.dtype.pa_type if not isinstance(self.dtype, cudf.core.dtypes.BooleanDtype) else pa.int8(),
             length=len(self),
             buffers=[mask, data],
             null_count=self.null_count,
@@ -437,7 +437,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
     is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
 
     if is_op_comparison:
-        out_dtype = "bool"
+        out_dtype = cudf.BooleanDtype()
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     if is_op_comparison:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 46d947e82f8..e47e42c1f13 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4772,7 +4772,7 @@ def binary_operator(self, op, rhs, reflect=False):
         if isinstance(rhs, StringColumn) and op == "add":
             return lhs.str().cat(others=rhs)
         elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
-            return _string_column_binop(self, rhs, op=op, out_dtype="bool")
+            return _string_column_binop(self, rhs, op=op, out_dtype=cudf.BooleanDtype())
         else:
             msg = "{!r} operator not supported between {} and {}"
             raise TypeError(msg.format(op, type(self), type(rhs)))
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index dc77e30d882..791dc74be54 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -126,7 +126,7 @@ class Floating(Inexact):
 class Flexible(Generic):
     pass
 
-class Datetime(Generic):
+class Datetime(Generic):    
     pass
 
 class Timedelta(Generic):
@@ -202,24 +202,28 @@ class Datetime64NSDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("ns")
         self._name = "Datetime64NS"
+        self._time_unit = "ns"
 
 
 class Datetime64USDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("us")
         self._name = "Datetime64US"
+        self._time_unit = "us"
 
 
 class Datetime64MSDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("ms")
         self._name = "Datetime64MS"
+        self._time_unit = "ms"
 
 
 class Datetime64SDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("s")
         self._name = "Datetime64S"
+        self._time_unit = "s"
 
 class Timedelta64NSDtype(Timedelta):
     def __init__(self):
@@ -278,7 +282,7 @@ def make_dtype_from_string(obj):
             return UInt16Dtype()
         elif obj in {"uint8", "Uint8"}:
             return UInt8Dtype()
-    elif "float" in obj:
+    elif "float" in obj or "Float" in obj:
         if obj in {"float64", "Float64"}:
             return Float64Dtype()
         elif obj in {"float32", "Float32"}:
@@ -299,7 +303,9 @@ def dtype(obj):
     if isinstance(obj, CategoricalDtype):
         return obj
     elif isinstance(obj, Generic):
-        return np_to_cudf_dtypes[obj.to_numpy]
+        return obj
+    elif issubclass(obj.__class__, Generic):
+        return obj()
     if isinstance(obj, np.dtype):
         return np_to_cudf_dtypes[obj]
     elif isinstance(obj, pa.lib.DataType):
@@ -513,6 +519,7 @@ def __repr__(self):
     np.dtype("uint32"): UInt32Dtype(),
     np.dtype("uint64"): UInt64Dtype(),
     np.dtype("bool"): BooleanDtype(),
+    np.dtype("U"): StringDtype(),
     np.dtype("object"): StringDtype(),
     np.dtype("float32"): Float32Dtype(),
     np.dtype("float64"): Float64Dtype(),
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e5c8ecfed29..f31e43b179b 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1058,8 +1058,8 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False):
         rhs = self._normalize_binop_value(rhs)
 
         if fn == "truediv":
-            if str(lhs.dtype) in truediv_int_dtype_corrections:
-                truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)]
+            if lhs.dtype.name in truediv_int_dtype_corrections:
+                truediv_type = truediv_int_dtype_corrections[lhs.dtype.name]
                 lhs = lhs.astype(truediv_type)
 
         if fill_value is not None:
@@ -1381,27 +1381,17 @@ def __rtruediv__(self, other):
     __div__ = __truediv__
 
     def _bitwise_binop(self, other, op):
-        if (
-            np.issubdtype(self.dtype, np.bool_)
-            or np.issubdtype(self.dtype, np.integer)
-        ) and (
-            np.issubdtype(other.dtype, np.bool_)
-            or np.issubdtype(other.dtype, np.integer)
-        ):
-            # TODO: This doesn't work on Series (op) DataFrame
-            # because dataframe doesn't have dtype
+        if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer))):
             ser = self._binaryop(other, op)
-            if np.issubdtype(self.dtype, np.bool_) or np.issubdtype(
-                other.dtype, np.bool_
-            ):
-                ser = ser.astype(np.bool_)
-            return ser
+            if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(other.dtype, cudf.BooleanDtype):
+                ser = ser.astype(cudf.BooleanDtype())
         else:
             raise TypeError(
                 f"Operation 'bitwise {op}' not supported between "
                 f"{self.dtype.type.__name__} and {other.dtype.type.__name__}"
             )
-
+        return ser
+        
     def __and__(self, other):
         """Performs vectorized bitwise and (&) on corresponding elements of two
         series.
@@ -2125,7 +2115,6 @@ def astype(self, dtype, copy=False, errors="raise"):
             return self.copy(deep=copy)
         try:
             data = self._column.astype(dtype)
-
             return self._copy_construct(
                 data=data.copy(deep=True) if copy else data, index=self.index
             )
@@ -4239,16 +4228,16 @@ def keys(self):
 
 
 truediv_int_dtype_corrections = {
-    "int8": "float32",
-    "int16": "float32",
-    "int32": "float32",
-    "int64": "float64",
-    "uint8": "float32",
-    "uint16": "float32",
-    "uint32": "float64",
-    "uint64": "float64",
-    "bool": "float32",
-    "int": "float",
+    "Int8": "Float32",
+    "Int16": "Float32",
+    "Int32": "Float32",
+    "Int64": "Float64",
+    "UInt8": "Float32",
+    "UInt16": "Float32",
+    "UInt32": "Float64",
+    "UInt64": "Float64",
+    "Boolean": "Float32",
+    "Int": "Float",
 }
 
 

From 92d1a644f0a40156f3c068323581710b0b321aae Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 25 Aug 2020 15:33:58 -0700
Subject: [PATCH 23/80] more progress

---
 python/cudf/cudf/__init__.py                |  1 +
 python/cudf/cudf/_lib/column.pyx            |  2 +-
 python/cudf/cudf/_lib/stream_compaction.pyx |  3 +-
 python/cudf/cudf/core/column/categorical.py |  5 ++-
 python/cudf/cudf/core/column/column.py      |  4 +-
 python/cudf/cudf/core/column/numerical.py   | 19 +++++----
 python/cudf/cudf/core/column/string.py      |  2 +-
 python/cudf/cudf/core/column/timedelta.py   | 47 +++++++++------------
 python/cudf/cudf/core/dtypes.py             | 16 ++++++-
 python/cudf/cudf/core/join/join.py          | 20 ++++-----
 python/cudf/cudf/core/series.py             |  4 +-
 python/cudf/cudf/tests/test_column.py       |  9 ++--
 python/cudf/cudf/utils/dtypes.py            | 16 ++++---
 13 files changed, 84 insertions(+), 64 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 56892f6787f..cff9df9f032 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -8,6 +8,7 @@
 
 import rmm
 
+import cudf.api.types
 from cudf import core, datasets, testing
 from cudf._version import get_versions
 from cudf.core import (
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 812c42c8c51..5021778be44 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -365,7 +365,7 @@ cdef class Column:
         else:
             col = self
 
-        cdef _Dtype pydtype = self.dtype
+        cdef _Dtype pydtype = col.dtype
         cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type()
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 2d81eb49f00..aa59fe14e2c 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 import pandas as pd
+from cudf.core.dtypes import BooleanDtype
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -99,7 +100,7 @@ def apply_boolean_mask(Table source_table, Column boolean_mask):
     Table obtained from applying mask
     """
 
-    assert pd.api.types.is_bool_dtype(boolean_mask.dtype)
+    assert isinstance(boolean_mask.dtype, BooleanDtype)
 
     cdef unique_ptr[table] c_result
     cdef table_view source_table_view = source_table.view()
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 3a8df934264..5185660c13c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -308,8 +308,9 @@ def add_categories(self, new_categories, **kwargs):
                 f"existing categories."
             )
         common_dtype = np.find_common_type(
-            [old_categories.dtype, new_categories.dtype], []
+            [old_categories.dtype.to_numpy, new_categories.dtype.to_numpy], []
         )
+        common_dtype = cudf.dtype(common_dtype)
 
         new_categories = new_categories.astype(common_dtype, copy=False)
         old_categories = old_categories.astype(common_dtype, copy=False)
@@ -1254,7 +1255,7 @@ def _create_empty_categorical_column(categorical_column, dtype):
             cudf.utils.utils.scalar_broadcast_to(
                 categorical_column.default_na_value(),
                 categorical_column.size,
-                np.dtype(categorical_column.cat().codes),
+                categorical_column.cat().codes.dtype,
             )
         ),
         offset=categorical_column.offset,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c89de55148c..33c0b98d203 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -516,9 +516,9 @@ def __getitem__(self, arg):
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = as_column([], dtype="int32")
-            if pd.api.types.is_integer_dtype(arg.dtype):
+            if pd.api.types.is_integer_dtype(arg.dtype) or isinstance(arg.dtype, cudf.Integer):
                 return self.take(arg)
-            if pd.api.types.is_bool_dtype(arg.dtype):
+            if pd.api.types.is_bool_dtype(arg.dtype) or isinstance(arg.dtype, cudf.BooleanDtype):
                 return self.apply_boolean_mask(arg)
             raise NotImplementedError(type(arg))
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 8cb8ecaf752..b0e4c563fd8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -272,6 +272,7 @@ def find_and_replace(self, to_replace, replacement, all_nan):
         """
         Return col with *to_replace* replaced with *value*.
         """
+
         to_replace_col = _normalize_find_and_replace_input(
             self.dtype, to_replace
         )
@@ -379,14 +380,15 @@ def can_cast_safely(self, to_dtype):
         safely cast to dtype
         """
         if self.dtype.kind == to_dtype.kind:
-            if self.dtype <= to_dtype:
+            # todo: implement >, < for cudf.Dtype
+            if self.dtype.to_numpy <= to_dtype.to_numpy:
                 return True
             else:
                 # Kinds are the same but to_dtype is smaller
-                if "float" in to_dtype.name:
-                    info = np.finfo(to_dtype)
-                elif "int" in to_dtype.name:
-                    info = np.iinfo(to_dtype)
+                if isinstance(to_dtype, cudf.Floating):
+                    info = np.finfo(to_dtype.to_numpy)
+                elif isinstance(to_dtype, cudf.Integer):
+                    info = np.iinfo(to_dtype.to_numpy)
                 min_, max_ = info.min, info.max
 
                 if (self.min() > min_) and (self.max() < max_):
@@ -396,7 +398,7 @@ def can_cast_safely(self, to_dtype):
 
         # want to cast int to float
         elif to_dtype.kind == "f" and self.dtype.kind in {"i", "u"}:
-            info = np.finfo(to_dtype)
+            info = np.finfo(to_dtype.to_numpy)
             biggest_exact_int = 2 ** (info.nmant + 1)
             if (self.min() >= -biggest_exact_int) and (
                 self.max() <= biggest_exact_int
@@ -415,7 +417,7 @@ def can_cast_safely(self, to_dtype):
 
         # want to cast float to int:
         elif to_dtype.kind in {"i", "u"} and self.dtype.kind == "f":
-            info = np.iinfo(to_dtype)
+            info = np.iinfo(to_dtype.to_numpy)
             min_, max_ = info.min, info.max
             # best we can do is hope to catch it here and avoid compare
             if (self.min() >= min_) and (self.max() <= max_):
@@ -503,11 +505,10 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
         col_to_normalize_dtype = col_to_normalize.dtype
     else:
         raise TypeError(f"Type {type(col_to_normalize)} not supported")
-
     if (
         col_to_normalize_dtype.kind == "f"
         and input_column_dtype.kind in {"i", "u"}
-    ) or (col_to_normalize_dtype.num > input_column_dtype.num):
+    ) or (col_to_normalize_dtype.to_numpy.num > input_column_dtype.to_numpy.num):
         raise TypeError(
             f"Potentially unsafe cast for non-equivalent "
             f"{col_to_normalize_dtype.name} "
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index e47e42c1f13..be96e18c148 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4740,7 +4740,7 @@ def fillna(self, fill_value):
 
     def _find_first_and_last(self, value):
         found_indices = self.str().contains(f"^{value}$")
-        found_indices = libcudf.unary.cast(found_indices, dtype=np.int32)
+        found_indices = libcudf.unary.cast(found_indices, dtype=cudf.Int32Dtype())
         first = column.as_column(found_indices).find_first_value(1)
         last = column.as_column(found_indices).find_last_value(1)
         return first, last
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 521d422a233..644c06bcd80 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -44,7 +44,7 @@ def __init__(
             The number of null values.
             If None, it is calculated automatically.
         """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
 
@@ -60,11 +60,9 @@ def __init__(
         if not (self.dtype.type is np.timedelta64):
             raise TypeError(f"{self.dtype} is not a supported duration type")
 
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item):
         try:
-            item = np.timedelta64(item, self._time_unit)
+            item = np.timedelta64(item, self.dtype._time_unit)
         except ValueError:
             # If item cannot be converted to duration type
             # np.timedelta64 raises ValueError, hence `item`
@@ -111,7 +109,7 @@ def to_arrow(self):
 
     def _binary_op_floordiv(self, rhs):
         lhs, rhs = self, rhs
-        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
+        if cudf.api.types.is_timedelta64_dtype(rhs.dtype):
             common_dtype = determine_out_dtype(self.dtype, rhs.dtype)
             lhs = lhs.astype(common_dtype).astype("float64")
 
@@ -148,7 +146,7 @@ def _binary_op_mul(self, rhs):
         return out_dtype
 
     def _binary_op_mod(self, rhs):
-        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
+        if cudf.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = determine_out_dtype(self.dtype, rhs.dtype)
         elif rhs.dtype.kind in ("f", "i", "u"):
             out_dtype = self.dtype
@@ -160,8 +158,8 @@ def _binary_op_mod(self, rhs):
         return out_dtype
 
     def _binary_op_eq_ne(self, rhs):
-        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
-            out_dtype = np.bool
+        if cudf.api.types.is_timedelta64_dtype(rhs.dtype):
+            out_dtype = cudf.BooleanDtype()
         else:
             raise TypeError(
                 f"Equality of {self.dtype} with {rhs.dtype} "
@@ -170,8 +168,8 @@ def _binary_op_eq_ne(self, rhs):
         return out_dtype
 
     def _binary_op_lt_gt_le_ge(self, rhs):
-        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
-            return np.bool
+        if cudf.api.types.is_timedelta64_dtype(rhs.dtype):
+            return cudf.BooleanDtype()
         else:
             raise TypeError(
                 f"Invalid comparison between dtype={self.dtype}"
@@ -180,7 +178,7 @@ def _binary_op_lt_gt_le_ge(self, rhs):
 
     def _binary_op_truediv(self, rhs):
         lhs, rhs = self, rhs
-        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
+        if cudf.api.types.is_timedelta64_dtype(rhs.dtype):
             common_dtype = determine_out_dtype(self.dtype, rhs.dtype)
             lhs = lhs.astype(common_dtype).astype("float64")
 
@@ -234,7 +232,8 @@ def binary_operator(self, op, rhs, reflect=False):
 
         if reflect:
             lhs, rhs = rhs, lhs
-
+        import pdb
+        pdb.set_trace()
         return binop(lhs, rhs, op=op, out_dtype=out_dtype)
 
     def normalize_binop_value(self, other):
@@ -575,24 +574,24 @@ def binop(lhs, rhs, op, out_dtype):
 
 
 def determine_out_dtype(lhs_dtype, rhs_dtype):
-    if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)):
-        return rhs_dtype
-    elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)):
-        return lhs_dtype
+    if np.can_cast(cudf.dtype(lhs_dtype).to_numpy, cudf.dtype(rhs_dtype).to_numpy):
+        return cudf.dtype(rhs_dtype)
+    elif np.can_cast(cudf.dtype(rhs_dtype).to_numpy, cudf.dtype(lhs_dtype).to_numpy):
+        return cudf.dtype(lhs_dtype)
     else:
         raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}")
 
 
 def _timedelta_binary_op_add(lhs, rhs):
-    if pd.api.types.is_timedelta64_dtype(rhs.dtype):
+    if isinstance(rhs.dtype, cudf.Timedelta):
         out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype)
-    elif pd.api.types.is_datetime64_dtype(rhs.dtype):
+    elif isinstance(rhs.dtype, cudf.Datetime):
         units = ["s", "ms", "us", "ns"]
         lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
         lhs_unit = units.index(lhs_time_unit)
         rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
         rhs_unit = units.index(rhs_time_unit)
-        out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
+        out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]"))
     else:
         raise TypeError(
             f"Addition of {lhs.dtype} with {rhs.dtype} "
@@ -603,19 +602,15 @@ def _timedelta_binary_op_add(lhs, rhs):
 
 
 def _timedelta_binary_op_sub(lhs, rhs):
-    if pd.api.types.is_timedelta64_dtype(
-        lhs.dtype
-    ) and pd.api.types.is_timedelta64_dtype(rhs.dtype):
+    if isinstance(lhs.dtype, cudf.Timedelta) and isinstance(rhs.dtype, cudf.Timedelta):
         out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype)
-    elif pd.api.types.is_timedelta64_dtype(
-        rhs.dtype
-    ) and pd.api.types.is_datetime64_dtype(lhs.dtype):
+    elif isinstance(rhs.dtype, cudf.Timedelta) and isinstance(lhs.dtype, cudf.Datetime):
         units = ["s", "ms", "us", "ns"]
         lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
         lhs_unit = units.index(lhs_time_unit)
         rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
         rhs_unit = units.index(rhs_time_unit)
-        out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
+        out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]"))
     else:
         raise TypeError(
             f"Subtraction of {lhs.dtype} with {rhs.dtype} "
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 791dc74be54..4fb67ce1424 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -26,6 +26,10 @@
     pa.timestamp("us"): np.dtype("datetime64[us]"),
     pa.timestamp("ms"): np.dtype("datetime64[ms]"),
     pa.timestamp("s"): np.dtype("datetime64[s]"),
+    pa.duration("ns"): np.dtype('timedelta64[ns]'),
+    pa.duration("us"): np.dtype('timedelta64[us]'),
+    pa.duration("ms"): np.dtype('timedelta64[ms]'),
+    pa.duration("s"): np.dtype('timedelta64[s]'),
 }
 
 pa_to_np_dtypes = {
@@ -45,6 +49,10 @@
     pa.timestamp("us"): np.dtype("datetime64[us]"),
     pa.timestamp("ms"): np.dtype("datetime64[ms]"),
     pa.timestamp("s"): np.dtype("datetime64[s]"),
+    pa.duration("ns"): np.dtype('timedelta64[ns]'),
+    pa.duration("us"): np.dtype('timedelta64[us]'),
+    pa.duration("ms"): np.dtype('timedelta64[ms]'),
+    pa.duration("s"): np.dtype('timedelta64[s]'),
     None: None,
 }
 
@@ -192,7 +200,6 @@ def __init__(self):
 
 
 class BooleanDtype(Generic):
-    is_boolean = True
 
     def __init__(self):
         self.pa_type = pa.bool_()
@@ -289,6 +296,8 @@ def make_dtype_from_string(obj):
             return Float32Dtype()
     elif "bool" in obj:
         return BooleanDtype()
+    elif "category" in obj:
+        return "category"
 
 
 def make_dtype_from_numpy(obj):
@@ -300,6 +309,8 @@ def make_dtype_from_numpy(obj):
 def dtype(obj):
     if obj is None:
         return None
+    if isinstance(obj, pd.CategoricalDtype):
+        return cudf.CategoricalDtype.from_pandas(obj)
     if isinstance(obj, CategoricalDtype):
         return obj
     elif isinstance(obj, Generic):
@@ -336,6 +347,9 @@ def __init__(self, categories=None, ordered=None):
         self._categories = self._init_categories(categories)
         self.ordered = ordered
 
+    def __repr__(self):
+        return self.to_pandas().__repr__()
+
     @property
     def categories(self):
         if self._categories is None:
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 0aadcf875cb..459d8e215c4 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -356,7 +356,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
             dtype_r, CategoricalDtype
         ):
             # categories are not equal
-            libcudf_join_type = np.dtype("O")
+            libcudf_join_type = cudf.StringDtype()
         elif how == "left":
             check_col = rcol.fillna(0)
             if not check_col.can_cast_safely(dtype_l):
@@ -393,20 +393,20 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
                 raise ValueError(ctgry_err.format(lcol, "left"))
             libcudf_join_type = rcol.cat().categories.dtype
         elif how in {"inner", "outer"}:
-            if (np.issubdtype(dtype_l, np.number)) and (
-                np.issubdtype(dtype_r, np.number)
+            if (isinstance(dtype_l, cudf.Number)) and (
+                isinstance(dtype_r, cudf.Number)
             ):
                 if dtype_l.kind == dtype_r.kind:
                     # both ints or both floats
-                    libcudf_join_type = max(dtype_l, dtype_r)
+                    libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy))
                 else:
-                    libcudf_join_type = np.find_common_type(
-                        [], [dtype_l, dtype_r]
-                    )
-            elif np.issubdtype(dtype_l, np.datetime64) and np.issubdtype(
-                dtype_r, np.datetime64
+                    libcudf_join_type = cudf.dtype(np.find_common_type(
+                        [], [dtype_l.to_numpy, dtype_r.to_numpy]
+                    ))
+            elif isinstance(dtype_l, cudf.Datetime) and isinstance(
+                dtype_r, cudf.Datetime
             ):
-                libcudf_join_type = max(dtype_l, dtype_r)
+                libcudf_join_type = cudf.dtype(max(dtype_l, dtype_r))
         return libcudf_join_type
 
     def libcudf_to_output_casting_rules(self, lcol, rcol, how):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f31e43b179b..83720446ab0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1588,9 +1588,9 @@ def __invert__(self):
 
         Returns a new Series.
         """
-        if np.issubdtype(self.dtype, np.integer):
+        if isinstance(self.dtype, cudf.Integer):
             return self._unaryop("invert")
-        elif np.issubdtype(self.dtype, np.bool_):
+        elif isinstance(self.dtype, cudf.BooleanDtype):
             return self._unaryop("not")
         else:
             raise TypeError(
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 3fb6120f53d..7ac4df4e514 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -55,10 +55,11 @@ def test_column_offset_and_size(pandas_input, offset, size):
         assert col.size == col.codes.size
         assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
     elif isinstance(col.dtype, cudf.StringDtype):
-        assert col.size == (col.children[0].size - 1)
-        assert col.size == (
-            (col.children[0].data.size / col.children[0].dtype.itemsize) - 1
-        )
+        if col.size > 0:
+            assert col.size == (col.children[0].size - 1)
+            assert col.size == (
+                (col.children[0].data.size / col.children[0].dtype.itemsize) - 1
+            )
     else:
         assert col.size == (col.data.size / col.dtype.itemsize)
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 11596163c32..9d9bece3e10 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -70,6 +70,8 @@
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype.
     """
+    if isinstance(dtype, cudf.Generic):
+        return dtype.pa_type
     # special case when dtype is np.datetime64
     if dtype.kind == "M":
         time_unit, _ = np.datetime_data(dtype)
@@ -102,7 +104,7 @@ def get_numeric_type_info(dtype):
 def numeric_normalize_types(*args):
     """Cast all args to a common type using numpy promotion logic
     """
-    dtype = np.result_type(*[a.dtype for a in args])
+    dtype = np.result_type(*[a.dtype.to_numpy for a in args])
     return [a.astype(dtype) for a in args]
 
 
@@ -255,6 +257,8 @@ def to_cudf_compatible_scalar(val, dtype=None):
     val = pd.api.types.pandas_dtype(type(val)).type(val)
 
     if dtype is not None:
+        if isinstance(dtype, cudf.Generic):
+            dtype = dtype.to_numpy
         val = val.astype(dtype)
 
     if val.dtype.type is np.datetime64:
@@ -381,25 +385,27 @@ def min_column_type(x, expected_type):
     If the column is not a subtype of `np.signedinteger` or `np.floating`
     returns the same dtype as the dtype of `x` without modification
     """
-
     if not isinstance(x, cudf.core.column.NumericalColumn):
         raise TypeError("Argument x must be of type column.NumericalColumn")
     if x.valid_count == 0:
         return x.dtype
+    x_np_dtype = x.dtype.to_numpy
+    expected_type = cudf.dtype(expected_type).to_numpy
 
-    if np.issubdtype(x.dtype, np.floating):
+    if np.issubdtype(x_np_dtype, np.floating):
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
         if result_type == np.dtype("float16"):
             # cuDF does not support float16 dtype
             result_type = np.dtype("float32")
-        return result_type
+        return cudf.dtype(result_type)
 
     if np.issubdtype(expected_type, np.integer):
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
-        return np.promote_types(max_bound_dtype, min_bound_dtype)
+        result = np.promote_types(max_bound_dtype, min_bound_dtype)
+        return cudf.dtype(result)
 
     return x.dtype
 

From 59b3673aed72465c135e43ca405150ae51d1b520 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Aug 2020 05:37:11 -0700
Subject: [PATCH 24/80] all column tests pass

---
 python/cudf/cudf/core/column/timedelta.py | 12 +++++-------
 python/cudf/cudf/core/dtypes.py           | 10 +++++++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 644c06bcd80..2ab0fadae82 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -16,10 +16,10 @@
 from cudf.utils.utils import buffers_from_pyarrow
 
 _dtype_to_format_conversion = {
-    "timedelta64[ns]": "%D days %H:%M:%S",
-    "timedelta64[us]": "%D days %H:%M:%S",
-    "timedelta64[ms]": "%D days %H:%M:%S",
-    "timedelta64[s]": "%D days %H:%M:%S",
+    "Timedelta64NS": "%D days %H:%M:%S",
+    "Timedelta64US": "%D days %H:%M:%S",
+    "Timedelta64MS": "%D days %H:%M:%S",
+    "Timedelta64S": "%D days %H:%M:%S",
 }
 
 
@@ -232,8 +232,6 @@ def binary_operator(self, op, rhs, reflect=False):
 
         if reflect:
             lhs, rhs = rhs, lhs
-        import pdb
-        pdb.set_trace()
         return binop(lhs, rhs, op=op, out_dtype=out_dtype)
 
     def normalize_binop_value(self, other):
@@ -342,7 +340,7 @@ def as_string_column(self, dtype, **kwargs):
             kwargs["format"] = fmt
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
-                np.dtype(self.dtype)
+                self.dtype
             ](self, **kwargs)
         else:
             return column.column_empty(0, dtype="object", masked=False)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 4fb67ce1424..0931b3af5c4 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -298,7 +298,15 @@ def make_dtype_from_string(obj):
         return BooleanDtype()
     elif "category" in obj:
         return "category"
-
+    elif "timedelta" in obj:
+        if obj == 'timedelta64[ns]':
+            return Timedelta64NSDtype()
+        if obj == 'timedelta64[us]':
+            return Timedelta64USDtype()
+        if obj == 'timedelta64[ms]':
+            return Timedelta64MSDtype()
+        if obj == 'timedelta64[s]':
+            return Timedelta64SDtype()
 
 def make_dtype_from_numpy(obj):
     np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()}

From 297a31a484ef0f59c78baca2eaacb07b7423fa67 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Aug 2020 06:27:47 -0700
Subject: [PATCH 25/80] move more stuff to cudf.api.types

---
 python/cudf/cudf/_lib/column.pyx            |  2 +-
 python/cudf/cudf/_lib/parquet.pyx           |  3 +-
 python/cudf/cudf/_lib/transpose.pyx         |  2 +-
 python/cudf/cudf/core/column/categorical.py |  2 +-
 python/cudf/cudf/core/column/column.py      |  6 +-
 python/cudf/cudf/core/column/lists.py       |  2 +-
 python/cudf/cudf/core/column/string.py      |  7 +--
 python/cudf/cudf/core/dataframe.py          |  4 +-
 python/cudf/cudf/core/frame.py              |  9 ++-
 python/cudf/cudf/core/index.py              |  2 +-
 python/cudf/cudf/core/indexing.py           |  2 +-
 python/cudf/cudf/core/reshape.py            |  4 +-
 python/cudf/cudf/core/series.py             |  3 +-
 python/cudf/cudf/testing/testing.py         |  2 +-
 python/cudf/cudf/utils/dtypes.py            | 64 +--------------------
 15 files changed, 23 insertions(+), 91 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 5021778be44..ee9978316fa 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -7,7 +7,7 @@ import rmm
 
 import cudf
 from cudf.core.buffer import Buffer
-from cudf.utils.dtypes import is_categorical_dtype, is_list_dtype
+from cudf.api.types import is_categorical_dtype, is_list_dtype
 import cudf._lib as libcudfxx
 
 from cpython.buffer cimport PyObject_CheckBuffer
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 26e2e02402c..fd7e2cd847c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -11,7 +11,8 @@ import json
 from cython.operator import dereference
 import numpy as np
 
-from cudf.utils.dtypes import np_to_pa_dtype, is_categorical_dtype
+from cudf.utils.dtypes import np_to_pa_dtype
+from cudf.api.types import is_categorical_dtype
 from libc.stdlib cimport free
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport shared_ptr, unique_ptr, make_unique
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 1c31e3f5d3f..ad4edebf1cf 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 import cudf
-from cudf.utils.dtypes import is_categorical_dtype
+from cudf.api.types import is_categorical_dtype
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 5185660c13c..2f1e677b898 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -14,11 +14,11 @@
 from cudf.core.column.methods import ColumnMethodsMixin
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
-    is_categorical_dtype,
     is_mixed_with_object_dtype,
     min_signed_type,
     min_unsigned_type,
 )
+from cudf.api.types import is_categorical_dtype
 
 
 class CategoricalAccessor(ColumnMethodsMixin):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 33c0b98d203..02269c71b85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -30,15 +30,13 @@
     NUMERIC_TYPES,
     check_cast_unsupported_dtype,
     get_time_unit,
-    is_categorical_dtype,
-    is_list_dtype,
-    is_numerical_dtype,
     is_scalar,
-    is_string_dtype,
     min_unsigned_type,
     np_to_pa_dtype,
 )
 from cudf.utils.utils import mask_dtype
+from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype
+
 
 class ColumnBase(Column, Serializable):
     def __init__(
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index f33a5923a74..13552e20647 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -5,7 +5,7 @@
 from cudf.core.column import ColumnBase
 from cudf.core.column.methods import ColumnMethodsMixin
 from cudf.core.dtypes import ListDtype
-from cudf.utils.dtypes import is_list_dtype
+from cudf.api.types import is_list_dtype
 from cudf.utils.utils import buffers_from_pyarrow
 
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index be96e18c148..dae0b8ef6c1 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -140,13 +140,10 @@
 from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    can_convert_to_column,
-    is_list_dtype,
-    is_scalar,
-    is_string_dtype,
+    can_convert_to_column, is_scalar
 )
 from cudf.utils.utils import buffers_from_pyarrow
-
+from cudf.api.types import is_list_dtype, is_string_dtype
 from cudf.core.dtypes import dtype
 
 _str_to_numeric_typecast_functions = {
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1023db69104..76f77d52f2f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -40,13 +40,11 @@
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     cudf_dtype_from_pydata_dtype,
-    is_categorical_dtype,
-    is_list_dtype,
     is_list_like,
     is_scalar,
-    is_string_dtype,
     numeric_normalize_types,
 )
+from cudf.api.types import is_categorical_dtype, is_list_dtype, is_string_dtype
 from cudf.utils.utils import OrderedColumnDict
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8d826977ba7..7251f364dad 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -15,12 +15,11 @@
 from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import utils
 from cudf.utils.dtypes import (
-    is_categorical_dtype,
     is_column_like,
-    is_numerical_dtype,
     is_scalar,
     min_scalar_type,
 )
+from cudf.api.types import is_numerical_dtype, is_categorical_dtype
 
 
 class Frame(libcudf.table.Table):
@@ -270,9 +269,9 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                 dtypes[idx] = cols[0].dtype
                 # If all the non-null dtypes are int/float, find a common dtype
                 if all(is_numerical_dtype(col.dtype) for col in cols):
-                    dtypes[idx] = np.find_common_type(
-                        [col.dtype for col in cols], []
-                    )
+                    dtypes[idx] = cudf.dtype(np.find_common_type(
+                        [col.dtype.to_numpy for col in cols], []
+                    ))
                 # If all categorical dtypes, combine the categories
                 elif all(
                     isinstance(col, cudf.core.column.CategoricalColumn)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 2663ad8e22d..e5a89a23077 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -26,12 +26,12 @@
 from cudf.utils import ioutils, utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    is_categorical_dtype,
     is_list_like,
     is_mixed_with_object_dtype,
     is_scalar,
     numeric_normalize_types,
 )
+from cudf.api.types import is_categorical_dtype
 from cudf.utils.utils import cached_property
 from cudf.core.dtypes import dtype
 
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index ce3c6806d54..5f6d4a69bd5 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -6,12 +6,12 @@
 import cudf
 from cudf._lib.nvtx import annotate
 from cudf.utils.dtypes import (
-    is_categorical_dtype,
     is_column_like,
     is_list_like,
     is_scalar,
     to_cudf_compatible_scalar,
 )
+from cudf.api.types import is_categorical_dtype
 
 
 def indices_from_labels(obj, labels):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c2603a8d177..b423a46b88b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -10,8 +10,8 @@
     build_categorical_column,
 )
 from cudf.utils import cudautils
-from cudf.utils.dtypes import is_categorical_dtype, is_list_like
-
+from cudf.utils.dtypes import is_list_like
+from cudf.api.types import is_categorical_dtype
 _axis_map = {0: 0, 1: 1, "index": 0, "columns": 1}
 
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 83720446ab0..4a2423ecd99 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -42,14 +42,13 @@
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
-    is_list_dtype,
     is_list_like,
     is_mixed_with_object_dtype,
     is_scalar,
-    is_string_dtype,
     min_scalar_type,
     numeric_normalize_types,
 )
+from cudf.api.types import is_list_dtype, is_string_dtype
 
 
 class Series(Frame, Serializable):
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index eba8d4c7f62..1dcf43c3367 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -5,7 +5,7 @@
 import pandas as pd
 
 import cudf
-from cudf.utils.dtypes import is_categorical_dtype
+from cudf.api.types import is_categorical_dtype
 
 
 def _check_isinstance(left, right, obj):
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 9d9bece3e10..29e767fe179 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -12,6 +12,8 @@
 
 import cudf
 from cudf._lib.scalar import Scalar
+from cudf.api.types import is_categorical_dtype
+
 
 _NA_REP = "<NA>"
 _np_pa_dtypes = {
@@ -131,68 +133,6 @@ def is_datetime_dtype(obj):
         return False
     return "M8" in obj.str
 
-
-def is_categorical_dtype(obj):
-    """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype
-    is a pandas CategoricalDtype.
-    """
-    if obj is None:
-        return False
-    if isinstance(obj, cudf.CategoricalDtype):
-        return True
-    if obj is cudf.CategoricalDtype:
-        return True
-    if isinstance(obj, np.dtype):
-        return False
-    if isinstance(obj, CategoricalDtype):
-        return True
-    if obj is CategoricalDtype:
-        return True
-    if obj is CategoricalDtypeType:
-        return True
-    if isinstance(obj, str) and obj == "category":
-        return True
-    if isinstance(
-        obj,
-        (
-            CategoricalDtype,
-            cudf.core.index.CategoricalIndex,
-            cudf.core.column.CategoricalColumn,
-            pd.Categorical,
-            pd.CategoricalIndex,
-        ),
-    ):
-        return True
-    if isinstance(obj, np.ndarray):
-        return False
-    if isinstance(
-        obj,
-        (
-            cudf.Index,
-            cudf.Series,
-            cudf.core.column.ColumnBase,
-            pd.Index,
-            pd.Series,
-        ),
-    ):
-        return is_categorical_dtype(obj.dtype)
-    if hasattr(obj, "type"):
-        if obj.type is CategoricalDtypeType:
-            return True
-    return pd.api.types.is_categorical_dtype(obj)
-
-
-def is_list_dtype(obj):
-    return (
-        type(obj) is cudf.core.dtypes.ListDtype
-        or obj is cudf.core.dtypes.ListDtype
-        or type(obj) is cudf.core.column.ListColumn
-        or obj is cudf.core.column.ListColumn
-        or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name)
-        or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype))
-    )
-
-
 def cudf_dtype_from_pydata_dtype(dtype):
     """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
         Python dtype.

From e5def6e162b00966e3983fdbca3381cfc5858cb6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Aug 2020 06:34:56 -0700
Subject: [PATCH 26/80] forgot entire api/ folder

---
 python/cudf/cudf/api/__init__.py |  0
 python/cudf/cudf/api/types.py    | 85 ++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 python/cudf/cudf/api/__init__.py
 create mode 100644 python/cudf/cudf/api/types.py

diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
new file mode 100644
index 00000000000..143930332b1
--- /dev/null
+++ b/python/cudf/cudf/api/types.py
@@ -0,0 +1,85 @@
+import pandas as pd
+import cudf
+import numpy as np
+from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
+
+def is_datetime64_dtype(obj):
+    return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(obj)
+
+def is_timedelta64_dtype(obj):
+    return isinstance(obj, cudf.Timedelta) or pd.api.types.is_timedelta64_dtype(obj)
+
+def is_string_dtype(obj):
+    return isinstance(obj, cudf.StringDtype) or (pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj))
+
+def is_numerical_dtype(obj):
+    if isinstance(obj, cudf.Generic):
+        return isinstance(obj, cudf.Number)
+    if is_categorical_dtype(obj):
+        return False
+    if is_list_dtype(obj):
+        return False
+    return (
+        np.issubdtype(obj, np.bool_)
+        or np.issubdtype(obj, np.floating)
+        or np.issubdtype(obj, np.signedinteger)
+    )
+
+def is_categorical_dtype(obj):
+    """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype
+    is a pandas CategoricalDtype.
+    """
+    if obj is None:
+        return False
+    if isinstance(obj, cudf.CategoricalDtype):
+        return True
+    if obj is cudf.CategoricalDtype:
+        return True
+    if isinstance(obj, np.dtype):
+        return False
+    if isinstance(obj, CategoricalDtype):
+        return True
+    if obj is CategoricalDtype:
+        return True
+    if obj is CategoricalDtypeType:
+        return True
+    if isinstance(obj, str) and obj == "category":
+        return True
+    if isinstance(
+        obj,
+        (
+            CategoricalDtype,
+            cudf.core.index.CategoricalIndex,
+            cudf.core.column.CategoricalColumn,
+            pd.Categorical,
+            pd.CategoricalIndex,
+        ),
+    ):
+        return True
+    if isinstance(obj, np.ndarray):
+        return False
+    if isinstance(
+        obj,
+        (
+            cudf.Index,
+            cudf.Series,
+            cudf.core.column.ColumnBase,
+            pd.Index,
+            pd.Series,
+        ),
+    ):
+        return is_categorical_dtype(obj.dtype)
+    if hasattr(obj, "type"):
+        if obj.type is CategoricalDtypeType:
+            return True
+    return pd.api.types.is_categorical_dtype(obj)
+
+def is_list_dtype(obj):
+    return (
+        type(obj) is cudf.core.dtypes.ListDtype
+        or obj is cudf.core.dtypes.ListDtype
+        or type(obj) is cudf.core.column.ListColumn
+        or obj is cudf.core.column.ListColumn
+        or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name)
+        or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype))
+    )

From b4d344f7d405390fb2ba4067ef14d3387645be90 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Aug 2020 06:52:23 -0700
Subject: [PATCH 27/80] fix mutable_column_view

---
 python/cudf/cudf/_lib/column.pyx | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index ee9978316fa..9953c8924e6 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -307,14 +307,8 @@ cdef class Column:
             col = self.base_children[0]
         else:
             col = self
-        data_dtype = col.dtype
-
-        cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(data_dtype)]
-            )
-        )
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
+        cdef _Dtype pydtype = col.dtype
+        cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type()
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data

From 22fd5d94941f03bd7a8ea1f667ad2752684a1cab Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 26 Aug 2020 14:24:47 -0700
Subject: [PATCH 28/80] working through dataframe.py tests

---
 python/cudf/cudf/_lib/reduce.pyx          |  2 +-
 python/cudf/cudf/api/types.py             |  4 +++-
 python/cudf/cudf/core/column/column.py    |  5 ++---
 python/cudf/cudf/core/column/datetime.py  |  2 +-
 python/cudf/cudf/core/column/numerical.py | 16 ++++++++++------
 python/cudf/cudf/core/dataframe.py        | 22 +++++++++++-----------
 python/cudf/cudf/core/frame.py            |  4 +++-
 python/cudf/cudf/core/index.py            |  5 ++---
 python/cudf/cudf/core/series.py           |  8 ++------
 9 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 39aad31f570..69592894cae 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -34,7 +34,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     col_dtype = incol.dtype
     if reduction_op in ['sum', 'sum_of_squares', 'product']:
         col_dtype = np.find_common_type([col_dtype.to_numpy], [np.uint64])
-    col_dtype = cudf_dtype(col_dtype)
+    col_dtype = cudf_dtype(col_dtype) if dtype is None else cudf_dtype(dtype)
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 143930332b1..ea6e503782c 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -14,7 +14,7 @@ def is_string_dtype(obj):
 
 def is_numerical_dtype(obj):
     if isinstance(obj, cudf.Generic):
-        return isinstance(obj, cudf.Number)
+        return isinstance(obj, (cudf.Number, cudf.BooleanDtype))
     if is_categorical_dtype(obj):
         return False
     if is_list_dtype(obj):
@@ -29,6 +29,8 @@ def is_categorical_dtype(obj):
     """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype
     is a pandas CategoricalDtype.
     """
+    if isinstance(obj, cudf.Generic) and not isinstance(obj, cudf.CategoricalDtype):
+        return False
     if obj is None:
         return False
     if isinstance(obj, cudf.CategoricalDtype):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 02269c71b85..2786ca45124 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -200,7 +200,7 @@ def _concat(cls, objs, dtype=None):
                 [
                     o
                     for o in not_null_cols
-                    if not isinstance(o.dtype, (cudf.Number, cudf.Datetime))
+                    if not isinstance(o.dtype, (cudf.Number)) or isinstance(o.dtype, cudf.Datetime)
                 ]
             )
             == 0
@@ -1421,8 +1421,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             if pa.types.is_dictionary(pa_type):
                 new_dtype = "category"
             else:
-                new_dtype = np.dtype(pa_type.to_pandas_dtype())
-
+                new_dtype = cudf.dtype(pa_type)
         data = ColumnBase._concat(gpu_cols, dtype=new_dtype)
 
     elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 807f0803e7f..c205b841af5 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -151,7 +151,7 @@ def as_numerical(self):
         )
 
     def as_datetime_column(self, dtype, **kwargs):
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b0e4c563fd8..92dab1a15de 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -17,7 +17,7 @@
     numeric_normalize_types,
 )
 from cudf.utils.utils import buffers_from_pyarrow
-
+from cudf.core.dtypes import Float64Dtype
 class NumericalColumn(column.ColumnBase):
     def __init__(
         self, data, dtype, mask=None, size=None, offset=0, null_count=None
@@ -200,18 +200,22 @@ def to_arrow(self):
             return out
 
     def sum(self, dtype=None):
-        return libcudf.reduce.reduce("sum", self, dtype=dtype)
+        try:
+            return libcudf.reduce.reduce("sum", self, dtype=dtype)
+        except:
+            import pdb
+            pdb.set_trace()
 
     def product(self, dtype=None):
         return libcudf.reduce.reduce("product", self, dtype=dtype)
 
-    def mean(self, dtype=np.float64):
-        return libcudf.reduce.reduce("mean", self, dtype=dtype)
+    def mean(self, dtype=Float64Dtype()):
+            return libcudf.reduce.reduce("mean", self, dtype=dtype)
 
-    def var(self, ddof=1, dtype=np.float64):
+    def var(self, ddof=1, dtype=Float64Dtype()):
         return libcudf.reduce.reduce("var", self, dtype=dtype, ddof=ddof)
 
-    def std(self, ddof=1, dtype=np.float64):
+    def std(self, ddof=1, dtype=Float64Dtype()):
         return libcudf.reduce.reduce("std", self, dtype=dtype, ddof=ddof)
 
     def sum_of_squares(self, dtype=None):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 76f77d52f2f..2717d64c0d4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3179,11 +3179,11 @@ def as_gpu_matrix(self, columns=None, order="F"):
             return cuda.as_cuda_array(matrix)
 
         if any(
-            (is_categorical_dtype(c) or np.issubdtype(c, np.dtype("object")))
+            (is_categorical_dtype(c) or isinstance(c.dtype, cudf.StringDtype))
             for c in cols
         ):
             raise TypeError("non-numeric data not yet supported")
-        dtype = np.find_common_type(cols, [])
+        dtype = np.find_common_type([c.dtype.to_numpy for c in cols], [])
         for k, c in self._data.items():
             if c.has_nulls:
                 errmsg = (
@@ -6003,7 +6003,7 @@ def kurtosis(
             msg = "Kurtosis only supports int, float, and bool dtypes."
             raise NotImplementedError(msg)
 
-        self = self.select_dtypes(include=[np.number, np.bool])
+        self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()])
         return self._apply_support_method(
             "kurtosis",
             axis=axis,
@@ -6313,7 +6313,7 @@ def select_dtypes(self, include=None, exclude=None):
             )
 
         include, exclude = map(
-            lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)),
+            lambda x: frozenset(map(cudf.dtype, x)),
             selection,
         )
 
@@ -6332,9 +6332,9 @@ def select_dtypes(self, include=None, exclude=None):
                 # category handling
                 if is_categorical_dtype(i_dtype):
                     include_subtypes.add(i_dtype)
-                elif issubclass(dtype.type, i_dtype):
-                    include_subtypes.add(dtype.type)
-
+                elif issubclass(dtype, i_dtype):
+                    include_subtypes.add(dtype)
+    
         # exclude all subtypes
         exclude_subtypes = set()
         for dtype in self.dtypes:
@@ -6342,11 +6342,11 @@ def select_dtypes(self, include=None, exclude=None):
                 # category handling
                 if is_categorical_dtype(e_dtype):
                     exclude_subtypes.add(e_dtype)
-                elif issubclass(dtype.type, e_dtype):
-                    exclude_subtypes.add(dtype.type)
+                elif issubclass(dtype, e_dtype):
+                    exclude_subtypes.add(dtype)
 
         include_all = set(
-            [cudf_dtype_from_pydata_dtype(d) for d in self.dtypes]
+            [cudf.dtype(d) for d in self.dtypes]
         )
 
         if include:
@@ -6359,7 +6359,7 @@ def select_dtypes(self, include=None, exclude=None):
         inclusion = inclusion - exclude_subtypes
 
         for k, col in self._data.items():
-            infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
+            infered_type = cudf.dtype(col.dtype)
             if infered_type in inclusion:
                 df.insert(len(df._data), k, col)
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7251f364dad..f9f36ed1798 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -289,8 +289,10 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                     dtypes[idx] = min_scalar_type(len(categories[idx]))
                 # Otherwise raise an error if columns have different dtypes
                 elif not all(
-                    is_dtype_equal(c.dtype, dtypes[idx]) for c in cols
+                    c.dtype == dtypes[idx] for c in cols
                 ):
+                    import pdb
+                    pdb.set_trace()
                     raise ValueError("All columns must be the same type")
             return categories
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e5a89a23077..217079fbf03 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1512,7 +1512,7 @@ def _num_rows(self):
     @cached_property
     def _values(self):
         if len(self) > 0:
-            vals = cupy.arange(self._start, self._stop, dtype=self.dtype)
+            vals = cupy.arange(self._start, self._stop, dtype=self.dtype.to_numpy)
             return column.as_column(vals)
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
@@ -1625,7 +1625,7 @@ def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
         """
-        return np.dtype(np.int64)
+        return cudf.Int64Dtype()
 
     @property
     def is_contiguous(self):
@@ -2524,7 +2524,6 @@ def as_index(arbitrary, **kwargs):
         - DatetimeIndex for Datetime input.
         - GenericIndex for all other inputs.
     """
-
     kwargs = _setdefault_name(arbitrary, **kwargs)
     if isinstance(arbitrary, cudf.MultiIndex):
         return arbitrary
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4a2423ecd99..513a8336f29 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3099,9 +3099,7 @@ def cumsum(self, axis=0, skipna=True, *args, **kwargs):
                 result_col[first_index:] = None
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
-        if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype(
-            result_col.dtype, np.bool_
-        ):
+        if isinstance(result_col.dtype, (cudf.Integer, cudf.BooleanDtype)):
             return Series(
                 result_col.astype(np.int64)._apply_scan_op("sum"),
                 name=self.name,
@@ -3161,9 +3159,7 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs):
                 result_col[first_index:] = None
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
-        if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype(
-            result_col.dtype, np.bool_
-        ):
+        if isinstance(result_col.dtype, (cudf.Integer, cudf.BooleanDtype)):
             return Series(
                 result_col.astype(np.int64)._apply_scan_op("product"),
                 name=self.name,

From c5a0b62f02072cf65930d98f839d5ce8d40aa6a3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 27 Aug 2020 11:12:08 -0700
Subject: [PATCH 29/80] pass join tests

---
 python/cudf/cudf/_lib/string_casting.pyx | 10 +++-------
 python/cudf/cudf/core/column/column.py   |  3 ++-
 python/cudf/cudf/core/column/string.py   |  6 +++---
 python/cudf/cudf/core/dtypes.py          |  6 ++++--
 python/cudf/cudf/core/join/join.py       |  2 +-
 python/cudf/cudf/tests/test_dataframe.py | 14 ++++++--------
 python/cudf/cudf/tests/test_joining.py   |  7 +++++--
 7 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 4dbb2d99db3..aee554017af 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -7,7 +7,7 @@ from cudf._lib.move cimport move
 from cudf._lib.scalar import as_scalar
 from cudf._lib.scalar cimport Scalar
 from cudf._lib.types import np_to_cudf_types
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, _Dtype
 
 from cudf.core.column.column import as_column
 
@@ -555,12 +555,8 @@ def timestamp2int(
     if input_col.size == 0:
         return as_column([], dtype=kwargs.get('dtype'))
     cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            np_to_cudf_types[kwargs.get('dtype')]
-        )
-    )
-    cdef data_type out_type = data_type(tid)
+    cdef _Dtype pydtype = kwargs.get('dtype')
+    cdef data_type out_type = pydtype.get_libcudf_type()
     cdef string c_timestamp_format = kwargs.get('format').encode('UTF-8')
     cdef unique_ptr[column] c_result
     with nogil:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2786ca45124..fc836f67d45 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -143,7 +143,7 @@ def values(self):
         Return a CuPy representation of the Column.
         """
         if len(self) == 0:
-            return cupy.asarray([], dtype=self.dtype)
+            return cupy.asarray([], dtype=self.dtype.to_numpy)
 
         if self.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -1632,6 +1632,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                 if is_categorical_dtype(dtype):
                     sr = pd.Series(arbitrary, dtype="category")
+                    dtype = cudf.CategoricalDtype.from_pandas(sr.dtype)
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
                 elif isinstance(cudf.dtype(dtype), cudf.StringDtype):
                     sr = pd.Series(arbitrary, dtype="str")
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index dae0b8ef6c1..96c064c7328 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4532,7 +4532,7 @@ def _nbytes(self):
 
     def as_numerical_column(self, dtype, **kwargs):
 
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
         kwargs.update(dtype=out_dtype)
 
         if out_dtype.type is np.datetime64:
@@ -4554,7 +4554,7 @@ def as_numerical_column(self, dtype, **kwargs):
                 raise ValueError("Could not convert `None` value to datetime")
 
             boolean_match = self.binary_operator("eq", "NaT")
-        elif out_dtype.type is np.timedelta64:
+        elif out_dtype.type is cudf.Timedelta:
             if "format" not in kwargs:
                 if len(self) > 0:
                     kwargs.update(format="%D days %H:%M:%S")
@@ -4577,7 +4577,7 @@ def as_numerical_column(self, dtype, **kwargs):
             self, **kwargs
         )
         if (
-            out_dtype.type in (np.datetime64, np.timedelta64)
+            isinstance(out_dtype, (cudf.Datetime, cudf.Timedelta))
         ) and boolean_match.any():
             result_col[boolean_match] = None
         return result_col
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 0931b3af5c4..6e5e37351e8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -326,6 +326,8 @@ def dtype(obj):
     elif issubclass(obj.__class__, Generic):
         return obj()
     if isinstance(obj, np.dtype):
+        if obj.type is np.str_:
+            return StringDtype()
         return np_to_cudf_dtypes[obj]
     elif isinstance(obj, pa.lib.DataType):
         return pa_to_cudf_dtypes[obj]
@@ -448,7 +450,7 @@ def deserialize(cls, header, frames):
         return cls(categories=categories, ordered=ordered)
 
 
-class ListDtype(ExtensionDtype):
+class ListDtype(Generic):
 
     name = "list"
 
@@ -483,7 +485,7 @@ def type(self):
 
     @classmethod
     def from_arrow(cls, typ):
-        obj = object.__new__(cls)
+        obj = ListDtype.__new__(ListDtype)
         obj._typ = typ
         return obj
 
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 459d8e215c4..ebc52490417 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -406,7 +406,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
             elif isinstance(dtype_l, cudf.Datetime) and isinstance(
                 dtype_r, cudf.Datetime
             ):
-                libcudf_join_type = cudf.dtype(max(dtype_l, dtype_r))
+                libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy))
         return libcudf_join_type
 
     def libcudf_to_output_casting_rules(self, lcol, rcol, how):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 57951879b7e..655bd6c28af 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -190,7 +190,7 @@ def test_series_init_none():
     sr1 = Series()
     got = sr1.to_string()
     print(got)
-    expect = "Series([], dtype: float64)"
+    expect = "Series([], dtype: Float64)"
     # values should match despite whitespace difference
     assert got.split() == expect.split()
 
@@ -198,7 +198,7 @@ def test_series_init_none():
     sr2 = Series(None)
     got = sr2.to_string()
     print(got)
-    expect = "Series([], dtype: float64)"
+    expect = "Series([], dtype: Float64)"
     # values should match despite whitespace difference
     assert got.split() == expect.split()
 
@@ -449,9 +449,9 @@ def test_dataframe_astype(nelem):
     df = DataFrame()
     data = np.asarray(range(nelem), dtype=np.int32)
     df["a"] = data
-    assert df["a"].dtype is np.dtype(np.int32)
+    assert df["a"].dtype == gd.Int32Dtype()
     df["b"] = df["a"].astype(np.float32)
-    assert df["b"].dtype is np.dtype(np.float32)
+    assert df["b"].dtype == gd.Float32Dtype()
     np.testing.assert_equal(df["a"].to_array(), df["b"].to_array())
 
 
@@ -460,9 +460,9 @@ def test_index_astype(nelem):
     df = DataFrame()
     data = np.asarray(range(nelem), dtype=np.int32)
     df["a"] = data
-    assert df.index.dtype is np.dtype(np.int64)
+    assert df.index.dtype == gd.Int64Dtype()
     df.index = df.index.astype(np.float32)
-    assert df.index.dtype is np.dtype(np.float32)
+    assert df.index.dtype == gd.Float32Dtype()
     df["a"] = df["a"].astype(np.float32)
     np.testing.assert_equal(df.index.to_array(), df["a"].to_array())
     df["b"] = df["a"]
@@ -1545,9 +1545,7 @@ def gdf(pdf):
 @pytest.mark.parametrize("skipna", [True, False, None])
 def test_dataframe_reductions(data, func, skipna):
     pdf = pd.DataFrame(data=data)
-    print(func(pdf, skipna=skipna))
     gdf = DataFrame.from_pandas(pdf)
-    print(func(gdf, skipna=skipna))
     assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna))
 
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 5cf8fb325d7..536ab79ddb0 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -802,7 +802,7 @@ def test_join_empty_table_dtype():
     gright = DataFrame.from_pandas(right)
     pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"])
     gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"])
-    assert_eq(pd_merge["a"].dtype, gd_merge["a"].dtype)
+    assert gd_merge['a'].dtype == pd_merge['a'].dtype
 
 
 @pytest.mark.parametrize("how", ["outer", "inner", "left", "right"])
@@ -1108,11 +1108,14 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
     lhs = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}, dtype=dtype_l)
     rhs = cudf.DataFrame({"a": [1, 2, 3, 4, dtype_l_max + 1]}, dtype=dtype_r)
 
+    l_typ_warn = cudf.dtype(dtype_l).name
+    r_typ_warn = cudf.dtype(dtype_r).name
+
     with pytest.warns(
         UserWarning,
         match=(
             f"can't safely cast column"
-            f" from right with type {dtype_r} to {dtype_l}"
+            f" from right with type {r_typ_warn} to {l_typ_warn}"
         ),
     ):
         merged = lhs.merge(rhs, on="a", how="left")  # noqa: F841

From d47de0361299560e490c3c5a72f9507cacd9942f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 27 Aug 2020 12:49:02 -0700
Subject: [PATCH 30/80] fix categorical tests

---
 python/cudf/cudf/core/column/column.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index fc836f67d45..599046a6b26 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1607,7 +1607,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
-
     else:
         try:
             data = as_column(
@@ -1631,9 +1630,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
 
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                 if is_categorical_dtype(dtype):
-                    sr = pd.Series(arbitrary, dtype="category")
-                    dtype = cudf.CategoricalDtype.from_pandas(sr.dtype)
-                    data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
+                    if isinstance(dtype, pd.CategoricalDtype) or dtype is 'category':
+                        data = as_column(pd.Series(arbitrary, dtype=dtype), nan_as_null=nan_as_null)
+                    else:
+                        data = as_column(arbitrary, nan_as_null=nan_as_null).astype(dtype)
                 elif isinstance(cudf.dtype(dtype), cudf.StringDtype):
                     sr = pd.Series(arbitrary, dtype="str")
                     data = as_column(sr, nan_as_null=nan_as_null)

From fe180a3bfbfab73819b245f7a2e2ec37d9b07554 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 27 Aug 2020 15:19:06 -0700
Subject: [PATCH 31/80] more bugfixes

---
 python/cudf/cudf/_lib/copying.pyx        |  2 +-
 python/cudf/cudf/api/types.py            |  4 ++++
 python/cudf/cudf/core/column/column.py   |  4 ++--
 python/cudf/cudf/core/dataframe.py       |  8 ++++----
 python/cudf/cudf/core/frame.py           |  2 --
 python/cudf/cudf/core/indexing.py        |  4 ++--
 python/cudf/cudf/tests/test_numerical.py | 20 ++++++++++----------
 7 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 21813c38253..357b019c0f3 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -203,7 +203,7 @@ def _scatter_scalar(scalars, Column scatter_map,
     cdef bool c_bounds_check = bounds_check
     cdef Scalar slr
     for val, col in zip(scalars, target_table._columns):
-        slr = as_scalar(val, col.dtype)
+        slr = as_scalar(val, col.dtype.to_numpy)
         source_scalars.push_back(move(slr.c_value))
     cdef column_view scatter_map_view = scatter_map.view()
     cdef table_view target_table_view = target_table.data_view()
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index ea6e503782c..e00023b492d 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -3,6 +3,10 @@
 import numpy as np
 from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
 
+def is_bool_dtype(obj):
+    # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably
+    return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(obj)
+
 def is_datetime64_dtype(obj):
     return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(obj)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 599046a6b26..3ad9ebb1551 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -35,7 +35,7 @@
     np_to_pa_dtype,
 )
 from cudf.utils.utils import mask_dtype
-from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype
+from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype, is_bool_dtype
 
 
 class ColumnBase(Column, Serializable):
@@ -552,7 +552,7 @@ def __setitem__(self, key, value):
                 nelem = abs(key_stop - key_start)
         else:
             key = as_column(key)
-            if pd.api.types.is_bool_dtype(key.dtype):
+            if is_bool_dtype(key.dtype):
                 if not len(key) == len(self):
                     raise ValueError(
                         "Boolean mask must be of same length as column"
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2717d64c0d4..84088a54fe7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4921,8 +4921,8 @@ def to_records(self, index=True):
         -------
         numpy recarray
         """
-        members = [("index", self.index.dtype)] if index else []
-        members += [(col, self[col].dtype) for col in self._data.names]
+        members = [("index", self.index.dtype.to_numpy)] if index else []
+        members += [(col, self[col].dtype.to_numpy) for col in self._data.names]
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
@@ -6049,7 +6049,7 @@ def skew(
             msg = "Skew only supports int, float, and bool dtypes."
             raise NotImplementedError(msg)
 
-        self = self.select_dtypes(include=[np.number, np.bool])
+        self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()])
         return self._apply_support_method(
             "skew",
             axis=axis,
@@ -6332,7 +6332,7 @@ def select_dtypes(self, include=None, exclude=None):
                 # category handling
                 if is_categorical_dtype(i_dtype):
                     include_subtypes.add(i_dtype)
-                elif issubclass(dtype, i_dtype):
+                elif isinstance(dtype, i_dtype.__class__):
                     include_subtypes.add(dtype)
     
         # exclude all subtypes
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index f9f36ed1798..640ec747f72 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -291,8 +291,6 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                 elif not all(
                     c.dtype == dtypes[idx] for c in cols
                 ):
-                    import pdb
-                    pdb.set_trace()
                     raise ValueError("All columns must be the same type")
             return categories
 
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 5f6d4a69bd5..dcc9ddf9315 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -452,8 +452,8 @@ def _get_column_selection(self, arg):
 
 def _normalize_dtypes(df):
     if len(df.columns) > 0:
-        dtypes = df.dtypes.values.tolist()
-        normalized_dtype = np.result_type(*dtypes)
+        dtypes = [d.to_numpy for d in df.dtypes.values.tolist()]
+        normalized_dtype = cudf.dtype(np.result_type(*dtypes))
         for name, col in df._data.items():
             df[name] = col.astype(normalized_dtype)
     return df
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index c6131fbcd14..48c6522a378 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -1,19 +1,19 @@
 import numpy as np
 import pandas as pd
 import pytest
-
+import cudf
 from cudf import Series
 from cudf.tests.utils import assert_eq
 
 
 def test_can_cast_safely_same_kind():
     data = Series([1, 2, 3], dtype="int32")._column
-    to_dtype = np.dtype("int64")
+    to_dtype = cudf.dtype("int64")
 
     assert data.can_cast_safely(to_dtype)
 
     data = Series([1, 2, 3], dtype="int64")._column
-    to_dtype = np.dtype("int32")
+    to_dtype = cudf.dtype("int32")
 
     assert data.can_cast_safely(to_dtype)
 
@@ -21,12 +21,12 @@ def test_can_cast_safely_same_kind():
     assert not data.can_cast_safely(to_dtype)
 
     data = Series([1, 2, 3], dtype="uint32")._column
-    to_dtype = np.dtype("uint64")
+    to_dtype = cudf.dtype("uint64")
 
     assert data.can_cast_safely(to_dtype)
 
     data = Series([1, 2, 3], dtype="uint64")._column
-    to_dtype = np.dtype("uint32")
+    to_dtype = cudf.dtype("uint32")
 
     assert data.can_cast_safely(to_dtype)
 
@@ -36,7 +36,7 @@ def test_can_cast_safely_same_kind():
 
 def test_can_cast_safely_mixed_kind():
     data = Series([1, 2, 3], dtype="int32")._column
-    to_dtype = np.dtype("float32")
+    to_dtype = cudf.dtype("float32")
     assert data.can_cast_safely(to_dtype)
 
     # too big to fit into f32 exactly
@@ -44,18 +44,18 @@ def test_can_cast_safely_mixed_kind():
     assert not data.can_cast_safely(to_dtype)
 
     data = Series([1, 2, 3], dtype="uint32")._column
-    to_dtype = np.dtype("float32")
+    to_dtype = cudf.dtype("float32")
     assert data.can_cast_safely(to_dtype)
 
     # too big to fit into f32 exactly
     data = Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column
     assert not data.can_cast_safely(to_dtype)
 
-    to_dtype = np.dtype("float64")
+    to_dtype = cudf.dtype("float64")
     assert data.can_cast_safely(to_dtype)
 
     data = Series([1.0, 2.0, 3.0], dtype="float32")._column
-    to_dtype = np.dtype("int32")
+    to_dtype = cudf.dtype("int32")
     assert data.can_cast_safely(to_dtype)
 
     # not integer float
@@ -97,7 +97,7 @@ def test_to_pandas_nullable_bool():
 
 def test_can_cast_safely_has_nulls():
     data = Series([1, 2, 3, None], dtype="float32")._column
-    to_dtype = np.dtype("int64")
+    to_dtype = cudf.dtype("int64")
 
     assert data.can_cast_safely(to_dtype)
 

From cad48d0497fd9e5ce43e9a6521153bd6a1e84cea Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 28 Aug 2020 16:01:19 -0700
Subject: [PATCH 32/80] more progress

---
 python/cudf/cudf/_lib/reduce.pyx            |  3 ++-
 python/cudf/cudf/api/types.py               |  6 ++++++
 python/cudf/cudf/core/column/categorical.py |  5 +----
 python/cudf/cudf/core/column/column.py      | 10 ++++-----
 python/cudf/cudf/core/column/datetime.py    |  8 +++----
 python/cudf/cudf/core/column/numerical.py   |  9 +++++++-
 python/cudf/cudf/core/dataframe.py          | 15 +++++++-------
 python/cudf/cudf/core/dtypes.py             | 23 ++++++++++++++++++++-
 python/cudf/cudf/core/frame.py              |  4 +---
 python/cudf/cudf/core/join/join.py          |  4 +---
 python/cudf/cudf/core/series.py             |  4 ++--
 python/cudf/cudf/tests/test_dataframe.py    |  2 ++
 python/cudf/cudf/tests/test_replace.py      |  2 +-
 python/cudf/cudf/tests/test_repr.py         | 10 +++++----
 python/cudf/cudf/tests/test_string.py       |  1 +
 python/cudf/cudf/utils/dtypes.py            |  5 +++--
 16 files changed, 72 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 69592894cae..ac8065d2d6f 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -14,6 +14,7 @@ from cudf._lib.aggregation cimport make_aggregation, aggregation
 from libcpp.memory cimport unique_ptr
 import numpy as np
 from cudf.core.dtypes import dtype as cudf_dtype
+from cudf.api.types import find_common_type
 
 
 def reduce(reduction_op, Column incol, dtype=None, **kwargs):
@@ -33,7 +34,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
 
     col_dtype = incol.dtype
     if reduction_op in ['sum', 'sum_of_squares', 'product']:
-        col_dtype = np.find_common_type([col_dtype.to_numpy], [np.uint64])
+        col_dtype = find_common_type([col_dtype], [np.uint64])
     col_dtype = cudf_dtype(col_dtype) if dtype is None else cudf_dtype(dtype)
 
     cdef column_view c_incol_view = incol.view()
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index e00023b492d..13494a4bdc3 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -89,3 +89,9 @@ def is_list_dtype(obj):
         or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name)
         or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype))
     )
+
+def find_common_type(array_types=[], scalar_types=[]):
+    array_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types]
+    scalar_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types]
+
+    return cudf.dtype(np.find_common_type(array_types, scalar_types))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 2f1e677b898..ae5c5d46562 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -307,10 +307,7 @@ def add_categories(self, new_categories, **kwargs):
                 f"type-cast new_categories to the same type as "
                 f"existing categories."
             )
-        common_dtype = np.find_common_type(
-            [old_categories.dtype.to_numpy, new_categories.dtype.to_numpy], []
-        )
-        common_dtype = cudf.dtype(common_dtype)
+        common_dtype = cudf.api.types.find_common_type([old_categories.dtype, new_categories.dtype], [])
 
         new_categories = new_categories.astype(common_dtype, copy=False)
         old_categories = old_categories.astype(common_dtype, copy=False)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 3ad9ebb1551..94f05935537 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -205,12 +205,12 @@ def _concat(cls, objs, dtype=None):
             )
             == 0
         ):
-            np_col_dtypes = [o.dtype.to_numpy for o in not_null_cols]
+            cudf_col_dtypes = [o.dtype for o in not_null_cols]
             # Use NumPy to find a common dtype
-            np_common_dtype = np.find_common_type(np_col_dtypes, [])
+            cudf_common_dtype = cudf.api.types.find_common_type(cudf_col_dtypes, [])
             # Cast all columns to the common dtype
             for i in range(len(objs)):
-                objs[i] = objs[i].astype(cudf.dtype(np_common_dtype))
+                objs[i] = objs[i].astype(cudf_common_dtype)
 
         # Find the first non-null column:
         head = objs[0]
@@ -1010,7 +1010,7 @@ def serialize(self):
         header = {}
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
-        header["dtype"] = self.dtype.str
+        header["dtype"] = str(self.dtype)
 
         data_header, data_frames = self.data.serialize()
         header["data"] = data_header
@@ -1164,7 +1164,7 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
-    elif dtype.type is np.timedelta64:
+    elif isinstance(dtype, cudf.Timedelta):
         return cudf.core.column.TimeDeltaColumn(
             data=data,
             dtype=dtype,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c205b841af5..ba652c3d3d2 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -27,10 +27,10 @@
 }
 
 _dtype_to_format_conversion = {
-    "datetime64[ns]": "%Y-%m-%d %H:%M:%S.%9f",
-    "datetime64[us]": "%Y-%m-%d %H:%M:%S.%6f",
-    "datetime64[ms]": "%Y-%m-%d %H:%M:%S.%3f",
-    "datetime64[s]": "%Y-%m-%d %H:%M:%S",
+    "Datetime64NS": "%Y-%m-%d %H:%M:%S.%9f",
+    "Datetime64US": "%Y-%m-%d %H:%M:%S.%6f",
+    "Datetime64MS": "%Y-%m-%d %H:%M:%S.%3f",
+    "Datetime64S": "%Y-%m-%d %H:%M:%S",
 }
 
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 92dab1a15de..d467d0d0ddb 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -30,6 +30,13 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
+        try:
+            cudf.dtype(dtype)
+            dtype.itemsize
+
+        except:
+            import pdb
+            pdb.set_trace()
         dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -512,7 +519,7 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
     if (
         col_to_normalize_dtype.kind == "f"
         and input_column_dtype.kind in {"i", "u"}
-    ) or (col_to_normalize_dtype.to_numpy.num > input_column_dtype.to_numpy.num):
+    ) or (col_to_normalize_dtype.num > input_column_dtype.num):
         raise TypeError(
             f"Potentially unsafe cast for non-equivalent "
             f"{col_to_normalize_dtype.name} "
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 84088a54fe7..7f998784d34 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3183,7 +3183,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
             for c in cols
         ):
             raise TypeError("non-numeric data not yet supported")
-        dtype = np.find_common_type([c.dtype.to_numpy for c in cols], [])
+        dtype = cudf.api.types.find_common_type([c.dtype for c in cols], [])
         for k, c in self._data.items():
             if c.has_nulls:
                 errmsg = (
@@ -3191,7 +3191,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
                     "hint: use .fillna() to replace null values"
                 )
                 raise ValueError(errmsg.format(k))
-        cupy_dtype = dtype
+        cupy_dtype = dtype.to_numpy
         if np.issubdtype(cupy_dtype, np.datetime64):
             cupy_dtype = np.dtype("int64")
 
@@ -6313,7 +6313,7 @@ def select_dtypes(self, include=None, exclude=None):
             )
 
         include, exclude = map(
-            lambda x: frozenset(map(cudf.dtype, x)),
+            lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)),
             selection,
         )
 
@@ -6332,8 +6332,8 @@ def select_dtypes(self, include=None, exclude=None):
                 # category handling
                 if is_categorical_dtype(i_dtype):
                     include_subtypes.add(i_dtype)
-                elif isinstance(dtype, i_dtype.__class__):
-                    include_subtypes.add(dtype)
+                elif isinstance(dtype, i_dtype):
+                    include_subtypes.add(dtype.__class__)
     
         # exclude all subtypes
         exclude_subtypes = set()
@@ -6346,9 +6346,8 @@ def select_dtypes(self, include=None, exclude=None):
                     exclude_subtypes.add(dtype)
 
         include_all = set(
-            [cudf.dtype(d) for d in self.dtypes]
+            [cudf_dtype_from_pydata_dtype(d) for d in self.dtypes]
         )
-
         if include:
             inclusion = include_all & include_subtypes
         elif exclude:
@@ -6359,7 +6358,7 @@ def select_dtypes(self, include=None, exclude=None):
         inclusion = inclusion - exclude_subtypes
 
         for k, col in self._data.items():
-            infered_type = cudf.dtype(col.dtype)
+            infered_type = cudf.dtype(col.dtype).__class__
             if infered_type in inclusion:
                 df.insert(len(df._data), k, col)
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 6e5e37351e8..682df0d7e9a 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -77,6 +77,13 @@ def __eq__(self, other):
             return True
         return False
 
+    def __str__(self):
+        return str(self.to_pandas)
+
+    @property
+    def num(self):
+        return self.to_numpy.num
+
     @property
     def to_numpy(self):
         return pa_to_np_dtypes[self.pa_type]
@@ -287,7 +294,7 @@ def make_dtype_from_string(obj):
             return UInt32Dtype()
         elif obj in {"uint16", "UInt16"}:
             return UInt16Dtype()
-        elif obj in {"uint8", "Uint8"}:
+        elif obj in {"uint8", "UInt8"}:
             return UInt8Dtype()
     elif "float" in obj or "Float" in obj:
         if obj in {"float64", "Float64"}:
@@ -315,11 +322,14 @@ def make_dtype_from_numpy(obj):
 
 
 def dtype(obj):
+
     if obj is None:
         return None
     if isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
     if isinstance(obj, CategoricalDtype):
+        if obj is 'category':
+            return cudf.CategoricalDtype()
         return obj
     elif isinstance(obj, Generic):
         return obj
@@ -337,6 +347,14 @@ def dtype(obj):
         return pd_to_cudf_dtypes[obj]
     elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
         return make_dtype_from_string(obj.name)
+    elif obj is np.number:
+        return cudf.Number
+    elif obj is np.datetime64:
+        return cudf.Datetime
+    elif obj is np.timedelta64:
+        return cudf.Timedelta
+
+
     else:
         try:
             if issubclass(obj, np.generic):
@@ -360,6 +378,9 @@ def __init__(self, categories=None, ordered=None):
     def __repr__(self):
         return self.to_pandas().__repr__()
 
+    def __hash__(self):
+        return hash(self.__repr__())
+
     @property
     def categories(self):
         if self._categories is None:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 640ec747f72..4509969de03 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -269,9 +269,7 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                 dtypes[idx] = cols[0].dtype
                 # If all the non-null dtypes are int/float, find a common dtype
                 if all(is_numerical_dtype(col.dtype) for col in cols):
-                    dtypes[idx] = cudf.dtype(np.find_common_type(
-                        [col.dtype.to_numpy for col in cols], []
-                    ))
+                    dtypes[idx] = cudf.api.types.find_common_type([col.dtype for col in cols], [])
                 # If all categorical dtypes, combine the categories
                 elif all(
                     isinstance(col, cudf.core.column.CategoricalColumn)
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index ebc52490417..231a114aff7 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -400,9 +400,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
                     # both ints or both floats
                     libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy))
                 else:
-                    libcudf_join_type = cudf.dtype(np.find_common_type(
-                        [], [dtype_l.to_numpy, dtype_r.to_numpy]
-                    ))
+                    libcudf_join_type = cudf.api.types.find_common_type([], [dtype_l, dtype_r])
             elif isinstance(dtype_l, cudf.Datetime) and isinstance(
                 dtype_r, cudf.Datetime
             ):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 513a8336f29..c6227ce4105 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1027,10 +1027,10 @@ def __repr__(self):
             else:
                 lines = lines[:-1]
                 lines[-1] = lines[-1] + "\n"
-            lines[-1] = lines[-1] + "dtype: %s" % self.dtype
+            lines[-1] = lines[-1] + "dtype: %s" % self.dtype.name
         else:
             lines = output.split(",")
-            return lines[0] + ", dtype: %s)" % self.dtype
+            return lines[0] + ", dtype: %s)" % self.dtype.name
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
             lines.append(category_memory)
         return "\n".join(lines)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 655bd6c28af..000827cc9c8 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2523,6 +2523,8 @@ def test_select_dtype():
         pdf.select_dtypes(include=["float64"]),
         gdf.select_dtypes(include=["float64"]),
     )
+    import pdb
+    pdb.set_trace()
     assert_eq(
         pdf.select_dtypes(include=["object", "int", "category"]),
         gdf.select_dtypes(include=["object", "int", "category"]),
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index dc8331965fe..823538a0200 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -455,7 +455,7 @@ def test_series_fillna_invalid_dtype(data_dtype):
         gdf.fillna(fill_value)
     raises.match(
         "Cannot safely cast non-equivalent {} to {}".format(
-            type(fill_value).__name__, gdf.dtype.type.__name__
+            type(fill_value).__name__, gdf.dtype.name
         )
     )
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 7ab863c3fff..a214588d367 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -46,10 +46,12 @@ def test_null_series(nrows, dtype):
         psrepr = psrepr.replace(
             str(sr._column.default_na_value()) + "\n", "<NA>\n"
         )
-    if "UInt" in psrepr:
-        psrepr = psrepr.replace("UInt", "uint")
-    elif "Int" in psrepr:
-        psrepr = psrepr.replace("Int", "int")
+    if "uint" in psrepr:
+        psrepr = psrepr.replace("uint", "UInt")
+    elif "int" in psrepr:
+        psrepr = psrepr.replace("int", "Int")
+    elif 'float' in psrepr:
+        psrepr = psrepr.replace("float", "Float")
     assert psrepr.split() == sr.__repr__().split()
 
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 22e873e5f25..dd0c7c71431 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -156,6 +156,7 @@ def test_string_repr(ps_gs, item):
 
     if got_out is not None and len(got_out) > 1:
         expect = expect.replace("None", "<NA>")
+    expect = expect.replace('object', 'String')
 
     assert expect == got
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 29e767fe179..7efc0cd049f 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -137,13 +137,14 @@ def cudf_dtype_from_pydata_dtype(dtype):
     """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
         Python dtype.
     """
-
+    if isinstance(dtype, cudf.Generic):
+        return dtype.__class__
     if is_categorical_dtype(dtype):
         return cudf.core.dtypes.CategoricalDtype
     elif np.issubdtype(dtype, np.datetime64):
         dtype = np.datetime64
 
-    return infer_dtype_from_object(dtype)
+    return cudf.dtype(infer_dtype_from_object(dtype)).__class__
 
 
 def is_scalar(val):

From 6a1785c1e8e7218863cb71831b32fe0e49af04ee Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 30 Aug 2020 09:00:48 -0700
Subject: [PATCH 33/80] all repr tests pass

---
 python/cudf/cudf/core/column/timedelta.py |   4 +-
 python/cudf/cudf/core/dtypes.py           |   6 +-
 python/cudf/cudf/core/index.py            |   5 +-
 python/cudf/cudf/core/series.py           |  54 +++++++++++
 python/cudf/cudf/tests/test_repr.py       | 105 +++++++++++-----------
 5 files changed, 119 insertions(+), 55 deletions(-)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 2ab0fadae82..b20d943d30c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -291,7 +291,7 @@ def default_na_value(self):
 
     @property
     def time_unit(self):
-        return self._time_unit
+        return self.dtype._time_unit
 
     def fillna(self, fill_value):
         col = self
@@ -346,7 +346,7 @@ def as_string_column(self, dtype, **kwargs):
             return column.column_empty(0, dtype="object", masked=False)
 
     def as_timedelta_column(self, dtype, **kwargs):
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 682df0d7e9a..5ac2fae2c03 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -78,7 +78,7 @@ def __eq__(self, other):
         return False
 
     def __str__(self):
-        return str(self.to_pandas)
+        return self.name
 
     @property
     def num(self):
@@ -243,21 +243,25 @@ class Timedelta64NSDtype(Timedelta):
     def __init__(self):
         self.pa_type = pa.duration('ns')
         self._name = "Timedelta64NS"
+        self._time_unit = 'ns'
 
 class Timedelta64USDtype(Timedelta):
     def __init__(self):
         self.pa_type = pa.duration('us')
         self._name = "Timedelta64US"
+        self._time_unit = 'us'
 
 class Timedelta64MSDtype(Timedelta):
     def __init__(self):
         self.pa_type = pa.duration('ms')
         self._name = "Timedelta64MS"
+        self._time_unit = 'ms'
 
 class Timedelta64SDtype(Timedelta):
     def __init__(self):
         self.pa_type = pa.duration('s')
         self._name = "Timedelta64S"
+        self._time_unit = 's'
 
 class StringDtype(Flexible):
     is_string = True
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 217079fbf03..9ee250e11e4 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2474,7 +2474,7 @@ def take(self, indices):
     def __repr__(self):
         return (
             f"{self.__class__.__name__}({self._values.to_array()},"
-            f" dtype='object'"
+            f" dtype='String'"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
                 if self.name is not None
@@ -2524,6 +2524,7 @@ def as_index(arbitrary, **kwargs):
         - DatetimeIndex for Datetime input.
         - GenericIndex for all other inputs.
     """
+
     kwargs = _setdefault_name(arbitrary, **kwargs)
     if isinstance(arbitrary, cudf.MultiIndex):
         return arbitrary
@@ -2533,7 +2534,7 @@ def as_index(arbitrary, **kwargs):
         return idx
     elif isinstance(arbitrary, NumericalColumn):
         try:
-            return _dtype_to_index[arbitrary.dtype.type](arbitrary, **kwargs)
+            return _dtype_to_index[arbitrary.dtype](arbitrary, **kwargs)
         except KeyError:
             return GenericIndex(arbitrary, **kwargs)
     elif isinstance(arbitrary, StringColumn):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c6227ce4105..aacabeed0b2 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1008,6 +1008,7 @@ def __repr__(self):
         else:
             output = preprocess.to_pandas().__repr__()
 
+        output = _fix_nullable_dtype_repr(output)
         lines = output.split("\n")
 
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
@@ -5008,3 +5009,56 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
         result_col[equal_nulls] = True
 
     return Series(result_col, index=index)
+
+def _fix_nullable_dtype_repr(string):
+
+    to_replace = [
+        'uint8',
+        'uint16',
+        'uint32', 
+        'uint64', 
+        'int8', 
+        'int16', 
+        'int32', 
+        'int64', 
+        'float32', 
+        'float64', 
+        'bool', 
+        'object', 
+        'datetime64[ns]', 
+        'datetime64[us]', 
+        'datetime64[ms]', 
+        'datetime64[s]'
+        'timedelta64[ns]',
+        'timedelta64[us]',
+        'timedelta64[ms]',
+        'timedelta64[s]'
+    ]
+
+
+    replacements = [
+        'UInt8',
+        'UInt16',
+        'UInt32',
+        'UInt64',
+        'Int8',
+        'Int16',
+        'Int32',
+        'Int64',
+        'Float32',
+        'Float64',
+        'Boolean',
+        'String',
+        'Datetime64NS',
+        'Datetime64US',
+        'Datetime64MS',
+        'Datetime64S',
+        'Timedelta64NS',
+        'Timedelta64US',
+        'Timedelta64MS',
+        'Timedelta64S'
+    ]
+
+    for tr, rp in zip(to_replace, replacements):
+        string = string.replace(tr, rp)
+    return string
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index a214588d367..f7efd680374 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pytest
 from hypothesis import given, settings, strategies as st
+from cudf.core.series import _fix_nullable_dtype_repr
 
 import cudf
 from cudf.tests import utils
@@ -46,12 +47,11 @@ def test_null_series(nrows, dtype):
         psrepr = psrepr.replace(
             str(sr._column.default_na_value()) + "\n", "<NA>\n"
         )
-    if "uint" in psrepr:
-        psrepr = psrepr.replace("uint", "UInt")
-    elif "int" in psrepr:
-        psrepr = psrepr.replace("int", "Int")
-    elif 'float' in psrepr:
-        psrepr = psrepr.replace("float", "Float")
+    from cudf.core.series import _fix_nullable_dtype_repr
+    # todo: this is kind of self-fulfilling since this is what is
+    # called inside _repr_ as well
+    psrepr = _fix_nullable_dtype_repr(psrepr)
+
     assert psrepr.split() == sr.__repr__().split()
 
 
@@ -92,7 +92,8 @@ def test_full_series(nrows, dtype):
     ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype)
     sr = cudf.from_pandas(ps)
     pd.options.display.max_rows = int(nrows)
-    assert ps.__repr__() == sr.__repr__()
+    psrepr = _fix_nullable_dtype_repr(ps.__repr__())
+    assert psrepr == sr.__repr__()
 
 
 @pytest.mark.parametrize("dtype", repr_categories)
@@ -154,9 +155,8 @@ def test_integer_dataframe(x):
 def test_integer_series(x):
     sr = cudf.Series(x)
     ps = pd.Series(x)
-    print(sr)
-    print(ps)
-    assert sr.__repr__() == ps.__repr__()
+    psrepr = _fix_nullable_dtype_repr(ps.__repr__())
+    assert sr.__repr__() == psrepr
 
 
 @given(st.lists(st.floats()))
@@ -172,7 +172,8 @@ def test_float_dataframe(x):
 def test_float_series(x):
     sr = cudf.Series(x, nan_as_null=False)
     ps = pd.Series(x)
-    assert sr.__repr__() == ps.__repr__()
+    psrepr = _fix_nullable_dtype_repr(ps.__repr__())
+    assert sr.__repr__() == psrepr
 
 
 @pytest.fixture
@@ -201,7 +202,11 @@ def test_mixed_dataframe(mixed_pdf, mixed_gdf):
 
 def test_mixed_series(mixed_pdf, mixed_gdf):
     for col in mixed_gdf.columns:
-        assert mixed_gdf[col].__repr__() == mixed_pdf[col].__repr__()
+        try:
+            assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr(mixed_pdf[col].__repr__())
+        except:
+            import pdb
+            pdb.set_trace()
 
 
 def test_MI():
@@ -253,8 +258,8 @@ def test_generic_index(length, dtype):
         index=np.random.randint(0, high=100, size=length).astype(dtype),
     )
     gsr = cudf.Series.from_pandas(psr)
-
-    assert psr.index.__repr__() == gsr.index.__repr__()
+    psrepr = _fix_nullable_dtype_repr(psr.index.__repr__())
+    assert psrepr == gsr.index.__repr__()
 
 
 @pytest.mark.parametrize(
@@ -316,23 +321,23 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
     [
         (
             cudf.Index([1, 2, 3, None]),
-            "Int64Index([1, 2, 3, <NA>], dtype='int64')",
+            "Int64Index([1, 2, 3, <NA>], dtype='Int64')",
         ),
         (
             cudf.Index([None, 2.2, 3.324342, None]),
-            "Float64Index([<NA>, 2.2, 3.324342, <NA>], dtype='float64')",
+            "Float64Index([<NA>, 2.2, 3.324342, <NA>], dtype='Float64')",
         ),
         (
             cudf.Index([None, None, None], name="hello"),
-            "Float64Index([<NA>, <NA>, <NA>], dtype='float64', name='hello')",
+            "Float64Index([<NA>, <NA>, <NA>], dtype='Float64', name='hello')",
         ),
         (
             cudf.Index([None], name="hello"),
-            "Float64Index([<NA>], dtype='float64', name='hello')",
+            "Float64Index([<NA>], dtype='Float64', name='hello')",
         ),
         (
-            cudf.Index([None], dtype="int8", name="hello"),
-            "Int8Index([<NA>], dtype='int8', name='hello')",
+            cudf.Index([None], dtype="Int8", name="hello"),
+            "Int8Index([<NA>], dtype='Int8', name='hello')",
         ),
         (
             cudf.Index([None] * 50, dtype="object"),
@@ -340,20 +345,20 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
             "None None None None None None\n None None None None None None "
             "None None None None None None None None\n None None None None "
             "None None None None None None None None None None\n None None "
-            "None None None None None None], dtype='object')",
+            "None None None None None None], dtype='String')",
         ),
         (
             cudf.Index([None] * 20, dtype="uint32"),
             "UInt32Index([<NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, "
             "<NA>,\n       <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, "
-            "<NA>,\n       <NA>, <NA>],\n      dtype='uint32')",
+            "<NA>,\n       <NA>, <NA>],\n      dtype='UInt32')",
         ),
         (
             cudf.Index(
                 [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16"
             ),
             "Int16Index([<NA>, 111, 22, 33, <NA>, 23, 34, 2343, <NA>], "
-            "dtype='int16')",
+            "dtype='Int16')",
         ),
         (
             cudf.Index([1, 2, 3, None], dtype="category"),
@@ -370,32 +375,32 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
             "DatetimeIndex([1970-01-01 00:00:00.000000010, "
             "1970-01-01 00:00:00.000000020,"
             "\n       1970-01-01 00:00:00.000000030, <NA>],\n      "
-            "dtype='datetime64[ns]')",
+            "dtype='Datetime64NS')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")),
             "DatetimeIndex([1970-01-01 00:00:10, "
             "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n"
-            "       <NA>],\n      dtype='datetime64[s]')",
+            "       <NA>],\n      dtype='Datetime64S')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")),
             "DatetimeIndex([1970-01-01 00:00:00.000010, "
             "1970-01-01 00:00:00.000020,\n       "
             "1970-01-01 00:00:00.000030, <NA>],\n      "
-            "dtype='datetime64[us]')",
+            "dtype='Datetime64US')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")),
             "DatetimeIndex([1970-01-01 00:00:00.010, "
             "1970-01-01 00:00:00.020,\n       "
             "1970-01-01 00:00:00.030, <NA>],\n      "
-            "dtype='datetime64[ms]')",
+            "dtype='Datetime64MS')",
         ),
         (
             cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")),
             "DatetimeIndex([<NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, "
-            "<NA>,\n       <NA>],\n      dtype='datetime64[ms]')",
+            "<NA>,\n       <NA>],\n      dtype='Datetime64MS')",
         ),
     ],
 )
@@ -576,7 +581,7 @@ def test_series_null_index_repr(sr, pandas_special_case):
         # Whereas cudf is consistent with strings `null` values
         # to be printed as `None` everywhere.
         actual_repr = gsr.__repr__().replace("None", "<NA>")
-    assert expected_repr.split() == actual_repr.split()
+    assert _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split()
 
 
 @pytest.mark.parametrize(
@@ -614,7 +619,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
     psr = sr.to_pandas()
 
     expected = (
-        psr.__repr__().replace("timedelta64[ns]", dtype).replace("NaT", "<NA>")
+        psr.__repr__().replace("timedelta64[ns]", str(sr.dtype)).replace("NaT", "<NA>")
     )
     actual = sr.__repr__()
 
@@ -628,7 +633,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             cudf.Series([], dtype="timedelta64[ns]"),
             textwrap.dedent(
                 """
-            Series([], dtype: timedelta64[ns])
+            Series([], dtype: Timedelta64NS)
             """
             ),
         ),
@@ -636,7 +641,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             cudf.Series([], dtype="timedelta64[ms]"),
             textwrap.dedent(
                 """
-            Series([], dtype: timedelta64[ms])
+            Series([], dtype: Timedelta64MS)
             """
             ),
         ),
@@ -647,7 +652,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             0    00:00:00.001000000
             1    00:00:00.000200000
             2    00:00:00.003000000
-            dtype: timedelta64[ns]
+            dtype: Timedelta64NS
             """
             ),
         ),
@@ -658,7 +663,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             0    00:16:40
             1    00:03:20
             2    00:50:00
-            dtype: timedelta64[ms]
+            dtype: Timedelta64MS
             """
             ),
         ),
@@ -669,7 +674,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             0    00:00:00.001000000
             1    00:00:00.000200000
             2                  <NA>
-            dtype: timedelta64[ns]
+            dtype: Timedelta64NS
             """
             ),
         ),
@@ -680,7 +685,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             0    00:16:40
             1    00:03:20
             2        <NA>
-            dtype: timedelta64[ms]
+            dtype: Timedelta64MS
             """
             ),
         ),
@@ -695,7 +700,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             2    <NA>
             3    <NA>
             4    <NA>
-            dtype: timedelta64[ns]
+            dtype: Timedelta64NS
             """
             ),
         ),
@@ -710,7 +715,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             2    <NA>
             3    <NA>
             4    <NA>
-            dtype: timedelta64[ms]
+            dtype: Timedelta64MS
             """
             ),
         ),
@@ -726,7 +731,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             3    00:00:00.000000343
             4    00:00:00.004353534
             5    00:00:00.000435342
-            dtype: timedelta64[ns]
+            dtype: Timedelta64NS
             """
             ),
         ),
@@ -742,7 +747,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             3    00:00:00.343
             4    01:12:33.534
             5    00:07:15.342
-            dtype: timedelta64[ms]
+            dtype: Timedelta64MS
             """
             ),
         ),
@@ -760,7 +765,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             4              00:00:00
             5    00:00:00.000000332
             6    00:00:00.000000323
-            dtype: timedelta64[ns]
+            dtype: Timedelta64NS
             """
             ),
         ),
@@ -778,7 +783,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             4        00:00:00
             5    00:00:00.332
             6    00:00:00.323
-            dtype: timedelta64[ms]
+            dtype: Timedelta64MS
             """
             ),
         ),
@@ -804,7 +809,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             4     11573 days 23:39:03.241
             5        42 days 01:35:48.734
             6         0 days 00:00:23.234
-            dtype: timedelta64[ms]
+            dtype: Timedelta64MS
             """
             ),
         ),
@@ -830,7 +835,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             4    00:16:39.992343241
             5    00:00:03.634548734
             6    00:00:00.000023234
-            dtype: timedelta64[ns]
+            dtype: Timedelta64NS
             """
             ),
         ),
@@ -857,7 +862,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             4     11573 days 23:39:03.241
             5        42 days 01:35:48.734
             6         0 days 00:00:23.234
-            Name: abc, dtype: timedelta64[ms]
+            Name: abc, dtype: Timedelta64MS
             """
             ),
         ),
@@ -885,7 +890,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
             y    00:16:39.992343241
             l    00:00:03.634548734
             m    00:00:00.000023234
-            Name: hello, dtype: timedelta64[ns]
+            Name: hello, dtype: Timedelta64NS
             """
             ),
         ),
@@ -1060,14 +1065,14 @@ def test_timedelta_dataframe_repr(df, expected_repr):
         (
             cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"),
             "TimedeltaIndex(['00:16:40', '00:03:20', '00:50:00'], "
-            "dtype='timedelta64[ms]')",
+            "dtype='Timedelta64MS')",
         ),
         (
             cudf.Index(
                 [None, None, None, None, None], dtype="timedelta64[us]"
             ),
             "TimedeltaIndex([<NA>, <NA>, <NA>, <NA>, <NA>], "
-            "dtype='timedelta64[us]')",
+            "dtype='Timedelta64US')",
         ),
         (
             cudf.Index(
@@ -1085,7 +1090,7 @@ def test_timedelta_dataframe_repr(df, expected_repr):
             "TimedeltaIndex([00:02:16.457654, <NA>, 00:04:05.345345, "
             "00:03:43.432411, <NA>,"
             "       01:00:34.548734, 00:00:00.023234],"
-            "      dtype='timedelta64[us]')",
+            "      dtype='Timedelta64US')",
         ),
         (
             cudf.Index(
@@ -1103,7 +1108,7 @@ def test_timedelta_dataframe_repr(df, expected_repr):
             "TimedeltaIndex([1579 days 08:54:14, <NA>, 2839 days 15:29:05,"
             "       2586 days 00:33:31, <NA>, 42066 days 12:52:14, "
             "0 days 06:27:14],"
-            "      dtype='timedelta64[s]')",
+            "      dtype='Timedelta64S')",
         ),
     ],
 )

From 8552907566dd9dbd15a1a2b99fbd83d0f32cf4c9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 31 Aug 2020 11:48:45 -0700
Subject: [PATCH 34/80] all timedelta tests pass

---
 python/cudf/cudf/_lib/types.pyx           | 105 +++++++++++++---------
 python/cudf/cudf/api/types.py             |   8 ++
 python/cudf/cudf/core/column/string.py    |   3 +-
 python/cudf/cudf/core/column/timedelta.py |  21 +++--
 python/cudf/cudf/core/dtypes.py           |   8 +-
 python/cudf/cudf/core/series.py           |   5 +-
 python/cudf/cudf/tests/test_timedelta.py  |  11 +--
 python/cudf/cudf/utils/dtypes.py          |   2 +-
 8 files changed, 97 insertions(+), 66 deletions(-)

diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 457bf16eeab..876b0021fa5 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -18,6 +18,28 @@ from cudf.core.dtypes import ListDtype
 
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.types cimport data_type
+from cudf.core.dtypes import (
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype, 
+    Int64Dtype, 
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+    Float32Dtype,
+    Float64Dtype,
+    StringDtype,
+    BooleanDtype,
+    Timedelta64NSDtype,
+    Timedelta64USDtype,
+    Timedelta64MSDtype,
+    Timedelta64SDtype,
+    Datetime64NSDtype,
+    Datetime64USDtype,
+    Datetime64MSDtype,
+    Datetime64SDtype,
+)
 
 
 class TypeId(IntEnum):
@@ -64,49 +86,49 @@ class TypeId(IntEnum):
 
 
 np_to_cudf_types = {
-    np.dtype("int8"): TypeId.INT8,
-    np.dtype("int16"): TypeId.INT16,
-    np.dtype("int32"): TypeId.INT32,
-    np.dtype("int64"): TypeId.INT64,
-    np.dtype("uint8"): TypeId.UINT8,
-    np.dtype("uint16"): TypeId.UINT16,
-    np.dtype("uint32"): TypeId.UINT32,
-    np.dtype("uint64"): TypeId.UINT64,
-    np.dtype("float32"): TypeId.FLOAT32,
-    np.dtype("float64"): TypeId.FLOAT64,
-    np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS,
-    np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS,
-    np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS,
-    np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS,
-    np.dtype("object"): TypeId.STRING,
-    np.dtype("bool"): TypeId.BOOL8,
-    np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS,
-    np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS,
-    np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS,
-    np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
+    Int8Dtype(): TypeId.INT8,
+    Int16Dtype(): TypeId.INT16,
+    Int32Dtype(): TypeId.INT32,
+    Int64Dtype(): TypeId.INT64,
+    UInt8Dtype(): TypeId.UINT8,
+    UInt16Dtype(): TypeId.UINT16,
+    UInt32Dtype(): TypeId.UINT32,
+    UInt64Dtype(): TypeId.UINT64,
+    Float32Dtype(): TypeId.FLOAT32,
+    Float64Dtype(): TypeId.FLOAT64,
+    Datetime64SDtype(): TypeId.TIMESTAMP_SECONDS,
+    Datetime64MSDtype(): TypeId.TIMESTAMP_MILLISECONDS,
+    Datetime64USDtype(): TypeId.TIMESTAMP_MICROSECONDS,
+    Datetime64NSDtype(): TypeId.TIMESTAMP_NANOSECONDS,
+    StringDtype(): TypeId.STRING,
+    BooleanDtype(): TypeId.BOOL8,
+    Timedelta64SDtype(): TypeId.DURATION_SECONDS,
+    Timedelta64MSDtype(): TypeId.DURATION_MILLISECONDS,
+    Timedelta64USDtype(): TypeId.DURATION_MICROSECONDS,
+    Timedelta64NSDtype(): TypeId.DURATION_NANOSECONDS,
 }
 
 cudf_to_np_types = {
-    TypeId.INT8: np.dtype("int8"),
-    TypeId.INT16: np.dtype("int16"),
-    TypeId.INT32: np.dtype("int32"),
-    TypeId.INT64: np.dtype("int64"),
-    TypeId.UINT8: np.dtype("uint8"),
-    TypeId.UINT16: np.dtype("uint16"),
-    TypeId.UINT32: np.dtype("uint32"),
-    TypeId.UINT64: np.dtype("uint64"),
-    TypeId.FLOAT32: np.dtype("float32"),
-    TypeId.FLOAT64: np.dtype("float64"),
-    TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"),
-    TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"),
-    TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"),
-    TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"),
-    TypeId.STRING: np.dtype("object"),
-    TypeId.BOOL8: np.dtype("bool"),
-    TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"),
-    TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"),
-    TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"),
-    TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"),
+    TypeId.INT8: Int8Dtype(),
+    TypeId.INT16: Int16Dtype(),
+    TypeId.INT32: Int32Dtype(),
+    TypeId.INT64: Int64Dtype(),
+    TypeId.UINT8: UInt8Dtype(),
+    TypeId.UINT16: UInt16Dtype(),
+    TypeId.UINT32: UInt32Dtype(),
+    TypeId.UINT64: UInt64Dtype(),
+    TypeId.FLOAT32: Float32Dtype(),
+    TypeId.FLOAT64: Float64Dtype(),
+    TypeId.TIMESTAMP_SECONDS: Datetime64SDtype(),
+    TypeId.TIMESTAMP_MILLISECONDS: Datetime64MSDtype(),
+    TypeId.TIMESTAMP_MICROSECONDS: Datetime64USDtype(),
+    TypeId.TIMESTAMP_NANOSECONDS: Datetime64NSDtype(),
+    TypeId.STRING: StringDtype(),
+    TypeId.BOOL8: BooleanDtype(),
+    TypeId.DURATION_SECONDS: Timedelta64SDtype(),
+    TypeId.DURATION_MILLISECONDS: Timedelta64MSDtype(),
+    TypeId.DURATION_MICROSECONDS: Timedelta64USDtype(),
+    TypeId.DURATION_NANOSECONDS: Timedelta64NSDtype(),
 }
 
 duration_unit_map = {
@@ -169,10 +191,9 @@ cdef class _Dtype:
         cdef data_type libcudf_type 
 
         if not isinstance(self, ListDtype):
-            np_dtype = self.to_numpy
             tid = <libcudf_types.type_id> (
                     <underlying_type_t_type_id> (
-                        np_to_cudf_types[np_dtype]
+                        np_to_cudf_types[self]
                     )
                 )
         else:
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 13494a4bdc3..df785906d0b 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -95,3 +95,11 @@ def find_common_type(array_types=[], scalar_types=[]):
     scalar_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types]
 
     return cudf.dtype(np.find_common_type(array_types, scalar_types))
+
+def can_cast(dtype_l, dtype_r):
+    if isinstance(dtype_l, cudf.Generic):
+        dtype_l = dtype_l.to_numpy
+    if isinstance(dtype_r, cudf.Generic):
+        dtype_r = dtype_r.to_numpy
+
+    return np.can_cast(dtype_l, dtype_r)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 96c064c7328..d44ebcb474f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4531,7 +4531,6 @@ def _nbytes(self):
             return self.children[1].size
 
     def as_numerical_column(self, dtype, **kwargs):
-
         out_dtype = cudf.dtype(dtype)
         kwargs.update(dtype=out_dtype)
 
@@ -4554,7 +4553,7 @@ def as_numerical_column(self, dtype, **kwargs):
                 raise ValueError("Could not convert `None` value to datetime")
 
             boolean_match = self.binary_operator("eq", "NaT")
-        elif out_dtype.type is cudf.Timedelta:
+        elif out_dtype.type is np.timedelta64:
             if "format" not in kwargs:
                 if len(self) > 0:
                     kwargs.update(format="%D days %H:%M:%S")
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index b20d943d30c..25d0e711bbb 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -14,7 +14,7 @@
 from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils.dtypes import is_scalar, np_to_pa_dtype
 from cudf.utils.utils import buffers_from_pyarrow
-
+from cudf.api.types import can_cast
 _dtype_to_format_conversion = {
     "Timedelta64NS": "%D days %H:%M:%S",
     "Timedelta64US": "%D days %H:%M:%S",
@@ -118,13 +118,13 @@ def _binary_op_floordiv(self, rhs):
                     if isinstance(rhs, Scalar):
                         rhs = np.timedelta64(rhs.value)
 
-                    rhs = rhs.astype(common_dtype).astype("float64")
+                    rhs = rhs.astype(common_dtype.to_numpy).astype("float64")
                 else:
                     rhs = as_scalar(None, "float64")
             else:
-                rhs = rhs.astype(common_dtype).astype("float64")
+                rhs = rhs.astype(common_dtype.to_numpy).astype("float64")
 
-            out_dtype = np.dtype("int64")
+            out_dtype = cudf.Int64Dtype()
         elif rhs.dtype.kind in ("f", "i", "u"):
             out_dtype = self.dtype
         else:
@@ -187,13 +187,13 @@ def _binary_op_truediv(self, rhs):
                     if isinstance(rhs, Scalar):
                         rhs = np.timedelta64(rhs.value)
 
-                    rhs = rhs.astype(common_dtype).astype("float64")
+                    rhs = rhs.astype(common_dtype.to_numpy).astype("float64")
                 else:
                     rhs = as_scalar(None, "float64")
             else:
                 rhs = rhs.astype(common_dtype).astype("float64")
 
-            out_dtype = np.dtype("float64")
+            out_dtype = cudf.Float64Dtype()
         elif rhs.dtype.kind in ("f", "i", "u"):
             out_dtype = self.dtype
         else:
@@ -206,7 +206,6 @@ def _binary_op_truediv(self, rhs):
 
     def binary_operator(self, op, rhs, reflect=False):
         lhs, rhs = self, rhs
-
         if op in ("eq", "ne"):
             out_dtype = self._binary_op_eq_ne(rhs)
         elif op in ("lt", "gt", "le", "ge"):
@@ -251,7 +250,7 @@ def normalize_binop_value(self, other):
                 other = other.astype("timedelta64[s]")
             else:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
-                other = other.astype(common_dtype)
+                other = other.astype(common_dtype.to_numpy)
             return as_scalar(other)
         elif np.isscalar(other):
             return as_scalar(other)
@@ -298,7 +297,7 @@ def fillna(self, fill_value):
         if is_scalar(fill_value):
             if isinstance(fill_value, np.timedelta64):
                 dtype = determine_out_dtype(self.dtype, fill_value.dtype)
-                fill_value = fill_value.astype(dtype)
+                fill_value = fill_value.astype(dtype.to_numpy)
                 col = col.astype(dtype)
             elif not isinstance(fill_value, Scalar):
                 fill_value = np.timedelta64(fill_value)
@@ -572,9 +571,9 @@ def binop(lhs, rhs, op, out_dtype):
 
 
 def determine_out_dtype(lhs_dtype, rhs_dtype):
-    if np.can_cast(cudf.dtype(lhs_dtype).to_numpy, cudf.dtype(rhs_dtype).to_numpy):
+    if can_cast(lhs_dtype, rhs_dtype):
         return cudf.dtype(rhs_dtype)
-    elif np.can_cast(cudf.dtype(rhs_dtype).to_numpy, cudf.dtype(lhs_dtype).to_numpy):
+    elif can_cast(rhs_dtype, lhs_dtype):
         return cudf.dtype(lhs_dtype)
     else:
         raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}")
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 5ac2fae2c03..e369494fcf9 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -301,7 +301,7 @@ def make_dtype_from_string(obj):
         elif obj in {"uint8", "UInt8"}:
             return UInt8Dtype()
     elif "float" in obj or "Float" in obj:
-        if obj in {"float64", "Float64"}:
+        if obj in {"float64", "Float64", 'float', 'Float'}:
             return Float64Dtype()
         elif obj in {"float32", "Float32"}:
             return Float32Dtype()
@@ -342,7 +342,11 @@ def dtype(obj):
     if isinstance(obj, np.dtype):
         if obj.type is np.str_:
             return StringDtype()
-        return np_to_cudf_dtypes[obj]
+        try:
+            return np_to_cudf_dtypes[obj]
+        except KeyError:
+            import pdb
+            pdb.set_trace()
     elif isinstance(obj, pa.lib.DataType):
         return pa_to_cudf_dtypes[obj]
     elif isinstance(obj, str):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index aacabeed0b2..496c20fb677 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1381,7 +1381,7 @@ def __rtruediv__(self, other):
     __div__ = __truediv__
 
     def _bitwise_binop(self, other, op):
-        if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer))):
+        if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))):
             ser = self._binaryop(other, op)
             if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(other.dtype, cudf.BooleanDtype):
                 ser = ser.astype(cudf.BooleanDtype())
@@ -5028,7 +5028,7 @@ def _fix_nullable_dtype_repr(string):
         'datetime64[ns]', 
         'datetime64[us]', 
         'datetime64[ms]', 
-        'datetime64[s]'
+        'datetime64[s]',
         'timedelta64[ns]',
         'timedelta64[us]',
         'timedelta64[ms]',
@@ -5058,7 +5058,6 @@ def _fix_nullable_dtype_repr(string):
         'Timedelta64MS',
         'Timedelta64S'
     ]
-
     for tr, rp in zip(to_replace, replacements):
         string = string.replace(tr, rp)
     return string
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 8a3e7acd68d..634517368d5 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -976,7 +976,7 @@ def test_timedelta_invalid_ops():
         with pytest.raises(
             TypeError,
             match=re.escape(
-                f"Addition of {sr.dtype} with {np.dtype('int64')} "
+                f"Addition of {sr.dtype} with Int64 "
                 f"cannot be performed."
             ),
         ):
@@ -990,7 +990,7 @@ def test_timedelta_invalid_ops():
         with pytest.raises(
             TypeError,
             match=re.escape(
-                f"Addition of {sr.dtype} with {np.dtype('object')} "
+                f"Addition of {sr.dtype} with String "
                 f"cannot be performed."
             ),
         ):
@@ -1021,7 +1021,7 @@ def test_timedelta_invalid_ops():
         with pytest.raises(
             TypeError,
             match=re.escape(
-                f"Modulus of {sr.dtype} with {np.dtype('object')} "
+                f"Modulus of {sr.dtype} with String "
                 f"cannot be performed."
             ),
         ):
@@ -1157,13 +1157,14 @@ def test_timedelta_invalid_ops():
 
 
 def test_timedelta_datetime_cast_invalid():
+    from cudf.core.series import _fix_nullable_dtype_repr
     sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
     psr = sr.to_pandas()
 
     try:
         psr.astype("datetime64[ns]")
     except TypeError as e:
-        with pytest.raises(type(e), match=re.escape(e.__str__())):
+        with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))):
             sr.astype("datetime64[ns]")
     else:
         raise AssertionError("Expected timedelta to datetime typecast to fail")
@@ -1174,7 +1175,7 @@ def test_timedelta_datetime_cast_invalid():
     try:
         psr.astype("timedelta64[ns]")
     except TypeError as e:
-        with pytest.raises(type(e), match=re.escape(e.__str__())):
+        with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))):
             sr.astype("timedelta64[ns]")
     else:
         raise AssertionError("Expected datetime to timedelta typecast to fail")
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 7efc0cd049f..67463b7317f 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -360,7 +360,7 @@ def check_cast_unsupported_dtype(dtype):
     else:
         dtype = np.dtype(dtype)
 
-    if dtype in cudf._lib.types.np_to_cudf_types:
+    if cudf.dtype(dtype) in cudf._lib.types.np_to_cudf_types:
         return dtype
 
     if dtype == np.dtype("float16"):

From 2b59285ed7e12e2faadf724b31b904a40a530aa6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 31 Aug 2020 13:14:32 -0700
Subject: [PATCH 35/80] sorting tests pass

---
 python/cudf/cudf/_lib/string_casting.pyx | 35 ++++++++++++++++--------
 python/cudf/cudf/api/types.py            |  8 ++++++
 python/cudf/cudf/core/indexing.py        |  4 +--
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index aee554017af..3f63bb23d6f 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -52,7 +52,18 @@ from cudf._lib.cpp.types cimport (
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
+from cudf.core.dtypes import (
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype, 
+    Int64Dtype, 
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+    Float32Dtype,
+    Float64Dtype,
+)
 
 def floating_to_string(Column input_col):
     cdef column_view input_column_view = input_col.view()
@@ -112,7 +123,7 @@ def stod(Column input_col, **kwargs):
     A Column with strings cast to double
     """
 
-    return string_to_floating(input_col, np.dtype("float64"))
+    return string_to_floating(input_col, Float64Dtype())
 
 
 def ftos(Column input_col, **kwargs):
@@ -144,7 +155,7 @@ def stof(Column input_col, **kwargs):
     A Column with strings cast to float
     """
 
-    return string_to_floating(input_col, np.dtype("float32"))
+    return string_to_floating(input_col, Float32Dtype())
 
 
 def integer_to_string(Column input_col):
@@ -205,7 +216,7 @@ def stoi8(Column input_col, **kwargs):
     A Column with strings cast to int8
     """
 
-    return string_to_integer(input_col, np.dtype("int8"))
+    return string_to_integer(input_col, Int8Dtype())
 
 
 def i16tos(Column input_col, **kwargs):
@@ -237,7 +248,7 @@ def stoi16(Column input_col, **kwargs):
     A Column with strings cast to int16
     """
 
-    return string_to_integer(input_col, np.dtype("int16"))
+    return string_to_integer(input_col, Int16Dtype())
 
 
 def itos(Column input_col, **kwargs):
@@ -269,7 +280,7 @@ def stoi(Column input_col, **kwargs):
     A Column with strings cast to int32
     """
 
-    return string_to_integer(input_col, np.dtype("int32"))
+    return string_to_integer(input_col, Int32Dtype())
 
 
 def ltos(Column input_col, **kwargs):
@@ -301,7 +312,7 @@ def stol(Column input_col, **kwargs):
     A Column with strings cast to int64
     """
 
-    return string_to_integer(input_col, np.dtype("int64"))
+    return string_to_integer(input_col, Int64Dtype())
 
 
 def ui8tos(Column input_col, **kwargs):
@@ -333,7 +344,7 @@ def stoui8(Column input_col, **kwargs):
     A Column with strings cast to uint8
     """
 
-    return string_to_integer(input_col, np.dtype("uint8"))
+    return string_to_integer(input_col, UInt8Dtype())
 
 
 def ui16tos(Column input_col, **kwargs):
@@ -365,7 +376,7 @@ def stoui16(Column input_col, **kwargs):
     A Column with strings cast to uint16
     """
 
-    return string_to_integer(input_col, np.dtype("uint16"))
+    return string_to_integer(input_col, UInt16Dtype())
 
 
 def uitos(Column input_col, **kwargs):
@@ -397,7 +408,7 @@ def stoui(Column input_col, **kwargs):
     A Column with strings cast to uint32
     """
 
-    return string_to_integer(input_col, np.dtype("uint32"))
+    return string_to_integer(input_col, UInt32Dtype())
 
 
 def ultos(Column input_col, **kwargs):
@@ -429,7 +440,7 @@ def stoul(Column input_col, **kwargs):
     A Column with strings cast to uint64
     """
 
-    return string_to_integer(input_col, np.dtype("uint64"))
+    return string_to_integer(input_col, UInt64Dtype())
 
 
 def _to_booleans(Column input_col, object string_true="True"):
@@ -717,7 +728,7 @@ def htoi(Column input_col, **kwargs):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[kwargs.get('dtype', np.dtype("int64"))]
+            np_to_cudf_types[kwargs.get('dtype', Int64Dtype())]
         )
     )
     cdef data_type c_out_type = data_type(tid)
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index df785906d0b..a89adf10c22 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -103,3 +103,11 @@ def can_cast(dtype_l, dtype_r):
         dtype_r = dtype_r.to_numpy
 
     return np.can_cast(dtype_l, dtype_r)
+
+def result_type(dtype_l, dtype_r):
+    if isinstance(dtype_l, cudf.Generic):
+        dtype_l = dtype_l.to_numpy
+    if isinstance(dtype_r, cudf.Generic):
+        dtype_r = dtype_r.to_numpy
+
+    return cudf.dtype(np.result_type(dtype_l, dtype_r))
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index dcc9ddf9315..ba388b45f21 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -95,8 +95,8 @@ def __setitem__(self, key, value):
         ):
             # normalize types if necessary:
             if not pd.api.types.is_integer(key):
-                to_dtype = np.result_type(value.dtype, self._sr._column.dtype)
-                value = value.astype(to_dtype)
+                to_dtype = cudf.api.types.result_type(value.dtype, self._sr._column.dtype)
+                value = value.astype(to_dtype.to_numpy)
                 self._sr._column._mimic_inplace(
                     self._sr._column.astype(to_dtype), inplace=True
                 )

From b2851a2fa4248626bef45be06d0a9f77862ffa94 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 31 Aug 2020 13:45:16 -0700
Subject: [PATCH 36/80] fix more tests

---
 python/cudf/cudf/_lib/binaryop.pyx        | 11 ++---------
 python/cudf/cudf/core/column/column.py    |  4 ++--
 python/cudf/cudf/tests/test_udf_binops.py |  3 ++-
 python/cudf/cudf/utils/dtypes.py          |  2 +-
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 4323e1f4b79..18c72da25f9 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -224,15 +224,8 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     """
     cdef column_view c_lhs = lhs.view()
     cdef column_view c_rhs = rhs.view()
-
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
-    cdef data_type c_dtype = data_type(tid)
+    cdef _Dtype pydtype = dtype
+    cdef data_type c_dtype = pydtype.get_libcudf_type()
 
     cdef string cpp_str = udf_ptx.encode("UTF-8")
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 94f05935537..da4c9ab06b3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -27,7 +27,7 @@
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
-    NUMERIC_TYPES,
+    NEW_NUMERIC_TYPES,
     check_cast_unsupported_dtype,
     get_time_unit,
     is_scalar,
@@ -119,7 +119,7 @@ def __len__(self):
         return self.size
 
     def to_pandas(self, index=None, **kwargs):
-        if str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
+        if str(self.dtype) in NEW_NUMERIC_TYPES and self.null_count == 0:
             pd_series = pd.Series(cupy.asnumpy(self.values))
         else:
             pd_series = self.to_arrow().to_pandas(**kwargs)
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index e3d03b80ae2..dfae94f1279 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+import cudf
 from cudf import _lib as libcudf
 from cudf.core import Series
 from cudf.utils import dtypes as dtypeutils
@@ -44,7 +45,7 @@ def generic_function(a, b):
     output_type = numpy_support.as_dtype(result.signature.return_type)
 
     out_col = libcudf.binaryop.binaryop_udf(
-        lhs_col, rhs_col, ptx_code, output_type.type
+        lhs_col, rhs_col, ptx_code, cudf.dtype(output_type.type)
     )
 
     result = lhs_arr ** 3 + rhs_arr
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 67463b7317f..bd5a1f4ab2c 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -67,7 +67,7 @@
 }
 OTHER_TYPES = {"bool", "category", "str"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
-
+NEW_NUMERIC_TYPES = {'Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Float32', 'Float64'}
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype.

From 9540643209fb9d21a5466081eb1233f42896e599 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Sep 2020 07:11:08 -0700
Subject: [PATCH 37/80] hackily pass select_dtype tests

---
 python/cudf/cudf/core/dataframe.py       | 14 +++++++-------
 python/cudf/cudf/core/series.py          |  2 +-
 python/cudf/cudf/tests/test_dataframe.py |  6 ++----
 python/cudf/cudf/tests/test_setitem.py   |  2 +-
 python/cudf/cudf/utils/dtypes.py         | 10 +++++++++-
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7f998784d34..93956f35a17 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3206,7 +3206,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
         for colidx, inpcol in enumerate(cols):
             dense = inpcol.astype(cupy_dtype)
             matrix[:, colidx] = dense
-        return cuda.as_cuda_array(matrix).view(dtype)
+        return cuda.as_cuda_array(matrix).view(cupy_dtype)
 
     def as_matrix(self, columns=None):
         """Convert to a matrix in host memory.
@@ -6324,20 +6324,20 @@ def select_dtypes(self, include=None, exclude=None):
                     inc_ex=(include & exclude)
                 )
             )
-
         # include all subtypes
+
         include_subtypes = set()
-        for dtype in self.dtypes:
+        for dtype in (d.__class__ for d in self.dtypes):
             for i_dtype in include:
                 # category handling
                 if is_categorical_dtype(i_dtype):
                     include_subtypes.add(i_dtype)
-                elif isinstance(dtype, i_dtype):
-                    include_subtypes.add(dtype.__class__)
+                elif issubclass(dtype, i_dtype):
+                    include_subtypes.add(dtype)
     
         # exclude all subtypes
         exclude_subtypes = set()
-        for dtype in self.dtypes:
+        for dtype in (d.__class__ for d in self.dtypes):
             for e_dtype in exclude:
                 # category handling
                 if is_categorical_dtype(e_dtype):
@@ -6367,7 +6367,7 @@ def select_dtypes(self, include=None, exclude=None):
     @ioutils.doc_to_parquet()
     def to_parquet(self, path, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import parquet as pq
+        from cudf.io import parquet as pq6
 
         return pq.to_parquet(self, path, *args, **kwargs)
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 496c20fb677..bfe1cc74151 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3902,7 +3902,7 @@ def describe_categorical(self):
             # pandas defaults
             percentiles = np.array([0.25, 0.5, 0.75])
 
-        if np.issubdtype(self.dtype, np.number):
+        if isinstance(self.dtype, cudf.Number):
             return describe_numeric(self)
         else:
             raise NotImplementedError(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 000827cc9c8..6bb927bd51c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -25,6 +25,7 @@
     does_not_raise,
     gen_rand,
 )
+from cudf.core.dtypes import Number
 
 
 def test_init_via_list_of_tuples():
@@ -2516,15 +2517,12 @@ def test_select_dtype():
         nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str}
     )
     pdf = gdf.to_pandas()
-
     assert_eq(pdf.select_dtypes("float64"), gdf.select_dtypes("float64"))
     assert_eq(pdf.select_dtypes(np.float64), gdf.select_dtypes(np.float64))
     assert_eq(
         pdf.select_dtypes(include=["float64"]),
         gdf.select_dtypes(include=["float64"]),
     )
-    import pdb
-    pdb.set_trace()
     assert_eq(
         pdf.select_dtypes(include=["object", "int", "category"]),
         gdf.select_dtypes(include=["object", "int", "category"]),
@@ -2536,7 +2534,7 @@ def test_select_dtype():
     )
     assert_eq(
         pdf.select_dtypes(include=np.number),
-        gdf.select_dtypes(include=np.number),
+        gdf.select_dtypes(include=Number),
     )
     assert_eq(
         pdf.select_dtypes(include=[np.int64, np.float64]),
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 9ae5c17da47..e0d35f2eb5c 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -143,7 +143,7 @@ def test_series_set_equal_length_object_by_mask(replace_data):
     # Lengths match in trivial case
     pd_bool_col = pd.Series([True] * len(psr))
     gd_bool_col = Series.from_pandas(pd_bool_col)
-
+    
     psr[pd_bool_col] = (
         replace_data.to_pandas()
         if hasattr(replace_data, "to_pandas")
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index bd5a1f4ab2c..a9c265247c8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -13,6 +13,7 @@
 import cudf
 from cudf._lib.scalar import Scalar
 from cudf.api.types import is_categorical_dtype
+import inspect
 
 
 _NA_REP = "<NA>"
@@ -139,12 +140,19 @@ def cudf_dtype_from_pydata_dtype(dtype):
     """
     if isinstance(dtype, cudf.Generic):
         return dtype.__class__
+    if inspect.isclass(dtype):
+        if issubclass(dtype, cudf.Generic):
+            return dtype
     if is_categorical_dtype(dtype):
         return cudf.core.dtypes.CategoricalDtype
     elif np.issubdtype(dtype, np.datetime64):
         dtype = np.datetime64
 
-    return cudf.dtype(infer_dtype_from_object(dtype)).__class__
+    result = cudf.dtype(infer_dtype_from_object(dtype))
+    if isinstance(result, cudf.Generic): 
+        return result.__class__
+    elif inspect.isclass(result):
+        return result
 
 
 def is_scalar(val):

From 781b42ee8bc300d787c4fc6e15cc0913cb66e757 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Sep 2020 11:41:32 -0700
Subject: [PATCH 38/80] all dataframe tests pass!

---
 python/cudf/cudf/core/column/column.py    |  3 +-
 python/cudf/cudf/core/column/numerical.py |  2 +-
 python/cudf/cudf/core/dataframe.py        | 10 ++--
 python/cudf/cudf/core/dtypes.py           | 10 ++++
 python/cudf/cudf/core/series.py           |  4 +-
 python/cudf/cudf/tests/test_dataframe.py  | 73 ++++++++++++-----------
 python/cudf/cudf/utils/dtypes.py          |  4 ++
 7 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index da4c9ab06b3..fb4362c7677 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -769,7 +769,7 @@ def isin(self, values):
             lhs_cats = lhs.cat().categories._values
             rhs_cats = rhs.cat().categories._values
 
-            if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype):
+            if not isinstance(rhs_cats.dtype, type(lhs_cats.dtype)):
                 # If they're not the same dtype, short-circuit if the values
                 # list doesn't have any nulls. If it does have nulls, make
                 # the values list a Categorical with a single null
@@ -1083,7 +1083,6 @@ def column_empty(row_count, dtype="object", masked=False):
     """
     dtype = pd.api.types.pandas_dtype(dtype)
     children = ()
-
     if is_categorical_dtype(dtype):
         data = None
         children = (
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index d467d0d0ddb..ccab749e3ba 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -233,7 +233,7 @@ def round(self, decimals=0):
             msg = "Decimal values < 0 are not yet supported."
             raise NotImplementedError(msg)
 
-        if np.issubdtype(self.dtype, np.integer):
+        if isinstance(self.dtype, cudf.Integer):
             return self
 
         data = Buffer(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 93956f35a17..1bcbea8d8be 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4522,7 +4522,7 @@ def _sizeof_fmt(num, size_qualifier):
                 deep = True
             else:
                 deep = False
-                if "object" in dtype_counts or self.index.dtype == "object":
+                if "String" in dtype_counts or self.index.dtype == cudf.StringDtype():
                     size_qualifier = "+"
             mem_usage = self.memory_usage(index=True, deep=deep).sum()
             lines.append(
@@ -5308,12 +5308,12 @@ def isin(self, values):
                     isinstance(
                         self[col]._column, cudf.core.column.CategoricalColumn
                     )
-                    or np.issubdtype(self[col].dtype, np.dtype("object"))
+                    or isinstance(self[col].dtype, cudf.StringDtype)
                 ) or (
                     isinstance(
                         values._column, cudf.core.column.CategoricalColumn
                     )
-                    or np.issubdtype(values.dtype, np.dtype("object"))
+                    or isinstance(values.dtype, cudf.StringDtype)
                 ):
                     result[col] = utils.scalar_broadcast_to(False, len(self))
                 else:
@@ -5371,8 +5371,8 @@ def _prepare_for_rowwise_op(self):
             )
             raise ValueError(msg)
 
-        filtered = self.select_dtypes(include=[np.number, np.bool])
-        common_dtype = np.find_common_type(filtered.dtypes, [])
+        filtered = self.select_dtypes(include=[cudf.Number, cudf.BooleanDtype])
+        common_dtype = cudf.api.types.find_common_type(filtered.dtypes, [])
         coerced = filtered.astype(common_dtype)
         return coerced
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index e369494fcf9..80c8d9ce7e8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -329,6 +329,12 @@ def dtype(obj):
 
     if obj is None:
         return None
+    if obj is str:
+        return cudf.StringDtype()
+    if obj is int:
+        return cudf.Int64Dtype()
+    if obj is float:
+        return cudf.Float64Dtype()
     if isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
     if isinstance(obj, CategoricalDtype):
@@ -478,6 +484,10 @@ def deserialize(cls, header, frames):
         )
         return cls(categories=categories, ordered=ordered)
 
+    @property
+    def kind(self):
+        return 'O'
+
 
 class ListDtype(Generic):
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index bfe1cc74151..a7dd73fbc4a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3945,7 +3945,7 @@ def diff(self, periods=1):
                 "Diff currently requires columns with no null values"
             )
 
-        if not np.issubdtype(self.dtype, np.number):
+        if not isinstance(self.dtype, cudf.Number):
             raise NotImplementedError(
                 "Diff currently only supports numeric dtypes"
             )
@@ -3953,7 +3953,7 @@ def diff(self, periods=1):
         # TODO: move this libcudf
         input_col = self._column
         output_col = column_empty_like(input_col)
-        output_mask = column_empty_like(input_col, dtype="bool")
+        output_mask = column_empty_like(input_col, dtype=cudf.BooleanDtype())
         if output_col.size > 0:
             cudautils.gpu_diff.forall(output_col.size)(
                 input_col, output_col, output_mask, periods
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6bb927bd51c..60f7d93e467 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3237,30 +3237,31 @@ def test_empty_dataframe_describe():
 
 def test_as_column_types():
     from cudf.core.column import column
+    from cudf import Float32Dtype, Float64Dtype, StringDtype
 
     col = column.as_column(Series([]))
-    assert_eq(col.dtype, np.dtype("float64"))
+    assert isinstance(col.dtype, Float64Dtype)
     gds = Series(col)
     pds = pd.Series(pd.Series([]))
 
     assert_eq(pds, gds)
 
     col = column.as_column(Series([]), dtype="float32")
-    assert_eq(col.dtype, np.dtype("float32"))
+    assert isinstance(col.dtype, Float32Dtype)
     gds = Series(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(Series([]), dtype="str")
-    assert_eq(col.dtype, np.dtype("object"))
+    assert isinstance(col.dtype, StringDtype)
     gds = Series(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(Series([]), dtype="object")
-    assert_eq(col.dtype, np.dtype("object"))
+    assert isinstance(col.dtype, StringDtype)
     gds = Series(col)
     pds = pd.Series(pd.Series([], dtype="object"))
 
@@ -5619,17 +5620,17 @@ def test_dataframe_info_basic():
     Data columns (total 10 columns):
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
-     0   0       10 non-null     float64
-     1   1       10 non-null     float64
-     2   2       10 non-null     float64
-     3   3       10 non-null     float64
-     4   4       10 non-null     float64
-     5   5       10 non-null     float64
-     6   6       10 non-null     float64
-     7   7       10 non-null     float64
-     8   8       10 non-null     float64
-     9   9       10 non-null     float64
-    dtypes: float64(10)
+     0   0       10 non-null     Float64
+     1   1       10 non-null     Float64
+     2   2       10 non-null     Float64
+     3   3       10 non-null     Float64
+     4   4       10 non-null     Float64
+     5   5       10 non-null     Float64
+     6   6       10 non-null     Float64
+     7   7       10 non-null     Float64
+     8   8       10 non-null     Float64
+     9   9       10 non-null     Float64
+    dtypes: Float64(10)
     memory usage: 859.0+ bytes
     """
     )
@@ -5652,9 +5653,9 @@ def test_dataframe_info_verbose_mem_usage():
     Data columns (total 2 columns):
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
-     0   a       3 non-null      int64
-     1   b       3 non-null      object
-    dtypes: int64(1), object(1)
+     0   a       3 non-null      Int64
+     1   b       3 non-null      String
+    dtypes: Int64(1), String(1)
     memory usage: 56.0+ bytes
     """
     )
@@ -5670,7 +5671,7 @@ def test_dataframe_info_verbose_mem_usage():
     <class 'cudf.core.dataframe.DataFrame'>
     RangeIndex: 3 entries, 0 to 2
     Columns: 2 entries, a to b
-    dtypes: int64(1), object(1)
+    dtypes: Int64(1), String(1)
     memory usage: 56.0+ bytes
     """
     )
@@ -5692,9 +5693,9 @@ def test_dataframe_info_verbose_mem_usage():
     Data columns (total 2 columns):
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
-     0   a       3 non-null      int64
-     1   b       3 non-null      object
-    dtypes: int64(1), object(1)
+     0   a       3 non-null      Int64
+     1   b       3 non-null      String
+    dtypes: Int64(1), String(1)
     memory usage: 91.0 bytes
     """
     )
@@ -5723,10 +5724,10 @@ def test_dataframe_info_verbose_mem_usage():
     Data columns (total 3 columns):
      #   Column     Non-Null Count  Dtype
     ---  ------     --------------  -----
-     0   int_col    5 non-null      int64
-     1   text_col   5 non-null      object
-     2   float_col  5 non-null      float64
-    dtypes: float64(1), int64(1), object(1)
+     0   int_col    5 non-null      Int64
+     1   text_col   5 non-null      String
+     2   float_col  5 non-null      Float64
+    dtypes: Float64(1), Int64(1), String(1)
     memory usage: 130.0 bytes
     """
     )
@@ -5758,10 +5759,10 @@ def test_dataframe_info_null_counts():
     Data columns (total 3 columns):
      #   Column     Dtype
     ---  ------     -----
-     0   int_col    int64
-     1   text_col   object
-     2   float_col  float64
-    dtypes: float64(1), int64(1), object(1)
+     0   int_col    Int64
+     1   text_col   String
+     2   float_col  Float64
+    dtypes: Float64(1), Int64(1), String(1)
     memory usage: 130.0+ bytes
     """
     )
@@ -5808,9 +5809,9 @@ def test_dataframe_info_null_counts():
     Data columns (total 2 columns):
      #   Column  Dtype
     ---  ------  -----
-     0   a       int64
-     1   b       object
-    dtypes: int64(1), object(1)
+     0   a       Int64
+     1   b       String
+    dtypes: Int64(1), String(1)
     memory usage: 238.0+ bytes
     """
     )
@@ -5830,9 +5831,9 @@ def test_dataframe_info_null_counts():
     Data columns (total 2 columns):
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
-     0   a       6 non-null      int64
-     1   b       6 non-null      object
-    dtypes: int64(1), object(1)
+     0   a       6 non-null      Int64
+     1   b       6 non-null      String
+    dtypes: Int64(1), String(1)
     memory usage: 238.0+ bytes
     """
     )
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index a9c265247c8..7f609797397 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -360,6 +360,10 @@ def min_column_type(x, expected_type):
 
 
 def check_cast_unsupported_dtype(dtype):
+
+    if isinstance(dtype, cudf.Generic):
+        return dtype.to_numpy
+
     if is_categorical_dtype(dtype):
         return dtype
 

From 13fe291dbe9965d454a28ff4f98d1d13b8da7ad2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 2 Sep 2020 08:14:48 -0700
Subject: [PATCH 39/80] much more progress

---
 python/cudf/cudf/_lib/transform.pyx        | 14 +++++---------
 python/cudf/cudf/api/types.py              |  2 ++
 python/cudf/cudf/core/column/column.py     |  9 +++++++--
 python/cudf/cudf/core/column/datetime.py   |  7 ++++---
 python/cudf/cudf/core/column/numerical.py  |  3 +++
 python/cudf/cudf/core/dataframe.py         |  6 +++++-
 python/cudf/cudf/core/dtypes.py            |  7 ++-----
 python/cudf/cudf/core/indexing.py          |  9 ++++-----
 python/cudf/cudf/core/join/join.py         |  3 +++
 python/cudf/cudf/core/tools/datetimes.py   |  4 ++--
 python/cudf/cudf/tests/test_categorical.py | 15 ++++++++-------
 python/cudf/cudf/tests/test_datetime.py    |  5 +++--
 python/cudf/cudf/tests/test_feather.py     |  1 -
 python/cudf/cudf/utils/dtypes.py           |  5 +++++
 14 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 52a83744fce..8fafa166471 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -24,7 +24,7 @@ from cudf._lib.cpp.types cimport (
     type_id,
 )
 from cudf._lib.types import np_to_cudf_types
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, _Dtype
 from cudf._lib.cpp.column.column_view cimport column_view
 
 try:
@@ -102,14 +102,10 @@ def transform(Column input, op):
     compiled_op = cudautils.compile_udf(op, nb_signature)
     c_str = compiled_op[0].encode('UTF-8')
     np_dtype = np.dtype(compiled_op[1])
-
-    try:
-        c_tid = <type_id> (
-            <underlying_type_t_type_id> np_to_cudf_types[np_dtype]
-        )
-        c_dtype = data_type(c_tid)
-
-    except KeyError:
+    cdef _Dtype pydtype = cudf.dtype(np_dtype)
+    if pydtype in np_to_cudf_types.keys():
+        c_dtype = pydtype.get_libcudf_type()
+    else:
         raise TypeError(
             "Result of window function has unsupported dtype {}"
             .format(np_dtype)
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index a89adf10c22..54e2d64b80e 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -5,6 +5,8 @@
 
 def is_bool_dtype(obj):
     # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably
+    if hasattr(obj, "dtype"):
+        obj = obj.dtype
     return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(obj)
 
 def is_datetime64_dtype(obj):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index fb4362c7677..86eb7467db5 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1624,8 +1624,13 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                     dtype = pd.api.types.pandas_dtype(dtype)
                     if is_categorical_dtype(dtype):
                         raise TypeError
-                pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null)
-                data = as_column(pa_data, dtype=cudf.dtype(pa_data.type), nan_as_null=nan_as_null)
+
+                pa_data = pa.array(arbitrary,
+                                   type=dtype.pa_type if dtype is not None else None, 
+                                   from_pandas=True if nan_as_null is None else nan_as_null)
+                # todo: fix this ???? ????????
+                as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, pa.lib.DictionaryType) else None
+                data = as_column(pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null)
 
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                 if is_categorical_dtype(dtype):
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index ba652c3d3d2..95940df5944 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -293,10 +293,11 @@ def from_arrow(cls, array, dtype=None):
         )
 
     def can_cast_safely(self, to_dtype):
-        if np.issubdtype(to_dtype, np.datetime64):
+        to_dtype = cudf.dtype(to_dtype)
+        if isinstance(to_dtype, cudf.Datetime):
 
-            to_res, _ = np.datetime_data(to_dtype)
-            self_res, _ = np.datetime_data(self.dtype)
+            to_res, _ = np.datetime_data(to_dtype.to_numpy)
+            self_res, _ = np.datetime_data(self.dtype.to_numpy)
 
             max_int = np.iinfo(np.dtype("int64")).max
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ccab749e3ba..71787be695e 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -174,6 +174,9 @@ def as_numerical_column(self, dtype, **kwargs):
         # expect a cudf dtype always here
         if dtype == self.dtype:
             return self
+        if dtype is None:
+            import pdb
+            pdb.set_trace()
         return libcudf.unary.cast(self, dtype)
 
     @classmethod
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1bcbea8d8be..833edc9e1c3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4821,8 +4821,12 @@ def to_arrow(self, preserve_index=True):
 
         # We may want to add additional metadata to this in the future, but
         # for now lets just piggyback off of what's done for Pandas
+
+        # egregious hack
+        metadata_df = self.head(0).to_pandas()
+
         metadata = pa.pandas_compat.construct_metadata(
-            self,
+            metadata_df,
             names,
             index_columns,
             index_descriptors,
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 80c8d9ce7e8..b47281e5c36 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -348,11 +348,8 @@ def dtype(obj):
     if isinstance(obj, np.dtype):
         if obj.type is np.str_:
             return StringDtype()
-        try:
-            return np_to_cudf_dtypes[obj]
-        except KeyError:
-            import pdb
-            pdb.set_trace()
+        else:
+            return np_to_cudf_dtypes.get(obj, None)
     elif isinstance(obj, pa.lib.DataType):
         return pa_to_cudf_dtypes[obj]
     elif isinstance(obj, str):
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index ba388b45f21..9481d2a83e3 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -207,13 +207,13 @@ def _can_downcast_to_series(self, df, arg):
             ):
                 return False
             else:
-                if pd.api.types.is_bool_dtype(
+                if cudf.api.types.is_bool_dtype(
                     as_column(arg[0]).dtype
                 ) and not isinstance(arg[1], slice):
                     return True
             dtypes = df.dtypes.values.tolist()
             all_numeric = all(
-                [pd.api.types.is_numeric_dtype(t) for t in dtypes]
+                [cudf.api.types.is_numerical_dtype(t) for t in dtypes]
             )
             if all_numeric:
                 return True
@@ -316,8 +316,7 @@ def _getitem_tuple_arg(self, arg):
                 if len(tmp_arg[0]) == 0:
                     return columns_df._empty_like(keep_index=True)
                 tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])
-
-                if pd.api.types.is_bool_dtype(tmp_arg[0]):
+                if cudf.api.types.is_bool_dtype(tmp_arg[0]):
                     df = columns_df._apply_boolean_mask(tmp_arg[0])
                 else:
                     tmp_col_name = str(uuid4())
@@ -344,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
                 df.index = as_index(start)
             else:
                 row_selection = column.as_column(arg[0])
-                if pd.api.types.is_bool_dtype(row_selection.dtype):
+                if cudf.api.types.is_bool_dtype(row_selection.dtype):
                     df.index = self._df.index.take(row_selection)
                 else:
                     df.index = as_index(row_selection)
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 231a114aff7..e2a0af5cef2 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -405,6 +405,9 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
                 dtype_r, cudf.Datetime
             ):
                 libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy))
+        if libcudf_join_type is None:
+            # todo: test this
+            raise TypeError(f"Cant find an implicit common type for {dtype_l} and {dtype_r}")
         return libcudf_join_type
 
     def libcudf_to_output_casting_rules(self, lcol, rcol, how):
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index fb34c3c2f49..8abbc0e0ac2 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -184,7 +184,7 @@ def to_datetime(
                         column.datetime._numpy_to_pandas_conversion[u]
                         / (
                             column.datetime._numpy_to_pandas_conversion["s"]
-                            if np.datetime_data(col.dtype)[0] == "s"
+                            if np.datetime_data(col.dtype.to_numpy)[0] == "s"
                             else 1
                         )
                     )
@@ -261,7 +261,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
         return col
     elif col.dtype.kind == "m":
         raise TypeError(
-            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
+            f"dtype {col.dtype} cannot be converted to {str(cudf.dtype(_unit_dtype_map[unit]))}"
         )
 
     if col.dtype.kind in ("f"):
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index d3f3f2c2dd1..2e7233c0ff3 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -8,6 +8,7 @@
 from cudf.core import DataFrame, Series
 from cudf.core.index import as_index
 from cudf.tests.utils import assert_eq
+from cudf.core.series import _fix_nullable_dtype_repr
 
 
 @pytest.fixture
@@ -67,7 +68,7 @@ def test_categorical_integer():
 3 c
 4 a
 dtype: category
-Categories (3, object): [a, b, c]
+Categories (3, String): [a, b, c]
 """
     assert string.split() == expect_str.split()
 
@@ -360,7 +361,7 @@ def test_categorical_as_ordered(pd_str_cat, inplace):
 
     assert cd_sr_1.cat.ordered is True
     assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered
-    assert str(cd_sr_1) == str(pd_sr_1)
+    assert str(cd_sr_1) == _fix_nullable_dtype_repr(str(pd_sr_1))
 
 
 @pytest.mark.parametrize("inplace", [True, False])
@@ -379,7 +380,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
 
     assert cd_sr_1.cat.ordered is False
     assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered
-    assert str(cd_sr_1) == str(pd_sr_1)
+    assert str(cd_sr_1) == _fix_nullable_dtype_repr(str(pd_sr_1))
 
 
 @pytest.mark.parametrize("from_ordered", [True, False])
@@ -394,7 +395,7 @@ def test_categorical_reorder_categories(
 
     assert_eq(pd_sr, cd_sr)
 
-    assert str(pd_sr) == str(cd_sr)
+    assert _fix_nullable_dtype_repr(str(pd_sr)) == str(cd_sr)
 
     kwargs = dict(ordered=to_ordered, inplace=inplace)
 
@@ -405,7 +406,7 @@ def test_categorical_reorder_categories(
 
     assert_eq(pd_sr_1, cd_sr_1)
 
-    assert str(cd_sr_1) == str(pd_sr_1)
+    assert str(cd_sr_1) == _fix_nullable_dtype_repr(str(pd_sr_1))
 
 
 @pytest.mark.parametrize("inplace", [True, False])
@@ -416,7 +417,7 @@ def test_categorical_add_categories(pd_str_cat, inplace):
 
     assert_eq(pd_sr, cd_sr)
 
-    assert str(pd_sr) == str(cd_sr)
+    assert _fix_nullable_dtype_repr(str(pd_sr)) == str(cd_sr)
 
     pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace)
     cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace)
@@ -437,7 +438,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace):
 
     assert_eq(pd_sr, cd_sr)
 
-    assert str(pd_sr) == str(cd_sr)
+    assert _fix_nullable_dtype_repr(str(pd_sr)) == str(cd_sr)
 
     pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace)
     cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index a5e666fd57c..052cb1f6ad2 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -174,7 +174,7 @@ def test_dt_index(data, field):
 def test_setitem_datetime():
     df = DataFrame()
     df["date"] = pd.date_range("20010101", "20010105").values
-    assert np.issubdtype(df.date.dtype, np.datetime64)
+    assert isinstance(df.date.dtype, cudf.Datetime)
 
 
 def test_sort_datetime():
@@ -630,6 +630,7 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format):
     ],
 )
 def test_to_datetime_errors(data):
+    from cudf.core.series import _fix_nullable_dtype_repr
     pd_data = data
     if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
         gd_data = cudf.from_pandas(pd_data)
@@ -639,7 +640,7 @@ def test_to_datetime_errors(data):
     try:
         pd.to_datetime(pd_data)
     except Exception as e:
-        with pytest.raises(type(e), match=re.escape(str(e))):
+        with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(str(e)))):
             cudf.to_datetime(gd_data)
     else:
         raise AssertionError("Was expecting `pd.to_datetime` to fail")
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 7f4608fd514..953258bed9b 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -80,7 +80,6 @@ def test_feather_reader(feather_file, columns):
         .to_arrow(preserve_index=False)
         .to_pandas()
     )
-
     assert_eq(expect, got, check_categorical=False)
 
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 7f609797397..29a7de22436 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -398,6 +398,11 @@ def get_time_unit(obj):
         ),
     ):
         return obj.time_unit
+    elif isinstance(obj, cudf.Generic):
+        return obj._time_unit
+    elif isinstance(obj.dtype, cudf.Generic):
+        return obj.dtype._time_unit
 
     time_unit, _ = np.datetime_data(obj.dtype)
+
     return time_unit

From 3c047ef2c4e282a179eb6e934fefe3e319088ecc Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 2 Sep 2020 11:37:51 -0700
Subject: [PATCH 40/80] fix indexing tests

---
 python/cudf/cudf/core/column/column.py | 2 +-
 python/cudf/cudf/core/dtypes.py        | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 86eb7467db5..198329717e8 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1629,7 +1629,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                                    type=dtype.pa_type if dtype is not None else None, 
                                    from_pandas=True if nan_as_null is None else nan_as_null)
                 # todo: fix this ???? ????????
-                as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, pa.lib.DictionaryType) else None
+                as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, (pa.lib.DictionaryType, pa.lib.ListType)) else None
                 data = as_column(pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null)
 
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index b47281e5c36..d6618422043 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -495,7 +495,7 @@ def __init__(self, element_type):
             self._typ = pa.list_(element_type._typ)
         else:
             element_type = cudf.utils.dtypes.np_to_pa_dtype(
-                np.dtype(element_type)
+                cudf.dtype(element_type)
             )
             self._typ = pa.list_(element_type)
 
@@ -513,6 +513,10 @@ def leaf_type(self):
         else:
             return self.element_type
 
+    @property
+    def kind(self):
+        return 'O'
+
     @property
     def type(self):
         # TODO: we should change this to return something like a

From a1395718ae710a204ad8983da1acaafd0972724a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Sep 2020 06:17:10 -0700
Subject: [PATCH 41/80] less than 10 tests still failing

---
 python/cudf/cudf/_lib/parquet.pyx      |  6 ++----
 python/cudf/cudf/api/types.py          |  9 +++------
 python/cudf/cudf/core/column/string.py |  2 +-
 python/cudf/cudf/core/dataframe.py     |  6 +++---
 python/cudf/cudf/core/dtypes.py        | 25 +++++++++++++++----------
 python/cudf/cudf/core/reshape.py       |  2 +-
 python/cudf/cudf/io/parquet.py         |  1 +
 python/cudf/cudf/tests/test_orc.py     |  2 +-
 8 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index fd7e2cd847c..244a28a2868 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -103,7 +103,6 @@ cpdef generate_pandas_metadata(Table table, index):
             )
         else:
             types.append(np_to_pa_dtype(col.dtype))
-
     # Indexes
     if index is not False:
         for name in table._index.names:
@@ -135,16 +134,15 @@ cpdef generate_pandas_metadata(Table table, index):
                 index_descriptors.append(descr)
             else:
                 col_names.append(name)
-
+    metadata_df = table.head(0).to_pandas()
     metadata = pa.pandas_compat.construct_metadata(
-        table,
+        metadata_df,
         col_names,
         index_levels,
         index_descriptors,
         index,
         types,
     )
-
     md = metadata[b'pandas']
     json_str = md.decode("utf-8")
     return json_str
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 54e2d64b80e..3f8fe33e43f 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -106,10 +106,7 @@ def can_cast(dtype_l, dtype_r):
 
     return np.can_cast(dtype_l, dtype_r)
 
-def result_type(dtype_l, dtype_r):
-    if isinstance(dtype_l, cudf.Generic):
-        dtype_l = dtype_l.to_numpy
-    if isinstance(dtype_r, cudf.Generic):
-        dtype_r = dtype_r.to_numpy
+def result_type(*arrays_and_dtypes):
 
-    return cudf.dtype(np.result_type(dtype_l, dtype_r))
+    arrays_and_dtypes = (d.to_numpy if isinstance(d, cudf.Generic) else d for d in arrays_and_dtypes)
+    return cudf.dtype(np.result_type(*arrays_and_dtypes))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d44ebcb474f..9f1535c5f95 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4531,7 +4531,7 @@ def _nbytes(self):
             return self.children[1].size
 
     def as_numerical_column(self, dtype, **kwargs):
-        out_dtype = cudf.dtype(dtype)
+        out_dtype = cudf.dtype(dtype) if dtype is not None else cudf.Float64Dtype()
         kwargs.update(dtype=out_dtype)
 
         if out_dtype.type is np.datetime64:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 833edc9e1c3..34e179e748f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6371,7 +6371,7 @@ def select_dtypes(self, include=None, exclude=None):
     @ioutils.doc_to_parquet()
     def to_parquet(self, path, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import parquet as pq6
+        from cudf.io import parquet as pq
 
         return pq.to_parquet(self, path, *args, **kwargs)
 
@@ -6473,12 +6473,12 @@ def stack(self, level=-1, dropna=True):
         )
 
         # Collect datatypes and cast columns as that type
-        common_type = np.result_type(*self.dtypes)
+        common_type = cudf.api.types.result_type(*self.dtypes)
         homogenized = DataFrame(
             {
                 c: (
                     self._data[c].astype(common_type)
-                    if not np.issubdtype(self._data[c].dtype, common_type)
+                    if not isinstance(self._data[c].dtype, type(common_type))
                     else self._data[c]
                 )
                 for c in self._data
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index d6618422043..9e7bdb670a8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -274,14 +274,14 @@ def __init__(self):
 def make_dtype_from_string(obj):
     if obj in {"str", "string", "object", "O"}:
         return StringDtype()
-    elif "datetime" in obj:
-        if obj == "datetime64[ns]":
+    elif "datetime" in obj or "Datetime" in obj:
+        if obj in {"datetime64[ns]", 'Datetime64NS'}:
             return Datetime64NSDtype()
-        elif obj == "datetime64[us]":
+        elif obj in {"datetime64[us]", "Datetime64US"}:
             return Datetime64USDtype()
-        elif obj == "datetime64[ms]":
+        elif obj in {"datetime64[ms]", "Datetime64MS"}:
             return Datetime64MSDtype()
-        elif obj == "datetime64[s]":
+        elif obj in {"datetime64[s]", "Datetime64MS"}:
             return Datetime64SDtype()
     elif "int" in obj or "Int" in obj:
         if obj in {"int", "Int", "int64", "Int64"}:
@@ -310,15 +310,19 @@ def make_dtype_from_string(obj):
     elif "category" in obj:
         return "category"
     elif "timedelta" in obj:
-        if obj == 'timedelta64[ns]':
+        if obj in {'timedelta64[ns]', "Timedelta64NS"}:
             return Timedelta64NSDtype()
-        if obj == 'timedelta64[us]':
+        if obj in {'timedelta64[us]', "Timedelta64US"}:
             return Timedelta64USDtype()
-        if obj == 'timedelta64[ms]':
+        if obj in {'timedelta64[ms]', "Timedelta64MS"}:
             return Timedelta64MSDtype()
-        if obj == 'timedelta64[s]':
+        if obj in {'timedelta64[s]', "Timedelta64S"}:
             return Timedelta64SDtype()
-
+    else:
+        try:
+            return np_to_cudf_dtypes[np.dtype(obj)]
+        except:
+            return None
 def make_dtype_from_numpy(obj):
     np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()}
     result = np_to_pd_types.get(obj)
@@ -570,6 +574,7 @@ def __repr__(self):
     pa.duration("us"): Timedelta64USDtype(),
     pa.duration("ms"): Timedelta64MSDtype(),
     pa.duration("s"): Timedelta64SDtype(),
+    pa.date32(): Datetime64NSDtype(),
     pa.null(): None
 }
 
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index b423a46b88b..c549a609769 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -604,7 +604,7 @@ def length_check(obj, name):
                 unique = df[name].unique()
 
             if not dummy_na:
-                if np.issubdtype(unique.dtype, np.floating):
+                if isinstance(unique.dtype, cudf.Floating):
                     unique = unique.nans_to_nulls()
                 unique = unique.dropna()
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 50bbe1c20c2..18f26c6b2dc 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -6,6 +6,7 @@
 from pyarrow import parquet as pq
 from pyarrow.compat import guid
 
+
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.utils import ioutils
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 763f810f715..28d84561f5d 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -156,7 +156,7 @@ def test_orc_reader_trailing_nulls(datadir):
     # PANDAS uses NaN to represent invalid data, which forces float dtype
     # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
     for col in expect.columns:
-        expect[col] = expect[col].astype(got[col].dtype)
+        expect[col] = expect[col].astype(got[col].dtype.to_numpy)
 
     assert_eq(expect, got, check_categorical=False)
 

From ea24184ac1c4879e828379234583861a7113c12e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Sep 2020 09:41:30 -0700
Subject: [PATCH 42/80] fix bugs

---
 python/cudf/cudf/core/column/lists.py     | 1 -
 python/cudf/cudf/core/column/numerical.py | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index eed89885b2c..295c2fa250f 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -4,7 +4,6 @@
 from cudf.core.column.methods import ColumnMethodsMixin
 from cudf.core.dtypes import ListDtype
 from cudf.api.types import is_list_dtype
-from cudf.utils.utils import buffers_from_pyarrow
 
 
 class ListColumn(ColumnBase):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 03989c130ed..a55c2684656 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -14,13 +14,8 @@
     min_signed_type,
     numeric_normalize_types,
 )
-<<<<<<< HEAD
-from cudf.utils.utils import buffers_from_pyarrow
 from cudf.core.dtypes import Float64Dtype
-=======
 
-
->>>>>>> branch-0.16
 class NumericalColumn(column.ColumnBase):
     def __init__(
         self, data, dtype, mask=None, size=None, offset=0, null_count=None

From ddf340b51cb986fd7267f479a85c8361d095c792 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Sep 2020 12:56:27 -0700
Subject: [PATCH 43/80] fix a few more bugs

---
 python/cudf/cudf/_lib/aggregation.pyx  | 18 +++++++-----------
 python/cudf/cudf/_lib/groupby.pyx      |  2 +-
 python/cudf/cudf/core/dataframe.py     |  3 ++-
 python/cudf/cudf/core/dtypes.py        |  2 +-
 python/cudf/cudf/tests/test_parquet.py |  5 ++---
 5 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7392432bb64..19634d78061 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -16,8 +16,10 @@ from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_null_policy,
     underlying_type_t_type_id,
+    _Dtype
 )
 from cudf._lib.types import Interpolation
+from cudf.core.dtypes import dtype as cudf_dtype
 
 try:
     # Numba >= 0.49
@@ -241,24 +243,18 @@ cdef class _AggregationFactory:
         cdef string cpp_str
 
         # Handling UDF type
-        nb_type = numpy_support.from_dtype(kwargs['dtype'])
+        nb_type = numpy_support.from_dtype(kwargs['dtype'].to_numpy)
         type_signature = (nb_type[:],)
         compiled_op = cudautils.compile_udf(op, type_signature)
-        output_np_dtype = np.dtype(compiled_op[1])
+        output_np_dtype = cudf_dtype(np.dtype(compiled_op[1]))
         cpp_str = compiled_op[0].encode('UTF-8')
-        if output_np_dtype not in np_to_cudf_types:
+        if cudf_dtype(output_np_dtype) not in np_to_cudf_types:
             raise TypeError(
                 "Result of window function has unsupported dtype {}"
                 .format(op[1])
             )
-        tid = (
-            <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[output_np_dtype]
-                )
-            )
-        )
-        out_dtype = libcudf_types.data_type(tid)
+        cdef _Dtype pydtype = output_np_dtype
+        out_dtype = pydtype.get_libcudf_type()
 
         agg.c_obj = move(libcudf_aggregation.make_udf_aggregation(
             libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 5175aafe9cb..a909d5f5762 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -178,7 +178,7 @@ def _drop_unsupported_aggs(Table values, aggs):
     if all(len(v) == 0 for v in aggs.values()):
         return aggs
 
-    from cudf.utils.dtypes import (
+    from cudf.api.types import (
         is_categorical_dtype,
         is_string_dtype,
         is_list_dtype
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index aa6de4b5835..d5348def80e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4882,8 +4882,9 @@ def to_arrow(self, preserve_index=True):
             index_descr.append(descr)
 
         out = super(DataFrame, data).to_arrow()
+        metadata_df = self.head(0).to_pandas()
         metadata = pa.pandas_compat.construct_metadata(
-            self,
+            metadata_df,
             out.schema.names,
             [self.index],
             index_descr,
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 9e7bdb670a8..1693e6683d6 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -281,7 +281,7 @@ def make_dtype_from_string(obj):
             return Datetime64USDtype()
         elif obj in {"datetime64[ms]", "Datetime64MS"}:
             return Datetime64MSDtype()
-        elif obj in {"datetime64[s]", "Datetime64MS"}:
+        elif obj in {"datetime64[s]", "Datetime64S"}:
             return Datetime64SDtype()
     elif "int" in obj or "Int" in obj:
         if obj in {"int", "Int", "int64", "Int64"}:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e5398befd4f..cf926f39da2 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -378,9 +378,8 @@ def test_parquet_read_filtered_everything(tmpdir):
     # Check filter
     df_filtered = cudf.read_parquet(fname, filters=[("x", "==", 12)])
     assert_eq(len(df_filtered), 0)
-    assert_eq(df_filtered["x"].dtype, "int64")
-    assert_eq(df_filtered["y"].dtype, "object")
-
+    assert isinstance(df_filtered["x"].dtype, cudf.Int64Dtype)
+    assert isinstance(df_filtered["y"].dtype, cudf.StringDtype)
 
 def test_parquet_read_filtered_multiple_files(tmpdir):
     # Generate data

From 4a140425a41cb5d47632b8d82eeb1248c8c145b1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Sep 2020 13:22:20 -0700
Subject: [PATCH 44/80] construct from string tests

---
 python/cudf/cudf/tests/test_dtypes.py | 48 +++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index a02f01327bf..9a3f9a285a5 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -75,3 +75,51 @@ def test_nested_dtype():
     expect = ListDtype("int32")
     got = dt.element_type
     assert expect == got
+
+@pytest.mark.parametrize('string,dtype', [
+    ('uint8', cudf.UInt8Dtype),
+    ('uint16', cudf.UInt16Dtype),
+    ('uint32', cudf.UInt32Dtype),
+    ('uint64', cudf.UInt64Dtype),
+    ('UInt8', cudf.UInt8Dtype),
+    ('UInt16', cudf.UInt16Dtype),
+    ('UInt32', cudf.UInt32Dtype),
+    ('UInt64', cudf.UInt64Dtype),
+    ('int8', cudf.Int8Dtype),
+    ('int16', cudf.Int16Dtype),
+    ('int32', cudf.Int32Dtype),
+    ('int64', cudf.Int64Dtype),
+    ('Int8', cudf.Int8Dtype),
+    ('Int16', cudf.Int16Dtype),
+    ('Int32', cudf.Int32Dtype),
+    ('Int64', cudf.Int64Dtype),
+    ('int', cudf.Int64Dtype),
+    ('float32', cudf.Float32Dtype),
+    ('float64', cudf.Float64Dtype),
+    ('Float32', cudf.Float32Dtype),
+    ('Float64', cudf.Float64Dtype),
+    ('float', cudf.Float64Dtype),
+    ('bool', cudf.BooleanDtype),
+    ('Boolean', cudf.BooleanDtype),
+    ('string', cudf.StringDtype),
+    ('String', cudf.StringDtype),
+    ('object', cudf.StringDtype),
+    ('datetime64[ns]', cudf.Datetime64NSDtype),
+    ('datetime64[us]', cudf.Datetime64USDtype),
+    ('datetime64[ms]', cudf.Datetime64MSDtype),
+    ('datetime64[s]', cudf.Datetime64SDtype),
+    ('Datetime64NS', cudf.Datetime64NSDtype),
+    ('Datetime64US', cudf.Datetime64USDtype),
+    ('Datetime64MS', cudf.Datetime64MSDtype),
+    ('Datetime64S', cudf.Datetime64SDtype),
+    ('timedelta64[ns]', cudf.Timedelta64NSDtype),
+    ('timedelta64[us]', cudf.Timedelta64USDtype),
+    ('timedelta64[ms]', cudf.Timedelta64MSDtype),
+    ('timedelta64[s]', cudf.Timedelta64SDtype),
+    ('Timedelta64NS', cudf.Timedelta64NSDtype),
+    ('Timedelta64US', cudf.Timedelta64USDtype),
+    ('Timedelta64MS', cudf.Timedelta64MSDtype),
+    ('Timedelta64S', cudf.Timedelta64SDtype),
+    ])
+def test_cudf_dtype_string_construction(string, dtype):
+    assert type(cudf.dtype(string) == dtype)

From 55cec7e9458c02598f0f45546b9d4863cf3ae9c6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 3 Sep 2020 15:12:33 -0700
Subject: [PATCH 45/80] clean up dtypes.py

---
 python/cudf/cudf/core/dtypes.py | 203 ++++++++++++++------------------
 1 file changed, 90 insertions(+), 113 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 1693e6683d6..d54a73533e6 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -105,9 +105,6 @@ def type(self):
 
     @property
     def kind(self):
-        if isinstance(self, Floating):
-            return "f"
-        else:
             return self.to_pandas.kind
 
     @property
@@ -115,32 +112,49 @@ def name(self):
         return self._name
 
     def __repr__(self):
-        return self.pa_type.__repr__()
+        return self.name
 
     def __hash__(self):
         return hash(self.__repr__())
+    
+    def _raise_construction_error(self):
+        raise TypeError(f"Cannot create {type(self)} instances")
+
+
 
 class Number(Generic):
-    pass
+    def __init__(self):
+        self._raise_construction_error()
 
 class Integer(Number):
-    pass
+    def __init__(self):
+        self._raise_construction_error()
 
 class SignedInteger(Integer):
-    pass
-
+    def __init__(self):
+        self._raise_construction_error()
+        
 class UnsignedInteger(Integer):
-    pass
+    def __init__(self):
+        self._raise_construction_error()
+        
 
 class Inexact(Number):
-    pass
-
+    def __init__(self):
+        self._raise_construction_error()
+        
 class Floating(Inexact):
-    pass
+    def __init__(self):
+        self._raise_construction_error()
+        
+    @property
+    def kind(self):
+        return "f"
 
 class Flexible(Generic):
-    pass
-
+    def __init__(self):
+        self._construction_error()
+        
 class Datetime(Generic):    
     pass
 
@@ -264,120 +278,59 @@ def __init__(self):
         self._time_unit = 's'
 
 class StringDtype(Flexible):
-    is_string = True
 
     def __init__(self):
         self.pa_type = pa.string()
         self._name = "String"
 
 
-def make_dtype_from_string(obj):
-    if obj in {"str", "string", "object", "O"}:
-        return StringDtype()
-    elif "datetime" in obj or "Datetime" in obj:
-        if obj in {"datetime64[ns]", 'Datetime64NS'}:
-            return Datetime64NSDtype()
-        elif obj in {"datetime64[us]", "Datetime64US"}:
-            return Datetime64USDtype()
-        elif obj in {"datetime64[ms]", "Datetime64MS"}:
-            return Datetime64MSDtype()
-        elif obj in {"datetime64[s]", "Datetime64S"}:
-            return Datetime64SDtype()
-    elif "int" in obj or "Int" in obj:
-        if obj in {"int", "Int", "int64", "Int64"}:
-            return Int64Dtype()
-        elif obj in {"int32", "Int32"}:
-            return Int32Dtype()
-        elif obj in {"int16", "Int16"}:
-            return Int16Dtype()
-        elif obj in {"int8", "Int8"}:
-            return Int8Dtype()
-        elif obj in {"uint64", "UInt64"}:
-            return UInt64Dtype()
-        elif obj in {"uint32", "UInt32"}:
-            return UInt32Dtype()
-        elif obj in {"uint16", "UInt16"}:
-            return UInt16Dtype()
-        elif obj in {"uint8", "UInt8"}:
-            return UInt8Dtype()
-    elif "float" in obj or "Float" in obj:
-        if obj in {"float64", "Float64", 'float', 'Float'}:
-            return Float64Dtype()
-        elif obj in {"float32", "Float32"}:
-            return Float32Dtype()
-    elif "bool" in obj:
-        return BooleanDtype()
-    elif "category" in obj:
-        return "category"
-    elif "timedelta" in obj:
-        if obj in {'timedelta64[ns]', "Timedelta64NS"}:
-            return Timedelta64NSDtype()
-        if obj in {'timedelta64[us]', "Timedelta64US"}:
-            return Timedelta64USDtype()
-        if obj in {'timedelta64[ms]', "Timedelta64MS"}:
-            return Timedelta64MSDtype()
-        if obj in {'timedelta64[s]', "Timedelta64S"}:
-            return Timedelta64SDtype()
-    else:
-        try:
-            return np_to_cudf_dtypes[np.dtype(obj)]
-        except:
-            return None
-def make_dtype_from_numpy(obj):
-    np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()}
-    result = np_to_pd_types.get(obj)
-    return result
+def cudf_dtype_from_string(obj):
+    try:
+        np_dtype = np.dtype(obj)
+        return cudf_dtype_from_numpy(np_dtype)
+    except TypeError:
+        return _cudf_dtype_from_string.get(obj, None)
 
 
-def dtype(obj):
+def cudf_dtype_from_numpy(obj):
+    if obj is np.str_:
+        return StringDtype()
+    elif obj is np.number:
+        return cudf.Number
+    elif obj is np.datetime64:
+        return cudf.Datetime
+    elif obj is np.timedelta64:
+        return cudf.Timedelta
+    dtype = np.dtype(obj)
+    return _cudf_dtype_from_numpy.get(obj, None)
 
-    if obj is None:
-        return None
-    if obj is str:
-        return cudf.StringDtype()
-    if obj is int:
-        return cudf.Int64Dtype()
-    if obj is float:
-        return cudf.Float64Dtype()
+def dtype(obj):
+    if isinstance(obj, Generic):
+        return obj
+    elif type(obj) is type and issubclass(obj, Generic):
+        return obj()
+    elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, np.generic)):
+        return cudf_dtype_from_numpy(obj)
+    elif isinstance(obj, str):
+        return cudf_dtype_from_string(obj)
     if isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
     if isinstance(obj, CategoricalDtype):
         if obj is 'category':
             return cudf.CategoricalDtype()
         return obj
-    elif isinstance(obj, Generic):
-        return obj
-    elif issubclass(obj.__class__, Generic):
-        return obj()
-    if isinstance(obj, np.dtype):
-        if obj.type is np.str_:
-            return StringDtype()
-        else:
-            return np_to_cudf_dtypes.get(obj, None)
-    elif isinstance(obj, pa.lib.DataType):
-        return pa_to_cudf_dtypes[obj]
-    elif isinstance(obj, str):
-        return make_dtype_from_string(obj)
-    elif obj in pd_to_cudf_dtypes.keys():
-        return pd_to_cudf_dtypes[obj]
+    elif obj in _pd_to_cudf_dtypes.keys():
+        return _pd_to_cudf_dtypes[obj]
     elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
-        return make_dtype_from_string(obj.name)
-    elif obj is np.number:
-        return cudf.Number
-    elif obj is np.datetime64:
-        return cudf.Datetime
-    elif obj is np.timedelta64:
-        return cudf.Timedelta
-
-
+        return cudf_dtype_from_string(obj.name)
+    elif obj is str:
+        return cudf.StringDtype()
+    elif obj is int:
+        return cudf.Int64Dtype()
+    elif obj in {float, None}:
+        return cudf.Float64Dtype()
     else:
-        try:
-            if issubclass(obj, np.generic):
-                return np_to_cudf_dtypes[np.dtype(obj)]
-        except:
-            import pdb
-            pdb.set_trace()
-    
+        raise TypeError(f"Could not find a cuDF dtype matching {obj}")
 
 
 class CategoricalDtype(Generic):
@@ -578,7 +531,7 @@ def __repr__(self):
     pa.null(): None
 }
 
-np_to_cudf_dtypes = {
+_cudf_dtype_from_numpy = {
     np.dtype("int8"): Int8Dtype(),
     np.dtype("int16"): Int16Dtype(),
     np.dtype("int32"): Int32Dtype(),
@@ -602,7 +555,7 @@ def __repr__(self):
     np.dtype("timedelta64[s]"): Timedelta64SDtype(),
 }
 
-pd_to_cudf_dtypes = {
+_pd_to_cudf_dtypes = {
     pd.Int8Dtype(): Int8Dtype(),
     pd.Int16Dtype(): Int16Dtype(),
     pd.Int32Dtype(): Int32Dtype(),
@@ -614,3 +567,27 @@ def __repr__(self):
     pd.BooleanDtype(): BooleanDtype(),
     pd.StringDtype(): StringDtype(),
 }
+
+_cudf_dtype_from_string = {
+    'UInt8': UInt8Dtype,
+    'UInt16': UInt16Dtype,
+    'UInt32': UInt32Dtype,
+    'UInt64': UInt64Dtype,
+    'Int8': Int8Dtype,
+    'Int16': Int16Dtype,
+    'Int32': Int32Dtype,
+    'Int64': Int64Dtype,
+    'Float': Float64Dtype,
+    'Float32': Float32Dtype,
+    'Float64': Float64Dtype,
+    'Boolean': BooleanDtype,
+    'String': StringDtype,
+    'Datetime64NS': Datetime64NSDtype,
+    'Datetime64US': Datetime64USDtype,
+    'Datetime64MS': Datetime64MSDtype,
+    'Datetime64S': Datetime64SDtype,
+    'Timedelta64NS': Timedelta64NSDtype,
+    'Timedelta64US': Timedelta64USDtype,
+    'Timedelta64MS': Timedelta64MSDtype,
+    'Timedelta64S': Timedelta64SDtype,
+}

From c28c7b6fea0d370bbb7068f4f0d88fd765362900 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 4 Sep 2020 06:13:01 -0700
Subject: [PATCH 46/80] fixed some bugs

---
 python/cudf/cudf/core/column/column.py |  2 +-
 python/cudf/cudf/core/dataframe.py     |  4 ++--
 python/cudf/cudf/core/dtypes.py        | 22 +++++++++++++---------
 python/cudf/cudf/core/series.py        |  3 ---
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8914cd8ab39..b6d66d4ba8b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1398,7 +1398,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
     * pandas.Categorical objects
     """
 
-    dtype = cudf.dtype(dtype) if dtype is not None else None
+    dtype = cudf.dtype(dtype)
 
     if isinstance(arbitrary, ColumnBase):
         if dtype is not None:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d5348def80e..ed12e34a688 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6088,7 +6088,7 @@ def kurtosis(
             msg = "Kurtosis only supports int, float, and bool dtypes."
             raise NotImplementedError(msg)
 
-        self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()])
+        self = self.select_dtypes(include=[cudf.Number, cudf.BooleanDtype])
         return self._apply_support_method(
             "kurtosis",
             axis=axis,
@@ -6134,7 +6134,7 @@ def skew(
             msg = "Skew only supports int, float, and bool dtypes."
             raise NotImplementedError(msg)
 
-        self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()])
+        self = self.select_dtypes(include=[cudf.Number, cudf.BooleanDtype])
         return self._apply_support_method(
             "skew",
             axis=axis,
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index d54a73533e6..509266af722 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -285,6 +285,8 @@ def __init__(self):
 
 
 def cudf_dtype_from_string(obj):
+    if obj == 'category':
+        return obj
     try:
         np_dtype = np.dtype(obj)
         return cudf_dtype_from_numpy(np_dtype)
@@ -302,35 +304,37 @@ def cudf_dtype_from_numpy(obj):
     elif obj is np.timedelta64:
         return cudf.Timedelta
     dtype = np.dtype(obj)
-    return _cudf_dtype_from_numpy.get(obj, None)
+    return _cudf_dtype_from_numpy.get(dtype, None)
 
 def dtype(obj):
     if isinstance(obj, Generic):
         return obj
     elif type(obj) is type and issubclass(obj, Generic):
         return obj()
-    elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, np.generic)):
+    elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype))):
         return cudf_dtype_from_numpy(obj)
     elif isinstance(obj, str):
         return cudf_dtype_from_string(obj)
     if isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
-    if isinstance(obj, CategoricalDtype):
-        if obj is 'category':
-            return cudf.CategoricalDtype()
-        return obj
     elif obj in _pd_to_cudf_dtypes.keys():
         return _pd_to_cudf_dtypes[obj]
     elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
         return cudf_dtype_from_string(obj.name)
+    elif isinstance(obj, pa.lib.DataType):
+        return cudf_dtype_from_pyarrow[obj]
     elif obj is str:
         return cudf.StringDtype()
     elif obj is int:
         return cudf.Int64Dtype()
-    elif obj in {float, None}:
+    elif obj is float:
         return cudf.Float64Dtype()
+    elif obj is None:
+        return None
     else:
-        raise TypeError(f"Could not find a cuDF dtype matching {obj}")
+        raise TypeError
+        
+        #raise TypeError(f"Could not find a cuDF dtype matching {obj}")
 
 
 class CategoricalDtype(Generic):
@@ -506,7 +510,7 @@ def __repr__(self):
             return f"ListDtype({self.element_type})"
 
 
-pa_to_cudf_dtypes = {
+cudf_dtype_from_pyarrow = {
     pa.uint8(): UInt8Dtype(),
     pa.uint16(): UInt16Dtype(),
     pa.uint32(): UInt32Dtype(),
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5aefad92e1e..91f7e7d0a45 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -12,8 +12,6 @@
 from pandas._config import get_option
 from pandas.api.types import is_dict_like
 
-from cudf.core.dtypes import dtype as cudf_dtype
-
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
@@ -145,7 +143,6 @@ def __init__(
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
-        dtype = cudf_dtype(dtype)
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name

From bad1dc231d6fb5ec8d5aee6208c2d8f2ffe6609f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 4 Sep 2020 12:10:51 -0700
Subject: [PATCH 47/80] a little iteration on dtypes.py

---
 python/cudf/cudf/core/column/numerical.py |   7 --
 python/cudf/cudf/core/dtypes.py           | 134 ++++++++++------------
 python/cudf/cudf/core/series.py           |   2 +-
 3 files changed, 60 insertions(+), 83 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a55c2684656..d3b3fe7d0ee 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -28,13 +28,6 @@ def __init__(
             The dtype associated with the data Buffer
         mask : Buffer, optional
         """
-        try:
-            cudf.dtype(dtype)
-            dtype.itemsize
-
-        except:
-            import pdb
-            pdb.set_trace()
         dtype = cudf.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 509266af722..d9d0ea07183 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -9,54 +9,6 @@
 from cudf._lib.types import _Dtype
 import cudf
 
-pa_to_pd_dtypes = {
-    pa.uint8(): pd.UInt8Dtype(),
-    pa.uint16(): pd.UInt16Dtype(),
-    pa.uint32(): pd.UInt32Dtype(),
-    pa.uint64(): pd.UInt64Dtype(),
-    pa.int8(): pd.Int8Dtype(),
-    pa.int16(): pd.Int16Dtype(),
-    pa.int32(): pd.Int32Dtype(),
-    pa.int64(): pd.Int64Dtype(),
-    pa.bool_(): pd.BooleanDtype(),
-    pa.string(): pd.StringDtype(),
-    pa.float32(): np.float32(),
-    pa.float64(): np.float64(),
-    pa.timestamp("ns"): np.dtype("datetime64[ns]"),
-    pa.timestamp("us"): np.dtype("datetime64[us]"),
-    pa.timestamp("ms"): np.dtype("datetime64[ms]"),
-    pa.timestamp("s"): np.dtype("datetime64[s]"),
-    pa.duration("ns"): np.dtype('timedelta64[ns]'),
-    pa.duration("us"): np.dtype('timedelta64[us]'),
-    pa.duration("ms"): np.dtype('timedelta64[ms]'),
-    pa.duration("s"): np.dtype('timedelta64[s]'),
-}
-
-pa_to_np_dtypes = {
-    pa.uint8(): np.dtype("uint8"),
-    pa.uint16(): np.dtype("uint16"),
-    pa.uint32(): np.dtype("uint32"),
-    pa.uint64(): np.dtype("uint64"),
-    pa.int8(): np.dtype("int8"),
-    pa.int16(): np.dtype("int16"),
-    pa.int32(): np.dtype("int32"),
-    pa.int64(): np.dtype("int64"),
-    pa.bool_(): np.dtype("bool"),
-    pa.string(): np.dtype("object"),
-    pa.float32(): np.dtype("float32"),
-    pa.float64(): np.dtype("float64"),
-    pa.timestamp("ns"): np.dtype("datetime64[ns]"),
-    pa.timestamp("us"): np.dtype("datetime64[us]"),
-    pa.timestamp("ms"): np.dtype("datetime64[ms]"),
-    pa.timestamp("s"): np.dtype("datetime64[s]"),
-    pa.duration("ns"): np.dtype('timedelta64[ns]'),
-    pa.duration("us"): np.dtype('timedelta64[us]'),
-    pa.duration("ms"): np.dtype('timedelta64[ms]'),
-    pa.duration("s"): np.dtype('timedelta64[s]'),
-    None: None,
-}
-
-
 class Generic(ExtensionDtype, _Dtype):
     pa_type = None
 
@@ -86,11 +38,11 @@ def num(self):
 
     @property
     def to_numpy(self):
-        return pa_to_np_dtypes[self.pa_type]
+        return np.dtype(self.pa_type.to_pandas_dtype())
 
     @property
     def to_pandas(self):
-        return pa_to_pd_dtypes[self.pa_type]
+        return pd.api.types.pandas_dtype(self.name)
 
     @property
     def itemsize(self):
@@ -138,7 +90,6 @@ class UnsignedInteger(Integer):
     def __init__(self):
         self._raise_construction_error()
         
-
 class Inexact(Number):
     def __init__(self):
         self._raise_construction_error()
@@ -155,11 +106,27 @@ class Flexible(Generic):
     def __init__(self):
         self._construction_error()
         
-class Datetime(Generic):    
-    pass
+class Datetime(Generic):   
+
+    @property
+    def to_numpy(self):
+        return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self]
+
+    @property
+    def to_pandas(self):
+        # pandas only supports nanos
+        return np.dtype('datetime64[ns]')
 
 class Timedelta(Generic):
-    pass
+
+    @property
+    def to_numpy(self):
+        return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self]
+
+    @property
+    def to_pandas(self):
+        # pandas only supports nanos
+        return np.dtype('timedelta64[ns]')
 
 class UInt8Dtype(UnsignedInteger):
     def __init__(self):
@@ -224,7 +191,7 @@ class BooleanDtype(Generic):
 
     def __init__(self):
         self.pa_type = pa.bool_()
-        self._name = "Boolean"
+        self._name = "boolean"
 
 class Datetime64NSDtype(Datetime):
     def __init__(self):
@@ -281,7 +248,7 @@ class StringDtype(Flexible):
 
     def __init__(self):
         self.pa_type = pa.string()
-        self._name = "String"
+        self._name = "string"
 
 
 def cudf_dtype_from_string(obj):
@@ -291,7 +258,10 @@ def cudf_dtype_from_string(obj):
         np_dtype = np.dtype(obj)
         return cudf_dtype_from_numpy(np_dtype)
     except TypeError:
-        return _cudf_dtype_from_string.get(obj, None)
+        result = _cudf_dtype_from_string.get(obj, None)
+        if not result:
+            raise TypeError(f"Could not find a cuDF dtype matching {obj}")
+        return result
 
 
 def cudf_dtype_from_numpy(obj):
@@ -304,7 +274,22 @@ def cudf_dtype_from_numpy(obj):
     elif obj is np.timedelta64:
         return cudf.Timedelta
     dtype = np.dtype(obj)
-    return _cudf_dtype_from_numpy.get(dtype, None)
+    if dtype.type is np.str_:
+        return StringDtype()
+    result = _cudf_dtype_from_numpy.get(dtype, None)
+    if not result:
+        raise TypeError(f"Could not find a cuDF dtype matching {obj}")
+    return result
+
+def cudf_dtype_from_pandas(obj):
+    if isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
+        try:
+            return cudf_dtype_from_numpy(obj.numpy_dtype)
+        except TypeError:
+            result = _cudf_dtype_from_pandas.get(obj, None)
+            if not result:
+                raise TypeError(f"Could not find a cuDF dtype matching {obj}")
+            return result
 
 def dtype(obj):
     if isinstance(obj, Generic):
@@ -317,10 +302,8 @@ def dtype(obj):
         return cudf_dtype_from_string(obj)
     if isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
-    elif obj in _pd_to_cudf_dtypes.keys():
-        return _pd_to_cudf_dtypes[obj]
-    elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
-        return cudf_dtype_from_string(obj.name)
+    elif isinstance(obj, (ExtensionDtype, pd.core.arrays.numpy_.PandasDtype)):
+        return cudf_dtype_from_pandas(obj)
     elif isinstance(obj, pa.lib.DataType):
         return cudf_dtype_from_pyarrow[obj]
     elif obj is str:
@@ -332,6 +315,7 @@ def dtype(obj):
     elif obj is None:
         return None
     else:
+
         raise TypeError
         
         #raise TypeError(f"Could not find a cuDF dtype matching {obj}")
@@ -559,19 +543,6 @@ def __repr__(self):
     np.dtype("timedelta64[s]"): Timedelta64SDtype(),
 }
 
-_pd_to_cudf_dtypes = {
-    pd.Int8Dtype(): Int8Dtype(),
-    pd.Int16Dtype(): Int16Dtype(),
-    pd.Int32Dtype(): Int32Dtype(),
-    pd.Int64Dtype(): Int64Dtype(),
-    pd.UInt8Dtype(): UInt8Dtype(),
-    pd.UInt16Dtype(): UInt16Dtype(),
-    pd.UInt32Dtype(): UInt32Dtype(),
-    pd.UInt64Dtype(): UInt64Dtype(),
-    pd.BooleanDtype(): BooleanDtype(),
-    pd.StringDtype(): StringDtype(),
-}
-
 _cudf_dtype_from_string = {
     'UInt8': UInt8Dtype,
     'UInt16': UInt16Dtype,
@@ -595,3 +566,16 @@ def __repr__(self):
     'Timedelta64MS': Timedelta64MSDtype,
     'Timedelta64S': Timedelta64SDtype,
 }
+
+_cudf_dtype_from_pandas = {
+    pd.UInt8Dtype(): UInt8Dtype(),
+    pd.UInt16Dtype():  UInt16Dtype(),
+    pd.UInt32Dtype(): UInt32Dtype(),
+    pd.UInt64Dtype(): UInt64Dtype(),
+    pd.Int8Dtype():  Int8Dtype(),
+    pd.Int16Dtype(): Int16Dtype(),
+    pd.Int32Dtype(): Int32Dtype(),
+    pd.Int64Dtype(): Int64Dtype(),
+    pd.StringDtype(): StringDtype(),
+    pd.BooleanDtype(): BooleanDtype(),
+}
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 91f7e7d0a45..17c4d5b8c58 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4363,7 +4363,7 @@ def keys(self):
     "UInt16": "Float32",
     "UInt32": "Float64",
     "UInt64": "Float64",
-    "Boolean": "Float32",
+    "boolean": "Float32",
     "Int": "Float",
 }
 

From 09385079084170a5d3e518f66cb3f495b3e958d5 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 4 Sep 2020 12:35:06 -0700
Subject: [PATCH 48/80] implement the scalar type attribute

---
 python/cudf/cudf/_lib/copying.pyx         |  3 ++-
 python/cudf/cudf/_lib/scalar.pyx          |  8 ++++----
 python/cudf/cudf/api/types.py             |  3 +++
 python/cudf/cudf/core/column/datetime.py  |  2 +-
 python/cudf/cudf/core/column/timedelta.py |  2 +-
 python/cudf/cudf/core/dtypes.py           | 11 +++++++----
 python/cudf/cudf/core/index.py            |  2 +-
 7 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 357b019c0f3..cab42bce789 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 import pandas as pd
+from cudf.api.types import is_integer_dtype
 
 from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
@@ -129,7 +130,7 @@ def copy_range(Column input_column,
 
 
 def gather(Table source_table, Column gather_map, bool keep_index=True):
-    assert pd.api.types.is_integer_dtype(gather_map.dtype)
+    assert is_integer_dtype(gather_map.dtype)
 
     cdef unique_ptr[table] c_result
     cdef table_view source_table_view
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index ba2ecef1cd5..773ce54be31 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -107,13 +107,13 @@ cdef class Scalar:
         """
         Returns a host copy of the underlying device scalar.
         """
-        if pd.api.types.is_string_dtype(self.dtype):
+        if cudf.api.types.is_string_dtype(self.dtype):
             return _get_py_string_from_string(self.c_value)
-        elif pd.api.types.is_numeric_dtype(self.dtype):
+        elif cudf.api.types.is_numerical_dtype(self.dtype):
             return _get_np_scalar_from_numeric(self.c_value)
-        elif pd.api.types.is_datetime64_dtype(self.dtype):
+        elif cudf.api.types.is_datetime64_dtype(self.dtype):
             return _get_np_scalar_from_timestamp64(self.c_value)
-        elif pd.api.types.is_timedelta64_dtype(self.dtype):
+        elif cudf.api.types.is_timedelta64_dtype(self.dtype):
             return _get_np_scalar_from_timedelta64(self.c_value)
         else:
             raise ValueError(
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 3f8fe33e43f..f3a90e25765 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -18,6 +18,9 @@ def is_timedelta64_dtype(obj):
 def is_string_dtype(obj):
     return isinstance(obj, cudf.StringDtype) or (pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj))
 
+def is_integer_dtype(obj):
+    return isinstance(obj, cudf.Integer) or pd.api.types.is_integer_dtype(obj)
+
 def is_numerical_dtype(obj):
     if isinstance(obj, cudf.Generic):
         return isinstance(obj, (cudf.Number, cudf.BooleanDtype))
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index f293f480eb3..f1974b10ef7 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -61,7 +61,7 @@ def __init__(
             null_count=null_count,
         )
 
-        if not (self.dtype.type is np.datetime64):
+        if not isinstance(self.dtype, cudf.Datetime):
             raise TypeError(f"{self.dtype} is not a supported datetime type")
 
     def __contains__(self, item):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index a43f9ee98dd..e7d38223736 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -56,7 +56,7 @@ def __init__(
             null_count=null_count,
         )
 
-        if not (self.dtype.type is np.timedelta64):
+        if not isinstance(self.dtype, cudf.Timedelta):
             raise TypeError(f"{self.dtype} is not a supported duration type")
 
     def __contains__(self, item):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index d9d0ea07183..a01c580aced 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -50,10 +50,7 @@ def itemsize(self):
 
     @property
     def type(self):
-        if isinstance(self, (Floating, Datetime)):
-            return self.to_numpy.type
-        else:
-            return self.to_pandas.type
+        return CUDFType(self)
 
     @property
     def kind(self):
@@ -250,6 +247,12 @@ def __init__(self):
         self.pa_type = pa.string()
         self._name = "string"
 
+class CUDFType(object):
+    def __init__(self, parent_dtype):
+        self.parent_dtype = parent_dtype
+
+    def __call__(self, arg):
+        return cudf._lib.scalar.Scalar(arg, dtype=self.parent_dtype)
 
 def cudf_dtype_from_string(obj):
     if obj == 'category':
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 420c99afe92..a5e7ddb7b6c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1719,7 +1719,7 @@ def to_pandas(self):
         return pd.RangeIndex(
             start=self._start,
             stop=self._stop,
-            dtype=self.dtype,
+            dtype=self.dtype.to_pandas,
             name=self.name,
         )
 

From e5a489dbcaf04d5d411ffac68e6f1425fc340f44 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 4 Sep 2020 13:00:35 -0700
Subject: [PATCH 49/80] cleanup and style

---
 python/cudf/cudf/__init__.py                |  48 ++++---
 python/cudf/cudf/_lib/binaryop.pyx          |   2 +-
 python/cudf/cudf/api/types.py               |  49 +++++--
 python/cudf/cudf/core/column/categorical.py |   6 +-
 python/cudf/cudf/core/column/column.py      |  65 +++++++---
 python/cudf/cudf/core/column/datetime.py    |  15 +--
 python/cudf/cudf/core/column/lists.py       |   2 +-
 python/cudf/cudf/core/column/numerical.py   |  17 ++-
 python/cudf/cudf/core/column/string.py      |  22 ++--
 python/cudf/cudf/core/column/timedelta.py   |  25 ++--
 python/cudf/cudf/core/dataframe.py          |  13 +-
 python/cudf/cudf/core/dtypes.py             | 134 +++++++++++---------
 python/cudf/cudf/core/frame.py              |  10 +-
 python/cudf/cudf/core/index.py              |   4 +-
 python/cudf/cudf/core/indexing.py           |   6 +-
 python/cudf/cudf/core/join/join.py          |  18 ++-
 python/cudf/cudf/core/reshape.py            |   3 +-
 python/cudf/cudf/core/series.py             | 100 ++++++++-------
 python/cudf/cudf/io/parquet.py              |   4 +-
 python/cudf/cudf/tests/test_categorical.py  |   2 +-
 python/cudf/cudf/tests/test_column.py       |   3 +-
 python/cudf/cudf/tests/test_dataframe.py    |   4 +-
 python/cudf/cudf/tests/test_datetime.py     |   5 +-
 python/cudf/cudf/tests/test_dtypes.py       |  94 +++++++-------
 python/cudf/cudf/tests/test_joining.py      |   2 +-
 python/cudf/cudf/tests/test_numerical.py    |   2 +-
 python/cudf/cudf/tests/test_parquet.py      |   6 +-
 python/cudf/cudf/tests/test_repr.py         |  20 +--
 python/cudf/cudf/tests/test_setitem.py      |   2 +-
 python/cudf/cudf/tests/test_string.py       |   2 +-
 python/cudf/cudf/tests/test_timedelta.py    |  18 +--
 python/cudf/cudf/utils/dtypes.py            |  40 +++---
 python/cudf/cudf/utils/utils.py             |   4 +-
 33 files changed, 437 insertions(+), 310 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index cff9df9f032..6e644cf09be 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -34,38 +34,36 @@
     merge,
 )
 from cudf.core.dtypes import (
-    dtype,
-    Generic,
+    BooleanDtype,
+    CategoricalDtype,
     Datetime,
-    Floating,
-    Number,
-    Integer,
+    Datetime64MSDtype,
+    Datetime64NSDtype,
+    Datetime64SDtype,
+    Datetime64USDtype,
     Flexible,
-    Datetime,
-    Timedelta,
-    CategoricalDtype, 
+    Float32Dtype,
+    Float64Dtype,
+    Floating,
+    Generic,
     Int8Dtype,
-    Int16Dtype, 
-    Int32Dtype, 
-    Int64Dtype, 
-    UInt8Dtype, 
-    UInt16Dtype,
-    UInt32Dtype, 
-    UInt64Dtype, 
+    Int16Dtype,
+    Int32Dtype,
+    Int64Dtype,
+    Integer,
+    Number,
     StringDtype,
-    Float32Dtype,
-    Float64Dtype, 
-    BooleanDtype,
-    Datetime64NSDtype,
-    Datetime64USDtype, 
-    Datetime64MSDtype,
-    Datetime64SDtype,
+    Timedelta,
+    Timedelta64MSDtype,
     Timedelta64NSDtype,
+    Timedelta64SDtype,
     Timedelta64USDtype,
-    Timedelta64MSDtype,
-    Timedelta64SDtype
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+    dtype,
 )
-
 from cudf.core.groupby import Grouper
 from cudf.core.ops import (
     add,
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 18c72da25f9..b2b2c217db2 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -23,7 +23,7 @@ from cudf._lib.cpp.types cimport (
     type_id,
 )
 
-from cudf.utils.dtypes import is_string_dtype
+from cudf.api.types import is_string_dtype
 
 from cudf._lib.cpp.binaryop cimport binary_operator
 cimport cudf._lib.cpp.binaryop as cpp_binaryop
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index f3a90e25765..732828085b4 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -1,26 +1,41 @@
-import pandas as pd
-import cudf
 import numpy as np
+import pandas as pd
 from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
 
+import cudf
+
+
 def is_bool_dtype(obj):
     # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably
     if hasattr(obj, "dtype"):
         obj = obj.dtype
-    return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(obj)
+    return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(
+        obj
+    )
+
 
 def is_datetime64_dtype(obj):
-    return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(obj)
+    return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(
+        obj
+    )
+
 
 def is_timedelta64_dtype(obj):
-    return isinstance(obj, cudf.Timedelta) or pd.api.types.is_timedelta64_dtype(obj)
+    return isinstance(
+        obj, cudf.Timedelta
+    ) or pd.api.types.is_timedelta64_dtype(obj)
+
 
 def is_string_dtype(obj):
-    return isinstance(obj, cudf.StringDtype) or (pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj))
+    return isinstance(obj, cudf.StringDtype) or (
+        pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)
+    )
+
 
 def is_integer_dtype(obj):
     return isinstance(obj, cudf.Integer) or pd.api.types.is_integer_dtype(obj)
 
+
 def is_numerical_dtype(obj):
     if isinstance(obj, cudf.Generic):
         return isinstance(obj, (cudf.Number, cudf.BooleanDtype))
@@ -34,11 +49,14 @@ def is_numerical_dtype(obj):
         or np.issubdtype(obj, np.signedinteger)
     )
 
+
 def is_categorical_dtype(obj):
     """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype
     is a pandas CategoricalDtype.
     """
-    if isinstance(obj, cudf.Generic) and not isinstance(obj, cudf.CategoricalDtype):
+    if isinstance(obj, cudf.Generic) and not isinstance(
+        obj, cudf.CategoricalDtype
+    ):
         return False
     if obj is None:
         return False
@@ -85,6 +103,7 @@ def is_categorical_dtype(obj):
             return True
     return pd.api.types.is_categorical_dtype(obj)
 
+
 def is_list_dtype(obj):
     return (
         type(obj) is cudf.core.dtypes.ListDtype
@@ -95,12 +114,18 @@ def is_list_dtype(obj):
         or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype))
     )
 
+
 def find_common_type(array_types=[], scalar_types=[]):
-    array_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types]
-    scalar_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types]
+    array_types = [
+        d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types
+    ]
+    scalar_types = [
+        d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types
+    ]
 
     return cudf.dtype(np.find_common_type(array_types, scalar_types))
 
+
 def can_cast(dtype_l, dtype_r):
     if isinstance(dtype_l, cudf.Generic):
         dtype_l = dtype_l.to_numpy
@@ -109,7 +134,11 @@ def can_cast(dtype_l, dtype_r):
 
     return np.can_cast(dtype_l, dtype_r)
 
+
 def result_type(*arrays_and_dtypes):
 
-    arrays_and_dtypes = (d.to_numpy if isinstance(d, cudf.Generic) else d for d in arrays_and_dtypes)
+    arrays_and_dtypes = (
+        d.to_numpy if isinstance(d, cudf.Generic) else d
+        for d in arrays_and_dtypes
+    )
     return cudf.dtype(np.result_type(*arrays_and_dtypes))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d5931e439c5..f9108f4be64 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -7,6 +7,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.transform import bools_to_mask
+from cudf.api.types import is_categorical_dtype
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -16,7 +17,6 @@
     min_signed_type,
     min_unsigned_type,
 )
-from cudf.api.types import is_categorical_dtype
 
 
 class CategoricalAccessor(ColumnMethodsMixin):
@@ -305,7 +305,9 @@ def add_categories(self, new_categories, **kwargs):
                 f"type-cast new_categories to the same type as "
                 f"existing categories."
             )
-        common_dtype = cudf.api.types.find_common_type([old_categories.dtype, new_categories.dtype], [])
+        common_dtype = cudf.api.types.find_common_type(
+            [old_categories.dtype, new_categories.dtype], []
+        )
 
         new_categories = new_categories.astype(common_dtype, copy=False)
         old_categories = old_categories.astype(common_dtype, copy=False)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b6d66d4ba8b..0f5f29913b0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -23,6 +23,12 @@
 from cudf._lib.scalar import as_scalar
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 from cudf._lib.transform import bools_to_mask
+from cudf.api.types import (
+    is_bool_dtype,
+    is_categorical_dtype,
+    is_list_dtype,
+    is_string_dtype,
+)
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
 from cudf.core.dtypes import CategoricalDtype
@@ -34,10 +40,8 @@
     is_scalar,
     min_signed_type,
     min_unsigned_type,
-    np_to_pa_dtype,
 )
 from cudf.utils.utils import mask_dtype
-from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype, is_bool_dtype
 
 
 class ColumnBase(Column, Serializable):
@@ -202,14 +206,17 @@ def _concat(cls, objs, dtype=None):
                 [
                     o
                     for o in not_null_cols
-                    if not isinstance(o.dtype, (cudf.Number)) or isinstance(o.dtype, cudf.Datetime)
+                    if not isinstance(o.dtype, (cudf.Number))
+                    or isinstance(o.dtype, cudf.Datetime)
                 ]
             )
             == 0
         ):
             cudf_col_dtypes = [o.dtype for o in not_null_cols]
             # Use NumPy to find a common dtype
-            cudf_common_dtype = cudf.api.types.find_common_type(cudf_col_dtypes, [])
+            cudf_common_dtype = cudf.api.types.find_common_type(
+                cudf_col_dtypes, []
+            )
             # Cast all columns to the common dtype
             for i in range(len(objs)):
                 objs[i] = objs[i].astype(cudf_common_dtype)
@@ -635,9 +642,13 @@ def __getitem__(self, arg):
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = as_column([], dtype="int32")
-            if pd.api.types.is_integer_dtype(arg.dtype) or isinstance(arg.dtype, cudf.Integer):
+            if pd.api.types.is_integer_dtype(arg.dtype) or isinstance(
+                arg.dtype, cudf.Integer
+            ):
                 return self.take(arg)
-            if pd.api.types.is_bool_dtype(arg.dtype) or isinstance(arg.dtype, cudf.BooleanDtype):
+            if pd.api.types.is_bool_dtype(arg.dtype) or isinstance(
+                arg.dtype, cudf.BooleanDtype
+            ):
                 return self.apply_boolean_mask(arg)
             raise NotImplementedError(type(arg))
 
@@ -1607,9 +1618,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         elif arb_dtype.kind in ("O", "U"):
 
             pa_data = pa.Array.from_pandas(arbitrary)
-            data = as_column(
-                pa_data, dtype=cudf.dtype(pa_data.type)
-            )
+            data = as_column(pa_data, dtype=cudf.dtype(pa_data.type))
             # There is no cast operation available for pa.Array from int to
             # str, Hence instead of handling in pa.Array block, we
             # will have to type-cast here.
@@ -1681,24 +1690,44 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                     if is_categorical_dtype(dtype):
                         raise TypeError
 
-                pa_data = pa.array(arbitrary,
-                                   type=dtype.pa_type if dtype is not None else None, 
-                                   from_pandas=True if nan_as_null is None else nan_as_null)
+                pa_data = pa.array(
+                    arbitrary,
+                    type=dtype.pa_type if dtype is not None else None,
+                    from_pandas=True if nan_as_null is None else nan_as_null,
+                )
                 # todo: fix this ???? ????????
-                as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, (pa.lib.DictionaryType, pa.lib.ListType)) else None
-                data = as_column(pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null)
+                as_column_dtype = (
+                    cudf.dtype(pa_data.type)
+                    if not isinstance(
+                        pa_data.type, (pa.lib.DictionaryType, pa.lib.ListType)
+                    )
+                    else None
+                )
+                data = as_column(
+                    pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null
+                )
 
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                 if is_categorical_dtype(dtype):
-                    if isinstance(dtype, pd.CategoricalDtype) or dtype is 'category':
-                        data = as_column(pd.Series(arbitrary, dtype=dtype), nan_as_null=nan_as_null)
+                    if (
+                        isinstance(dtype, pd.CategoricalDtype)
+                        or dtype is "category" # noqa: F632
+                    ):
+                        data = as_column(
+                            pd.Series(arbitrary, dtype=dtype),
+                            nan_as_null=nan_as_null,
+                        )
                     else:
-                        data = as_column(arbitrary, nan_as_null=nan_as_null).astype(dtype)
+                        data = as_column(
+                            arbitrary, nan_as_null=nan_as_null
+                        ).astype(dtype)
                 elif isinstance(cudf.dtype(dtype), cudf.StringDtype):
                     sr = pd.Series(arbitrary, dtype="str")
                     data = as_column(sr, nan_as_null=nan_as_null)
                 else:
-                    native_dtype = dtype.to_numpy if dtype is not None else None
+                    native_dtype = (
+                        dtype.to_numpy if dtype is not None else None
+                    )
                     if dtype is None and pd.api.types.infer_dtype(
                         arbitrary
                     ) in ("mixed", "mixed-integer"):
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index f1974b10ef7..3894b5dd0dc 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -5,11 +5,11 @@
 import numpy as np
 import pandas as pd
 
-import cudf.core.dtypes as cudf_dtypes
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
 from cudf._lib.scalar import Scalar, as_scalar
+from cudf.core import dtypes as cudf_dtypes
 from cudf.core.column import column, string
 from cudf.utils.dtypes import is_scalar
 
@@ -170,9 +170,9 @@ def as_string_column(self, dtype, **kwargs):
             )
             kwargs["format"] = fmt
         if len(self) > 0:
-            return string._numeric_to_str_typecast_functions[
-                self.dtype
-            ](self, **kwargs)
+            return string._numeric_to_str_typecast_functions[self.dtype](
+                self, **kwargs
+            )
         else:
             return column.column_empty(0, dtype="object", masked=False)
 
@@ -184,7 +184,6 @@ def default_na_value(self):
     def binary_operator(self, op, rhs, reflect=False):
         lhs, rhs = self, rhs
 
-        lhs_dtype = cudf.dtype(lhs.dtype)
         rhs_dtype = cudf.dtype(rhs.dtype)
 
         if op in ("eq", "ne", "lt", "gt", "le", "ge"):
@@ -203,9 +202,9 @@ def binary_operator(self, op, rhs, reflect=False):
             lhs_unit = units.index(lhs_time_unit)
             rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
             rhs_unit = units.index(rhs_time_unit)
-            out_dtype = cudf.dtype(np.dtype(
-                f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]"
-            ))
+            out_dtype = cudf.dtype(
+                np.dtype(f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]")
+            )
         else:
             raise TypeError(
                 f"Series of dtype {self.dtype} cannot perform "
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 295c2fa250f..fb308ce09cd 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+from cudf.api.types import is_list_dtype
 from cudf.core.column import ColumnBase
 from cudf.core.column.methods import ColumnMethodsMixin
 from cudf.core.dtypes import ListDtype
-from cudf.api.types import is_list_dtype
 
 
 class ListColumn(ColumnBase):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index d3b3fe7d0ee..2fa16c8458d 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -8,13 +8,14 @@
 from cudf._lib.scalar import Scalar
 from cudf.core.buffer import Buffer
 from cudf.core.column import as_column, build_column, column, string
+from cudf.core.dtypes import Float64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     min_column_type,
     min_signed_type,
     numeric_normalize_types,
 )
-from cudf.core.dtypes import Float64Dtype
+
 
 class NumericalColumn(column.ColumnBase):
     def __init__(
@@ -79,7 +80,9 @@ def binary_operator(self, binop, rhs, reflect=False):
         if reflect:
             tmp = self
         if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs):
-            out_dtype = np.result_type(cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy)
+            out_dtype = np.result_type(
+                cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy
+            )
             out_dtype = cudf.dtype(out_dtype)
             if binop in ["mod", "floordiv"]:
                 if (cudf.dtype(tmp.dtype) in int_dtypes) and (
@@ -134,9 +137,9 @@ def int2ip(self):
     def as_string_column(self, dtype, **kwargs):
 
         if len(self) > 0:
-            return string._numeric_to_str_typecast_functions[
-                self.dtype
-            ](self, **kwargs)
+            return string._numeric_to_str_typecast_functions[self.dtype](
+                self, **kwargs
+            )
         else:
             return as_column([], dtype="object")
 
@@ -167,6 +170,7 @@ def as_numerical_column(self, dtype, **kwargs):
             return self
         if dtype is None:
             import pdb
+
             pdb.set_trace()
         return libcudf.unary.cast(self, dtype)
 
@@ -175,13 +179,14 @@ def sum(self, dtype=None):
             return libcudf.reduce.reduce("sum", self, dtype=dtype)
         except:
             import pdb
+
             pdb.set_trace()
 
     def product(self, dtype=None):
         return libcudf.reduce.reduce("product", self, dtype=dtype)
 
     def mean(self, dtype=Float64Dtype()):
-            return libcudf.reduce.reduce("mean", self, dtype=dtype)
+        return libcudf.reduce.reduce("mean", self, dtype=dtype)
 
     def var(self, ddof=1, dtype=Float64Dtype()):
         return libcudf.reduce.reduce("var", self, dtype=dtype, ddof=ddof)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a5978ff9e0a..224211d5b2d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -132,20 +132,18 @@
     slice_strings as cpp_slice_strings,
 )
 from cudf._lib.strings.translate import (
-    translate as cpp_translate,
     filter_characters as cpp_filter_characters,
+    translate as cpp_translate,
 )
 from cudf._lib.strings.wrap import wrap as cpp_wrap
+from cudf.api.types import is_list_dtype, is_string_dtype
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.dtypes import dtype
 from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import (
-    can_convert_to_column, is_scalar
-)
-from cudf.api.types import is_list_dtype, is_string_dtype
-from cudf.core.dtypes import dtype
+from cudf.utils.dtypes import can_convert_to_column, is_scalar
 
 _str_to_numeric_typecast_functions = {
     dtype("int8"): str_cast.stoi8,
@@ -4564,7 +4562,9 @@ def _nbytes(self):
             return self.children[1].size
 
     def as_numerical_column(self, dtype, **kwargs):
-        out_dtype = cudf.dtype(dtype) if dtype is not None else cudf.Float64Dtype()
+        out_dtype = (
+            cudf.dtype(dtype) if dtype is not None else cudf.Float64Dtype()
+        )
         kwargs.update(dtype=out_dtype)
 
         if out_dtype.type is np.datetime64:
@@ -4744,7 +4744,9 @@ def fillna(self, fill_value):
 
     def _find_first_and_last(self, value):
         found_indices = self.str().contains(f"^{value}$")
-        found_indices = libcudf.unary.cast(found_indices, dtype=cudf.Int32Dtype())
+        found_indices = libcudf.unary.cast(
+            found_indices, dtype=cudf.Int32Dtype()
+        )
         first = column.as_column(found_indices).find_first_value(1)
         last = column.as_column(found_indices).find_last_value(1)
         return first, last
@@ -4776,7 +4778,9 @@ def binary_operator(self, op, rhs, reflect=False):
         if isinstance(rhs, StringColumn) and op == "add":
             return lhs.str().cat(others=rhs)
         elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
-            return _string_column_binop(self, rhs, op=op, out_dtype=cudf.BooleanDtype())
+            return _string_column_binop(
+                self, rhs, op=op, out_dtype=cudf.BooleanDtype()
+            )
         else:
             msg = "{!r} operator not supported between {} and {}"
             raise TypeError(msg.format(op, type(self), type(rhs)))
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e7d38223736..60b3f027efe 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -10,10 +10,11 @@
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
 from cudf._lib.scalar import Scalar, as_scalar
+from cudf.api.types import can_cast
 from cudf.core.column import column, string
 from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils.dtypes import is_scalar, np_to_pa_dtype
-from cudf.api.types import can_cast
+
 _dtype_to_format_conversion = {
     "Timedelta64NS": "%D days %H:%M:%S",
     "Timedelta64US": "%D days %H:%M:%S",
@@ -298,9 +299,9 @@ def as_string_column(self, dtype, **kwargs):
             )
             kwargs["format"] = fmt
         if len(self) > 0:
-            return string._numeric_to_str_typecast_functions[
-                self.dtype
-            ](self, **kwargs)
+            return string._numeric_to_str_typecast_functions[self.dtype](
+                self, **kwargs
+            )
         else:
             return column.column_empty(0, dtype="object", masked=False)
 
@@ -548,7 +549,9 @@ def _timedelta_binary_op_add(lhs, rhs):
         lhs_unit = units.index(lhs_time_unit)
         rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
         rhs_unit = units.index(rhs_time_unit)
-        out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]"))
+        out_dtype = cudf.dtype(
+            np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
+        )
     else:
         raise TypeError(
             f"Addition of {lhs.dtype} with {rhs.dtype} "
@@ -559,15 +562,21 @@ def _timedelta_binary_op_add(lhs, rhs):
 
 
 def _timedelta_binary_op_sub(lhs, rhs):
-    if isinstance(lhs.dtype, cudf.Timedelta) and isinstance(rhs.dtype, cudf.Timedelta):
+    if isinstance(lhs.dtype, cudf.Timedelta) and isinstance(
+        rhs.dtype, cudf.Timedelta
+    ):
         out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype)
-    elif isinstance(rhs.dtype, cudf.Timedelta) and isinstance(lhs.dtype, cudf.Datetime):
+    elif isinstance(rhs.dtype, cudf.Timedelta) and isinstance(
+        lhs.dtype, cudf.Datetime
+    ):
         units = ["s", "ms", "us", "ns"]
         lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
         lhs_unit = units.index(lhs_time_unit)
         rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
         rhs_unit = units.index(rhs_time_unit)
-        out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]"))
+        out_dtype = cudf.dtype(
+            np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
+        )
     else:
         raise TypeError(
             f"Subtraction of {lhs.dtype} with {rhs.dtype} "
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ed12e34a688..0f59bda56a5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -26,6 +26,7 @@
 from cudf import _lib as libcudf
 from cudf._lib.null_mask import MaskState, create_null_mask
 from cudf._lib.nvtx import annotate
+from cudf.api.types import is_categorical_dtype, is_list_dtype, is_string_dtype
 from cudf.core import column
 from cudf.core.abc import Serializable
 from cudf.core.column import as_column, column_empty
@@ -44,7 +45,6 @@
     is_scalar,
     numeric_normalize_types,
 )
-from cudf.api.types import is_categorical_dtype, is_list_dtype, is_string_dtype
 from cudf.utils.utils import OrderedColumnDict
 
 
@@ -4520,7 +4520,10 @@ def _sizeof_fmt(num, size_qualifier):
                 deep = True
             else:
                 deep = False
-                if "String" in dtype_counts or self.index.dtype == cudf.StringDtype():
+                if (
+                    "String" in dtype_counts
+                    or self.index.dtype == cudf.StringDtype()
+                ):
                     size_qualifier = "+"
             mem_usage = self.memory_usage(index=True, deep=deep).sum()
             lines.append(
@@ -4907,7 +4910,9 @@ def to_records(self, index=True):
         numpy recarray
         """
         members = [("index", self.index.dtype.to_numpy)] if index else []
-        members += [(col, self[col].dtype.to_numpy) for col in self._data.names]
+        members += [
+            (col, self[col].dtype.to_numpy) for col in self._data.names
+        ]
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
@@ -6419,7 +6424,7 @@ def select_dtypes(self, include=None, exclude=None):
                     include_subtypes.add(i_dtype)
                 elif issubclass(dtype, i_dtype):
                     include_subtypes.add(dtype)
-    
+
         # exclude all subtypes
         exclude_subtypes = set()
         for dtype in (d.__class__ for d in self.dtypes):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index a01c580aced..360de63efae 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -6,8 +6,10 @@
 import pandas as pd
 import pyarrow as pa
 from pandas.api.extensions import ExtensionDtype
-from cudf._lib.types import _Dtype
+
 import cudf
+from cudf._lib.types import _Dtype
+
 
 class Generic(ExtensionDtype, _Dtype):
     pa_type = None
@@ -15,7 +17,9 @@ class Generic(ExtensionDtype, _Dtype):
     def __eq__(self, other):
         if isinstance(other, self.__class__):
             return True
-        if isinstance(other, Generic) and not isinstance(other, self.__class__):
+        if isinstance(other, Generic) and not isinstance(
+            other, self.__class__
+        ):
             return False
         if (
             isinstance(other, self.to_pandas.__class__)
@@ -54,7 +58,7 @@ def type(self):
 
     @property
     def kind(self):
-            return self.to_pandas.kind
+        return self.to_pandas.kind
 
     @property
     def name(self):
@@ -65,65 +69,71 @@ def __repr__(self):
 
     def __hash__(self):
         return hash(self.__repr__())
-    
+
     def _raise_construction_error(self):
         raise TypeError(f"Cannot create {type(self)} instances")
 
 
-
 class Number(Generic):
     def __init__(self):
         self._raise_construction_error()
 
+
 class Integer(Number):
     def __init__(self):
         self._raise_construction_error()
 
+
 class SignedInteger(Integer):
     def __init__(self):
         self._raise_construction_error()
-        
+
+
 class UnsignedInteger(Integer):
     def __init__(self):
         self._raise_construction_error()
-        
+
+
 class Inexact(Number):
     def __init__(self):
         self._raise_construction_error()
-        
+
+
 class Floating(Inexact):
     def __init__(self):
         self._raise_construction_error()
-        
+
     @property
     def kind(self):
         return "f"
 
+
 class Flexible(Generic):
     def __init__(self):
         self._construction_error()
-        
-class Datetime(Generic):   
 
+
+class Datetime(Generic):
     @property
     def to_numpy(self):
-        return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self]
+        return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self]
 
     @property
     def to_pandas(self):
         # pandas only supports nanos
-        return np.dtype('datetime64[ns]')
+        return np.dtype("datetime64[ns]")
 
-class Timedelta(Generic):
 
+class Timedelta(Generic):
     @property
     def to_numpy(self):
-        return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self]
+        return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self]
 
     @property
     def to_pandas(self):
         # pandas only supports nanos
-        return np.dtype('timedelta64[ns]')
+        return np.dtype("timedelta64[ns]")
+
 
 class UInt8Dtype(UnsignedInteger):
     def __init__(self):
@@ -172,6 +182,7 @@ def __init__(self):
         self.pa_type = pa.int64()
         self._name = "Int64"
 
+
 class Float32Dtype(Floating):
     def __init__(self):
         self.pa_type = pa.float32()
@@ -185,11 +196,11 @@ def __init__(self):
 
 
 class BooleanDtype(Generic):
-
     def __init__(self):
         self.pa_type = pa.bool_()
         self._name = "boolean"
 
+
 class Datetime64NSDtype(Datetime):
     def __init__(self):
         self.pa_type = pa.timestamp("ns")
@@ -217,36 +228,41 @@ def __init__(self):
         self._name = "Datetime64S"
         self._time_unit = "s"
 
+
 class Timedelta64NSDtype(Timedelta):
     def __init__(self):
-        self.pa_type = pa.duration('ns')
+        self.pa_type = pa.duration("ns")
         self._name = "Timedelta64NS"
-        self._time_unit = 'ns'
+        self._time_unit = "ns"
+
 
 class Timedelta64USDtype(Timedelta):
     def __init__(self):
-        self.pa_type = pa.duration('us')
+        self.pa_type = pa.duration("us")
         self._name = "Timedelta64US"
-        self._time_unit = 'us'
+        self._time_unit = "us"
+
 
 class Timedelta64MSDtype(Timedelta):
     def __init__(self):
-        self.pa_type = pa.duration('ms')
+        self.pa_type = pa.duration("ms")
         self._name = "Timedelta64MS"
-        self._time_unit = 'ms'
+        self._time_unit = "ms"
+
 
 class Timedelta64SDtype(Timedelta):
     def __init__(self):
-        self.pa_type = pa.duration('s')
+        self.pa_type = pa.duration("s")
         self._name = "Timedelta64S"
-        self._time_unit = 's'
+        self._time_unit = "s"
 
-class StringDtype(Flexible):
 
+class StringDtype(Flexible):
     def __init__(self):
         self.pa_type = pa.string()
         self._name = "string"
 
+
 class CUDFType(object):
     def __init__(self, parent_dtype):
         self.parent_dtype = parent_dtype
@@ -254,8 +270,9 @@ def __init__(self, parent_dtype):
     def __call__(self, arg):
         return cudf._lib.scalar.Scalar(arg, dtype=self.parent_dtype)
 
+
 def cudf_dtype_from_string(obj):
-    if obj == 'category':
+    if obj == "category":
         return obj
     try:
         np_dtype = np.dtype(obj)
@@ -284,6 +301,7 @@ def cudf_dtype_from_numpy(obj):
         raise TypeError(f"Could not find a cuDF dtype matching {obj}")
     return result
 
+
 def cudf_dtype_from_pandas(obj):
     if isinstance(obj, pd.core.arrays.numpy_.PandasDtype):
         try:
@@ -294,12 +312,15 @@ def cudf_dtype_from_pandas(obj):
                 raise TypeError(f"Could not find a cuDF dtype matching {obj}")
             return result
 
+
 def dtype(obj):
     if isinstance(obj, Generic):
         return obj
     elif type(obj) is type and issubclass(obj, Generic):
         return obj()
-    elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype))):
+    elif isinstance(obj, np.dtype) or (
+        isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype))
+    ):
         return cudf_dtype_from_numpy(obj)
     elif isinstance(obj, str):
         return cudf_dtype_from_string(obj)
@@ -320,12 +341,11 @@ def dtype(obj):
     else:
 
         raise TypeError
-        
-        #raise TypeError(f"Could not find a cuDF dtype matching {obj}")
 
+        # raise TypeError(f"Could not find a cuDF dtype matching {obj}")
 
-class CategoricalDtype(Generic):
 
+class CategoricalDtype(Generic):
     def __init__(self, categories=None, ordered=None):
         """
         dtype similar to pd.CategoricalDtype with the categories
@@ -431,7 +451,7 @@ def deserialize(cls, header, frames):
 
     @property
     def kind(self):
-        return 'O'
+        return "O"
 
 
 class ListDtype(Generic):
@@ -463,7 +483,7 @@ def leaf_type(self):
 
     @property
     def kind(self):
-        return 'O'
+        return "O"
 
     @property
     def type(self):
@@ -519,7 +539,7 @@ def __repr__(self):
     pa.duration("ms"): Timedelta64MSDtype(),
     pa.duration("s"): Timedelta64SDtype(),
     pa.date32(): Datetime64NSDtype(),
-    pa.null(): None
+    pa.null(): None,
 }
 
 _cudf_dtype_from_numpy = {
@@ -547,35 +567,35 @@ def __repr__(self):
 }
 
 _cudf_dtype_from_string = {
-    'UInt8': UInt8Dtype,
-    'UInt16': UInt16Dtype,
-    'UInt32': UInt32Dtype,
-    'UInt64': UInt64Dtype,
-    'Int8': Int8Dtype,
-    'Int16': Int16Dtype,
-    'Int32': Int32Dtype,
-    'Int64': Int64Dtype,
-    'Float': Float64Dtype,
-    'Float32': Float32Dtype,
-    'Float64': Float64Dtype,
-    'Boolean': BooleanDtype,
-    'String': StringDtype,
-    'Datetime64NS': Datetime64NSDtype,
-    'Datetime64US': Datetime64USDtype,
-    'Datetime64MS': Datetime64MSDtype,
-    'Datetime64S': Datetime64SDtype,
-    'Timedelta64NS': Timedelta64NSDtype,
-    'Timedelta64US': Timedelta64USDtype,
-    'Timedelta64MS': Timedelta64MSDtype,
-    'Timedelta64S': Timedelta64SDtype,
+    "UInt8": UInt8Dtype,
+    "UInt16": UInt16Dtype,
+    "UInt32": UInt32Dtype,
+    "UInt64": UInt64Dtype,
+    "Int8": Int8Dtype,
+    "Int16": Int16Dtype,
+    "Int32": Int32Dtype,
+    "Int64": Int64Dtype,
+    "Float": Float64Dtype,
+    "Float32": Float32Dtype,
+    "Float64": Float64Dtype,
+    "Boolean": BooleanDtype,
+    "String": StringDtype,
+    "Datetime64NS": Datetime64NSDtype,
+    "Datetime64US": Datetime64USDtype,
+    "Datetime64MS": Datetime64MSDtype,
+    "Datetime64S": Datetime64SDtype,
+    "Timedelta64NS": Timedelta64NSDtype,
+    "Timedelta64US": Timedelta64USDtype,
+    "Timedelta64MS": Timedelta64MSDtype,
+    "Timedelta64S": Timedelta64SDtype,
 }
 
 _cudf_dtype_from_pandas = {
     pd.UInt8Dtype(): UInt8Dtype(),
-    pd.UInt16Dtype():  UInt16Dtype(),
+    pd.UInt16Dtype(): UInt16Dtype(),
     pd.UInt32Dtype(): UInt32Dtype(),
     pd.UInt64Dtype(): UInt64Dtype(),
-    pd.Int8Dtype():  Int8Dtype(),
+    pd.Int8Dtype(): Int8Dtype(),
     pd.Int16Dtype(): Int16Dtype(),
     pd.Int32Dtype(): Int32Dtype(),
     pd.Int64Dtype(): Int64Dtype(),
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 649a0af9fd8..31de28f408f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -13,6 +13,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
+from cudf.api.types import is_categorical_dtype, is_numerical_dtype
 from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import utils
 from cudf.utils.dtypes import (
@@ -21,7 +22,6 @@
     min_scalar_type,
     min_signed_type,
 )
-from cudf.api.types import is_numerical_dtype, is_categorical_dtype
 
 
 class Frame(libcudf.table.Table):
@@ -277,7 +277,9 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                 dtypes[idx] = cols[0].dtype
                 # If all the non-null dtypes are int/float, find a common dtype
                 if all(is_numerical_dtype(col.dtype) for col in cols):
-                    dtypes[idx] = cudf.api.types.find_common_type([col.dtype for col in cols], [])
+                    dtypes[idx] = cudf.api.types.find_common_type(
+                        [col.dtype for col in cols], []
+                    )
                 # If all categorical dtypes, combine the categories
                 elif all(
                     isinstance(col, cudf.core.column.CategoricalColumn)
@@ -294,9 +296,7 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                     # will be re-assigned at the end
                     dtypes[idx] = min_scalar_type(len(categories[idx]))
                 # Otherwise raise an error if columns have different dtypes
-                elif not all(
-                    c.dtype == dtypes[idx] for c in cols
-                ):
+                elif not all(c.dtype == dtypes[idx] for c in cols):
                     raise ValueError("All columns must be the same type")
             return categories
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index a5e7ddb7b6c..31afdf10a33 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -11,6 +11,7 @@
 
 import cudf
 from cudf._lib.nvtx import annotate
+from cudf.api.types import is_categorical_dtype
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     CategoricalColumn,
@@ -22,6 +23,7 @@
     column,
 )
 from cudf.core.column.string import StringMethods as StringMethods
+from cudf.core.dtypes import dtype
 from cudf.core.frame import Frame
 from cudf.utils import ioutils, utils
 from cudf.utils.docutils import copy_docstring
@@ -31,9 +33,7 @@
     is_scalar,
     numeric_normalize_types,
 )
-from cudf.api.types import is_categorical_dtype
 from cudf.utils.utils import cached_property
-from cudf.core.dtypes import dtype
 
 
 def _to_frame(this_index, index=True, name=None):
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index e1324c15268..83dfc0cb768 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -4,13 +4,13 @@
 
 import cudf
 from cudf._lib.nvtx import annotate
+from cudf.api.types import is_categorical_dtype
 from cudf.utils.dtypes import (
     is_column_like,
     is_list_like,
     is_scalar,
     to_cudf_compatible_scalar,
 )
-from cudf.api.types import is_categorical_dtype
 
 
 def indices_from_labels(obj, labels):
@@ -94,7 +94,9 @@ def __setitem__(self, key, value):
         ):
             # normalize types if necessary:
             if not pd.api.types.is_integer(key):
-                to_dtype = cudf.api.types.result_type(value.dtype, self._sr._column.dtype)
+                to_dtype = cudf.api.types.result_type(
+                    value.dtype, self._sr._column.dtype
+                )
                 value = value.astype(to_dtype.to_numpy)
                 self._sr._column._mimic_inplace(
                     self._sr._column.astype(to_dtype), inplace=True
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index e2a0af5cef2..95a1a05b377 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -2,7 +2,6 @@
 import itertools
 import warnings
 
-import numpy as np
 import pandas as pd
 
 import cudf
@@ -398,16 +397,25 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
             ):
                 if dtype_l.kind == dtype_r.kind:
                     # both ints or both floats
-                    libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy))
+                    libcudf_join_type = cudf.dtype(
+                        max(dtype_l.to_numpy, dtype_r.to_numpy)
+                    )
                 else:
-                    libcudf_join_type = cudf.api.types.find_common_type([], [dtype_l, dtype_r])
+                    libcudf_join_type = cudf.api.types.find_common_type(
+                        [], [dtype_l, dtype_r]
+                    )
             elif isinstance(dtype_l, cudf.Datetime) and isinstance(
                 dtype_r, cudf.Datetime
             ):
-                libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy))
+                libcudf_join_type = cudf.dtype(
+                    max(dtype_l.to_numpy, dtype_r.to_numpy)
+                )
         if libcudf_join_type is None:
             # todo: test this
-            raise TypeError(f"Cant find an implicit common type for {dtype_l} and {dtype_r}")
+            raise TypeError(
+                f"Cant find an implicit common \
+                type for {dtype_l} and {dtype_r}"
+            )
         return libcudf_join_type
 
     def libcudf_to_output_casting_rules(self, lcol, rcol, how):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index dbed1510866..4c2816deeea 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 import cudf
+from cudf.api.types import is_categorical_dtype
 from cudf.core import DataFrame, Index, Series
 from cudf.core.column import (
     CategoricalColumn,
@@ -10,7 +11,7 @@
     build_categorical_column,
 )
 from cudf.utils.dtypes import is_list_like
-from cudf.api.types import is_categorical_dtype
+
 _axis_map = {0: 0, 1: 1, "index": 0, "columns": 1}
 
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 17c4d5b8c58..8804aff2e38 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -16,6 +16,7 @@
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
 from cudf._lib.transform import bools_to_mask
+from cudf.api.types import is_list_dtype, is_string_dtype
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     ColumnBase,
@@ -46,7 +47,6 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.api.types import is_list_dtype, is_string_dtype
 
 
 class Series(Frame, Serializable):
@@ -1396,9 +1396,19 @@ def __rtruediv__(self, other):
     __div__ = __truediv__
 
     def _bitwise_binop(self, other, op):
-        if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))):
+        if (
+            isinstance(
+                self.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta)
+            )
+        ) and (
+            isinstance(
+                other.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta)
+            )
+        ):
             ser = self._binaryop(other, op)
-            if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(other.dtype, cudf.BooleanDtype):
+            if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(
+                other.dtype, cudf.BooleanDtype
+            ):
                 ser = ser.astype(cudf.BooleanDtype())
         else:
             raise TypeError(
@@ -1406,7 +1416,7 @@ def _bitwise_binop(self, other, op):
                 f"{self.dtype.type.__name__} and {other.dtype.type.__name__}"
             )
         return ser
-        
+
     def __and__(self, other):
         """Performs vectorized bitwise and (&) on corresponding elements of two
         series.
@@ -5177,53 +5187,53 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
 
     return Series(result_col, index=index)
 
+
 def _fix_nullable_dtype_repr(string):
 
     to_replace = [
-        'uint8',
-        'uint16',
-        'uint32', 
-        'uint64', 
-        'int8', 
-        'int16', 
-        'int32', 
-        'int64', 
-        'float32', 
-        'float64', 
-        'bool', 
-        'object', 
-        'datetime64[ns]', 
-        'datetime64[us]', 
-        'datetime64[ms]', 
-        'datetime64[s]',
-        'timedelta64[ns]',
-        'timedelta64[us]',
-        'timedelta64[ms]',
-        'timedelta64[s]'
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "bool",
+        "object",
+        "datetime64[ns]",
+        "datetime64[us]",
+        "datetime64[ms]",
+        "datetime64[s]",
+        "timedelta64[ns]",
+        "timedelta64[us]",
+        "timedelta64[ms]",
+        "timedelta64[s]",
     ]
 
-
     replacements = [
-        'UInt8',
-        'UInt16',
-        'UInt32',
-        'UInt64',
-        'Int8',
-        'Int16',
-        'Int32',
-        'Int64',
-        'Float32',
-        'Float64',
-        'Boolean',
-        'String',
-        'Datetime64NS',
-        'Datetime64US',
-        'Datetime64MS',
-        'Datetime64S',
-        'Timedelta64NS',
-        'Timedelta64US',
-        'Timedelta64MS',
-        'Timedelta64S'
+        "UInt8",
+        "UInt16",
+        "UInt32",
+        "UInt64",
+        "Int8",
+        "Int16",
+        "Int32",
+        "Int64",
+        "Float32",
+        "Float64",
+        "Boolean",
+        "String",
+        "Datetime64NS",
+        "Datetime64US",
+        "Datetime64MS",
+        "Datetime64S",
+        "Timedelta64NS",
+        "Timedelta64US",
+        "Timedelta64MS",
+        "Timedelta64S",
     ]
     for tr, rp in zip(to_replace, replacements):
         string = string.replace(tr, rp)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index acaab744344..0ca741a5b0c 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -4,10 +4,8 @@
 from collections import defaultdict
 
 from fsspec.core import get_fs_token_paths
-from pyarrow import parquet as pq
+from pyarrow import dataset as ds, parquet as pq
 from pyarrow.compat import guid
-from pyarrow import dataset as ds
-
 
 import cudf
 from cudf._lib import parquet as libparquet
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 1155a1b91ce..ebbbd7b8cd5 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -8,8 +8,8 @@
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.index import as_index
-from cudf.tests.utils import assert_eq
 from cudf.core.series import _fix_nullable_dtype_repr
+from cudf.tests.utils import assert_eq
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 7ac4df4e514..438838c9f36 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -58,7 +58,8 @@ def test_column_offset_and_size(pandas_input, offset, size):
         if col.size > 0:
             assert col.size == (col.children[0].size - 1)
             assert col.size == (
-                (col.children[0].data.size / col.children[0].dtype.itemsize) - 1
+                (col.children[0].data.size / col.children[0].dtype.itemsize)
+                - 1
             )
     else:
         assert col.size == (col.data.size / col.dtype.itemsize)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d414185a540..4edcf0955f5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -17,6 +17,7 @@
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.column import column
 from cudf.core.dataframe import DataFrame, Series
+from cudf.core.dtypes import Number
 from cudf.tests import utils
 from cudf.tests.utils import (
     ALL_TYPES,
@@ -26,7 +27,6 @@
     does_not_raise,
     gen_rand,
 )
-from cudf.core.dtypes import Number
 
 
 def test_init_via_list_of_tuples():
@@ -3242,8 +3242,8 @@ def test_empty_dataframe_describe():
 
 
 def test_as_column_types():
-    from cudf.core.column import column
     from cudf import Float32Dtype, Float64Dtype, StringDtype
+    from cudf.core.column import column
 
     col = column.as_column(Series([]))
     assert isinstance(col.dtype, Float64Dtype)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index a6d43686812..f043e045a7b 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -640,6 +640,7 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format):
 )
 def test_to_datetime_errors(data):
     from cudf.core.series import _fix_nullable_dtype_repr
+
     pd_data = data
     if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
         gd_data = cudf.from_pandas(pd_data)
@@ -649,7 +650,9 @@ def test_to_datetime_errors(data):
     try:
         pd.to_datetime(pd_data)
     except Exception as e:
-        with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(str(e)))):
+        with pytest.raises(
+            type(e), match=re.escape(_fix_nullable_dtype_repr(str(e)))
+        ):
             cudf.to_datetime(gd_data)
     else:
         raise AssertionError("Was expecting `pd.to_datetime` to fail")
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 9a3f9a285a5..2b6abd951b2 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -76,50 +76,54 @@ def test_nested_dtype():
     got = dt.element_type
     assert expect == got
 
-@pytest.mark.parametrize('string,dtype', [
-    ('uint8', cudf.UInt8Dtype),
-    ('uint16', cudf.UInt16Dtype),
-    ('uint32', cudf.UInt32Dtype),
-    ('uint64', cudf.UInt64Dtype),
-    ('UInt8', cudf.UInt8Dtype),
-    ('UInt16', cudf.UInt16Dtype),
-    ('UInt32', cudf.UInt32Dtype),
-    ('UInt64', cudf.UInt64Dtype),
-    ('int8', cudf.Int8Dtype),
-    ('int16', cudf.Int16Dtype),
-    ('int32', cudf.Int32Dtype),
-    ('int64', cudf.Int64Dtype),
-    ('Int8', cudf.Int8Dtype),
-    ('Int16', cudf.Int16Dtype),
-    ('Int32', cudf.Int32Dtype),
-    ('Int64', cudf.Int64Dtype),
-    ('int', cudf.Int64Dtype),
-    ('float32', cudf.Float32Dtype),
-    ('float64', cudf.Float64Dtype),
-    ('Float32', cudf.Float32Dtype),
-    ('Float64', cudf.Float64Dtype),
-    ('float', cudf.Float64Dtype),
-    ('bool', cudf.BooleanDtype),
-    ('Boolean', cudf.BooleanDtype),
-    ('string', cudf.StringDtype),
-    ('String', cudf.StringDtype),
-    ('object', cudf.StringDtype),
-    ('datetime64[ns]', cudf.Datetime64NSDtype),
-    ('datetime64[us]', cudf.Datetime64USDtype),
-    ('datetime64[ms]', cudf.Datetime64MSDtype),
-    ('datetime64[s]', cudf.Datetime64SDtype),
-    ('Datetime64NS', cudf.Datetime64NSDtype),
-    ('Datetime64US', cudf.Datetime64USDtype),
-    ('Datetime64MS', cudf.Datetime64MSDtype),
-    ('Datetime64S', cudf.Datetime64SDtype),
-    ('timedelta64[ns]', cudf.Timedelta64NSDtype),
-    ('timedelta64[us]', cudf.Timedelta64USDtype),
-    ('timedelta64[ms]', cudf.Timedelta64MSDtype),
-    ('timedelta64[s]', cudf.Timedelta64SDtype),
-    ('Timedelta64NS', cudf.Timedelta64NSDtype),
-    ('Timedelta64US', cudf.Timedelta64USDtype),
-    ('Timedelta64MS', cudf.Timedelta64MSDtype),
-    ('Timedelta64S', cudf.Timedelta64SDtype),
-    ])
+
+@pytest.mark.parametrize(
+    "string,dtype",
+    [
+        ("uint8", cudf.UInt8Dtype),
+        ("uint16", cudf.UInt16Dtype),
+        ("uint32", cudf.UInt32Dtype),
+        ("uint64", cudf.UInt64Dtype),
+        ("UInt8", cudf.UInt8Dtype),
+        ("UInt16", cudf.UInt16Dtype),
+        ("UInt32", cudf.UInt32Dtype),
+        ("UInt64", cudf.UInt64Dtype),
+        ("int8", cudf.Int8Dtype),
+        ("int16", cudf.Int16Dtype),
+        ("int32", cudf.Int32Dtype),
+        ("int64", cudf.Int64Dtype),
+        ("Int8", cudf.Int8Dtype),
+        ("Int16", cudf.Int16Dtype),
+        ("Int32", cudf.Int32Dtype),
+        ("Int64", cudf.Int64Dtype),
+        ("int", cudf.Int64Dtype),
+        ("float32", cudf.Float32Dtype),
+        ("float64", cudf.Float64Dtype),
+        ("Float32", cudf.Float32Dtype),
+        ("Float64", cudf.Float64Dtype),
+        ("float", cudf.Float64Dtype),
+        ("bool", cudf.BooleanDtype),
+        ("Boolean", cudf.BooleanDtype),
+        ("string", cudf.StringDtype),
+        ("String", cudf.StringDtype),
+        ("object", cudf.StringDtype),
+        ("datetime64[ns]", cudf.Datetime64NSDtype),
+        ("datetime64[us]", cudf.Datetime64USDtype),
+        ("datetime64[ms]", cudf.Datetime64MSDtype),
+        ("datetime64[s]", cudf.Datetime64SDtype),
+        ("Datetime64NS", cudf.Datetime64NSDtype),
+        ("Datetime64US", cudf.Datetime64USDtype),
+        ("Datetime64MS", cudf.Datetime64MSDtype),
+        ("Datetime64S", cudf.Datetime64SDtype),
+        ("timedelta64[ns]", cudf.Timedelta64NSDtype),
+        ("timedelta64[us]", cudf.Timedelta64USDtype),
+        ("timedelta64[ms]", cudf.Timedelta64MSDtype),
+        ("timedelta64[s]", cudf.Timedelta64SDtype),
+        ("Timedelta64NS", cudf.Timedelta64NSDtype),
+        ("Timedelta64US", cudf.Timedelta64USDtype),
+        ("Timedelta64MS", cudf.Timedelta64MSDtype),
+        ("Timedelta64S", cudf.Timedelta64SDtype),
+    ],
+)
 def test_cudf_dtype_string_construction(string, dtype):
     assert type(cudf.dtype(string) == dtype)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 536ab79ddb0..f7e3ba3be94 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -802,7 +802,7 @@ def test_join_empty_table_dtype():
     gright = DataFrame.from_pandas(right)
     pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"])
     gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"])
-    assert gd_merge['a'].dtype == pd_merge['a'].dtype
+    assert gd_merge["a"].dtype == pd_merge["a"].dtype
 
 
 @pytest.mark.parametrize("how", ["outer", "inner", "left", "right"])
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 48c6522a378..c10d1879ccf 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -1,6 +1,6 @@
-import numpy as np
 import pandas as pd
 import pytest
+
 import cudf
 from cudf import Series
 from cudf.tests.utils import assert_eq
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index cf926f39da2..27a075c80f7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -5,20 +5,19 @@
 from glob import glob
 from io import BytesIO
 from string import ascii_letters
-from packaging import version
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
+from packaging import version
 from pyarrow import parquet as pq
 
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
+from cudf.tests import dataset_generator as dg
 from cudf.tests.utils import assert_eq
 
-import cudf.tests.dataset_generator as dg
-
 
 @pytest.fixture(scope="module")
 def datadir(datadir):
@@ -381,6 +380,7 @@ def test_parquet_read_filtered_everything(tmpdir):
     assert isinstance(df_filtered["x"].dtype, cudf.Int64Dtype)
     assert isinstance(df_filtered["y"].dtype, cudf.StringDtype)
 
+
 def test_parquet_read_filtered_multiple_files(tmpdir):
     # Generate data
     fname_0 = tmpdir.join("filtered_multiple_files_0.parquet")
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 115570fb8fe..57b72caffa3 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -6,10 +6,10 @@
 import pandas as pd
 import pytest
 from hypothesis import given, settings, strategies as st
-from cudf.core.series import _fix_nullable_dtype_repr
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
+from cudf.core.series import _fix_nullable_dtype_repr
 from cudf.tests import utils
 from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
 
@@ -49,6 +49,7 @@ def test_null_series(nrows, dtype):
             str(sr._column.default_na_value()) + "\n", "<NA>\n"
         )
     from cudf.core.series import _fix_nullable_dtype_repr
+
     # todo: this is kind of self-fulfilling since this is what is
     # called inside _repr_ as well
     psrepr = _fix_nullable_dtype_repr(psrepr)
@@ -203,12 +204,9 @@ def test_mixed_dataframe(mixed_pdf, mixed_gdf):
 
 def test_mixed_series(mixed_pdf, mixed_gdf):
     for col in mixed_gdf.columns:
-        try:
-            assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr(mixed_pdf[col].__repr__())
-        except:
-            import pdb
-            pdb.set_trace()
-
+        assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr(
+            mixed_pdf[col].__repr__()
+        )
 
 def test_MI():
     gdf = cudf.DataFrame(
@@ -582,7 +580,9 @@ def test_series_null_index_repr(sr, pandas_special_case):
         # Whereas cudf is consistent with strings `null` values
         # to be printed as `None` everywhere.
         actual_repr = gsr.__repr__().replace("None", "<NA>")
-    assert _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split()
+    assert (
+        _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split()
+    )
 
 
 @pytest.mark.parametrize(
@@ -622,7 +622,9 @@ def test_timedelta_series_s_us_repr(data, dtype):
     psr = sr.to_pandas()
 
     expected = (
-        psr.__repr__().replace("timedelta64[ns]", str(sr.dtype)).replace("NaT", "<NA>")
+        psr.__repr__()
+        .replace("timedelta64[ns]", str(sr.dtype))
+        .replace("NaT", "<NA>")
     )
     actual = sr.__repr__()
 
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index e0d35f2eb5c..9ae5c17da47 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -143,7 +143,7 @@ def test_series_set_equal_length_object_by_mask(replace_data):
     # Lengths match in trivial case
     pd_bool_col = pd.Series([True] * len(psr))
     gd_bool_col = Series.from_pandas(pd_bool_col)
-    
+
     psr[pd_bool_col] = (
         replace_data.to_pandas()
         if hasattr(replace_data, "to_pandas")
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 8e2e2585c27..99ee4878f11 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -156,7 +156,7 @@ def test_string_repr(ps_gs, item):
 
     if got_out is not None and len(got_out) > 1:
         expect = expect.replace("None", "<NA>")
-    expect = expect.replace('object', 'String')
+    expect = expect.replace("object", "String")
 
     assert expect == got
 
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 634517368d5..564271c217f 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -976,8 +976,7 @@ def test_timedelta_invalid_ops():
         with pytest.raises(
             TypeError,
             match=re.escape(
-                f"Addition of {sr.dtype} with Int64 "
-                f"cannot be performed."
+                f"Addition of {sr.dtype} with Int64 " f"cannot be performed."
             ),
         ):
             sr + 1
@@ -990,8 +989,7 @@ def test_timedelta_invalid_ops():
         with pytest.raises(
             TypeError,
             match=re.escape(
-                f"Addition of {sr.dtype} with String "
-                f"cannot be performed."
+                f"Addition of {sr.dtype} with String " f"cannot be performed."
             ),
         ):
             sr + "a"
@@ -1021,8 +1019,7 @@ def test_timedelta_invalid_ops():
         with pytest.raises(
             TypeError,
             match=re.escape(
-                f"Modulus of {sr.dtype} with String "
-                f"cannot be performed."
+                f"Modulus of {sr.dtype} with String " f"cannot be performed."
             ),
         ):
             sr % "a"
@@ -1158,13 +1155,16 @@ def test_timedelta_invalid_ops():
 
 def test_timedelta_datetime_cast_invalid():
     from cudf.core.series import _fix_nullable_dtype_repr
+
     sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
     psr = sr.to_pandas()
 
     try:
         psr.astype("datetime64[ns]")
     except TypeError as e:
-        with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))):
+        with pytest.raises(
+            type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))
+        ):
             sr.astype("datetime64[ns]")
     else:
         raise AssertionError("Expected timedelta to datetime typecast to fail")
@@ -1175,7 +1175,9 @@ def test_timedelta_datetime_cast_invalid():
     try:
         psr.astype("timedelta64[ns]")
     except TypeError as e:
-        with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))):
+        with pytest.raises(
+            type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))
+        ):
             sr.astype("timedelta64[ns]")
     else:
         raise AssertionError("Expected datetime to timedelta typecast to fail")
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 4959a4a5419..aaf37b635a2 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,5 @@
 import datetime as dt
+import inspect
 import numbers
 from collections import namedtuple
 from collections.abc import Sequence
@@ -8,13 +9,10 @@
 import pandas as pd
 import pyarrow as pa
 from pandas.core.dtypes.common import infer_dtype_from_object
-from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
 
 import cudf
 from cudf._lib.scalar import Scalar
 from cudf.api.types import is_categorical_dtype
-import inspect
-
 
 _NA_REP = "<NA>"
 _np_pa_dtypes = {
@@ -68,7 +66,19 @@
 }
 OTHER_TYPES = {"bool", "category", "str"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
-NEW_NUMERIC_TYPES = {'Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Float32', 'Float64'}
+NEW_NUMERIC_TYPES = {
+    "Int8",
+    "Int16",
+    "Int32",
+    "Int64",
+    "UInt8",
+    "UInt16",
+    "UInt32",
+    "UInt64",
+    "Float32",
+    "Float64",
+}
+
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype.
@@ -110,23 +120,6 @@ def numeric_normalize_types(*args):
     dtype = np.result_type(*[a.dtype.to_numpy for a in args])
     return [a.astype(dtype) for a in args]
 
-
-def is_numerical_dtype(obj):
-    if is_categorical_dtype(obj):
-        return False
-    if is_list_dtype(obj):
-        return False
-    return (
-        np.issubdtype(obj, np.bool_)
-        or np.issubdtype(obj, np.floating)
-        or np.issubdtype(obj, np.signedinteger)
-    )
-
-
-def is_string_dtype(obj):
-    return pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)
-
-
 def is_datetime_dtype(obj):
     if obj is None:
         return False
@@ -134,6 +127,7 @@ def is_datetime_dtype(obj):
         return False
     return "M8" in obj.str
 
+
 def cudf_dtype_from_pydata_dtype(dtype):
     """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
         Python dtype.
@@ -151,7 +145,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         dtype = np.datetime64
 
     result = cudf.dtype(infer_dtype_from_object(dtype))
-    if isinstance(result, cudf.Generic): 
+    if isinstance(result, cudf.Generic):
         return result.__class__
     elif inspect.isclass(result):
         return result
@@ -193,7 +187,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
     if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0:
         val = val.item()
 
-    if ((dtype is None) and isinstance(val, str)) or is_string_dtype(dtype):
+    if ((dtype is None) and isinstance(val, str)) or cudf.api.types.is_string_dtype(dtype):
         dtype = "str"
 
     if isinstance(val, dt.datetime):
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7af5e6f8caf..c8193c7226a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -349,7 +349,9 @@ def time_col_replace_nulls(input_col):
         column.as_column(
             Buffer(
                 np.array(
-                    [input_col.default_na_value()], dtype=input_col.dtype.to_numpy).view("|u1")
+                    [input_col.default_na_value()],
+                    dtype=input_col.dtype.to_numpy,
+                ).view("|u1")
             ),
             dtype=input_col.dtype,
         ),

From 62a7d5bf4d46590273b87ab159f8cf2c9ea2e453 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 8 Sep 2020 18:47:48 -0700
Subject: [PATCH 50/80] bug fixes and type attribute plumbing/iteration

---
 python/cudf/cudf/_lib/reduce.pyx           |  4 +-
 python/cudf/cudf/_lib/scalar.pyx           |  6 +++
 python/cudf/cudf/api/types.py              | 37 +++++++++++----
 python/cudf/cudf/core/column/column.py     |  5 +--
 python/cudf/cudf/core/column/numerical.py  | 16 +++----
 python/cudf/cudf/core/column/string.py     |  4 +-
 python/cudf/cudf/core/dataframe.py         |  2 +-
 python/cudf/cudf/core/dtypes.py            | 18 ++++----
 python/cudf/cudf/core/frame.py             |  4 +-
 python/cudf/cudf/core/index.py             |  4 +-
 python/cudf/cudf/core/indexing.py          |  2 +-
 python/cudf/cudf/core/series.py            | 52 ----------------------
 python/cudf/cudf/tests/test_categorical.py |  2 +-
 python/cudf/cudf/tests/test_dataframe.py   | 26 +++++------
 python/cudf/cudf/tests/test_index.py       |  8 ++--
 python/cudf/cudf/tests/test_string.py      |  2 +-
 python/cudf/cudf/tests/utils.py            | 52 ++++++++++++++++++++++
 17 files changed, 133 insertions(+), 111 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index ac8065d2d6f..63e3f28b450 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -48,9 +48,9 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     # check empty case
     if len(incol) <= incol.null_count:
         if reduction_op == 'sum' or reduction_op == 'sum_of_squares':
-            return incol.dtype.type(0)
+            return incol.dtype.type(0).value
         if reduction_op == 'product':
-            return incol.dtype.type(1)
+            return incol.dtype.type(1).value
         return np.nan
 
     with nogil:
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 773ce54be31..47a0d55816c 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -93,6 +93,11 @@ cdef class Scalar:
                 f"{type(value).__name__} to cudf scalar"
             )
 
+    def __eq__(self, other):
+        if isinstance(other, Scalar):
+            other = other.value
+        return self.value == other
+
     @property
     def dtype(self):
         """
@@ -352,6 +357,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
 
 
 def as_scalar(val, dtype=None):
+    dtype = cudf.dtype(dtype)
     if isinstance(val, Scalar):
         if (dtype is None or dtype == val.dtype):
             return val
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 732828085b4..484b9f1bfd1 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -6,37 +6,53 @@
 
 
 def is_bool_dtype(obj):
+    if hasattr(obj, 'dtype'):
+        obj = obj.dtype
     # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably
     if hasattr(obj, "dtype"):
         obj = obj.dtype
-    return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(
+    return isinstance(obj, cudf.BooleanDtype) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_bool_dtype(
         obj
-    )
+    ))
 
 
 def is_datetime64_dtype(obj):
-    return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(
+    if hasattr(obj, 'dtype'):
+        obj = obj.dtype
+    return isinstance(obj, cudf.Datetime) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_datetime64_dtype(
         obj
-    )
+    ))
 
 
 def is_timedelta64_dtype(obj):
+    if hasattr(obj, 'dtype'):
+        obj = obj.dtype
     return isinstance(
         obj, cudf.Timedelta
-    ) or pd.api.types.is_timedelta64_dtype(obj)
+    ) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_timedelta64_dtype(obj))
 
 
 def is_string_dtype(obj):
-    return isinstance(obj, cudf.StringDtype) or (
+    if hasattr(obj, 'dtype'):
+        obj = obj.dtype
+    return isinstance(obj, cudf.StringDtype) or (not isinstance(obj, cudf.Generic) and (
         pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)
-    )
+    ))
 
 
 def is_integer_dtype(obj):
-    return isinstance(obj, cudf.Integer) or pd.api.types.is_integer_dtype(obj)
+    if hasattr(obj, 'dtype'):
+        obj = obj.dtype
+    try:
+        return isinstance(obj, cudf.Integer) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_integer_dtype(obj))
+    except:
+        import pdb
+        pdb.set_trace()
 
 
 def is_numerical_dtype(obj):
+    if hasattr(obj, 'dtype'):
+        obj = obj.dtype
     if isinstance(obj, cudf.Generic):
         return isinstance(obj, (cudf.Number, cudf.BooleanDtype))
     if is_categorical_dtype(obj):
@@ -142,3 +158,8 @@ def result_type(*arrays_and_dtypes):
         for d in arrays_and_dtypes
     )
     return cudf.dtype(np.result_type(*arrays_and_dtypes))
+
+def isnan(obj):
+    if isinstance(obj, cudf._lib.scalar.Scalar):
+        obj = obj.value
+    return np.isnan(obj)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 0f5f29913b0..f26c87c08ff 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -642,11 +642,11 @@ def __getitem__(self, arg):
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = as_column([], dtype="int32")
-            if pd.api.types.is_integer_dtype(arg.dtype) or isinstance(
+            if cudf.api.types.is_integer_dtype(arg.dtype) or isinstance(
                 arg.dtype, cudf.Integer
             ):
                 return self.take(arg)
-            if pd.api.types.is_bool_dtype(arg.dtype) or isinstance(
+            if cudf.api.types.is_bool_dtype(arg.dtype) or isinstance(
                 arg.dtype, cudf.BooleanDtype
             ):
                 return self.apply_boolean_mask(arg)
@@ -1616,7 +1616,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 data=buffer, mask=mask, dtype=arbitrary.dtype
             )
         elif arb_dtype.kind in ("O", "U"):
-
             pa_data = pa.Array.from_pandas(arbitrary)
             data = as_column(pa_data, dtype=cudf.dtype(pa_data.type))
             # There is no cast operation available for pa.Array from int to
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 2fa16c8458d..e56f87aac21 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2020, NVIDIA CORPORATION.
 import numpy as np
-from pandas.api.types import is_integer_dtype
+from cudf.api.types import is_integer_dtype
 
 import cudf
 from cudf import _lib as libcudf
@@ -175,12 +175,7 @@ def as_numerical_column(self, dtype, **kwargs):
         return libcudf.unary.cast(self, dtype)
 
     def sum(self, dtype=None):
-        try:
             return libcudf.reduce.reduce("sum", self, dtype=dtype)
-        except:
-            import pdb
-
-            pdb.set_trace()
 
     def product(self, dtype=None):
         return libcudf.reduce.reduce("product", self, dtype=dtype)
@@ -236,7 +231,7 @@ def default_na_value(self):
         """
         dkind = self.dtype.kind
         if dkind == "f":
-            return self.dtype.type(np.nan)
+            return self.dtype.type(np.nan).value
         elif dkind == "i":
             return np.iinfo(self.dtype.to_numpy).min
         elif dkind == "u":
@@ -280,7 +275,8 @@ def fillna(self, fill_value):
         """
         Fill null values with *fill_value*
         """
-        if np.isscalar(fill_value):
+
+        if np.isscalar(fill_value) and not isinstance(fill_value, libcudf.scalar.Scalar):
             # castsafely to the same dtype as self
             # TODO - produce a libcudf scalar directly
             fill_value_casted = self.dtype.to_numpy.type(fill_value)
@@ -291,6 +287,8 @@ def fillna(self, fill_value):
                     )
                 )
             fill_value = fill_value_casted
+        elif isinstance(fill_value, libcudf.scalar.Scalar):
+            fill_value = libcudf.scalar.as_scalar(fill_value, dtype=self.dtype)
         else:
             fill_value = column.as_column(fill_value, nan_as_null=False)
             # cast safely to the same dtype as self
@@ -471,7 +469,7 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
             col_to_normalize_casted = input_column_dtype.type(
                 col_to_normalize[0]
             )
-            if not np.isnan(col_to_normalize_casted) and (
+            if not cudf.api.types.isnan(col_to_normalize_casted) and (
                 col_to_normalize_casted != col_to_normalize[0]
             ):
                 raise TypeError(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 224211d5b2d..3641781c07b 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4567,7 +4567,7 @@ def as_numerical_column(self, dtype, **kwargs):
         )
         kwargs.update(dtype=out_dtype)
 
-        if out_dtype.type is np.datetime64:
+        if isinstance(out_dtype, cudf.Datetime):
             if "format" not in kwargs:
                 if len(self) > 0:
                     # infer on host from the first not na element
@@ -4586,7 +4586,7 @@ def as_numerical_column(self, dtype, **kwargs):
                 raise ValueError("Could not convert `None` value to datetime")
 
             boolean_match = self.binary_operator("eq", "NaT")
-        elif out_dtype.type is np.timedelta64:
+        elif isinstance(out_dtype, cudf.Timedelta):
             if "format" not in kwargs:
                 if len(self) > 0:
                     kwargs.update(format="%D days %H:%M:%S")
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0f59bda56a5..71f91432612 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4521,7 +4521,7 @@ def _sizeof_fmt(num, size_qualifier):
             else:
                 deep = False
                 if (
-                    "String" in dtype_counts
+                    "string" in dtype_counts
                     or self.index.dtype == cudf.StringDtype()
                 ):
                     size_qualifier = "+"
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 360de63efae..ac55b34afc4 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -15,15 +15,15 @@ class Generic(ExtensionDtype, _Dtype):
     pa_type = None
 
     def __eq__(self, other):
-        if isinstance(other, self.__class__):
+        if isinstance(other, type(self)):
             return True
         if isinstance(other, Generic) and not isinstance(
-            other, self.__class__
+            other, type(self)
         ):
             return False
         if (
-            isinstance(other, self.to_pandas.__class__)
-            or other is self.to_pandas.__class__
+            isinstance(other, type(self.to_pandas))
+            or other is type(self.to_pandas)
         ):
             return True
 
@@ -338,12 +338,10 @@ def dtype(obj):
         return cudf.Float64Dtype()
     elif obj is None:
         return None
+    elif obj is np.object:
+        return 
     else:
-
-        raise TypeError
-
-        # raise TypeError(f"Could not find a cuDF dtype matching {obj}")
-
+        raise TypeError(f"Could not find cuDF dtype matching {obj}")
 
 class CategoricalDtype(Generic):
     def __init__(self, categories=None, ordered=None):
@@ -413,7 +411,7 @@ def __eq__(self, other):
             return other == self.name
         elif other is self:
             return True
-        elif not isinstance(other, self.__class__):
+        elif not isinstance(other, type(self)):
             return False
         elif self.ordered != other.ordered:
             return False
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 31de28f408f..8792dccba85 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -486,7 +486,7 @@ def _get_columns_by_index(self, indices):
         )
 
     def _gather(self, gather_map, keep_index=True):
-        if not pd.api.types.is_integer_dtype(gather_map.dtype):
+        if not cudf.api.types.is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
         result = self.__class__._from_table(
             libcudf.copying.gather(
@@ -3142,7 +3142,7 @@ def _get_replacement_values(to_replace, replacement, col_name, column):
             if all_nan:
                 replacement = [replacement] * len(to_replace)
             # Do not broadcast numeric dtypes
-            elif pd.api.types.is_numeric_dtype(column.dtype):
+            elif cudf.api.types.is_numerical_dtype(column.dtype):
                 if len(to_replace) > 0:
                     replacement = [replacement]
                 else:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 31afdf10a33..a3ce9d8cb19 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -725,8 +725,8 @@ def append(self, other):
                     )
                     raise TypeError(
                         f"cudf does not support appending an Index of "
-                        f"dtype `{np.dtype('object')}` with an Index "
-                        f"of dtype `{got_dtype}`, please type-cast "
+                        f"dtype `{self.dtype}` with an Index "
+                        f"of dtype `{other.dtype}`, please type-cast "
                         f"either one of them to same dtypes."
                     )
 
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 83dfc0cb768..ef5ca3d6341 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -89,7 +89,7 @@ def __setitem__(self, key, value):
         else:
             value = column.as_column(value)
 
-        if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype(
+        if hasattr(value, "dtype") and cudf.api.types.is_numerical_dtype(
             value.dtype
         ):
             # normalize types if necessary:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8804aff2e38..8efc6c3a0a1 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -5186,55 +5186,3 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
         result_col[equal_nulls] = True
 
     return Series(result_col, index=index)
-
-
-def _fix_nullable_dtype_repr(string):
-
-    to_replace = [
-        "uint8",
-        "uint16",
-        "uint32",
-        "uint64",
-        "int8",
-        "int16",
-        "int32",
-        "int64",
-        "float32",
-        "float64",
-        "bool",
-        "object",
-        "datetime64[ns]",
-        "datetime64[us]",
-        "datetime64[ms]",
-        "datetime64[s]",
-        "timedelta64[ns]",
-        "timedelta64[us]",
-        "timedelta64[ms]",
-        "timedelta64[s]",
-    ]
-
-    replacements = [
-        "UInt8",
-        "UInt16",
-        "UInt32",
-        "UInt64",
-        "Int8",
-        "Int16",
-        "Int32",
-        "Int64",
-        "Float32",
-        "Float64",
-        "Boolean",
-        "String",
-        "Datetime64NS",
-        "Datetime64US",
-        "Datetime64MS",
-        "Datetime64S",
-        "Timedelta64NS",
-        "Timedelta64US",
-        "Timedelta64MS",
-        "Timedelta64S",
-    ]
-    for tr, rp in zip(to_replace, replacements):
-        string = string.replace(tr, rp)
-    return string
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index ebbbd7b8cd5..3ccd8a2e85c 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -71,7 +71,7 @@ def test_categorical_integer():
 3 c
 4 a
 dtype: category
-Categories (3, String): ['a', 'b', 'c']
+Categories (3, string): ['a', 'b', 'c']
 """
     assert string.split() == expect_str.split()
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 4edcf0955f5..c2429504764 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5576,8 +5576,8 @@ def test_dataframe_info_verbose_mem_usage():
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
      0   a       3 non-null      Int64
-     1   b       3 non-null      String
-    dtypes: Int64(1), String(1)
+     1   b       3 non-null      string
+    dtypes: Int64(1), string(1)
     memory usage: 56.0+ bytes
     """
     )
@@ -5593,7 +5593,7 @@ def test_dataframe_info_verbose_mem_usage():
     <class 'cudf.core.dataframe.DataFrame'>
     RangeIndex: 3 entries, 0 to 2
     Columns: 2 entries, a to b
-    dtypes: Int64(1), String(1)
+    dtypes: Int64(1), string(1)
     memory usage: 56.0+ bytes
     """
     )
@@ -5616,8 +5616,8 @@ def test_dataframe_info_verbose_mem_usage():
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
      0   a       3 non-null      Int64
-     1   b       3 non-null      String
-    dtypes: Int64(1), String(1)
+     1   b       3 non-null      string
+    dtypes: Int64(1), string(1)
     memory usage: 91.0 bytes
     """
     )
@@ -5647,9 +5647,9 @@ def test_dataframe_info_verbose_mem_usage():
      #   Column     Non-Null Count  Dtype
     ---  ------     --------------  -----
      0   int_col    5 non-null      Int64
-     1   text_col   5 non-null      String
+     1   text_col   5 non-null      string
      2   float_col  5 non-null      Float64
-    dtypes: Float64(1), Int64(1), String(1)
+    dtypes: Float64(1), Int64(1), string(1)
     memory usage: 130.0 bytes
     """
     )
@@ -5682,9 +5682,9 @@ def test_dataframe_info_null_counts():
      #   Column     Dtype
     ---  ------     -----
      0   int_col    Int64
-     1   text_col   String
+     1   text_col   string
      2   float_col  Float64
-    dtypes: Float64(1), Int64(1), String(1)
+    dtypes: Float64(1), Int64(1), string(1)
     memory usage: 130.0+ bytes
     """
     )
@@ -5732,8 +5732,8 @@ def test_dataframe_info_null_counts():
      #   Column  Dtype
     ---  ------  -----
      0   a       Int64
-     1   b       String
-    dtypes: Int64(1), String(1)
+     1   b       string
+    dtypes: Int64(1), string(1)
     memory usage: 238.0+ bytes
     """
     )
@@ -5754,8 +5754,8 @@ def test_dataframe_info_null_counts():
      #   Column  Non-Null Count  Dtype
     ---  ------  --------------  -----
      0   a       6 non-null      Int64
-     1   b       6 non-null      String
-    dtypes: Int64(1), String(1)
+     1   b       6 non-null      string
+    dtypes: Int64(1), string(1)
     memory usage: 238.0+ bytes
     """
     )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 463970eef8c..a28d10bd758 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -923,8 +923,8 @@ def test_index_append_error(data, other):
         TypeError,
         match=re.escape(
             f"cudf does not support appending an Index of "
-            f"dtype `{np.dtype('object')}` with an Index "
-            f"of dtype `{got_dtype}`, please type-cast "
+            f"dtype `{gd_data.dtype}` with an Index "
+            f"of dtype `{gd_other.dtype}`, please type-cast "
             f"either one of them to same dtypes."
         ),
     ):
@@ -934,8 +934,8 @@ def test_index_append_error(data, other):
         TypeError,
         match=re.escape(
             f"cudf does not support appending an Index of "
-            f"dtype `{np.dtype('object')}` with an Index "
-            f"of dtype `{got_dtype}`, please type-cast "
+            f"dtype `{gd_other.dtype}` with an Index "
+            f"of dtype `{gd_data.dtype}`, please type-cast "
             f"either one of them to same dtypes."
         ),
     ):
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 99ee4878f11..3575a61503b 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -156,7 +156,7 @@ def test_string_repr(ps_gs, item):
 
     if got_out is not None and len(got_out) > 1:
         expect = expect.replace("None", "<NA>")
-    expect = expect.replace("object", "String")
+    expect = expect.replace("object", "string")
 
     assert expect == got
 
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py
index dea444dd560..f5646f67dde 100644
--- a/python/cudf/cudf/tests/utils.py
+++ b/python/cudf/cudf/tests/utils.py
@@ -152,6 +152,58 @@ def gen_rand_series(dtype, size, **kwargs):
     return cudf.Series(values)
 
 
+def _fix_nullable_dtype_repr(string):
+
+    to_replace = [
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "bool",
+        "object",
+        "datetime64[ns]",
+        "datetime64[us]",
+        "datetime64[ms]",
+        "datetime64[s]",
+        "timedelta64[ns]",
+        "timedelta64[us]",
+        "timedelta64[ms]",
+        "timedelta64[s]",
+    ]
+
+    replacements = [
+        "UInt8",
+        "UInt16",
+        "UInt32",
+        "UInt64",
+        "Int8",
+        "Int16",
+        "Int32",
+        "Int64",
+        "Float32",
+        "Float64",
+        "boolean",
+        "string",
+        "Datetime64NS",
+        "Datetime64US",
+        "Datetime64MS",
+        "Datetime64S",
+        "Timedelta64NS",
+        "Timedelta64US",
+        "Timedelta64MS",
+        "Timedelta64S",
+    ]
+    for tr, rp in zip(to_replace, replacements):
+        string = string.replace(tr, rp)
+    return string
+
+
 @contextmanager
 def does_not_raise():
     yield

From 80baff4f805f48fda88261c13d87bc562fe179c9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Sep 2020 08:00:19 -0700
Subject: [PATCH 51/80] fix repr and move around testing utilities

---
 python/cudf/cudf/core/series.py            |  4 +++-
 python/cudf/cudf/tests/test_categorical.py |  3 +--
 python/cudf/cudf/tests/test_repr.py        | 16 +++++++---------
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8efc6c3a0a1..8448f67f618 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1023,11 +1023,13 @@ def __repr__(self):
         else:
             output = preprocess.to_pandas().__repr__()
 
-        output = _fix_nullable_dtype_repr(output)
         lines = output.split("\n")
 
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
             category_memory = lines[-1]
+            to_replace = str(self.dtype.categories.dtype.to_numpy)
+            replacement = str(self.dtype.categories.dtype.name)
+            category_memory = category_memory.replace(to_replace, replacement)
             lines = lines[:-1]
         if len(lines) > 1:
             if lines[-1].startswith("Name: "):
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 3ccd8a2e85c..1577c2b39f2 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -8,8 +8,7 @@
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.index import as_index
-from cudf.core.series import _fix_nullable_dtype_repr
-from cudf.tests.utils import assert_eq
+from cudf.tests.utils import assert_eq,  _fix_nullable_dtype_repr
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 57b72caffa3..f8226aada39 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -9,7 +9,6 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.core.series import _fix_nullable_dtype_repr
 from cudf.tests import utils
 from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
 
@@ -48,11 +47,10 @@ def test_null_series(nrows, dtype):
         psrepr = psrepr.replace(
             str(sr._column.default_na_value()) + "\n", "<NA>\n"
         )
-    from cudf.core.series import _fix_nullable_dtype_repr
 
     # todo: this is kind of self-fulfilling since this is what is
     # called inside _repr_ as well
-    psrepr = _fix_nullable_dtype_repr(psrepr)
+    psrepr = utils._fix_nullable_dtype_repr(psrepr)
 
     assert psrepr.split() == sr.__repr__().split()
 
@@ -94,7 +92,7 @@ def test_full_series(nrows, dtype):
     ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype)
     sr = cudf.from_pandas(ps)
     pd.options.display.max_rows = int(nrows)
-    psrepr = _fix_nullable_dtype_repr(ps.__repr__())
+    psrepr = utils._fix_nullable_dtype_repr(ps.__repr__())
     assert psrepr == sr.__repr__()
 
 
@@ -157,7 +155,7 @@ def test_integer_dataframe(x):
 def test_integer_series(x):
     sr = cudf.Series(x)
     ps = pd.Series(x)
-    psrepr = _fix_nullable_dtype_repr(ps.__repr__())
+    psrepr = utils._fix_nullable_dtype_repr(ps.__repr__())
     assert sr.__repr__() == psrepr
 
 
@@ -174,7 +172,7 @@ def test_float_dataframe(x):
 def test_float_series(x):
     sr = cudf.Series(x, nan_as_null=False)
     ps = pd.Series(x)
-    psrepr = _fix_nullable_dtype_repr(ps.__repr__())
+    psrepr = utils._fix_nullable_dtype_repr(ps.__repr__())
     assert sr.__repr__() == psrepr
 
 
@@ -204,7 +202,7 @@ def test_mixed_dataframe(mixed_pdf, mixed_gdf):
 
 def test_mixed_series(mixed_pdf, mixed_gdf):
     for col in mixed_gdf.columns:
-        assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr(
+        assert mixed_gdf[col].__repr__() == utils._fix_nullable_dtype_repr(
             mixed_pdf[col].__repr__()
         )
 
@@ -257,7 +255,7 @@ def test_generic_index(length, dtype):
         index=np.random.randint(0, high=100, size=length).astype(dtype),
     )
     gsr = cudf.Series.from_pandas(psr)
-    psrepr = _fix_nullable_dtype_repr(psr.index.__repr__())
+    psrepr = utils._fix_nullable_dtype_repr(psr.index.__repr__())
     assert psrepr == gsr.index.__repr__()
 
 
@@ -581,7 +579,7 @@ def test_series_null_index_repr(sr, pandas_special_case):
         # to be printed as `None` everywhere.
         actual_repr = gsr.__repr__().replace("None", "<NA>")
     assert (
-        _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split()
+        utils._fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split()
     )
 
 

From 38e11af5f123e18d11a54b1d87aa71781492c760 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Sep 2020 08:08:16 -0700
Subject: [PATCH 52/80] clean up reduce.pyx

---
 python/cudf/cudf/_lib/reduce.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 63e3f28b450..5d5a3f5d2a7 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -31,11 +31,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
+    dtype = cudf_dtype(dtype)
     col_dtype = incol.dtype
     if reduction_op in ['sum', 'sum_of_squares', 'product']:
         col_dtype = find_common_type([col_dtype], [np.uint64])
-    col_dtype = cudf_dtype(col_dtype) if dtype is None else cudf_dtype(dtype)
+    col_dtype = col_dtype if dtype is None else dtype
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result

From 22b299d053c7c642e1bde282c4bb102a720a1960 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 9 Sep 2020 15:50:15 -0700
Subject: [PATCH 53/80] implement cudf::scalar -> cudf.Scalar -> Buffer, column

---
 python/cudf/cudf/_lib/cpp/scalar/scalar.pxd |  6 ++
 python/cudf/cudf/_lib/reduce.pyx            |  6 +-
 python/cudf/cudf/_lib/scalar.pxd            |  1 +
 python/cudf/cudf/_lib/scalar.pyx            | 93 ++++++++++++++++++++-
 python/cudf/cudf/core/buffer.py             | 13 ++-
 python/cudf/cudf/core/column/column.py      |  5 ++
 6 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
index 3eb11c2bfd0..6b5242b8e08 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
@@ -23,6 +23,7 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         numeric_scalar(T value, bool is_valid) except +
         void set_value(T value) except +
         T value() except +
+        T* data() except +
 
     cdef cppclass timestamp_scalar[T](scalar):
         timestamp_scalar() except +
@@ -34,6 +35,8 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         int64_t ticks_since_epoch_64 "ticks_since_epoch"() except +
         int32_t ticks_since_epoch_32 "ticks_since_epoch"() except +
         T value() except +
+        T* data() except +
+
 
     cdef cppclass duration_scalar[T](scalar):
         duration_scalar() except +
@@ -44,6 +47,8 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         duration_scalar(int32_t value, bool is_valid) except +
         int64_t ticks "count"() except +
         T value() except +
+        T* data() except +
+
 
     cdef cppclass string_scalar(scalar):
         string_scalar() except +
@@ -51,3 +56,4 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         string_scalar(string st, bool is_valid) except +
         string_scalar(string_scalar other) except +
         string to_string() except +
+        const char* data() except +
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 5d5a3f5d2a7..60780abbfb5 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -48,9 +48,9 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     # check empty case
     if len(incol) <= incol.null_count:
         if reduction_op == 'sum' or reduction_op == 'sum_of_squares':
-            return incol.dtype.type(0).value
+            return incol.dtype.type(0)
         if reduction_op == 'product':
-            return incol.dtype.type(1).value
+            return incol.dtype.type(1)
         return np.nan
 
     with nogil:
@@ -61,7 +61,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         ))
 
     py_result = Scalar.from_unique_ptr(move(c_result))
-    return py_result.value
+    return py_result
 
 
 def scan(scan_op, Column incol, inclusive, **kwargs):
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 34dfea5431a..6c8a2155c98 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -4,6 +4,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp cimport bool
 
 from cudf._lib.cpp.scalar.scalar cimport scalar
+from libc.stdint cimport uintptr_t
 
 
 cdef class Scalar:
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 47a0d55816c..b18edcc531e 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -15,6 +15,7 @@ from libc.stdint cimport (
 )
 from libcpp.memory cimport unique_ptr
 from libcpp cimport bool
+from libc.stdint cimport uintptr_t
 
 import cudf
 from cudf._lib.types import cudf_to_np_types, duration_unit_map
@@ -43,7 +44,6 @@ from cudf._lib.cpp.scalar.scalar cimport (
 )
 cimport cudf._lib.cpp.types as libcudf_types
 
-
 cdef class Scalar:
 
     def __init__(self, value, dtype=None):
@@ -145,6 +145,11 @@ cdef class Scalar:
         cdef Scalar s = Scalar.__new__(Scalar)
         s.c_value = move(ptr)
         return s
+    
+    @property
+    def ptr(self):
+        return _get_ptr_from_scalar_any(self.c_value)
+            
 
 
 cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):
@@ -365,3 +370,89 @@ def as_scalar(val, dtype=None):
             return Scalar(val.value, dtype)
     else:
         return Scalar(value=val, dtype=dtype)
+
+cdef _get_ptr_from_scalar_any(unique_ptr[scalar]& s):
+    cdef scalar* s_ptr = s.get()
+    if not s_ptr[0].is_valid():
+        return None
+
+    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
+
+    if cdtype.id() == libcudf_types.INT8:
+        return int(
+            <uintptr_t>(<numeric_scalar[int8_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.INT16:
+        return int(
+            <uintptr_t>(<numeric_scalar[int16_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.INT32:
+        return int(
+            <uintptr_t>(<numeric_scalar[int32_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.INT64:
+        return int(
+            <uintptr_t>(<numeric_scalar[int64_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.UINT8:
+        return int(
+            <uintptr_t>(<numeric_scalar[uint8_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.UINT16:
+        return int(
+            <uintptr_t>(<numeric_scalar[uint16_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.UINT32:
+        return int(
+            <uintptr_t>(<numeric_scalar[uint32_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.UINT64:
+        return int(
+            <uintptr_t>(<numeric_scalar[uint64_t]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.FLOAT32:
+        return int(
+            <uintptr_t>(<numeric_scalar[float]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.FLOAT64:
+        return int(
+            <uintptr_t>(<numeric_scalar[double]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.BOOL8:
+        return int(
+            <uintptr_t>(<numeric_scalar[bool]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.TIMESTAMP_NANOSECONDS:
+        return int(
+            <uintptr_t>(<timestamp_scalar[timestamp_ns]*>s_ptr)[0].data()
+        ) 
+    elif cdtype.id() == libcudf_types.TIMESTAMP_MICROSECONDS:
+        return int(
+            <uintptr_t>(<timestamp_scalar[timestamp_us]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.TIMESTAMP_MILLISECONDS:
+        return int(
+            <uintptr_t>(<timestamp_scalar[timestamp_ms]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.TIMESTAMP_SECONDS:
+        return int(
+            <uintptr_t>(<timestamp_scalar[timestamp_s]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.DURATION_NANOSECONDS:
+        return int(
+            <uintptr_t>(<duration_scalar[duration_ns]*>s_ptr)[0].data()
+        ) 
+    elif cdtype.id() == libcudf_types.DURATION_MICROSECONDS:
+        return int(
+            <uintptr_t>(<duration_scalar[duration_us]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.DURATION_MILLISECONDS:
+        return int(
+            <uintptr_t>(<duration_scalar[duration_ms]*>s_ptr)[0].data()
+        )
+    elif cdtype.id() == libcudf_types.DURATION_SECONDS:
+        return int(
+            <uintptr_t>(<duration_scalar[duration_s]*>s_ptr)[0].data()
+        )  
+    else:
+        raise ValueError('Could not get pointer from cudf::scalar')
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 43ef5e42106..c12a08f04f5 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -6,7 +6,7 @@
 
 import rmm
 from rmm import DeviceBuffer
-
+import cudf
 from cudf.core.abc import Serializable
 
 
@@ -17,9 +17,10 @@ def __init__(self, data=None, size=None, owner=None):
 
         Parameters
         ----------
-        data : Buffer, array_like, int
-            An array-like object or integer representing a
-            device or host pointer to pre-allocated memory.
+        data : Buffer, array_like, int, Scalar
+            An array-like object, integer, or `Scalar`
+            representing a device or host pointer to
+            pre-allocated memory.
         size : int, optional
             Size of memory allocation. Required if a pointer
             is passed for `data`.
@@ -45,6 +46,10 @@ def __init__(self, data=None, size=None, owner=None):
             self.ptr = data
             self.size = size
             self._owner = owner
+        elif isinstance(data, cudf._lib.scalar.Scalar):
+            self.ptr = data.ptr
+            self.size = data.dtype.itemsize
+            self._owner = data
         elif data is None:
             self.ptr = 0
             self.size = 0
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f26c87c08ff..a10397253c9 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1616,6 +1616,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 data=buffer, mask=mask, dtype=arbitrary.dtype
             )
         elif arb_dtype.kind in ("O", "U"):
+            import pdb
+            pdb.set_trace()
             pa_data = pa.Array.from_pandas(arbitrary)
             data = as_column(pa_data, dtype=cudf.dtype(pa_data.type))
             # There is no cast operation available for pa.Array from int to
@@ -1670,6 +1672,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
+    elif isinstance(arbitrary, cudf._lib.scalar.Scalar):
+        buffer = Buffer(arbitrary)
+        data = as_column(buffer, dtype=arbitrary.dtype)
     else:
         try:
             data = as_column(

From 1552c0a6a414b0fe0b30ab7d9752cc3651a8afc9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 10 Sep 2020 16:16:37 -0700
Subject: [PATCH 54/80] minor bugfixes

---
 python/cudf/cudf/tests/test_datetime.py  | 3 +--
 python/cudf/cudf/tests/test_timedelta.py | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index f043e045a7b..9db1a04ab2a 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -13,6 +13,7 @@
 from cudf.core import DataFrame, Series
 from cudf.core.index import DatetimeIndex
 from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.tests.utils import _fix_nullable_dtype_repr
 
 
 def data1():
@@ -639,8 +640,6 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format):
     ],
 )
 def test_to_datetime_errors(data):
-    from cudf.core.series import _fix_nullable_dtype_repr
-
     pd_data = data
     if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
         gd_data = cudf.from_pandas(pd_data)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 564271c217f..7e4637c63ba 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq
+from cudf.tests.utils import assert_eq, _fix_nullable_dtype_repr
 from cudf.utils import dtypes as dtypeutils
 
 _TIMEDELTA_DATA = [
@@ -1154,8 +1154,6 @@ def test_timedelta_invalid_ops():
 
 
 def test_timedelta_datetime_cast_invalid():
-    from cudf.core.series import _fix_nullable_dtype_repr
-
     sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
     psr = sr.to_pandas()
 

From 78caafae4907e0e152031455a2057e5b67562d66 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 10 Sep 2020 16:23:48 -0700
Subject: [PATCH 55/80] add __int__ and __float__ to scalar

---
 python/cudf/cudf/_lib/scalar.pyx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index b18edcc531e..3ae4f18900d 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -149,7 +149,12 @@ cdef class Scalar:
     @property
     def ptr(self):
         return _get_ptr_from_scalar_any(self.c_value)
-            
+
+    def __int__(self):
+        return int(self.value)
+
+    def __float__(self):
+        return float(self.value)
 
 
 cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):

From a9fe2fb292d6e42a0893cee66935e5f3b6fa932e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 11 Sep 2020 08:59:49 -0700
Subject: [PATCH 56/80] partially implement scalar binops

---
 python/cudf/cudf/_lib/scalar.pyx       | 56 +++++++++++++++++++++--
 python/cudf/cudf/core/column/column.py |  8 ++--
 python/cudf/cudf/tests/test_scalar.py  | 61 +++++++++++++++++++++++++-
 python/cudf/cudf/utils/dtypes.py       |  2 +-
 4 files changed, 117 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 3ae4f18900d..d1f892cd327 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -43,7 +43,7 @@ from cudf._lib.cpp.scalar.scalar cimport (
     string_scalar
 )
 cimport cudf._lib.cpp.types as libcudf_types
-
+from cudf.utils.dtypes import to_cudf_compatible_scalar
 cdef class Scalar:
 
     def __init__(self, value, dtype=None):
@@ -61,7 +61,7 @@ cdef class Scalar:
             A NumPy dtype.
         """
 
-        value = cudf.utils.dtypes.to_cudf_compatible_scalar(value, dtype=dtype)
+        value = to_cudf_compatible_scalar(value, dtype=dtype)
 
         valid = value is not None
 
@@ -133,7 +133,7 @@ cdef class Scalar:
 
     def __repr__(self):
         if self.value is None:
-            return f"Scalar({self.value}, {self.dtype.__repr__()})"
+            return f"Scalar(<NA>, {self.dtype.__repr__()})"
         else:
             return f"Scalar({self.value.__repr__()})"
 
@@ -156,6 +156,56 @@ cdef class Scalar:
     def __float__(self):
         return float(self.value)
 
+    def __add__(self, other):
+        return self._scalar_binop(other, '__add__')
+
+    def __sub__(self, other):
+        return self._scalar_binop(other, '__sub__')
+
+    def __mul__(self, other):
+        return self._scalar_binop(other, '__mul__')
+
+    def __div__(self, other):
+        return self._scalar_binop(other, '__div__')
+
+    def __mod__(self, other):
+        return self._scalar_binop(other, '__mod__')
+
+    def __divmod__(self, other):
+        return self._scalar_binop(other, '__divmod__')
+
+    def __and__(self, other):
+        return self._scalar_binop(other, '__and__')
+
+    def __xor__(self, other):
+        return self._scalar_binop(other, '__or__')
+
+    def _binop_result_dtype_or_error(self, other):
+
+        if (self.dtype.kind == 'O' and other.dtype.kind != 'O') or (self.dtype.kind != 'O' and other.dtype.kind == 'O'):
+            wrong_dtype = self.dtype if self.dtype.kind != 'O' else other.dtype
+            raise TypeError(f"Can only concatenate string (not {wrong_dtype}) to string")
+
+
+        return cudf.api.types.find_common_type([
+            self.dtype, other.dtype
+        ])
+
+    def _scalar_binop(self, other, op):
+        other = to_cudf_compatible_scalar(other)
+        out_dtype = self._binop_result_dtype_or_error(other)
+
+        valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid())
+        if not valid:
+            return cudf.Scalar(None, dtype=out_dtype)
+        else:
+            result = self._dispatch_scalar_binop(other, op)
+            return Scalar(result, dtype=out_dtype)
+
+    def _dispatch_scalar_binop(self, other, op):
+        if isinstance(other, Scalar):
+            other = other.value
+        return getattr(self.value, op)(other)
 
 cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):
     value = value if valid else ""
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a10397253c9..61037d36905 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1616,8 +1616,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 data=buffer, mask=mask, dtype=arbitrary.dtype
             )
         elif arb_dtype.kind in ("O", "U"):
-            import pdb
-            pdb.set_trace()
             pa_data = pa.Array.from_pandas(arbitrary)
             data = as_column(pa_data, dtype=cudf.dtype(pa_data.type))
             # There is no cast operation available for pa.Array from int to
@@ -1672,9 +1670,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
-    elif isinstance(arbitrary, cudf._lib.scalar.Scalar):
-        buffer = Buffer(arbitrary)
-        data = as_column(buffer, dtype=arbitrary.dtype)
+    #elif isinstance(arbitrary, cudf._lib.scalar.Scalar):
+    #    buffer = Buffer(arbitrary)
+    #    data = as_column(buffer, dtype=arbitrary.dtype)
     else:
         try:
             data = as_column(
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index c8fb5e40d23..09fea834f59 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -7,7 +7,7 @@
 
 from cudf._lib.scalar import Scalar
 from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
-
+import operator
 
 @pytest.mark.parametrize(
     "value",
@@ -144,3 +144,62 @@ def test_date_duration_scalars(value):
 
     np.testing.assert_equal(actual, expected)
     assert s.is_valid() is True
+
+@pytest.mark.parametrize('pairs', [
+    (1, 1),
+    (1, 1.5),
+    (-1.5, 1),
+    (1, 'a'),
+    ('a', 'b'),
+    (1.5, 'a'),
+    (1, False),
+    (False, True),
+    (1.5, False),
+    (True, 1.5),
+    ('a', False),
+])
+@pytest.mark.parametrize('dtype_l', [
+    np.dtype('uint8'),
+    np.dtype('uint16'),
+    np.dtype('uint32'),
+    np.dtype('uint64'),
+    np.dtype('int8'),
+    np.dtype('int16'),
+    np.dtype('int32'),
+    np.dtype('int64'),
+    np.dtype('float32'),
+    np.dtype('float64'),
+    np.dtype('bool'),
+    np.dtype('object')
+])
+@pytest.mark.parametrize('dtype_r', [
+    np.dtype('uint8'),
+    np.dtype('uint16'),
+    np.dtype('uint32'),
+    np.dtype('uint64'),
+    np.dtype('int8'),
+    np.dtype('int16'),
+    np.dtype('int32'),
+    np.dtype('int64'),
+    np.dtype('float32'),
+    np.dtype('float64'),
+    np.dtype('bool'),
+    np.dtype('object')
+])
+@pytest.mark.parametrize('op', [
+    operator.add,
+    operator.sub,
+    operator.mul,
+])
+def test_scalar_binops_value(pairs, dtype_l, dtype_r, op):
+    l, r = pairs
+    host_value_l = dtype_l.type(l)
+    host_value_r = dtype_r.type(r)
+
+    gpu_value_l = Scalar(l)
+    gpu_value_r = Scalar(r)
+
+    expect = op(host_value_l, host_value_r)
+    got = op(gpu_value_l, gpu_value_r)
+
+    assert expect == got.value
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index aaf37b635a2..26946f685a9 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -175,7 +175,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
 
     If `val` is None, returns None.
     """
-    if val is None:
+    if val is None or isinstance(val, cudf._lib.scalar.Scalar):
         return val
 
     if not is_scalar(val):

From 455af02b92fea7e67b70e418f2324b342bb497c4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 11 Sep 2020 13:03:30 -0700
Subject: [PATCH 57/80] partial tests for scalar binop result dtype

---
 python/cudf/cudf/_lib/scalar.pyx      |  2 +-
 python/cudf/cudf/tests/test_scalar.py | 78 +++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index d1f892cd327..83e49f79cf2 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -197,7 +197,7 @@ cdef class Scalar:
 
         valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid())
         if not valid:
-            return cudf.Scalar(None, dtype=out_dtype)
+            return Scalar(None, dtype=out_dtype)
         else:
             result = self._dispatch_scalar_binop(other, op)
             return Scalar(result, dtype=out_dtype)
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 09fea834f59..3ca4b9f7a18 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -170,7 +170,7 @@ def test_date_duration_scalars(value):
     np.dtype('float32'),
     np.dtype('float64'),
     np.dtype('bool'),
-    np.dtype('object')
+    np.dtype('str')
 ])
 @pytest.mark.parametrize('dtype_r', [
     np.dtype('uint8'),
@@ -184,7 +184,7 @@ def test_date_duration_scalars(value):
     np.dtype('float32'),
     np.dtype('float64'),
     np.dtype('bool'),
-    np.dtype('object')
+    np.dtype('str')
 ])
 @pytest.mark.parametrize('op', [
     operator.add,
@@ -193,13 +193,73 @@ def test_date_duration_scalars(value):
 ])
 def test_scalar_binops_value(pairs, dtype_l, dtype_r, op):
     l, r = pairs
-    host_value_l = dtype_l.type(l)
-    host_value_r = dtype_r.type(r)
+    import re
+    try:
+        host_value_l = dtype_l.type(l)
+    except ValueError as e:
+        with pytest.raises(ValueError, match=re.escape(str(e))):
+            gpu_value_l = Scalar(l, dtype=dtype_l)
+        return
+    try:
+        host_value_r = dtype_r.type(r)
+    except ValueError as e:
+        with pytest.raises(ValueError, match=re.escape(str(e))):
+            gpu_value_r = Scalar(r, dtype=dtype_r)
+        return
 
-    gpu_value_l = Scalar(l)
-    gpu_value_r = Scalar(r)
-
-    expect = op(host_value_l, host_value_r)
+    gpu_value_l = Scalar(l, dtype=dtype_l)
+    gpu_value_r = Scalar(r, dtype=dtype_r)
+    try:
+        expect = op(host_value_l, host_value_r)
+    except np.core._exceptions.UFuncTypeError:
+        with pytest.raises(TypeError):
+            got = op(gpu_value_l, gpu_value_r)
+        return
     got = op(gpu_value_l, gpu_value_r)
-
     assert expect == got.value
+
+
+@pytest.mark.parametrize('dtype_l', [
+    np.dtype('uint8'),
+    np.dtype('uint16'),
+    np.dtype('uint32'),
+    np.dtype('uint64'),
+    np.dtype('int8'),
+    np.dtype('int16'),
+    np.dtype('int32'),
+    np.dtype('int64'),
+    np.dtype('float32'),
+    np.dtype('float64'),
+])
+@pytest.mark.parametrize('dtype_r', [
+    np.dtype('uint8'),
+    np.dtype('uint16'),
+    np.dtype('uint32'),
+    np.dtype('uint64'),
+    np.dtype('int8'),
+    np.dtype('int16'),
+    np.dtype('int32'),
+    np.dtype('int64'),
+    np.dtype('float32'),
+    np.dtype('float64'),
+])
+@pytest.mark.parametrize('op', [
+    operator.add,
+    operator.sub,
+    operator.mul,
+])
+@pytest.mark.parametrize('l_valid', [True, False])
+@pytest.mark.parametrize('r_valid', [True, False])
+def test_scalar_binops_dtype_and_validity(dtype_l, dtype_r, l_valid, r_valid, op):
+    l_value = 0 if l_valid else None
+    r_value = 0 if r_valid else None
+
+    expect_dtype = op(dtype_l.type(0), dtype_r.type(0)).dtype
+
+    scalar_l = Scalar(l_value, dtype=dtype_l)
+    scalar_r = Scalar(r_value, dtype=dtype_r)
+
+    got = op(scalar_l, scalar_r)
+
+    assert got.dtype == expect_dtype
+    assert got.is_valid() == (l_valid and r_valid)

From e4c0bf1105556ac3f3d215c39f753cba5be5c027 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 13 Sep 2020 16:47:46 -0700
Subject: [PATCH 58/80] scalar binop updates

---
 python/cudf/cudf/_lib/scalar.pyx | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 83e49f79cf2..0abf4ded84f 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -180,6 +180,18 @@ cdef class Scalar:
     def __xor__(self, other):
         return self._scalar_binop(other, '__or__')
 
+    def __gt__(self, other):
+        return self._scalar_binop(other, '__gt__').value
+    
+    def __lt__(self, other):
+        return self._scalar_binop(other, '__gt__').value
+
+    def __ge__(self, other):
+        return self._scalar_binop(other, '__ge__').value
+
+    def __le__(self, other):
+        return self._scalar_binop(other, '__le__').value
+
     def _binop_result_dtype_or_error(self, other):
 
         if (self.dtype.kind == 'O' and other.dtype.kind != 'O') or (self.dtype.kind != 'O' and other.dtype.kind == 'O'):
@@ -193,8 +205,11 @@ cdef class Scalar:
 
     def _scalar_binop(self, other, op):
         other = to_cudf_compatible_scalar(other)
-        out_dtype = self._binop_result_dtype_or_error(other)
 
+        if op in  ['__eq__', '__lt__', '__gt__', '__le__', '__ge__']:
+            out_dtype = cudf.BooleanDtype()
+        else: 
+            out_dtype = self._binop_result_dtype_or_error(other)
         valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid())
         if not valid:
             return Scalar(None, dtype=out_dtype)

From 42828c0ef19bda2c90932fe4b0d9cc1927f9c032 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 06:52:15 -0700
Subject: [PATCH 59/80] convert a list of cudf.Scalars into a contiguous column

---
 python/cudf/cudf/core/column/column.py |  6 +++---
 python/cudf/cudf/core/dataframe.py     | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 61037d36905..4f2bb2bb4ef 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1670,9 +1670,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
-    #elif isinstance(arbitrary, cudf._lib.scalar.Scalar):
-    #    buffer = Buffer(arbitrary)
-    #    data = as_column(buffer, dtype=arbitrary.dtype)
+    elif isinstance(arbitrary, cudf._lib.scalar.Scalar):
+        buffer = Buffer(arbitrary)
+        data = as_column(buffer, dtype=arbitrary.dtype)
     else:
         try:
             data = as_column(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 71f91432612..7e3c091b51f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6261,6 +6261,10 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs):
                 result = DataFrame(index=support_result[0].index)
                 for idx, col in enumerate(self._data.names):
                     result[col] = support_result[idx]
+            elif isinstance(result[0], cudf._lib.scalar.Scalar):
+                result = _gpu_scalars_to_column(result)
+                result = cudf.Series(result)
+                result = result.set_index(self._data.names)
             else:
                 result = Series(result)
                 result = result.set_index(self._data.names)
@@ -7067,3 +7071,16 @@ def _get_union_of_series_names(series_list):
         names_list = [*range(len(series_list))]
 
     return names_list
+
+
+def _gpu_scalars_to_column(list_of_scalars):
+    '''
+    Convert a list of cuDF scalars into a contiguous column
+    '''
+    ind = range(len(list_of_scalars))
+    cols_dict = {
+        k: v for k, v in zip(ind, [as_column(i) for i in list_of_scalars])
+    }
+
+    tbl = DataFrame(cols_dict)
+    return (tbl.T)[0]._column

From 0d3d6a0e08d23308fde92f4ae511c614d118fba1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 08:45:48 -0700
Subject: [PATCH 60/80] migrate scalar methods to python

---
 python/cudf/cudf/__init__.py           |  1 +
 python/cudf/cudf/_lib/reduce.pyx       |  5 +-
 python/cudf/cudf/_lib/scalar.pyx       | 76 --------------------------
 python/cudf/cudf/core/__init__.py      |  1 +
 python/cudf/cudf/core/column/column.py |  2 +-
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/tests/utils.py        |  4 ++
 7 files changed, 11 insertions(+), 80 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 6e644cf09be..ff02a9af1fc 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -32,6 +32,7 @@
     UInt64Index,
     from_pandas,
     merge,
+    Scalar
 )
 from cudf.core.dtypes import (
     BooleanDtype,
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 60780abbfb5..0d4dd41f1ca 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -15,6 +15,7 @@ from libcpp.memory cimport unique_ptr
 import numpy as np
 from cudf.core.dtypes import dtype as cudf_dtype
 from cudf.api.types import find_common_type
+from cudf.core.scalar import Scalar as PyScalar
 
 
 def reduce(reduction_op, Column incol, dtype=None, **kwargs):
@@ -60,8 +61,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             c_out_dtype
         ))
 
-    py_result = Scalar.from_unique_ptr(move(c_result))
-    return py_result
+    cy_result = Scalar.from_unique_ptr(move(c_result))
+    return PyScalar(cy_result)
 
 
 def scan(scan_op, Column incol, inclusive, **kwargs):
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0abf4ded84f..93259438869 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -93,11 +93,6 @@ cdef class Scalar:
                 f"{type(value).__name__} to cudf scalar"
             )
 
-    def __eq__(self, other):
-        if isinstance(other, Scalar):
-            other = other.value
-        return self.value == other
-
     @property
     def dtype(self):
         """
@@ -150,77 +145,6 @@ cdef class Scalar:
     def ptr(self):
         return _get_ptr_from_scalar_any(self.c_value)
 
-    def __int__(self):
-        return int(self.value)
-
-    def __float__(self):
-        return float(self.value)
-
-    def __add__(self, other):
-        return self._scalar_binop(other, '__add__')
-
-    def __sub__(self, other):
-        return self._scalar_binop(other, '__sub__')
-
-    def __mul__(self, other):
-        return self._scalar_binop(other, '__mul__')
-
-    def __div__(self, other):
-        return self._scalar_binop(other, '__div__')
-
-    def __mod__(self, other):
-        return self._scalar_binop(other, '__mod__')
-
-    def __divmod__(self, other):
-        return self._scalar_binop(other, '__divmod__')
-
-    def __and__(self, other):
-        return self._scalar_binop(other, '__and__')
-
-    def __xor__(self, other):
-        return self._scalar_binop(other, '__or__')
-
-    def __gt__(self, other):
-        return self._scalar_binop(other, '__gt__').value
-    
-    def __lt__(self, other):
-        return self._scalar_binop(other, '__gt__').value
-
-    def __ge__(self, other):
-        return self._scalar_binop(other, '__ge__').value
-
-    def __le__(self, other):
-        return self._scalar_binop(other, '__le__').value
-
-    def _binop_result_dtype_or_error(self, other):
-
-        if (self.dtype.kind == 'O' and other.dtype.kind != 'O') or (self.dtype.kind != 'O' and other.dtype.kind == 'O'):
-            wrong_dtype = self.dtype if self.dtype.kind != 'O' else other.dtype
-            raise TypeError(f"Can only concatenate string (not {wrong_dtype}) to string")
-
-
-        return cudf.api.types.find_common_type([
-            self.dtype, other.dtype
-        ])
-
-    def _scalar_binop(self, other, op):
-        other = to_cudf_compatible_scalar(other)
-
-        if op in  ['__eq__', '__lt__', '__gt__', '__le__', '__ge__']:
-            out_dtype = cudf.BooleanDtype()
-        else: 
-            out_dtype = self._binop_result_dtype_or_error(other)
-        valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid())
-        if not valid:
-            return Scalar(None, dtype=out_dtype)
-        else:
-            result = self._dispatch_scalar_binop(other, op)
-            return Scalar(result, dtype=out_dtype)
-
-    def _dispatch_scalar_binop(self, other, op):
-        if isinstance(other, Scalar):
-            other = other.value
-        return getattr(self.value, op)(other)
 
 cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):
     value = value if valid else ""
diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py
index d30f949f72c..6fd61c0240f 100644
--- a/python/cudf/cudf/core/__init__.py
+++ b/python/cudf/cudf/core/__init__.py
@@ -22,3 +22,4 @@
 )
 from cudf.core.multiindex import MultiIndex
 from cudf.core.series import Series
+from cudf.core.scalar import Scalar
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 4f2bb2bb4ef..ef03a66ef73 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1670,7 +1670,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
-    elif isinstance(arbitrary, cudf._lib.scalar.Scalar):
+    elif isinstance(arbitrary, cudf.Scalar):
         buffer = Buffer(arbitrary)
         data = as_column(buffer, dtype=arbitrary.dtype)
     else:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7e3c091b51f..1e68fe8b57b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6261,7 +6261,7 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs):
                 result = DataFrame(index=support_result[0].index)
                 for idx, col in enumerate(self._data.names):
                     result[col] = support_result[idx]
-            elif isinstance(result[0], cudf._lib.scalar.Scalar):
+            elif isinstance(result[0], cudf.Scalar):
                 result = _gpu_scalars_to_column(result)
                 result = cudf.Series(result)
                 result = result.set_index(self._data.names)
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py
index f5646f67dde..def40067acf 100644
--- a/python/cudf/cudf/tests/utils.py
+++ b/python/cudf/cudf/tests/utils.py
@@ -93,6 +93,10 @@ def assert_eq(left, right, **kwargs):
         else:
             assert np.array_equal(left, right)
     else:
+        if isinstance(left, cudf._lib.scalar.Scalar):
+            left = left.value
+        if isinstance(right, cudf._lib.scalar.Scalar):
+            right = right.value
         if left == right:
             return True
         else:

From 63e1387f3ab927bbc9ab2d523a1b99d25d549ec2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 12:55:25 -0700
Subject: [PATCH 61/80] actually include scalar.py and update tests

---
 python/cudf/cudf/core/scalar.py       | 114 ++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_scalar.py |  24 ++++--
 2 files changed, 129 insertions(+), 9 deletions(-)
 create mode 100644 python/cudf/cudf/core/scalar.py

diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
new file mode 100644
index 00000000000..3a620880229
--- /dev/null
+++ b/python/cudf/cudf/core/scalar.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+import cudf._lib as libcudf
+from cudf.utils.dtypes import to_cudf_compatible_scalar
+from cudf.core.dtypes import BooleanDtype
+from cudf.api.types import find_common_type
+import numpy as np
+
+class Scalar(libcudf.scalar.Scalar):
+    def __init__(self, value, dtype=None):
+        if isinstance(value, libcudf.scalar.Scalar):
+            if dtype and not value.dtype == dtype:
+                raise TypeError
+            self._data = value
+        else:
+            self._data = libcudf.scalar.Scalar(value, dtype=dtype)
+
+    @property
+    def value(self):
+        return self._data.value
+
+    @property
+    def ptr(self):
+        return self._data.ptr
+
+    @property
+    def dtype(self):
+        return self._data.dtype
+
+    @property
+    def is_valid(self):
+        return self._data.is_valid
+
+    def __int__(self):
+        return int(self.value)
+
+    def __float__(self):
+        return float(self.value)
+
+    def __bool__(self):
+        return bool(self.value)
+
+    def __add__(self, other):
+        return self._scalar_binop(other, "__add__")
+
+    def __sub__(self, other):
+        return self._scalar_binop(other, "__sub__")
+
+    def __mul__(self, other):
+        return self._scalar_binop(other, "__mul__")
+
+    def __truediv__(self, other):
+        return self._scalar_binop(other, "__truediv__")
+
+    def __mod__(self, other):
+        return self._scalar_binop(other, "__mod__")
+
+    def __divmod__(self, other):
+        return self._scalar_binop(other, "__divmod__")
+
+    def __and__(self, other):
+        return self._scalar_binop(other, "__and__")
+
+    def __xor__(self, other):
+        return self._scalar_binop(other, "__or__")
+
+    def __gt__(self, other):
+        return self._scalar_binop(other, "__gt__").value
+
+    def __lt__(self, other):
+        return self._scalar_binop(other, "__lt__").value
+
+    def __ge__(self, other):
+        return self._scalar_binop(other, "__ge__").value
+
+    def __le__(self, other):
+        return self._scalar_binop(other, "__le__").value
+
+    def __eq__(self, other):
+        return self._scalar_binop(other, '__eq__').value
+
+    def _binop_result_dtype_or_error(self, other, op):
+
+        if (self.dtype.kind == "O" and other.dtype.kind != "O") or (
+            self.dtype.kind != "O" and other.dtype.kind == "O"
+        ):
+            wrong_dtype = self.dtype if self.dtype.kind != "O" else other.dtype
+            raise TypeError(
+                f"Can only concatenate string (not {wrong_dtype}) to string"
+            )
+        if (self.dtype.kind == "O" or other.dtype.kind == "O") and op != "__add__":
+            raise TypeError(f"{op} is not supported for string type scalars")
+
+        return find_common_type([self.dtype, other.dtype])
+
+    def _scalar_binop(self, other, op):
+        other = to_cudf_compatible_scalar(other)
+
+        if op in ["__eq__", "__lt__", "__gt__", "__le__", "__ge__"]:
+            out_dtype = BooleanDtype()
+        else:
+            out_dtype = self._binop_result_dtype_or_error(other, op)
+        valid = self.is_valid() and (
+            isinstance(other, np.generic) or other.is_valid()
+        )
+        if not valid:
+            return Scalar(None, dtype=out_dtype)
+        else:
+            result = self._dispatch_scalar_binop(other, op)
+            return Scalar(result, dtype=out_dtype)
+
+    def _dispatch_scalar_binop(self, other, op):
+        if isinstance(other, Scalar):
+            other = other.value
+        return getattr(self.value, op)(other)
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 3ca4b9f7a18..04cbe501f2e 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import pytest
 
-from cudf._lib.scalar import Scalar
+from cudf import Scalar
 from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 import operator
 
@@ -190,28 +190,34 @@ def test_date_duration_scalars(value):
     operator.add,
     operator.sub,
     operator.mul,
+    operator.truediv
 ])
 def test_scalar_binops_value(pairs, dtype_l, dtype_r, op):
-    l, r = pairs
+    lval, rval = pairs
+    if (isinstance(lval, str) and dtype_l != np.dtype('str')) or (isinstance(rval, str) and dtype_r != np.dtype('str')):
+        pytest.skip("Invalid scalar/dtype combination")
+
+
+
     import re
     try:
-        host_value_l = dtype_l.type(l)
+        host_value_l = dtype_l.type(lval)
     except ValueError as e:
         with pytest.raises(ValueError, match=re.escape(str(e))):
-            gpu_value_l = Scalar(l, dtype=dtype_l)
+            gpu_value_l = Scalar(lval, dtype=dtype_l)
         return
     try:
-        host_value_r = dtype_r.type(r)
+        host_value_r = dtype_r.type(rval)
     except ValueError as e:
         with pytest.raises(ValueError, match=re.escape(str(e))):
-            gpu_value_r = Scalar(r, dtype=dtype_r)
+            gpu_value_r = Scalar(rval, dtype=dtype_r)
         return
 
-    gpu_value_l = Scalar(l, dtype=dtype_l)
-    gpu_value_r = Scalar(r, dtype=dtype_r)
+    gpu_value_l = Scalar(lval, dtype=dtype_l)
+    gpu_value_r = Scalar(rval, dtype=dtype_r)
     try:
         expect = op(host_value_l, host_value_r)
-    except np.core._exceptions.UFuncTypeError:
+    except TypeError:
         with pytest.raises(TypeError):
             got = op(gpu_value_l, gpu_value_r)
         return

From 2005d65f73af51169af6cd83b0c3b331f6db3937 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 13:21:08 -0700
Subject: [PATCH 62/80] fix the rest of test_reductions.py

---
 python/cudf/cudf/core/scalar.py           | 18 ++++++++++++++++++
 python/cudf/cudf/tests/test_reductions.py |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 3a620880229..92f0f3ac269 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -42,15 +42,27 @@ def __bool__(self):
     def __add__(self, other):
         return self._scalar_binop(other, "__add__")
 
+    def __radd__(self, other):
+        return self._scalar_binop(other, '__radd__')
+
     def __sub__(self, other):
         return self._scalar_binop(other, "__sub__")
 
+    def __rsub__(self, other):
+        return self._scalar_binop(other, "__rsub__")
+
     def __mul__(self, other):
         return self._scalar_binop(other, "__mul__")
 
+    def __rmul__(self, other):
+        return self._scalar_binop(other, "__rmul__")
+
     def __truediv__(self, other):
         return self._scalar_binop(other, "__truediv__")
 
+    def __rtruediv__(self, other):
+        return self._scalar_binop(other, "__rtruediv__")
+
     def __mod__(self, other):
         return self._scalar_binop(other, "__mod__")
 
@@ -78,6 +90,9 @@ def __le__(self, other):
     def __eq__(self, other):
         return self._scalar_binop(other, '__eq__').value
 
+    def __abs__(self):
+        return self._scalar_unaop('__abs__')
+
     def _binop_result_dtype_or_error(self, other, op):
 
         if (self.dtype.kind == "O" and other.dtype.kind != "O") or (
@@ -112,3 +127,6 @@ def _dispatch_scalar_binop(self, other, op):
         if isinstance(other, Scalar):
             other = other.value
         return getattr(self.value, op)(other)
+
+    def _scalar_unaop(self, op):
+        return Scalar(getattr(self.value, op)())
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 9da3af36763..88ffa9a036f 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -81,7 +81,7 @@ def test_sum_of_squares(dtype, nelem):
 
     if np.dtype(dtype).kind in {"u", "i"}:
         if 0 <= expect <= np.iinfo(dtype).max:
-            np.testing.assert_array_almost_equal(expect, got)
+            np.testing.assert_array_almost_equal(expect, got.value)
         else:
             print("overflow, passing")
     else:
@@ -130,7 +130,7 @@ def test_sum_masked(nelem):
     expect = data[res_mask].sum()
 
     significant = 4 if dtype == np.float32 else 6
-    np.testing.assert_approx_equal(expect, got, significant=significant)
+    np.testing.assert_approx_equal(expect, got.value, significant=significant)
 
 
 def test_sum_boolean():

From 523919cef1eb1da226e16ce661fb3ec7fc1654c1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 13:52:35 -0700
Subject: [PATCH 63/80] fix indexing error

---
 python/cudf/cudf/core/column/column.py | 2 +-
 python/cudf/cudf/core/dtypes.py        | 2 +-
 python/cudf/cudf/core/scalar.py        | 5 ++++-
 python/cudf/cudf/utils/cudautils.py    | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index ef03a66ef73..9537371bb64 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -600,7 +600,7 @@ def element_indexing(self, index):
 
     def __getitem__(self, arg):
 
-        if isinstance(arg, Number):
+        if isinstance(arg, (Number, cudf.Scalar)):
             arg = int(arg)
             return self.element_indexing(arg)
         elif isinstance(arg, slice):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index ac55b34afc4..b6b343d04f3 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -268,7 +268,7 @@ def __init__(self, parent_dtype):
         self.parent_dtype = parent_dtype
 
     def __call__(self, arg):
-        return cudf._lib.scalar.Scalar(arg, dtype=self.parent_dtype)
+        return cudf.Scalar(arg, dtype=self.parent_dtype)
 
 
 def cudf_dtype_from_string(obj):
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 92f0f3ac269..43bf24deee6 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -90,6 +90,9 @@ def __le__(self, other):
     def __eq__(self, other):
         return self._scalar_binop(other, '__eq__').value
 
+    def __ne__(self, other):
+        return self._scalar_binop(other, "__ne__").value
+
     def __abs__(self):
         return self._scalar_unaop('__abs__')
 
@@ -110,7 +113,7 @@ def _binop_result_dtype_or_error(self, other, op):
     def _scalar_binop(self, other, op):
         other = to_cudf_compatible_scalar(other)
 
-        if op in ["__eq__", "__lt__", "__gt__", "__le__", "__ge__"]:
+        if op in ["__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"]:
             out_dtype = BooleanDtype()
         else:
             out_dtype = self._binop_result_dtype_or_error(other, op)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index d1abf981a19..c8fb0b7a2ec 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -186,7 +186,7 @@ def find_first(arr, val, mask=None, compare="eq"):
     found_col = found_col.find_and_replace([arr.size], [None], True)
 
     min_index = found_col.min()
-    return -1 if min_index is None or np.isnan(min_index) else min_index
+    return -1 if min_index is None or cudf.api.types.isnan(min_index) else min_index
 
 
 def find_last(arr, val, mask=None, compare="eq"):

From c730301423ffd04459c723d10729307f63f56817 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 14:35:04 -0700
Subject: [PATCH 64/80] fix as_scalar

---
 python/cudf/cudf/_lib/scalar.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 93259438869..ead3097f3b4 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -357,6 +357,8 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
 
 def as_scalar(val, dtype=None):
     dtype = cudf.dtype(dtype)
+    if isinstance(val, cudf.Scalar):
+        return as_scalar(val._data, dtype=dtype)
     if isinstance(val, Scalar):
         if (dtype is None or dtype == val.dtype):
             return val

From c5450c2d588f7ca02628a74e6759447ae957659d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 15:01:58 -0700
Subject: [PATCH 65/80] remove unecessary code

---
 python/cudf/cudf/core/dtypes.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index b6b343d04f3..99528c6bde6 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -80,28 +80,19 @@ def __init__(self):
 
 
 class Integer(Number):
-    def __init__(self):
-        self._raise_construction_error()
-
+    pass
 
 class SignedInteger(Integer):
-    def __init__(self):
-        self._raise_construction_error()
-
+    pass
 
 class UnsignedInteger(Integer):
-    def __init__(self):
-        self._raise_construction_error()
-
+    pass
 
 class Inexact(Number):
-    def __init__(self):
-        self._raise_construction_error()
-
+    pass
 
 class Floating(Inexact):
-    def __init__(self):
-        self._raise_construction_error()
+    pass
 
     @property
     def kind(self):

From 7bc08936103fb3c35cf1de36db9cdb2a1488c3bc Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Sep 2020 15:02:15 -0700
Subject: [PATCH 66/80] minor bugfixes

---
 python/cudf/cudf/utils/cudautils.py | 2 +-
 python/cudf/cudf/utils/utils.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index c8fb0b7a2ec..1c3f4e773c6 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -207,7 +207,7 @@ def find_last(arr, val, mask=None, compare="eq"):
     found_col = found_col.find_and_replace([arr.size], [None], True)
 
     max_index = found_col.max()
-    return -1 if max_index is None or np.isnan(max_index) else max_index
+    return -1 if max_index is None or cudf.api.types.isnan(max_index) else max_index
 
 
 @cuda.jit
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c8193c7226a..35efdf94260 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -74,17 +74,17 @@ def scalar_broadcast_to(scalar, size, dtype=None):
     if isinstance(scalar, pd.Categorical):
         return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)
 
-    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
+    scalar = cudf.Scalar(to_cudf_compatible_scalar(scalar, dtype=dtype))
     dtype = scalar.dtype
 
-    if np.dtype(dtype).kind in ("O", "U"):
+    if dtype.kind in ("O", "U"):
         gather_map = column.full(size, 0, dtype="int32")
         scalar_str_col = column.as_column([scalar], dtype="str")
         return scalar_str_col[gather_map]
     else:
         out_col = column.column_empty(size, dtype=dtype)
         if out_col.size != 0:
-            out_col.data_array_view[:] = scalar
+            out_col.data_array_view[:] = scalar.value
         return out_col
 
 

From a3a48934761203d86391b2b08f6e3a4b1a98bca8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Sep 2020 11:58:59 -0700
Subject: [PATCH 67/80] scalar plumbing, cudf.api.types additions, bug fixes

---
 python/cudf/cudf/_lib/copying.pyx         |  2 +-
 python/cudf/cudf/api/types.py             | 33 ++++++++++++++++++-----
 python/cudf/cudf/core/column/numerical.py | 22 ++++++++-------
 python/cudf/cudf/core/scalar.py           |  3 +++
 python/cudf/cudf/utils/dtypes.py          | 16 +++++------
 python/cudf/cudf/utils/utils.py           |  2 +-
 6 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index cab42bce789..9174d611166 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -204,7 +204,7 @@ def _scatter_scalar(scalars, Column scatter_map,
     cdef bool c_bounds_check = bounds_check
     cdef Scalar slr
     for val, col in zip(scalars, target_table._columns):
-        slr = as_scalar(val, col.dtype.to_numpy)
+        slr = as_scalar(val, col.dtype)
         source_scalars.push_back(move(slr.c_value))
     cdef column_view scatter_map_view = scatter_map.view()
     cdef table_view target_table_view = target_table.data_view()
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 484b9f1bfd1..b82f6ac6145 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -142,13 +142,15 @@ def find_common_type(array_types=[], scalar_types=[]):
     return cudf.dtype(np.find_common_type(array_types, scalar_types))
 
 
-def can_cast(dtype_l, dtype_r):
-    if isinstance(dtype_l, cudf.Generic):
-        dtype_l = dtype_l.to_numpy
-    if isinstance(dtype_r, cudf.Generic):
-        dtype_r = dtype_r.to_numpy
+def can_cast(from_, to, casting='safe'):
+    if isinstance(from_, cudf.Generic):
+        from_ = from_.to_numpy
+    elif isinstance(from_, cudf.Scalar):
+        from_ = from_.value
+    if isinstance(to, cudf.Generic):
+        to = to.to_numpy
 
-    return np.can_cast(dtype_l, dtype_r)
+    return np.can_cast(from_, to, casting=casting)
 
 
 def result_type(*arrays_and_dtypes):
@@ -163,3 +165,22 @@ def isnan(obj):
     if isinstance(obj, cudf._lib.scalar.Scalar):
         obj = obj.value
     return np.isnan(obj)
+
+def min_scalar_type(a):
+    if isinstance(a, cudf.Scalar):
+        a = a.value
+    result = np.min_scalar_type(a)
+    if result == np.dtype('float16'):
+        return cudf.Float32Dtype()
+    return cudf.dtype(result)
+
+def promote_types(type1, type2):
+    if isinstance(type1, cudf.Generic):
+        type1 = type1.to_numpy
+    if isinstance(type2, cudf.Generic):
+        type2 = type2.to_numpy
+
+    result = np.promote_types(type1, type2)
+    if result == np.dtype('float16'):
+        return cudf.Float32Dtype()
+    return cudf.dtype(result)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index e56f87aac21..45fc042a416 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -51,15 +51,17 @@ def __contains__(self, item):
         # Handles improper item types
         # Fails if item is of type None, so the handler.
         try:
-            if np.can_cast(item, self.data_array_view.dtype):
-                item = self.data_array_view.dtype.type(item)
+            if cudf.api.types.can_cast(item, self.dtype):
+                if isinstance(item, cudf.Scalar):
+                    item = item.value
+                item = cudf.Scalar(item, dtype=self.dtype)
             else:
                 return False
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
         return libcudf.search.contains(
-            self, column.as_column([item], dtype=self.dtype)
+            self, column.as_column(item, dtype=self.dtype)
         ).any()
 
     def unary_operator(self, unaryop):
@@ -105,16 +107,16 @@ def _apply_scan_op(self, op):
     def normalize_binop_value(self, other):
         if other is None:
             return other
-        other_dtype = np.min_scalar_type(other)
+        other_dtype = cudf.api.types.min_scalar_type(other)
         if other_dtype.kind in {"b", "i", "u", "f"}:
-            other_dtype = np.promote_types(self.dtype.to_numpy, other_dtype)
+            other_dtype = cudf.api.types.promote_types(self.dtype, other_dtype)
             if other_dtype == np.dtype("float16"):
                 other = np.dtype("float32").type(other)
                 other_dtype = other.dtype
             if self.dtype.kind == "b":
                 other_dtype = min_signed_type(other)
-            if np.isscalar(other):
-                other = np.dtype(other_dtype).type(other)
+            if np.isscalar(other) or isinstance(other, cudf.Scalar):
+                other = cudf.Scalar(other, dtype=other_dtype)
                 return other
             else:
                 ary = utils.scalar_broadcast_to(
@@ -169,9 +171,8 @@ def as_numerical_column(self, dtype, **kwargs):
         if dtype == self.dtype:
             return self
         if dtype is None:
-            import pdb
-
-            pdb.set_trace()
+            # dtype = None can cause segfault here
+            raise TypeError('libcudf.unary.cast requires a dtype')
         return libcudf.unary.cast(self, dtype)
 
     def sum(self, dtype=None):
@@ -455,6 +456,7 @@ def _safe_cast_to_int(col, dtype):
 
 
 def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
+
     normalized_column = column.as_column(
         col_to_normalize,
         dtype=input_column_dtype if len(col_to_normalize) <= 0 else None,
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 43bf24deee6..ddf2b3f5211 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -75,6 +75,9 @@ def __and__(self, other):
     def __xor__(self, other):
         return self._scalar_binop(other, "__or__")
 
+    def __pow__(self, other):
+        return self._scalar_binop(other, "__pow__")
+
     def __gt__(self, other):
         return self._scalar_binop(other, "__gt__").value
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 26946f685a9..1b8a2d28cb2 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -330,25 +330,25 @@ def min_column_type(x, expected_type):
     If the column is not a subtype of `np.signedinteger` or `np.floating`
     returns the same dtype as the dtype of `x` without modification
     """
+
+    expected_type = cudf.dtype(expected_type)
     if not isinstance(x, cudf.core.column.NumericalColumn):
         raise TypeError("Argument x must be of type column.NumericalColumn")
     if x.valid_count == 0:
         return x.dtype
-    x_np_dtype = x.dtype.to_numpy
-    expected_type = cudf.dtype(expected_type).to_numpy
 
-    if np.issubdtype(x_np_dtype, np.floating):
-        max_bound_dtype = np.min_scalar_type(x.max())
-        min_bound_dtype = np.min_scalar_type(x.min())
+    if isinstance(x.dtype, cudf.Floating):
+        max_bound_dtype = np.min_scalar_type(x.max().value)
+        min_bound_dtype = np.min_scalar_type(x.min().value)
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
         if result_type == np.dtype("float16"):
             # cuDF does not support float16 dtype
             result_type = np.dtype("float32")
         return cudf.dtype(result_type)
 
-    if np.issubdtype(expected_type, np.integer):
-        max_bound_dtype = np.min_scalar_type(x.max())
-        min_bound_dtype = np.min_scalar_type(x.min())
+    if isinstance(expected_type, cudf.Integer):
+        max_bound_dtype = np.min_scalar_type(x.max().value)
+        min_bound_dtype = np.min_scalar_type(x.min().value)
         result = np.promote_types(max_bound_dtype, min_bound_dtype)
         return cudf.dtype(result)
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 35efdf94260..fcf86b6bf82 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -79,7 +79,7 @@ def scalar_broadcast_to(scalar, size, dtype=None):
 
     if dtype.kind in ("O", "U"):
         gather_map = column.full(size, 0, dtype="int32")
-        scalar_str_col = column.as_column([scalar], dtype="str")
+        scalar_str_col = column.as_column([scalar.value], dtype="str")
         return scalar_str_col[gather_map]
     else:
         out_col = column.column_empty(size, dtype=dtype)

From 6bf121c25bf1bedaf0605155282f21bba7b0df23 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Sep 2020 15:03:35 -0700
Subject: [PATCH 68/80] add cudf.api.types.isscalar(element)

---
 python/cudf/cudf/api/types.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index b82f6ac6145..192e708d147 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -184,3 +184,6 @@ def promote_types(type1, type2):
     if result == np.dtype('float16'):
         return cudf.Float32Dtype()
     return cudf.dtype(result)
+
+def isscalar(element):
+    return isinstance(element, cudf._lib.scalar.Scalar) or np.isscalar(element)

From 165f86c1b90c836d77841fa21447023368822240 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Sep 2020 15:04:24 -0700
Subject: [PATCH 69/80] plumbing

---
 python/cudf/cudf/core/column/numerical.py | 6 +++---
 python/cudf/cudf/core/dtypes.py           | 4 ++--
 python/cudf/cudf/core/series.py           | 4 +++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 45fc042a416..715160e1b05 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -82,13 +82,13 @@ def binary_operator(self, binop, rhs, reflect=False):
         if reflect:
             tmp = self
         if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs):
-            out_dtype = np.result_type(
-                cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy
+            out_dtype = cudf.api.types.result_type(
+                self.dtype, rhs.dtype
             )
             out_dtype = cudf.dtype(out_dtype)
             if binop in ["mod", "floordiv"]:
                 if (cudf.dtype(tmp.dtype) in int_dtypes) and (
-                    (np.isscalar(tmp) and (0 == tmp))
+                    (cudf.api.types.isscalar(tmp) and (0 == tmp))
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
                 ):
                     out_dtype = cudf.Float64Dtype()
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 99528c6bde6..41d4f669061 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -22,8 +22,8 @@ def __eq__(self, other):
         ):
             return False
         if (
-            isinstance(other, type(self.to_pandas))
-            or other is type(self.to_pandas)
+            isinstance(other, self.to_pandas.type)
+            or other is self.to_pandas
         ):
             return True
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8448f67f618..cc4555d9155 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4029,7 +4029,9 @@ def describe_numeric(self):
                 + [self.max()]
             )
             data = _format_stats_values(data)
-
+            for i, d in enumerate(data):
+                if isinstance(d, cudf.Scalar):
+                    data[i] = d.value
             return Series(
                 data=data, index=names, nan_as_null=False, name=self.name,
             )

From cec9528ecb96f2b28309c186a28a9279c2fcfca8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Sep 2020 15:05:16 -0700
Subject: [PATCH 70/80] scalars may __round__

---
 python/cudf/cudf/core/dataframe.py | 2 +-
 python/cudf/cudf/core/scalar.py    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1e68fe8b57b..d3b81296fda 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7079,7 +7079,7 @@ def _gpu_scalars_to_column(list_of_scalars):
     '''
     ind = range(len(list_of_scalars))
     cols_dict = {
-        k: v for k, v in zip(ind, [as_column(i) for i in list_of_scalars])
+        k: v for k, v in zip(ind, [as_column(cudf.Scalar(i)) for i in list_of_scalars])
     }
 
     tbl = DataFrame(cols_dict)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index ddf2b3f5211..0c35833c91e 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -99,6 +99,9 @@ def __ne__(self, other):
     def __abs__(self):
         return self._scalar_unaop('__abs__')
 
+    def __round__(self, n):
+        return self._scalar_binop(n, '__round__')
+
     def _binop_result_dtype_or_error(self, other, op):
 
         if (self.dtype.kind == "O" and other.dtype.kind != "O") or (

From a8b380b763d61a82cd54a5f0ce9ca5570101d1c4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 16 Sep 2020 08:47:39 -0700
Subject: [PATCH 71/80] to_numpy -> numpy_dtype

---
 python/cudf/cudf/_lib/aggregation.pyx     |  2 +-
 python/cudf/cudf/_lib/transform.pyx       |  2 +-
 python/cudf/cudf/api/types.py             | 14 +++++++-------
 python/cudf/cudf/core/column/column.py    | 10 +++++-----
 python/cudf/cudf/core/column/datetime.py  |  6 +++---
 python/cudf/cudf/core/column/numerical.py | 16 ++++++++--------
 python/cudf/cudf/core/column/timedelta.py | 10 +++++-----
 python/cudf/cudf/core/dataframe.py        |  6 +++---
 python/cudf/cudf/core/dtypes.py           |  6 ++++--
 python/cudf/cudf/core/indexing.py         |  4 ++--
 python/cudf/cudf/core/join/join.py        |  4 ++--
 python/cudf/cudf/core/series.py           |  2 +-
 python/cudf/cudf/core/tools/datetimes.py  |  2 +-
 python/cudf/cudf/tests/test_avro.py       |  2 +-
 python/cudf/cudf/tests/test_orc.py        |  2 +-
 python/cudf/cudf/utils/dtypes.py          |  6 +++---
 python/cudf/cudf/utils/utils.py           |  2 +-
 17 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 19634d78061..96b948d65e8 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -243,7 +243,7 @@ cdef class _AggregationFactory:
         cdef string cpp_str
 
         # Handling UDF type
-        nb_type = numpy_support.from_dtype(kwargs['dtype'].to_numpy)
+        nb_type = numpy_support.from_dtype(kwargs['dtype'].numpy_dtype)
         type_signature = (nb_type[:],)
         compiled_op = cudautils.compile_udf(op, type_signature)
         output_np_dtype = cudf_dtype(np.dtype(compiled_op[1]))
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 8fafa166471..cc839659b13 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -97,7 +97,7 @@ def transform(Column input, op):
     cdef type_id c_tid
     cdef data_type c_dtype
 
-    nb_type = numpy_support.from_dtype(input.dtype.to_numpy)
+    nb_type = numpy_support.from_dtype(input.dtype.numpy_dtype)
     nb_signature = (nb_type,)
     compiled_op = cudautils.compile_udf(op, nb_signature)
     c_str = compiled_op[0].encode('UTF-8')
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 192e708d147..92cc561a1c8 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -133,10 +133,10 @@ def is_list_dtype(obj):
 
 def find_common_type(array_types=[], scalar_types=[]):
     array_types = [
-        d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types
+        d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in array_types
     ]
     scalar_types = [
-        d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types
+        d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in scalar_types
     ]
 
     return cudf.dtype(np.find_common_type(array_types, scalar_types))
@@ -144,11 +144,11 @@ def find_common_type(array_types=[], scalar_types=[]):
 
 def can_cast(from_, to, casting='safe'):
     if isinstance(from_, cudf.Generic):
-        from_ = from_.to_numpy
+        from_ = from_.numpy_dtype
     elif isinstance(from_, cudf.Scalar):
         from_ = from_.value
     if isinstance(to, cudf.Generic):
-        to = to.to_numpy
+        to = to.numpy_dtype
 
     return np.can_cast(from_, to, casting=casting)
 
@@ -156,7 +156,7 @@ def can_cast(from_, to, casting='safe'):
 def result_type(*arrays_and_dtypes):
 
     arrays_and_dtypes = (
-        d.to_numpy if isinstance(d, cudf.Generic) else d
+        d.numpy_dtype if isinstance(d, cudf.Generic) else d
         for d in arrays_and_dtypes
     )
     return cudf.dtype(np.result_type(*arrays_and_dtypes))
@@ -176,9 +176,9 @@ def min_scalar_type(a):
 
 def promote_types(type1, type2):
     if isinstance(type1, cudf.Generic):
-        type1 = type1.to_numpy
+        type1 = type1.numpy_dtype
     if isinstance(type2, cudf.Generic):
-        type2 = type2.to_numpy
+        type2 = type2.numpy_dtype
 
     result = np.promote_types(type1, type2)
     if result == np.dtype('float16'):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9537371bb64..3a3446d4b7f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -98,7 +98,7 @@ def data_array_view(self):
         result = cuda.devicearray.DeviceNDArray(
             shape=(result.nbytes // dtype.itemsize,),
             strides=(dtype.itemsize,),
-            dtype=dtype.to_numpy,
+            dtype=dtype.numpy_dtype,
             gpu_data=result.gpu_data,
         )
         return result
@@ -149,7 +149,7 @@ def values(self):
         Return a CuPy representation of the Column.
         """
         if len(self) == 0:
-            return cupy.asarray([], dtype=self.dtype.to_numpy)
+            return cupy.asarray([], dtype=self.dtype.numpy_dtype)
 
         if self.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -1098,7 +1098,7 @@ def __cuda_array_interface__(self):
         output = {
             "shape": (len(self),),
             "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.to_numpy.str,
+            "typestr": self.dtype.numpy_dtype.str,
             "data": (self.data_ptr, False),
             "version": 1,
         }
@@ -1573,7 +1573,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             arbitrary = np.ascontiguousarray(arbitrary)
 
         if dtype is not None:
-            arbitrary = arbitrary.astype(dtype.to_numpy)
+            arbitrary = arbitrary.astype(dtype.numpy_dtype)
 
         if arb_dtype.kind == "M":
 
@@ -1728,7 +1728,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                     data = as_column(sr, nan_as_null=nan_as_null)
                 else:
                     native_dtype = (
-                        dtype.to_numpy if dtype is not None else None
+                        dtype.numpy_dtype if dtype is not None else None
                     )
                     if dtype is None and pd.api.types.infer_dtype(
                         arbitrary
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 3894b5dd0dc..8b80f609a74 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -123,7 +123,7 @@ def normalize_binop_value(self, other):
             if np.isnat(other):
                 return as_scalar(val=None, dtype=self.dtype)
 
-            other = other.astype(self.dtype.to_numpy)
+            other = other.astype(self.dtype.numpy_dtype)
             return as_scalar(other)
         elif isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
@@ -265,8 +265,8 @@ def can_cast_safely(self, to_dtype):
         to_dtype = cudf.dtype(to_dtype)
         if isinstance(to_dtype, cudf.Datetime):
 
-            to_res, _ = np.datetime_data(to_dtype.to_numpy)
-            self_res, _ = np.datetime_data(self.dtype.to_numpy)
+            to_res, _ = np.datetime_data(to_dtype.numpy_dtype)
+            self_res, _ = np.datetime_data(self.dtype.numpy_dtype)
 
             max_int = np.iinfo(np.dtype("int64")).max
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 715160e1b05..3072f513f40 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -234,9 +234,9 @@ def default_na_value(self):
         if dkind == "f":
             return self.dtype.type(np.nan).value
         elif dkind == "i":
-            return np.iinfo(self.dtype.to_numpy).min
+            return np.iinfo(self.dtype.numpy_dtype).min
         elif dkind == "u":
-            return np.iinfo(self.dtype.to_numpy).max
+            return np.iinfo(self.dtype.numpy_dtype).max
         elif dkind == "b":
             return self.dtype.type(False)
         else:
@@ -280,7 +280,7 @@ def fillna(self, fill_value):
         if np.isscalar(fill_value) and not isinstance(fill_value, libcudf.scalar.Scalar):
             # castsafely to the same dtype as self
             # TODO - produce a libcudf scalar directly
-            fill_value_casted = self.dtype.to_numpy.type(fill_value)
+            fill_value_casted = self.dtype.numpy_dtype.type(fill_value)
             if not np.isnan(fill_value) and (fill_value_casted != fill_value):
                 raise TypeError(
                     "Cannot safely cast non-equivalent {} to {}".format(
@@ -360,14 +360,14 @@ def can_cast_safely(self, to_dtype):
         """
         if self.dtype.kind == to_dtype.kind:
             # todo: implement >, < for cudf.Dtype
-            if self.dtype.to_numpy <= to_dtype.to_numpy:
+            if self.dtype.numpy_dtype <= to_dtype.numpy_dtype:
                 return True
             else:
                 # Kinds are the same but to_dtype is smaller
                 if isinstance(to_dtype, cudf.Floating):
-                    info = np.finfo(to_dtype.to_numpy)
+                    info = np.finfo(to_dtype.numpy_dtype)
                 elif isinstance(to_dtype, cudf.Integer):
-                    info = np.iinfo(to_dtype.to_numpy)
+                    info = np.iinfo(to_dtype.numpy_dtype)
                 min_, max_ = info.min, info.max
 
                 if (self.min() > min_) and (self.max() < max_):
@@ -377,7 +377,7 @@ def can_cast_safely(self, to_dtype):
 
         # want to cast int to float
         elif to_dtype.kind == "f" and self.dtype.kind in {"i", "u"}:
-            info = np.finfo(to_dtype.to_numpy)
+            info = np.finfo(to_dtype.numpy_dtype)
             biggest_exact_int = 2 ** (info.nmant + 1)
             if (self.min() >= -biggest_exact_int) and (
                 self.max() <= biggest_exact_int
@@ -396,7 +396,7 @@ def can_cast_safely(self, to_dtype):
 
         # want to cast float to int:
         elif to_dtype.kind in {"i", "u"} and self.dtype.kind == "f":
-            info = np.iinfo(to_dtype.to_numpy)
+            info = np.iinfo(to_dtype.numpy_dtype)
             min_, max_ = info.min, info.max
             # best we can do is hope to catch it here and avoid compare
             if (self.min() >= min_) and (self.max() <= max_):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 60b3f027efe..24b0f01c4cd 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -94,11 +94,11 @@ def _binary_op_floordiv(self, rhs):
                     if isinstance(rhs, Scalar):
                         rhs = np.timedelta64(rhs.value)
 
-                    rhs = rhs.astype(common_dtype.to_numpy).astype("float64")
+                    rhs = rhs.astype(common_dtype.numpy_dtype).astype("float64")
                 else:
                     rhs = as_scalar(None, "float64")
             else:
-                rhs = rhs.astype(common_dtype.to_numpy).astype("float64")
+                rhs = rhs.astype(common_dtype.numpy_dtype).astype("float64")
 
             out_dtype = cudf.Int64Dtype()
         elif rhs.dtype.kind in ("f", "i", "u"):
@@ -163,7 +163,7 @@ def _binary_op_truediv(self, rhs):
                     if isinstance(rhs, Scalar):
                         rhs = np.timedelta64(rhs.value)
 
-                    rhs = rhs.astype(common_dtype.to_numpy).astype("float64")
+                    rhs = rhs.astype(common_dtype.numpy_dtype).astype("float64")
                 else:
                     rhs = as_scalar(None, "float64")
             else:
@@ -226,7 +226,7 @@ def normalize_binop_value(self, other):
                 other = other.astype("timedelta64[s]")
             else:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
-                other = other.astype(common_dtype.to_numpy)
+                other = other.astype(common_dtype.numpy_dtype)
             return as_scalar(other)
         elif np.isscalar(other):
             return as_scalar(other)
@@ -258,7 +258,7 @@ def fillna(self, fill_value):
         if is_scalar(fill_value):
             if isinstance(fill_value, np.timedelta64):
                 dtype = determine_out_dtype(self.dtype, fill_value.dtype)
-                fill_value = fill_value.astype(dtype.to_numpy)
+                fill_value = fill_value.astype(dtype.numpy_dtype)
                 col = col.astype(dtype)
             elif not isinstance(fill_value, Scalar):
                 fill_value = np.timedelta64(fill_value)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d3b81296fda..e4bb2c6ec69 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3189,7 +3189,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
                     "hint: use .fillna() to replace null values"
                 )
                 raise ValueError(errmsg.format(k))
-        cupy_dtype = dtype.to_numpy
+        cupy_dtype = dtype.numpy_dtype
         if np.issubdtype(cupy_dtype, np.datetime64):
             cupy_dtype = np.dtype("int64")
 
@@ -4909,9 +4909,9 @@ def to_records(self, index=True):
         -------
         numpy recarray
         """
-        members = [("index", self.index.dtype.to_numpy)] if index else []
+        members = [("index", self.index.dtype.numpy_dtype)] if index else []
         members += [
-            (col, self[col].dtype.to_numpy) for col in self._data.names
+            (col, self[col].dtype.numpy_dtype) for col in self._data.names
         ]
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 41d4f669061..a963daffefc 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -9,6 +9,8 @@
 
 import cudf
 from cudf._lib.types import _Dtype
+#from cudf.utils.utils import cached_property
+
 
 
 class Generic(ExtensionDtype, _Dtype):
@@ -41,11 +43,11 @@ def num(self):
         return self.to_numpy.num
 
     @property
-    def to_numpy(self):
+    def numpy_dtype(self):
         return np.dtype(self.pa_type.to_pandas_dtype())
 
     @property
-    def to_pandas(self):
+    def pandas_dtype(self):
         return pd.api.types.pandas_dtype(self.name)
 
     @property
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index ef5ca3d6341..926f44c2ced 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -97,7 +97,7 @@ def __setitem__(self, key, value):
                 to_dtype = cudf.api.types.result_type(
                     value.dtype, self._sr._column.dtype
                 )
-                value = value.astype(to_dtype.to_numpy)
+                value = value.astype(to_dtype.numpy_dtype)
                 self._sr._column._mimic_inplace(
                     self._sr._column.astype(to_dtype), inplace=True
                 )
@@ -452,7 +452,7 @@ def _get_column_selection(self, arg):
 
 def _normalize_dtypes(df):
     if len(df.columns) > 0:
-        dtypes = [d.to_numpy for d in df.dtypes.values.tolist()]
+        dtypes = [d.numpy_dtype for d in df.dtypes.values.tolist()]
         normalized_dtype = cudf.dtype(np.result_type(*dtypes))
         for name, col in df._data.items():
             df[name] = col.astype(normalized_dtype)
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 95a1a05b377..b18babefe83 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -398,7 +398,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
                 if dtype_l.kind == dtype_r.kind:
                     # both ints or both floats
                     libcudf_join_type = cudf.dtype(
-                        max(dtype_l.to_numpy, dtype_r.to_numpy)
+                        max(dtype_l.numpy_dtype, dtype_r.numpy_dtype)
                     )
                 else:
                     libcudf_join_type = cudf.api.types.find_common_type(
@@ -408,7 +408,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how):
                 dtype_r, cudf.Datetime
             ):
                 libcudf_join_type = cudf.dtype(
-                    max(dtype_l.to_numpy, dtype_r.to_numpy)
+                    max(dtype_l.numpy_dtype, dtype_r.numpy_dtype)
                 )
         if libcudf_join_type is None:
             # todo: test this
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index cc4555d9155..7e42b06787a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1027,7 +1027,7 @@ def __repr__(self):
 
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
             category_memory = lines[-1]
-            to_replace = str(self.dtype.categories.dtype.to_numpy)
+            to_replace = str(self.dtype.categories.dtype.numpy_dtype)
             replacement = str(self.dtype.categories.dtype.name)
             category_memory = category_memory.replace(to_replace, replacement)
             lines = lines[:-1]
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 16eee560f0e..54f5d4a1e1e 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -184,7 +184,7 @@ def to_datetime(
                         column.datetime._numpy_to_pandas_conversion[u]
                         / (
                             column.datetime._numpy_to_pandas_conversion["s"]
-                            if np.datetime_data(col.dtype.to_numpy)[0] == "s"
+                            if np.datetime_data(col.dtype.numpy_dtype)[0] == "s"
                             else 1
                         )
                     )
diff --git a/python/cudf/cudf/tests/test_avro.py b/python/cudf/cudf/tests/test_avro.py
index 059f5343e0b..3a79ae469c0 100644
--- a/python/cudf/cudf/tests/test_avro.py
+++ b/python/cudf/cudf/tests/test_avro.py
@@ -65,7 +65,7 @@ def test_avro_reader_basic(datadir, inputfile, columns, engine):
     # FASTAVRO produces int64 columns from avro int32 dtype, so convert
     # it back to int32 here
     for col in expect.columns:
-        expect[col] = expect[col].astype(got[col].dtype.to_numpy)
+        expect[col] = expect[col].astype(got[col].dtype.numpy_dtype)
 
     # fastavro appears to return columns in reverse order
     # (actual order may depend on pandas/python version)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 58a17b5a2ed..c92f0603e4e 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -156,7 +156,7 @@ def test_orc_reader_trailing_nulls(datadir):
     # PANDAS uses NaN to represent invalid data, which forces float dtype
     # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
     for col in expect.columns:
-        expect[col] = expect[col].astype(got[col].dtype.to_numpy)
+        expect[col] = expect[col].astype(got[col].dtype.numpy_dtype)
 
     assert_eq(expect, got, check_categorical=False)
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1b8a2d28cb2..391b9a800d3 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -117,7 +117,7 @@ def get_numeric_type_info(dtype):
 def numeric_normalize_types(*args):
     """Cast all args to a common type using numpy promotion logic
     """
-    dtype = np.result_type(*[a.dtype.to_numpy for a in args])
+    dtype = np.result_type(*[a.dtype.numpy_dtype for a in args])
     return [a.astype(dtype) for a in args]
 
 def is_datetime_dtype(obj):
@@ -203,7 +203,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
 
     if dtype is not None:
         if isinstance(dtype, cudf.Generic):
-            dtype = dtype.to_numpy
+            dtype = dtype.numpy_dtype
         val = val.astype(dtype)
 
     if val.dtype.type is np.datetime64:
@@ -358,7 +358,7 @@ def min_column_type(x, expected_type):
 def check_cast_unsupported_dtype(dtype):
 
     if isinstance(dtype, cudf.Generic):
-        return dtype.to_numpy
+        return dtype.numpy_dtype
 
     if is_categorical_dtype(dtype):
         return dtype
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index fcf86b6bf82..17cdc4a93c3 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -350,7 +350,7 @@ def time_col_replace_nulls(input_col):
             Buffer(
                 np.array(
                     [input_col.default_na_value()],
-                    dtype=input_col.dtype.to_numpy,
+                    dtype=input_col.dtype.numpy_dtype,
                 ).view("|u1")
             ),
             dtype=input_col.dtype,

From 1dc151ad2dd9f1198439d1b0fb9948d974788466 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 16 Sep 2020 11:11:43 -0700
Subject: [PATCH 72/80] extra to_numpy -> numpy_dtype that were missed

---
 python/cudf/cudf/core/dtypes.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index a963daffefc..79378a78e4a 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -29,9 +29,9 @@ def __eq__(self, other):
         ):
             return True
 
-        if self.to_numpy == other:
+        if self.numpy_dtype == other:
             return True
-        if isinstance(other, str) and str(self.to_numpy) == other:
+        if isinstance(other, str) and str(self.numpy_dtype) == other:
             return True
         return False
 
@@ -40,7 +40,7 @@ def __str__(self):
 
     @property
     def num(self):
-        return self.to_numpy.num
+        return self.numpy_dtype.num
 
     @property
     def numpy_dtype(self):
@@ -52,7 +52,7 @@ def pandas_dtype(self):
 
     @property
     def itemsize(self):
-        return self.to_numpy.itemsize
+        return self.numpy_dtype.itemsize
 
     @property
     def type(self):
@@ -108,7 +108,7 @@ def __init__(self):
 
 class Datetime(Generic):
     @property
-    def to_numpy(self):
+    def numpy_dtype(self):
         return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self]
 
     @property
@@ -119,7 +119,7 @@ def to_pandas(self):
 
 class Timedelta(Generic):
     @property
-    def to_numpy(self):
+    def numpy_dtype(self):
         return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self]
 
     @property

From 46a9c2f16e838767a2e861ba943561edfd3b2d72 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 16 Sep 2020 14:31:27 -0700
Subject: [PATCH 73/80] add docstrings, respond to reviews

---
 python/cudf/cudf/_lib/scalar.pyx  |   3 +-
 python/cudf/cudf/api/types.py     | 364 +++++++++++++++++++++++++++++-
 python/cudf/cudf/core/dtypes.py   |  14 +-
 python/cudf/cudf/core/frame.py    |   6 +-
 python/cudf/cudf/core/index.py    |   2 +-
 python/cudf/cudf/core/indexing.py |   4 +-
 6 files changed, 373 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index ead3097f3b4..2a2d7a21f57 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -15,7 +15,6 @@ from libc.stdint cimport (
 )
 from libcpp.memory cimport unique_ptr
 from libcpp cimport bool
-from libc.stdint cimport uintptr_t
 
 import cudf
 from cudf._lib.types import cudf_to_np_types, duration_unit_map
@@ -109,7 +108,7 @@ cdef class Scalar:
         """
         if cudf.api.types.is_string_dtype(self.dtype):
             return _get_py_string_from_string(self.c_value)
-        elif cudf.api.types.is_numerical_dtype(self.dtype):
+        elif cudf.api.types.is_numeric_dtype(self.dtype):
             return _get_np_scalar_from_numeric(self.c_value)
         elif cudf.api.types.is_datetime64_dtype(self.dtype):
             return _get_np_scalar_from_timestamp64(self.c_value)
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 92cc561a1c8..b28ec5fcf4f 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
 import numpy as np
 import pandas as pd
 from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
@@ -6,9 +8,47 @@
 
 
 def is_bool_dtype(obj):
+    """
+    Check whether the provided array or dtype is of a boolean dtype.
+    Parameters
+    ----------
+    arr_or_dtype : array-like
+        The array or dtype to check.
+    Returns
+    -------
+    boolean
+        Whether or not the array or dtype is of a boolean dtype.
+    Notes
+    -----
+    Accepts cuDF, Pandas, or NumPy dtypes and arrays.
+
+    Examples
+    --------
+    >>> is_bool_dtype(cudf.BooleanDtype())
+    True
+    >>> is_bool_dtype(cudf.Series([True, False, None]))
+    True
+    >>> is_bool_dtype(str)
+    False
+    >>> is_bool_dtype(int)
+    False
+    >>> is_bool_dtype(bool)
+    True
+    >>> is_bool_dtype(np.bool_)
+    True
+    >>> is_bool_dtype(np.array(['a', 'b']))
+    False
+    >>> is_bool_dtype(pd.Series([1, 2]))
+    False
+    >>> is_bool_dtype(np.array([True, False]))
+    True
+    >>> is_bool_dtype(pd.Categorical([True, False]))
+    True
+    >>> is_bool_dtype(pd.arrays.SparseArray([True, False]))
+    True
+    """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
-    # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably
     if hasattr(obj, "dtype"):
         obj = obj.dtype
     return isinstance(obj, cudf.BooleanDtype) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_bool_dtype(
@@ -17,6 +57,37 @@ def is_bool_dtype(obj):
 
 
 def is_datetime64_dtype(obj):
+    """
+    Check whether the provided array or dtype is of the datetime64 dtype.
+    Parameters
+    ----------
+    arr_or_dtype : array-like
+        The array or dtype to check.
+    Returns
+    -------
+    boolean
+        Whether or not the array or dtype is of the datetime64 dtype.
+    Notes
+    --------
+        Accepts cuDF, Pandas, or NumPy dtypes and arrays.
+
+    Examples
+    --------
+    >>> is_datetime64_dtype(cudf.Datetime64NSDtype())
+    True
+    >>> is_datetime64_dtype(cudf.Series([1, 2, 3], dtype='datetime64[ms]'))
+    True
+    >>> is_datetime64_dtype(object)
+    False
+    >>> is_datetime64_dtype(np.datetime64)
+    True
+    >>> is_datetime64_dtype(np.array([], dtype=int))
+    False
+    >>> is_datetime64_dtype(np.array([], dtype=np.datetime64))
+    True
+    >>> is_datetime64_dtype([1, 2, 3])
+    False
+    """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
     return isinstance(obj, cudf.Datetime) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_datetime64_dtype(
@@ -25,6 +96,33 @@ def is_datetime64_dtype(obj):
 
 
 def is_timedelta64_dtype(obj):
+    """
+    Check whether an array or dtype is of the timedelta64 dtype.
+    Parameters
+    ----------
+    arr_or_dtype : array-like
+        The array or dtype to check.
+    Returns
+    -------
+    boolean
+        Whether or not the array or dtype is of the timedelta64 dtype.
+    Examples
+    --------
+    >>> is_timedelta64_dtype(cudf.Timedelta64NSDtype())
+    True
+    >>> is_timedelta64_dtype(cudf.Series([1,2,3], dtype='timedelta64[ns]'))
+    True
+    >>> is_timedelta64_dtype(object)
+    False
+    >>> is_timedelta64_dtype(np.timedelta64)
+    True
+    >>> is_timedelta64_dtype([1, 2, 3])
+    False
+    >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]"))
+    True
+    >>> is_timedelta64_dtype('0 days')
+    False
+    """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
     return isinstance(
@@ -33,6 +131,34 @@ def is_timedelta64_dtype(obj):
 
 
 def is_string_dtype(obj):
+    """
+    Check whether the provided array or dtype is of the string dtype.
+    Parameters
+    ----------
+    arr_or_dtype : array-like
+        The array or dtype to check.
+    Returns
+    -------
+    boolean
+        Whether or not the array or dtype is of the string dtype.
+    Examples
+    --------
+    >>> is_string_dtype(cudf.StringDtype())
+    True
+    >>> is_string_dtype(cudf.Series(['a','b','c']))
+    True
+    >>> is_string_dtype(str)
+    True
+    >>> is_string_dtype(object)
+    True
+    >>> is_string_dtype(int)
+    False
+    >>>
+    >>> is_string_dtype(np.array(['a', 'b']))
+    True
+    >>> is_string_dtype(pd.Series([1, 2]))
+    False
+    """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
     return isinstance(obj, cudf.StringDtype) or (not isinstance(obj, cudf.Generic) and (
@@ -41,6 +167,49 @@ def is_string_dtype(obj):
 
 
 def is_integer_dtype(obj):
+    """
+    Check whether the provided array or dtype is of an integer dtype.
+    Parameters
+    ----------
+    arr_or_dtype : array-like
+        The array or dtype to check.
+    Returns
+    -------
+    boolean
+        Whether or not the array or dtype is of an integer dtype and
+    Examples
+    --------
+    >>> is_integer_dtype(cudf.Int64Dtype())
+    True
+    >>> is_integer_dtype(cudf.Series([1,2,3], dtype='int64'))
+    True
+    >>> is_integer_dtype(str)
+    False
+    >>> is_integer_dtype(int)
+    True
+    >>> is_integer_dtype(float)
+    False
+    >>> is_integer_dtype(np.uint64)
+    True
+    >>> is_integer_dtype('int8')
+    True
+    >>> is_integer_dtype('Int8')
+    True
+    >>> is_integer_dtype(pd.Int8Dtype)
+    True
+    >>> is_integer_dtype(np.datetime64)
+    False
+    >>> is_integer_dtype(np.timedelta64)
+    False
+    >>> is_integer_dtype(np.array(['a', 'b']))
+    False
+    >>> is_integer_dtype(pd.Series([1, 2]))
+    True
+    >>> is_integer_dtype(np.array([], dtype=np.timedelta64))
+    False
+    >>> is_integer_dtype(pd.Index([1, 2.]))  # float
+    False
+    """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
     try:
@@ -50,7 +219,44 @@ def is_integer_dtype(obj):
         pdb.set_trace()
 
 
-def is_numerical_dtype(obj):
+def is_numeric_dtype(obj):
+    """
+    Check whether the provided array or dtype is of a numeric dtype.
+    Parameters
+    ----------
+    arr_or_dtype : array-like
+        The array or dtype to check.
+    Returns
+    -------
+    boolean
+        Whether or not the array or dtype is of a numeric dtype.
+    Examples
+    --------
+    >>> is_numeric_dtype(cudf.Float32Dtype())
+    True
+    >>> is_numeric_dtype(cudf.Series([1.0, 2.0, 3.0]))
+    True
+    >>> is_numeric_dtype(str)
+    False
+    >>> is_numeric_dtype(int)
+    True
+    >>> is_numeric_dtype(float)
+    True
+    >>> is_numeric_dtype(np.uint64)
+    True
+    >>> is_numeric_dtype(np.datetime64)
+    False
+    >>> is_numeric_dtype(np.timedelta64)
+    False
+    >>> is_numeric_dtype(np.array(['a', 'b']))
+    False
+    >>> is_numeric_dtype(pd.Series([1, 2]))
+    True
+    >>> is_numeric_dtype(pd.Index([1, 2.]))
+    True
+    >>> is_numeric_dtype(np.array([], dtype=np.timedelta64))
+    False
+    """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
     if isinstance(obj, cudf.Generic):
@@ -131,7 +337,32 @@ def is_list_dtype(obj):
     )
 
 
-def find_common_type(array_types=[], scalar_types=[]):
+def find_common_type(array_types, scalar_types):
+    """
+    Determine common type following numpy coercion rules.
+    Similar to numpy.find_common_type, but accepts both 
+    numpy and cuDF datatypes.
+
+    Parameters
+    ----------
+    array_types : sequence
+        A list of dtypes or dtype convertible objects representing arrays.
+    scalar_types : sequence
+        A list of dtypes or dtype convertible objects representing scalars.
+    Returns
+    -------
+    datatype : cuDF dtype
+        The common data type, which is the maximum of `array_types` ignoring
+        `scalar_types`, unless the maximum of `scalar_types` is of a
+        different kind (`dtype.kind`). 
+    See Also
+    --------
+    numpy.find_common_type
+    Notes
+    --------
+    Accepts numpy dtypes, cuDF dtypes, or a mix of both
+
+    """
     array_types = [
         d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in array_types
     ]
@@ -143,6 +374,30 @@ def find_common_type(array_types=[], scalar_types=[]):
 
 
 def can_cast(from_, to, casting='safe'):
+    """
+    Returns True if cast between data types can occur according to the casting rule.
+    If from is a scalar or array scalar, also returns True if the scalar value
+    can be cast without overflow or truncation to an integer.
+
+    Parameters
+    ----------
+    from_ : dtype, dtype specifier, scalar, or array
+        Data type, scalar, or array to cast from.
+    to : dtype or dtype specifier
+        Data type to cast to.
+    casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’}, optional
+        Controls what kind of data casting may occur.
+        - ‘no’ means the data types should not be cast at all.
+        - ‘equiv’ means only byte-order changes are allowed.
+        - ‘safe’ means only casts which can preserve values are allowed.
+        - ‘same_kind’ means only safe casts or casts within a kind, 
+        like float64 to float32, are allowed
+        - ‘unsafe’ means any data conversions may be done.
+
+    Notes
+    --------
+    Accepts numpy dtypes, cuDF dtypes, or a mix of both
+    """
     if isinstance(from_, cudf.Generic):
         from_ = from_.numpy_dtype
     elif isinstance(from_, cudf.Scalar):
@@ -154,19 +409,73 @@ def can_cast(from_, to, casting='safe'):
 
 
 def result_type(*arrays_and_dtypes):
+    """
+    Returns the type that results from applying the NumPy type promotion rules to the arguments.
+    See numpy.result_type for details. 
+    
+    See Also
+    --------
+    numpy.result_type
 
+    Returns
+    -------
+    datatype : cuDF dtype
+
+    Notes
+    --------
+    Accepts numpy dtypes, cuDF dtypes, or a mix of both
+
+    """
     arrays_and_dtypes = (
         d.numpy_dtype if isinstance(d, cudf.Generic) else d
         for d in arrays_and_dtypes
     )
     return cudf.dtype(np.result_type(*arrays_and_dtypes))
 
-def isnan(obj):
-    if isinstance(obj, cudf._lib.scalar.Scalar):
-        obj = obj.value
-    return np.isnan(obj)
+def isnan(x):
+    """
+    Returns true if an input scalar is equal to NaN.
+
+    Parameters
+    -------
+    x : cuDF or NumPy scalar
+
+    See Also
+    -------
+    numpy.isnan
+
+    Notes
+    --------
+    Accepts numpy dtypes, cuDF dtypes, or a mix of both
+
+    """
+    if isinstance(x, cudf._lib.scalar.Scalar):
+        x = x.value
+    return np.isnan(x)
 
 def min_scalar_type(a):
+    """
+    For scalar a, returns the data type with the smallest size and smallest
+    scalar kind which can hold its value. For non-scalar array a, returns
+    the vector’s dtype unmodified.
+
+    Parameters
+    -------
+    a : cuDF or NumPy scalar
+
+    Returns
+    -------
+    result : cuDF dtype
+
+    See Also
+    -------
+    numpy.mim_scalar_type
+
+    Notes
+    --------
+    Accepts numpy dtypes, cuDF dtypes, or a mix of both
+
+    """
     if isinstance(a, cudf.Scalar):
         a = a.value
     result = np.min_scalar_type(a)
@@ -175,6 +484,28 @@ def min_scalar_type(a):
     return cudf.dtype(result)
 
 def promote_types(type1, type2):
+    """
+    Returns the data type with the smallest size and smallest scalar kind
+    to which both type1 and type2 may be safely cast.
+
+    Parameters
+    -------
+    type1 : cuDF or NumPy dtype
+    type2 : cuDF or NumPy dtype
+
+    Returns
+    -------
+    result : cuDF dtype, the promoted type
+
+    See Also
+    --------
+    numpy.promote_types
+
+    Notes
+    --------
+    Accepts numpy dtypes, cuDF dtypes, or a mix of both
+
+    """
     if isinstance(type1, cudf.Generic):
         type1 = type1.numpy_dtype
     if isinstance(type2, cudf.Generic):
@@ -186,4 +517,23 @@ def promote_types(type1, type2):
     return cudf.dtype(result)
 
 def isscalar(element):
+    """
+    Returns True if the type of `element` is a scalar type, 
+    including cuDF, NumPy, and standard python scalars
+
+    Parameters
+    ----------
+    element : any
+        Input argument, can be of any type.
+    Returns
+    -------
+    val : bool
+        True if `element` is a scalar type, False if it is not.
+
+    See Also
+    --------
+    numpy.isscalar
+
+    """
+
     return isinstance(element, cudf._lib.scalar.Scalar) or np.isscalar(element)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 79378a78e4a..d2945eb0b22 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -24,8 +24,8 @@ def __eq__(self, other):
         ):
             return False
         if (
-            isinstance(other, self.to_pandas.type)
-            or other is self.to_pandas
+            isinstance(other, self.pandas_dtype.type)
+            or other is self.pandas_dtype
         ):
             return True
 
@@ -60,7 +60,7 @@ def type(self):
 
     @property
     def kind(self):
-        return self.to_pandas.kind
+        return self.pandas_dtype.kind
 
     @property
     def name(self):
@@ -112,7 +112,7 @@ def numpy_dtype(self):
         return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self]
 
     @property
-    def to_pandas(self):
+    def pandas_dtype(self):
         # pandas only supports nanos
         return np.dtype("datetime64[ns]")
 
@@ -123,7 +123,7 @@ def numpy_dtype(self):
         return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self]
 
     @property
-    def to_pandas(self):
+    def pandas_dtype(self):
         # pandas only supports nanos
         return np.dtype("timedelta64[ns]")
 
@@ -348,6 +348,10 @@ def __init__(self, categories=None, ordered=None):
     def __repr__(self):
         return self.to_pandas().__repr__()
 
+    @property
+    def pandas_dtype(self):
+        return self.to_pandas()
+
     def __hash__(self):
         return hash(self.__repr__())
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8792dccba85..974638bcf23 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -13,7 +13,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.nvtx import annotate
-from cudf.api.types import is_categorical_dtype, is_numerical_dtype
+from cudf.api.types import is_categorical_dtype, is_numeric_dtype
 from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import utils
 from cudf.utils.dtypes import (
@@ -276,7 +276,7 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes):
                 # default to the first non-null dtype
                 dtypes[idx] = cols[0].dtype
                 # If all the non-null dtypes are int/float, find a common dtype
-                if all(is_numerical_dtype(col.dtype) for col in cols):
+                if all(is_numeric_dtype(col.dtype) for col in cols):
                     dtypes[idx] = cudf.api.types.find_common_type(
                         [col.dtype for col in cols], []
                     )
@@ -3142,7 +3142,7 @@ def _get_replacement_values(to_replace, replacement, col_name, column):
             if all_nan:
                 replacement = [replacement] * len(to_replace)
             # Do not broadcast numeric dtypes
-            elif cudf.api.types.is_numerical_dtype(column.dtype):
+            elif cudf.api.types.is_numeric_dtype(column.dtype):
                 if len(to_replace) > 0:
                     replacement = [replacement]
                 else:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index a3ce9d8cb19..d895a2760a1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1719,7 +1719,7 @@ def to_pandas(self):
         return pd.RangeIndex(
             start=self._start,
             stop=self._stop,
-            dtype=self.dtype.to_pandas,
+            dtype=self.dtype.pandas_dtype,
             name=self.name,
         )
 
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 926f44c2ced..962f5ba3bef 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -89,7 +89,7 @@ def __setitem__(self, key, value):
         else:
             value = column.as_column(value)
 
-        if hasattr(value, "dtype") and cudf.api.types.is_numerical_dtype(
+        if hasattr(value, "dtype") and cudf.api.types.is_numeric_dtype(
             value.dtype
         ):
             # normalize types if necessary:
@@ -214,7 +214,7 @@ def _can_downcast_to_series(self, df, arg):
                     return True
             dtypes = df.dtypes.values.tolist()
             all_numeric = all(
-                [cudf.api.types.is_numerical_dtype(t) for t in dtypes]
+                [cudf.api.types.is_numeric_dtype(t) for t in dtypes]
             )
             if all_numeric:
                 return True

From 81e60581961deed48b0cc8d85158dfd8e08d7dd8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 07:50:21 -0700
Subject: [PATCH 74/80] minor fixes and code removal

---
 python/cudf/cudf/api/types.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index b28ec5fcf4f..b553e9978c3 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -212,12 +212,7 @@ def is_integer_dtype(obj):
     """
     if hasattr(obj, 'dtype'):
         obj = obj.dtype
-    try:
-        return isinstance(obj, cudf.Integer) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_integer_dtype(obj))
-    except:
-        import pdb
-        pdb.set_trace()
-
+    return isinstance(obj, cudf.Integer) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_integer_dtype(obj))
 
 def is_numeric_dtype(obj):
     """
@@ -328,7 +323,7 @@ def is_categorical_dtype(obj):
 
 def is_list_dtype(obj):
     return (
-        type(obj) is cudf.core.dtypes.ListDtype
+        isinstance(obj, cudf.core.dtypes.ListDtype)
         or obj is cudf.core.dtypes.ListDtype
         or type(obj) is cudf.core.column.ListColumn
         or obj is cudf.core.column.ListColumn

From d7930eb43cad3f386d2aa0417d89fe92be8d309e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 07:52:02 -0700
Subject: [PATCH 75/80] remove cudf_dtype_from_pydata_dtype

---
 python/cudf/cudf/core/dataframe.py       | 18 ++++++++++--------
 python/cudf/cudf/core/dtypes.py          | 15 +++++++++++----
 python/cudf/cudf/tests/test_dataframe.py |  6 +++---
 python/cudf/cudf/utils/dtypes.py         | 24 ------------------------
 4 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e4bb2c6ec69..3372fd35fb7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -40,7 +40,6 @@
 from cudf.utils import applyutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    cudf_dtype_from_pydata_dtype,
     is_list_like,
     is_scalar,
     numeric_normalize_types,
@@ -6406,8 +6405,12 @@ def select_dtypes(self, include=None, exclude=None):
                 "at least one of include or exclude must be nonempty"
             )
 
+        def cudf_dtype_type(d):
+            res = cudf.dtype(d)
+            return type(res) if isinstance(res, cudf.Generic) else res
+
         include, exclude = map(
-            lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)),
+            lambda x: frozenset(map(cudf_dtype_type, x)),
             selection,
         )
 
@@ -6419,28 +6422,27 @@ def select_dtypes(self, include=None, exclude=None):
                 )
             )
         # include all subtypes
-
         include_subtypes = set()
-        for dtype in (d.__class__ for d in self.dtypes):
+        for dtype in (type(d) for d in self.dtypes):
             for i_dtype in include:
                 # category handling
                 if is_categorical_dtype(i_dtype):
                     include_subtypes.add(i_dtype)
-                elif issubclass(dtype, i_dtype):
+                elif isinstance(dtype, i_dtype) or issubclass(dtype, i_dtype):
                     include_subtypes.add(dtype)
 
         # exclude all subtypes
         exclude_subtypes = set()
-        for dtype in (d.__class__ for d in self.dtypes):
+        for dtype in (type(d) for d in self.dtypes):
             for e_dtype in exclude:
                 # category handling
                 if is_categorical_dtype(e_dtype):
                     exclude_subtypes.add(e_dtype)
-                elif issubclass(dtype, e_dtype):
+                elif isinstance(dtype, e_dtype) or issubclass(dtype, e_dtype):
                     exclude_subtypes.add(dtype)
 
         include_all = set(
-            [cudf_dtype_from_pydata_dtype(d) for d in self.dtypes]
+            [cudf_dtype_type(d) for d in self.dtypes]
         )
         if include:
             inclusion = include_all & include_subtypes
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index d2945eb0b22..0a4b69ff896 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -266,13 +266,15 @@ def __call__(self, arg):
 
 def cudf_dtype_from_string(obj):
     if obj == "category":
-        return obj
+        return CategoricalDtype()
     try:
         np_dtype = np.dtype(obj)
         return cudf_dtype_from_numpy(np_dtype)
     except TypeError:
         result = _cudf_dtype_from_string.get(obj, None)
         if not result:
+            import pdb
+            pdb.set_trace()
             raise TypeError(f"Could not find a cuDF dtype matching {obj}")
         return result
 
@@ -282,9 +284,9 @@ def cudf_dtype_from_numpy(obj):
         return StringDtype()
     elif obj is np.number:
         return cudf.Number
-    elif obj is np.datetime64:
+    elif obj in {np.datetime64, np.dtype('datetime64')}:
         return cudf.Datetime
-    elif obj is np.timedelta64:
+    elif obj in {np.timedelta64, np.dtype('timedelta64')}:
         return cudf.Timedelta
     dtype = np.dtype(obj)
     if dtype.type is np.str_:
@@ -310,7 +312,10 @@ def dtype(obj):
     if isinstance(obj, Generic):
         return obj
     elif type(obj) is type and issubclass(obj, Generic):
-        return obj()
+        if obj in cant_construct_dtypes:
+            return obj
+        else:
+            return obj()
     elif isinstance(obj, np.dtype) or (
         isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype))
     ):
@@ -597,3 +602,5 @@ def __repr__(self):
     pd.StringDtype(): StringDtype(),
     pd.BooleanDtype(): BooleanDtype(),
 }
+
+cant_construct_dtypes = {Number, Integer, UnsignedInteger, Floating, Inexact, Timedelta}
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index c2429504764..5267b8af970 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2635,11 +2635,11 @@ def test_select_dtype_datetime():
     assert_eq(gdf[["timestamp"]], gdf.select_dtypes("datetime64"))
     assert_eq(gdf[["timestamp"]], gdf.select_dtypes(np.dtype("datetime64")))
     assert_eq(gdf[["timestamp"]], gdf.select_dtypes(include="datetime64"))
-    assert_eq(gdf[["timestamp"]], gdf.select_dtypes("datetime64[ms]"))
+    assert_eq(gdf[["timestamp"]], gdf.select_dtypes("datetime64[ns]"))
     assert_eq(
-        gdf[["timestamp"]], gdf.select_dtypes(np.dtype("datetime64[ms]"))
+        gdf[["timestamp"]], gdf.select_dtypes(np.dtype("datetime64[ns]"))
     )
-    assert_eq(gdf[["timestamp"]], gdf.select_dtypes(include="datetime64[ms]"))
+    assert_eq(gdf[["timestamp"]], gdf.select_dtypes(include="datetime64[ns]"))
 
 
 def test_array_ufunc():
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 391b9a800d3..241a0bc0b4a 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -127,30 +127,6 @@ def is_datetime_dtype(obj):
         return False
     return "M8" in obj.str
 
-
-def cudf_dtype_from_pydata_dtype(dtype):
-    """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
-        Python dtype.
-    """
-    if isinstance(dtype, cudf.Generic):
-        return dtype.__class__
-    if inspect.isclass(dtype):
-        if issubclass(dtype, cudf.Generic):
-            return dtype
-    if is_categorical_dtype(dtype):
-        return cudf.core.dtypes.CategoricalDtype
-    elif dtype in cudf._lib.types.np_to_cudf_types:
-        return dtype.type
-    elif np.issubdtype(dtype, np.datetime64):
-        dtype = np.datetime64
-
-    result = cudf.dtype(infer_dtype_from_object(dtype))
-    if isinstance(result, cudf.Generic):
-        return result.__class__
-    elif inspect.isclass(result):
-        return result
-
-
 def is_scalar(val):
     return (
         val is None

From c290a15180fde60275d767e41b2d2ae700a02246 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 07:52:27 -0700
Subject: [PATCH 76/80] update api calls for find_common_type to be numpy-like

---
 python/cudf/cudf/core/scalar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 0c35833c91e..14830b4ccea 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -114,7 +114,7 @@ def _binop_result_dtype_or_error(self, other, op):
         if (self.dtype.kind == "O" or other.dtype.kind == "O") and op != "__add__":
             raise TypeError(f"{op} is not supported for string type scalars")
 
-        return find_common_type([self.dtype, other.dtype])
+        return find_common_type([self.dtype, other.dtype], [])
 
     def _scalar_binop(self, other, op):
         other = to_cudf_compatible_scalar(other)

From e90e3255bbbc6a4014fe19fbe15f25c5d5879442 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 12:33:27 -0700
Subject: [PATCH 77/80] let pandas handle categorical edge cases

---
 python/cudf/cudf/core/column/column.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 3a3446d4b7f..65f5ef82c04 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1409,7 +1409,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
     * pandas.Categorical objects
     """
 
-    dtype = cudf.dtype(dtype)
+    if dtype and dtype is not 'category':
+        dtype = cudf.dtype(dtype) 
 
     if isinstance(arbitrary, ColumnBase):
         if dtype is not None:
@@ -1510,6 +1511,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             data = as_column(
                 cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
             )
+        elif dtype is 'category':
+            return as_column(pd.Series(arbitrary, dtype=dtype))
         else:
             data = as_column(
                 pa.array(arbitrary, from_pandas=nan_as_null),

From 3d8ca2f474c5f1818b6010af9333e772422d606c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 13:34:39 -0700
Subject: [PATCH 78/80] fix categorical creation and casting throughout cudf

---
 python/cudf/cudf/core/column/column.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 65f5ef82c04..b628def8ca1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1004,10 +1004,10 @@ def distinct_count(self, method="sort", dropna=True):
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype, **kwargs):
-        dtype = cudf.dtype(dtype)
         if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
-        elif isinstance(dtype, cudf.Datetime):
+        dtype = cudf.dtype(dtype)
+        if isinstance(dtype, cudf.Datetime):
             return self.as_datetime_column(dtype, **kwargs)
         elif isinstance(dtype, cudf.StringDtype):
             return self.as_string_column(dtype, **kwargs)
@@ -1496,6 +1496,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             arbitrary.array, pd.core.arrays.masked.BaseMaskedArray
         ):
             return as_column(arbitrary.array)
+        elif dtype is 'category':
+            return as_column(pd.Series(arbitrary, dtype=dtype))
         if is_categorical_dtype(arbitrary):
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif arbitrary.dtype == np.bool:
@@ -1511,8 +1513,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             data = as_column(
                 cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
             )
-        elif dtype is 'category':
-            return as_column(pd.Series(arbitrary, dtype=dtype))
         else:
             data = as_column(
                 pa.array(arbitrary, from_pandas=nan_as_null),

From 265338409fee828c043c56f9d9323ee8127ad73e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 13:34:58 -0700
Subject: [PATCH 79/80] remove old code

---
 python/cudf/cudf/core/column/numerical.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 3072f513f40..8f50c9188d8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -166,7 +166,6 @@ def as_timedelta_column(self, dtype, **kwargs):
         )
 
     def as_numerical_column(self, dtype, **kwargs):
-        # dtype = np.dtype(dtype)
         # expect a cudf dtype always here
         if dtype == self.dtype:
             return self

From 123784bdf9dbaf78d059054ccb245a017c8888f1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Sep 2020 14:59:50 -0700
Subject: [PATCH 80/80] continued bugfixes

---
 python/cudf/cudf/core/dtypes.py | 4 ++--
 python/cudf/cudf/core/index.py  | 2 +-
 python/cudf/cudf/core/scalar.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 0a4b69ff896..52155e5517f 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -265,8 +265,6 @@ def __call__(self, arg):
 
 
 def cudf_dtype_from_string(obj):
-    if obj == "category":
-        return CategoricalDtype()
     try:
         np_dtype = np.dtype(obj)
         return cudf_dtype_from_numpy(np_dtype)
@@ -579,6 +577,7 @@ def __repr__(self):
     "Float32": Float32Dtype,
     "Float64": Float64Dtype,
     "Boolean": BooleanDtype,
+    "string": StringDtype,
     "String": StringDtype,
     "Datetime64NS": Datetime64NSDtype,
     "Datetime64US": Datetime64USDtype,
@@ -588,6 +587,7 @@ def __repr__(self):
     "Timedelta64US": Timedelta64USDtype,
     "Timedelta64MS": Timedelta64MSDtype,
     "Timedelta64S": Timedelta64SDtype,
+    'category': CategoricalDtype
 }
 
 _cudf_dtype_from_pandas = {
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index d895a2760a1..75c48d5abe6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1941,7 +1941,7 @@ def find_label_range(self, first, last):
         if last is not None:
             end = col.find_last_value(last, closest=True)
             end += 1
-        return begin, end
+        return begin.value, end.value
 
     @property
     def is_unique(self):
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 14830b4ccea..3413f350ce0 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -9,7 +9,8 @@ class Scalar(libcudf.scalar.Scalar):
     def __init__(self, value, dtype=None):
         if isinstance(value, libcudf.scalar.Scalar):
             if dtype and not value.dtype == dtype:
-                raise TypeError
+                # TODO should be doable on the device
+                value = libcudf.scalar.Scalar(value.value, dtype=dtype)
             self._data = value
         else:
             self._data = libcudf.scalar.Scalar(value, dtype=dtype)