rapidsai · brandon-b-miller · Jul 28, 2020 · Jul 28, 2020 · Jul 29, 2020 · Jul 29, 2020
@@ -8,6 +8,7 @@
 
 import rmm
 
+import cudf.api.types
 from cudf import core, datasets, testing
 from cudf._version import get_versions
 from cudf.core import (
@@ -31,8 +32,39 @@
     UInt64Index,
     from_pandas,
     merge,
+    Scalar
+)
+from cudf.core.dtypes import (
+    BooleanDtype,
+    CategoricalDtype,
+    Datetime,
+    Datetime64MSDtype,
+    Datetime64NSDtype,
+    Datetime64SDtype,
+    Datetime64USDtype,
+    Flexible,
+    Float32Dtype,
+    Float64Dtype,
+    Floating,
+    Generic,
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype,
+    Int64Dtype,
+    Integer,
+    Number,
+    StringDtype,
+    Timedelta,
+    Timedelta64MSDtype,
+    Timedelta64NSDtype,
+    Timedelta64SDtype,
+    Timedelta64USDtype,
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+    dtype,
 )
-from cudf.core.dtypes import CategoricalDtype
 from cudf.core.groupby import Grouper
 from cudf.core.ops import (
     add,

@@ -16,8 +16,10 @@ from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_null_policy,
     underlying_type_t_type_id,
+    _Dtype
 )
 from cudf._lib.types import Interpolation
+from cudf.core.dtypes import dtype as cudf_dtype
 
 try:
     # Numba >= 0.49
@@ -241,24 +243,18 @@ cdef class _AggregationFactory:
         cdef string cpp_str
 
         # Handling UDF type
-        nb_type = numpy_support.from_dtype(kwargs['dtype'])
+        nb_type = numpy_support.from_dtype(kwargs['dtype'].to_numpy)
         type_signature = (nb_type[:],)
         compiled_op = cudautils.compile_udf(op, type_signature)
-        output_np_dtype = np.dtype(compiled_op[1])
+        output_np_dtype = cudf_dtype(np.dtype(compiled_op[1]))
         cpp_str = compiled_op[0].encode('UTF-8')
-        if output_np_dtype not in np_to_cudf_types:
+        if cudf_dtype(output_np_dtype) not in np_to_cudf_types:
             raise TypeError(
                 "Result of window function has unsupported dtype {}"
                 .format(op[1])
             )
-        tid = (
-            <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[output_np_dtype]
-                )
-            )
-        )
-        out_dtype = libcudf_types.data_type(tid)
+        cdef _Dtype pydtype = output_np_dtype
+        out_dtype = pydtype.get_libcudf_type()
 
         agg.c_obj = move(libcudf_aggregation.make_udf_aggregation(
             libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype

@@ -23,10 +23,11 @@ from cudf._lib.cpp.types cimport (
     type_id,
 )
 
-from cudf.utils.dtypes import is_string_dtype
+from cudf.api.types import is_string_dtype
 
 from cudf._lib.cpp.binaryop cimport binary_operator
 cimport cudf._lib.cpp.binaryop as cpp_binaryop
+from cudf._lib.types cimport _Dtype
 
 
 class BinaryOperation(IntEnum):
@@ -170,19 +171,13 @@ def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
     """
+    cdef _Dtype py_dtype = dtype
     op = BinaryOperation[op.upper()]
     cdef binary_operator c_op = <binary_operator> (
         <underlying_type_t_binary_operator> op
     )
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
 
-    cdef data_type c_dtype = data_type(tid)
+    cdef data_type c_dtype = py_dtype.get_libcudf_type()
 
     if isinstance(lhs, Scalar) or np.isscalar(lhs) or lhs is None:
 
@@ -229,15 +224,8 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     """
     cdef column_view c_lhs = lhs.view()
     cdef column_view c_rhs = rhs.view()
-
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
-    cdef data_type c_dtype = data_type(tid)
+    cdef _Dtype pydtype = dtype
+    cdef data_type c_dtype = pydtype.get_libcudf_type()
 
     cdef string cpp_str = udf_ptx.encode("UTF-8")
 

@@ -8,7 +8,7 @@ import rmm
 import cudf
 
 from cudf.core.buffer import Buffer
-from cudf.utils.dtypes import is_categorical_dtype, is_list_dtype
+from cudf.api.types import is_categorical_dtype, is_list_dtype
 import cudf._lib as libcudfxx
 
 from cpython.buffer cimport PyObject_CheckBuffer
@@ -41,6 +41,8 @@ from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport Scalar
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.types cimport _Dtype
+
 cimport cudf._lib.cpp.unary as libcudf_unary
 
 cdef class Column:
@@ -316,14 +318,8 @@ cdef class Column:
             col = self.base_children[0]
         else:
             col = self
-        data_dtype = col.dtype
-
-        cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(data_dtype)]
-            )
-        )
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
+        cdef _Dtype pydtype = col.dtype
+        cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type()
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -374,19 +370,8 @@ cdef class Column:
         else:
             col = self
 
-        data_dtype = col.dtype
-        cdef libcudf_types.type_id tid
-
-        if not is_list_dtype(self.dtype):
-            tid = <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[np.dtype(data_dtype)]
-                )
-            )
-        else:
-            tid = libcudf_types.type_id.LIST
-
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
+        cdef _Dtype pydtype = col.dtype
+        cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type()
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data

@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 import pandas as pd
+from cudf.api.types import is_integer_dtype
 
 from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
@@ -129,7 +130,7 @@ def copy_range(Column input_column,
 
 
 def gather(Table source_table, Column gather_map, bool keep_index=True):
-    assert pd.api.types.is_integer_dtype(gather_map.dtype)
+    assert is_integer_dtype(gather_map.dtype)
 
     cdef unique_ptr[table] c_result
     cdef table_view source_table_view

@@ -23,6 +23,7 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         numeric_scalar(T value, bool is_valid) except +
         void set_value(T value) except +
         T value() except +
+        T* data() except +
 
     cdef cppclass timestamp_scalar[T](scalar):
         timestamp_scalar() except +
@@ -34,6 +35,8 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         int64_t ticks_since_epoch_64 "ticks_since_epoch"() except +
         int32_t ticks_since_epoch_32 "ticks_since_epoch"() except +
         T value() except +
+        T* data() except +
+
 
     cdef cppclass duration_scalar[T](scalar):
         duration_scalar() except +
@@ -44,10 +47,13 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         duration_scalar(int32_t value, bool is_valid) except +
         int64_t ticks "count"() except +
         T value() except +
+        T* data() except +
+
 
     cdef cppclass string_scalar(scalar):
         string_scalar() except +
         string_scalar(string st) except +
         string_scalar(string st, bool is_valid) except +
         string_scalar(string_scalar other) except +
         string to_string() except +
+        const char* data() except +
@@ -178,7 +178,7 @@ def _drop_unsupported_aggs(Table values, aggs):
     if all(len(v) == 0 for v in aggs.values()):
         return aggs
 
-    from cudf.utils.dtypes import (
+    from cudf.api.types import (
         is_categorical_dtype,
         is_string_dtype,
         is_list_dtype

@@ -11,7 +11,8 @@ import json
 from cython.operator import dereference
 import numpy as np
 
-from cudf.utils.dtypes import np_to_pa_dtype, is_categorical_dtype
+from cudf.utils.dtypes import np_to_pa_dtype
+from cudf.api.types import is_categorical_dtype
 from libc.stdlib cimport free
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport shared_ptr, unique_ptr, make_unique
@@ -102,7 +103,6 @@ cpdef generate_pandas_metadata(Table table, index):
             )
         else:
             types.append(np_to_pa_dtype(col.dtype))
-
     # Indexes
     if index is not False:
         for name in table._index.names:
@@ -134,16 +134,15 @@ cpdef generate_pandas_metadata(Table table, index):
                 index_descriptors.append(descr)
             else:
                 col_names.append(name)
-
+    metadata_df = table.head(0).to_pandas()
     metadata = pa.pandas_compat.construct_metadata(
-        table,
+        metadata_df,
         col_names,
         index_levels,
         index_descriptors,
         index,
         types,
     )
-
     md = metadata[b'pandas']
     json_str = md.decode("utf-8")
     return json_str

@@ -8,11 +8,14 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.scalar cimport Scalar
 from cudf._lib.column cimport Column
 from cudf._lib.types import np_to_cudf_types
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, _Dtype
 from cudf._lib.move cimport move
 from cudf._lib.aggregation cimport make_aggregation, aggregation
 from libcpp.memory cimport unique_ptr
 import numpy as np
+from cudf.core.dtypes import dtype as cudf_dtype
+from cudf.api.types import find_common_type
+from cudf.core.scalar import Scalar as PyScalar
 
 
 def reduce(reduction_op, Column incol, dtype=None, **kwargs):
@@ -29,26 +32,19 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
+    dtype = cudf_dtype(dtype)
     col_dtype = incol.dtype
     if reduction_op in ['sum', 'sum_of_squares', 'product']:
-        col_dtype = np.find_common_type([col_dtype], [np.uint64])
+        col_dtype = find_common_type([col_dtype], [np.uint64])
     col_dtype = col_dtype if dtype is None else dtype
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
     cdef unique_ptr[aggregation] c_agg = move(make_aggregation(
         reduction_op, kwargs
     ))
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(col_dtype)]
-            )
-        )
-    )
-
-    cdef data_type c_out_dtype = data_type(tid)
+    cdef _Dtype data_dtype = col_dtype
+    cdef data_type c_out_dtype = data_dtype.get_libcudf_type()
 
     # check empty case
     if len(incol) <= incol.null_count:
@@ -65,8 +61,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             c_out_dtype
         ))
 
-    py_result = Scalar.from_unique_ptr(move(c_result))
-    return py_result.value
+    cy_result = Scalar.from_unique_ptr(move(c_result))
+    return PyScalar(cy_result)
 
 
 def scan(scan_op, Column incol, inclusive, **kwargs):

@@ -4,6 +4,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp cimport bool
 
 from cudf._lib.cpp.scalar.scalar cimport scalar
+from libc.stdint cimport uintptr_t
 
 
 cdef class Scalar: