From 1c83eacc21fb0c0910d2c26861bb7dda3bbb6462 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 Jul 2020 06:47:01 -0700 Subject: [PATCH 01/80] initial dtype work --- python/cudf/cudf/core/dtypes.py | 172 ++++++++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 4 + 2 files changed, 176 insertions(+) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8d313b19707..09eef0b1790 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -9,6 +9,178 @@ import cudf +pa_to_pd_dtypes = { + pa.uint8(): pd.UInt8Dtype(), + pa.uint16(): pd.UInt16Dtype(), + pa.uint32(): pd.UInt32Dtype(), + pa.uint64(): pd.UInt64Dtype(), + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + pa.bool_(): pd.BooleanDtype(), + pa.string(): pd.StringDtype(), + pa.float32(): np.float32(), + pa.float64(): np.float64(), + pa.timestamp('ns'): np.dtype('datetime64[ns]'), + pa.timestamp('us'): np.dtype('datetime64[us]'), + pa.timestamp('ms'): np.dtype('datetime64[ms]'), + pa.timestamp('s'): np.dtype('datetime64[s]'), +} + +pa_to_np_dtypes = { + pa.uint8(): np.dtype('uint8'), + pa.uint16(): np.dtype('uint16'), + pa.uint32(): np.dtype('uint32'), + pa.uint64(): np.dtype('uint64'), + pa.int8(): np.dtype('int8'), + pa.int16(): np.dtype('int16'), + pa.int32(): np.dtype('int32'), + pa.int64(): np.dtype('int64'), + pa.bool_(): np.dtype('bool'), + pa.string(): np.dtype('object'), + pa.float32(): np.dtype('float32'), + pa.float64(): np.dtype('float64'), + pa.timestamp('ns'): np.dtype('datetime64[ns]'), + pa.timestamp('us'): np.dtype('datetime64[us]'), + pa.timestamp('ms'): np.dtype('datetime64[ms]'), + pa.timestamp('s'): np.dtype('datetime64[s]'), +} + +class Dtype(ExtensionDtype): + def __init__(self, arg): + cudf_dtype = make_dtype_from_obj(arg) + cudf_dtype.__init__(self) + + def __eq__(self, other): + if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__: + return True + if self.to_numpy == other: + return True + raise NotImplementedError + + @property + def to_numpy(self): + return pa_to_np_dtypes[self.pa_type] + + @property + def to_pandas(self): + return pa_to_pd_dtypes[self.pa_type] + + @property + def type(self): + return self.pandas_dtype().type + +class UInt8Dtype(Dtype): + def __init__(self): + self.pa_type = pa.uint8() + +class UInt16Dtype(Dtype): + def __init__(self): + self.pa_type = pa.uint16() + +class UInt32Dtype(Dtype): + def __init__(self): + self.pa_type = pa.uint32() + +class UInt64Dtype(Dtype): + def __init__(self): + self.pa_type = pa.uint64() + +class Int8Dtype(Dtype): + def __init__(self): + self.pa_type = pa.int8() + +class Int16Dtype(Dtype): + def __init__(self): + self.pa_type = pa.int16() + +class Int32Dtype(Dtype): + def __init__(self): + self.pa_type = pa.int32() + +class Int64Dtype(Dtype): + def __init__(self): + self.pa_type = pa.int64() + +class Float32Dtype(Dtype): + def __init__(self): + self.pa_type = pa.float32() + +class Float64Dtype(Dtype): + def __init__(self): + self.pa_type = pa.float64() + +class BooleanDtype(Dtype): + def __init__(self): + self.pa_type = pa.bool() + +class Datetime64NSDtype(Dtype): + def __init__(self): + self.pa_type = pa.timestamp('ns') + +class Datetime64USDtype(Dtype): + def __init__(self): + self.pa_type = pa.timestamp('us') + +class Datetime64MSDtype(Dtype): + def __init__(self): + self.pa_type = pa.timestamp('ms') + +class Datetime64SDtype(Dtype): + def __init__(self): + self.pa_type = pa.timestamp('s') + +class StringDtype(Dtype): + def __init__(self): + self.pa_type = pa.string() + +def make_dtype_from_string(obj): + if obj in {'str', 'string', 'object'}: + return StringDtype + elif 'datetime' in obj: + if obj == 'datetime64[ns]': + return Datetime64NSDtype + elif obj == 'datetime64[us]': + return Datetime64USDtype + elif obj == 'datetime64[ms]': + return Datetime64MSDtype + elif obj == 'datetime64[s]': + return Datetime64SDtype + elif 'int' in obj or 'Int' in obj: + if obj in {'int', 'Int', 'int64', 'Int64'}: + return Int64Dtype + elif obj in {'int32', 'Int32'}: + return Int32Dtype + elif obj in {'int16', 'Int16'}: + return Int16Dtype + elif obj in {'int8', 'Int8'}: + return Int8Dtype + elif obj in {'uint64', 'UInt64'}: + return UInt64Dtype + elif obj in {'uint32', 'UInt32'}: + return UInt32Dtype + elif obj in {'uint16', 'UInt16'}: + return UInt16Dtype + elif obj in {'uint8', 'Uint8'}: + return UInt8Dtype + elif 'float' in obj: + if obj in {'float64', 'Float64'}: + return Float64Dtype + elif obj in {'float32', 'Float32'}: + return Float32Dtype + elif 'bool' in obj: + return BooleanDtype + +def make_dtype_from_numpy(obj): + np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()} + result = np_to_pd_types.get(obj) + +def make_dtype_from_obj(obj): + if isinstance(obj, np.dtype): + return make_dtype_from_numpy(obj) + elif isinstance(obj, str): + return make_dtype_from_string(obj) class CategoricalDtype(ExtensionDtype): def __init__(self, categories=None, ordered=None): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5e01e110c28..277772331b9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -10,6 +10,8 @@ from pandas._config import get_option from pandas.api.types import is_dict_like +from cudf.core.dtypes import Dtype + import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate @@ -143,6 +145,8 @@ def __init__( ``null`` values. If ``False``, leaves ``np.nan`` values as is. """ + if dtype: + dtype = Dtype(dtype) if isinstance(data, pd.Series): if name is None: name = data.name From 33bd96c83675f02c06a3bb22be59356006af2be4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 28 Jul 2020 13:43:16 -0700 Subject: [PATCH 02/80] begin to plumb dtype --- python/cudf/cudf/core/column/column.py | 18 +++++------------- python/cudf/cudf/core/dtypes.py | 7 +++++++ python/cudf/cudf/core/series.py | 3 +-- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 4b68e4a1159..b73a88b7ee2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1639,23 +1639,15 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: - pa_type = None - np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) - if is_categorical_dtype(dtype): + if dtype.is_categorical_dtype: raise TypeError - else: - np_type = np.dtype(dtype).type - if np_type == np.bool_: - pa_type = pa.bool_() - else: - pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array( arbitrary, - type=pa_type, + type=dtype.pa_type, from_pandas=True if nan_as_null is None else nan_as_null, @@ -1664,14 +1656,14 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): - if is_categorical_dtype(dtype): + if dtype.is_categorical_dtype: sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) - elif np_type == np.str_: + elif dtype.to_numpy == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: - native_dtype = dtype + native_dtype = dtype.to_numpy if dtype is None and pd.api.types.infer_dtype( arbitrary ) in ("mixed", "mixed-integer"): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 09eef0b1790..fbaa760c0b5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -45,9 +45,13 @@ pa.timestamp('us'): np.dtype('datetime64[us]'), pa.timestamp('ms'): np.dtype('datetime64[ms]'), pa.timestamp('s'): np.dtype('datetime64[s]'), + None: None } class Dtype(ExtensionDtype): + + is_categorical_dtype = False + pa_type = None def __init__(self, arg): cudf_dtype = make_dtype_from_obj(arg) cudf_dtype.__init__(self) @@ -183,6 +187,9 @@ def make_dtype_from_obj(obj): return make_dtype_from_string(obj) class CategoricalDtype(ExtensionDtype): + + is_categorical_dtype = True + def __init__(self, categories=None, ordered=None): """ dtype similar to pd.CategoricalDtype with the categories diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 277772331b9..f4a026996ff 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -145,8 +145,7 @@ def __init__( ``null`` values. If ``False``, leaves ``np.nan`` values as is. """ - if dtype: - dtype = Dtype(dtype) + dtype = Dtype(dtype) if isinstance(data, pd.Series): if name is None: name = data.name From baf138c99789ce82684a26286625f6b9fadbf924 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 29 Jul 2020 06:42:19 -0700 Subject: [PATCH 03/80] migrate dtypes to cudf main __init__ --- python/cudf/cudf/__init__.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index d953f517e4a..38b31e5e7b7 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -31,7 +31,27 @@ from_pandas, merge, ) -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import ( + Dtype, + CategoricalDtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + StringDtype, + Float32Dtype, + Float64Dtype, + BooleanDtype, + Datetime64NSDtype, + Datetime64USDtype, + Datetime64MSDtype, + Datetime64SDtype +) + from cudf.core.groupby import Grouper from cudf.core.ops import ( arccos, From bdb87fa9c887e8b43af86ac53a52ee81683ffd1a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 29 Jul 2020 11:02:44 -0700 Subject: [PATCH 04/80] numerical column plumbing --- python/cudf/cudf/core/column/column.py | 25 ++++++++--------------- python/cudf/cudf/core/column/numerical.py | 4 ++-- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b73a88b7ee2..01284c11062 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -878,11 +878,11 @@ def distinct_count(self, method="sort", dropna=True): return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype, **kwargs): - if is_categorical_dtype(dtype): + if dtype.is_categorical: return self.as_categorical_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.datetime64): + elif dtype.is_datetime: return self.as_datetime_column(dtype, **kwargs) - elif pd.api.types.pandas_dtype(dtype).type in (np.str_, np.object_): + elif dtype.is_string: return self.as_string_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) @@ -1447,7 +1447,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): ) data = cudf.core.column.NumericalColumn( data=padata, - dtype=np.dtype(arbitrary.type.to_pandas_dtype()), + dtype=dtype, mask=pamask, size=pa_size, offset=pa_offset, @@ -1642,19 +1642,12 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) - if dtype.is_categorical_dtype: + if dtype.is_categorical: raise TypeError - data = as_column( - pa.array( - arbitrary, - type=dtype.pa_type, - from_pandas=True - if nan_as_null is None - else nan_as_null, - ), - dtype=dtype, - nan_as_null=nan_as_null, - ) + + pa_data = pa.array(arbitrary, type=dtype.pa_type, from_pandas=True if nan_as_null is None else nan_as_null) + data = as_column(pa_data, dtype=cudf.Dtype(pa_data.type), nan_as_null=nan_as_null) + except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if dtype.is_categorical_dtype: sr = pd.Series(arbitrary, dtype="category") diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 27281111993..02494d7617c 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -32,7 +32,7 @@ def __init__( The dtype associated with the data Buffer mask : Buffer, optional """ - dtype = np.dtype(dtype) + dtype = cudf.Dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -139,7 +139,7 @@ def as_string_column(self, dtype, **kwargs): if len(self) > 0: return string._numeric_to_str_typecast_functions[ - np.dtype(self.dtype) + dtype.to_numpy ](self, **kwargs) else: return as_column([], dtype="object") From 4a3fe713e9d1ee58aa4ceb6c34c1c8694a027400 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 29 Jul 2020 11:03:36 -0700 Subject: [PATCH 05/80] update dtype classes, mappings --- python/cudf/cudf/core/dtypes.py | 110 +++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index fbaa760c0b5..4fad10aaaee 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -49,10 +49,17 @@ } class Dtype(ExtensionDtype): + is_integer = False + is_string = False + is_boolean = False + is_categorical = False + is_datetime = False + is_list = False + is_float = False - is_categorical_dtype = False pa_type = None def __init__(self, arg): + cudf_dtype = make_dtype_from_obj(arg) cudf_dtype.__init__(self) @@ -71,71 +78,93 @@ def to_numpy(self): def to_pandas(self): return pa_to_pd_dtypes[self.pa_type] + @property + def itemsize(self): + return self.to_numpy.itemsize + @property def type(self): return self.pandas_dtype().type -class UInt8Dtype(Dtype): + def __repr__(self): + return self.pa_type.__repr__() + + def __hash__(self): + return hash(self.__repr__()) + +class IntDtype(Dtype): + is_integer = True + +class UInt8Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint8() -class UInt16Dtype(Dtype): +class UInt16Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint16() -class UInt32Dtype(Dtype): +class UInt32Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint32() -class UInt64Dtype(Dtype): +class UInt64Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint64() -class Int8Dtype(Dtype): +class Int8Dtype(IntDtype): def __init__(self): self.pa_type = pa.int8() -class Int16Dtype(Dtype): +class Int16Dtype(IntDtype): def __init__(self): self.pa_type = pa.int16() -class Int32Dtype(Dtype): +class Int32Dtype(IntDtype): def __init__(self): self.pa_type = pa.int32() -class Int64Dtype(Dtype): +class Int64Dtype(IntDtype): def __init__(self): self.pa_type = pa.int64() -class Float32Dtype(Dtype): + +class FloatDtype(Dtype): + is_float = True + +class Float32Dtype(FloatDtype): def __init__(self): self.pa_type = pa.float32() -class Float64Dtype(Dtype): +class Float64Dtype(FloatDtype): def __init__(self): self.pa_type = pa.float64() class BooleanDtype(Dtype): + is_boolean = True def __init__(self): - self.pa_type = pa.bool() + self.pa_type = pa.bool_() -class Datetime64NSDtype(Dtype): +class DatetimeDtype(Dtype): + is_datetime = True + +class Datetime64NSDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('ns') -class Datetime64USDtype(Dtype): +class Datetime64USDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('us') -class Datetime64MSDtype(Dtype): +class Datetime64MSDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('ms') -class Datetime64SDtype(Dtype): +class Datetime64SDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('s') class StringDtype(Dtype): + is_string = True def __init__(self): self.pa_type = pa.string() @@ -176,13 +205,20 @@ def make_dtype_from_string(obj): elif 'bool' in obj: return BooleanDtype + + def make_dtype_from_numpy(obj): np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()} result = np_to_pd_types.get(obj) + return result def make_dtype_from_obj(obj): + if isinstance(obj, Dtype): + return np_to_cudf_dtypes[obj.to_numpy] if isinstance(obj, np.dtype): - return make_dtype_from_numpy(obj) + return np_to_cudf_dtypes[obj] + elif isinstance(obj, pa.lib.DataType): + return pa_to_cudf_dtypes[obj] elif isinstance(obj, str): return make_dtype_from_string(obj) @@ -346,3 +382,43 @@ def __repr__(self): return f"ListDtype({self.element_type.__repr__()})" else: return f"ListDtype({self.element_type})" + + +pa_to_cudf_dtypes = { + pa.uint8(): UInt8Dtype, + pa.uint16(): UInt16Dtype, + pa.uint32(): UInt32Dtype, + pa.uint64(): UInt64Dtype, + pa.int8(): Int8Dtype, + pa.int16(): Int16Dtype, + pa.int32(): Int32Dtype, + pa.int64(): Int64Dtype, + pa.bool_(): BooleanDtype, + pa.string(): StringDtype, + pa.float32(): Float32Dtype, + pa.float64(): Float64Dtype, + pa.timestamp('ns'): Datetime64NSDtype, + pa.timestamp('us'): Datetime64USDtype, + pa.timestamp('ms'): Datetime64MSDtype, + pa.timestamp('s'): Datetime64SDtype, + None: Dtype +} + +np_to_cudf_dtypes = { + np.dtype('int8'): Int8Dtype, + np.dtype('int16'): Int16Dtype, + np.dtype('int32'): Int32Dtype, + np.dtype('int64'): Int64Dtype, + np.dtype('uint8'): UInt8Dtype, + np.dtype('uint16'): UInt16Dtype, + np.dtype('uint32'): UInt32Dtype, + np.dtype('uint64'): UInt64Dtype, + np.dtype('bool'): BooleanDtype, + np.dtype('object'): StringDtype, + np.dtype('float32'): Float32Dtype, + np.dtype('float64'): Float64Dtype, + np.dtype('datetime64[ns]'): Datetime64NSDtype, + np.dtype('datetime64[us]'): Datetime64USDtype, + np.dtype('datetime64[ms]'): Datetime64MSDtype, + np.dtype('datetime64[s]'): Datetime64SDtype, +} From 1cf2c3ef58e805eaa55850520cbb5ed96849025a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 29 Jul 2020 11:04:00 -0700 Subject: [PATCH 06/80] start to plumb stringcolumn --- python/cudf/cudf/core/column/string.py | 31 +++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 987c23c8139..ceaf6b4ff3e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -134,25 +134,26 @@ is_scalar, is_string_dtype, ) +from cudf.core.dtypes import Dtype _str_to_numeric_typecast_functions = { - np.dtype("int8"): str_cast.stoi8, - np.dtype("int16"): str_cast.stoi16, - np.dtype("int32"): str_cast.stoi, - np.dtype("int64"): str_cast.stol, - np.dtype("uint8"): str_cast.stoui8, - np.dtype("uint16"): str_cast.stoui16, - np.dtype("uint32"): str_cast.stoui, - np.dtype("uint64"): str_cast.stoul, - np.dtype("float32"): str_cast.stof, - np.dtype("float64"): str_cast.stod, - np.dtype("bool"): str_cast.to_booleans, + Dtype("int8"): str_cast.stoi8, + Dtype("int16"): str_cast.stoi16, + Dtype("int32"): str_cast.stoi, + Dtype("int64"): str_cast.stol, + Dtype("uint8"): str_cast.stoui8, + Dtype("uint16"): str_cast.stoui16, + Dtype("uint32"): str_cast.stoui, + Dtype("uint64"): str_cast.stoul, + Dtype("float32"): str_cast.stof, + Dtype("float64"): str_cast.stod, + Dtype("bool"): str_cast.to_booleans, # TODO: support Date32 UNIX days # np.dtype("datetime64[D]"): str_cast.timestamp2int, - np.dtype("datetime64[s]"): str_cast.timestamp2int, - np.dtype("datetime64[ms]"): str_cast.timestamp2int, - np.dtype("datetime64[us]"): str_cast.timestamp2int, - np.dtype("datetime64[ns]"): str_cast.timestamp2int, + Dtype("datetime64[s]"): str_cast.timestamp2int, + Dtype("datetime64[ms]"): str_cast.timestamp2int, + Dtype("datetime64[us]"): str_cast.timestamp2int, + Dtype("datetime64[ns]"): str_cast.timestamp2int, } _numeric_to_str_typecast_functions = { From dbc4970057ca03a7de370f9a5882a9707e064e36 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 30 Jul 2020 07:22:36 -0700 Subject: [PATCH 07/80] inherit from basic cython class --- python/cudf/cudf/_lib/column.pyx | 10 +++------- python/cudf/cudf/_lib/types.pxd | 4 ++++ python/cudf/cudf/_lib/types.pyx | 13 +++++++++++++ python/cudf/cudf/core/dtypes.py | 4 ++-- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index d061d7065de..5b48d92f1b6 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -27,6 +27,7 @@ from cudf._lib.move cimport move from cudf._lib.cpp.column.column cimport column, column_contents from cudf._lib.cpp.column.column_view cimport column_view cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.types cimport _Dtype cdef class Column: @@ -352,13 +353,8 @@ cdef class Column: col = self.base_children[0] else: col = self - data_dtype = col.dtype - cdef libcudf_types.type_id tid = ( - ( - np_to_cudf_types[np.dtype(data_dtype)] - ) - ) - cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid) + cdef _Dtype data_dtype = col.dtype + cdef libcudf_types.data_type dtype = data_dtype.get_libcudf_type() cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index 923fbe0aa7c..f6c0c39174a 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport int32_t from libcpp cimport bool +from cudf._lib.cpp.types cimport data_type ctypedef bool underlying_type_t_order ctypedef bool underlying_type_t_null_order @@ -9,3 +10,6 @@ ctypedef bool underlying_type_t_sorted ctypedef int32_t underlying_type_t_interpolation ctypedef int32_t underlying_type_t_type_id ctypedef bool underlying_type_t_null_policy + +cdef class _Dtype: + cdef data_type get_libcudf_type(self) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index f5351d12b03..fc8d4fada22 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -11,6 +11,7 @@ from cudf._lib.types cimport ( underlying_type_t_interpolation ) cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.cpp.types cimport data_type class TypeId(IntEnum): @@ -119,3 +120,15 @@ class NullOrder(IntEnum): class NullHandling(IntEnum): INCLUDE = libcudf_types.null_policy.INCLUDE EXCLUDE = libcudf_types.null_policy.EXCLUDE + + +cdef class _Dtype: + cdef data_type get_libcudf_type(self): + np_dtype = self.to_numpy + cdef libcudf_types.type_id tid = ( + ( + np_to_cudf_types[np_dtype] + ) + ) + cdef data_type libcudf_type = libcudf_types.data_type(tid) + return libcudf_type diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4fad10aaaee..b610f29b4b7 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -6,7 +6,7 @@ import pandas as pd import pyarrow as pa from pandas.api.extensions import ExtensionDtype - +from cudf._lib.types import _Dtype import cudf pa_to_pd_dtypes = { @@ -48,7 +48,7 @@ None: None } -class Dtype(ExtensionDtype): +class Dtype(ExtensionDtype, _Dtype): is_integer = False is_string = False is_boolean = False From ba42bd8140d8390b53bd56253f42b316bc98d28f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 30 Jul 2020 08:11:32 -0700 Subject: [PATCH 08/80] plumb numerical column __repr__, default_na_value --- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/core/column/column.py | 12 +++++----- python/cudf/cudf/core/column/numerical.py | 4 ++-- python/cudf/cudf/core/dtypes.py | 29 +++++++++++++++++++++-- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 5b48d92f1b6..7cf3549ed1c 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -349,7 +349,7 @@ cdef class Column: return self._view(c_null_count) cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if is_categorical_dtype(self.dtype): + if self.dtype.is_categorical: col = self.base_children[0] else: col = self diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 01284c11062..baa8a329847 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -81,10 +81,10 @@ def data_array_view(self): """ View the data as a device array object """ - if self.dtype == "object": + if self.dtype.is_string: raise ValueError("Cannot get an array view of a StringColumn") - if is_categorical_dtype(self.dtype): + if self.dtype.is_categorical: return self.codes.data_array_view else: dtype = self.dtype @@ -95,7 +95,7 @@ def data_array_view(self): result = cuda.devicearray.DeviceNDArray( shape=(result.nbytes // dtype.itemsize,), strides=(dtype.itemsize,), - dtype=dtype, + dtype=dtype.to_numpy, gpu_data=result.gpu_data, ) return result @@ -1320,11 +1320,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): if isinstance(col, cudf.core.column.CategoricalColumn): return col - elif np.issubdtype(col.dtype, np.floating): + elif col.dtype.is_float: if nan_as_null or (mask is None and nan_as_null is None): mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan)) col = col.set_mask(mask) - elif np.issubdtype(col.dtype, np.datetime64): + elif col.dtype.is_datetime: if nan_as_null or (mask is None and nan_as_null is None): col = utils.time_col_replace_nulls(col) return col @@ -1602,7 +1602,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): arb_dtype = np.dtype("O") else: arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) - if arb_dtype != arbitrary.dtype.numpy_dtype: + if arb_dtype != arbitrary.dtype.to_numpy: arbitrary = arbitrary.astype(arb_dtype) if arb_dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 02494d7617c..8d45af88626 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -266,9 +266,9 @@ def default_na_value(self): if dkind == "f": return self.dtype.type(np.nan) elif dkind == "i": - return np.iinfo(self.dtype).min + return np.iinfo(self.dtype.to_numpy).min elif dkind == "u": - return np.iinfo(self.dtype).max + return np.iinfo(self.dtype.to_numpy).max elif dkind == "b": return self.dtype.type(False) else: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index b610f29b4b7..fe4710a2de9 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -84,7 +84,15 @@ def itemsize(self): @property def type(self): - return self.pandas_dtype().type + return self.to_pandas.type + + @property + def kind(self): + return self.to_pandas.kind + + @property + def name(self): + return self._name def __repr__(self): return self.pa_type.__repr__() @@ -92,41 +100,50 @@ def __repr__(self): def __hash__(self): return hash(self.__repr__()) + + class IntDtype(Dtype): is_integer = True class UInt8Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint8() + self._name = "UInt8" class UInt16Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint16() + self._name = "UInt16" class UInt32Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint32() + self._name = "UInt32" class UInt64Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint64() + self._name = "UInt64" class Int8Dtype(IntDtype): def __init__(self): self.pa_type = pa.int8() + self._name = "Int8" class Int16Dtype(IntDtype): def __init__(self): self.pa_type = pa.int16() + self._name = "Int16" class Int32Dtype(IntDtype): def __init__(self): self.pa_type = pa.int32() + self._name = "Int32" class Int64Dtype(IntDtype): def __init__(self): self.pa_type = pa.int64() - + self._name = "Int64" class FloatDtype(Dtype): is_float = True @@ -134,15 +151,18 @@ class FloatDtype(Dtype): class Float32Dtype(FloatDtype): def __init__(self): self.pa_type = pa.float32() + self._name = "Float32" class Float64Dtype(FloatDtype): def __init__(self): self.pa_type = pa.float64() + self._name = "Float64" class BooleanDtype(Dtype): is_boolean = True def __init__(self): self.pa_type = pa.bool_() + self._name = "Boolean" class DatetimeDtype(Dtype): is_datetime = True @@ -150,23 +170,28 @@ class DatetimeDtype(Dtype): class Datetime64NSDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('ns') + self._name = "Datetime64NS" class Datetime64USDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('us') + self._name = "Datetime64US" class Datetime64MSDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('ms') + self._name = "Datetime64MS" class Datetime64SDtype(DatetimeDtype): def __init__(self): self.pa_type = pa.timestamp('s') + self._name = "Datetime64S" class StringDtype(Dtype): is_string = True def __init__(self): self.pa_type = pa.string() + self._name = "String" def make_dtype_from_string(obj): if obj in {'str', 'string', 'object'}: From 60272e2a8578ab02b4c7221b84652d717bf14881 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 30 Jul 2020 15:04:35 -0700 Subject: [PATCH 09/80] plumb some parts of unary --- python/cudf/cudf/_lib/unary.pyx | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx index 2511556656e..11f7524f934 100644 --- a/python/cudf/cudf/_lib/unary.pyx +++ b/python/cudf/cudf/_lib/unary.pyx @@ -6,12 +6,15 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr import numpy as np + +from cudf.core.dtypes import Float64Dtype from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport ( column_view, mutable_column_view ) from cudf._lib.types import np_to_cudf_types +from cudf._lib.types cimport _Dtype from cudf._lib.cpp.types cimport ( size_type, data_type, @@ -90,16 +93,11 @@ def is_valid(Column input): return Column.from_unique_ptr(move(c_result)) -def cast(Column input, object dtype=np.float64): +def cast(Column input, object dtype=Float64Dtype()): cdef column_view c_input = input.view() - cdef type_id tid = ( - ( - ( - np_to_cudf_types[np.dtype(dtype)] - ) - ) - ) - cdef data_type c_dtype = data_type(tid) + cdef _Dtype data_dtype = dtype + + cdef data_type c_dtype = data_dtype.get_libcudf_type() cdef unique_ptr[column] c_result with nogil: From c03be406e5cfea22cc7847c53b378096eac35415 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 30 Jul 2020 15:05:40 -0700 Subject: [PATCH 10/80] make a factory and fix bugs --- python/cudf/cudf/core/column/column.py | 16 ++-- python/cudf/cudf/core/column/numerical.py | 11 ++- python/cudf/cudf/core/column/string.py | 64 ++++++------- python/cudf/cudf/core/dtypes.py | 106 +++++++++++----------- python/cudf/cudf/core/series.py | 4 +- 5 files changed, 103 insertions(+), 98 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index baa8a329847..2365e904881 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -37,7 +37,7 @@ np_to_pa_dtype, ) from cudf.utils.utils import buffers_from_pyarrow, mask_dtype - +from cudf.core.dtypes import make_dtype_from_obj class ColumnBase(Column, Serializable): def __init__( @@ -878,6 +878,7 @@ def distinct_count(self, method="sort", dropna=True): return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype, **kwargs): + dtype = make_dtype_from_obj(dtype) if dtype.is_categorical: return self.as_categorical_column(dtype, **kwargs) elif dtype.is_datetime: @@ -1263,6 +1264,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): * pyarrow array * pandas.Categorical objects """ + if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) @@ -1552,7 +1554,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: - arbitrary = arbitrary.astype(dtype) + arbitrary = arbitrary.astype(dtype.to_numpy) if arb_dtype.kind == "M": @@ -1575,8 +1577,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data=buffer, mask=mask, dtype=arbitrary.dtype ) elif arb_dtype.kind in ("O", "U"): + + pa_data = pa.Array.from_pandas(arbitrary) data = as_column( - pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype + pa_data, dtype=make_dtype_from_obj(pa_data.type) ) # There is no cast operation available for pa.Array from int to # str, Hence instead of handling in pa.Array block, we @@ -1645,11 +1649,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): if dtype.is_categorical: raise TypeError - pa_data = pa.array(arbitrary, type=dtype.pa_type, from_pandas=True if nan_as_null is None else nan_as_null) - data = as_column(pa_data, dtype=cudf.Dtype(pa_data.type), nan_as_null=nan_as_null) + pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null) + data = as_column(pa_data, dtype=make_dtype_from_obj(pa_data.type), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): - if dtype.is_categorical_dtype: + if dtype.is_categorical: sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif dtype.to_numpy == np.str_: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8d45af88626..958b22136dd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -18,7 +18,7 @@ np_to_pa_dtype, numeric_normalize_types, ) - +from cudf.core.dtypes import make_dtype_from_obj class NumericalColumn(column.ColumnBase): def __init__( @@ -32,7 +32,7 @@ def __init__( The dtype associated with the data Buffer mask : Buffer, optional """ - dtype = cudf.Dtype(dtype) + dtype = make_dtype_from_obj(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -139,7 +139,7 @@ def as_string_column(self, dtype, **kwargs): if len(self) > 0: return string._numeric_to_str_typecast_functions[ - dtype.to_numpy + self.dtype ](self, **kwargs) else: return as_column([], dtype="object") @@ -156,9 +156,12 @@ def as_datetime_column(self, dtype, **kwargs): ) def as_numerical_column(self, dtype, **kwargs): - dtype = np.dtype(dtype) + # dtype = np.dtype(dtype) + # expect a cudf dtype always here if dtype == self.dtype: return self + import pdb + pdb.set_trace() return libcudf.unary.cast(self, dtype) def to_pandas(self, index=None, nullable_pd_dtype=False): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ceaf6b4ff3e..2b8e030d49e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -134,46 +134,46 @@ is_scalar, is_string_dtype, ) -from cudf.core.dtypes import Dtype +from cudf.core.dtypes import make_dtype_from_obj _str_to_numeric_typecast_functions = { - Dtype("int8"): str_cast.stoi8, - Dtype("int16"): str_cast.stoi16, - Dtype("int32"): str_cast.stoi, - Dtype("int64"): str_cast.stol, - Dtype("uint8"): str_cast.stoui8, - Dtype("uint16"): str_cast.stoui16, - Dtype("uint32"): str_cast.stoui, - Dtype("uint64"): str_cast.stoul, - Dtype("float32"): str_cast.stof, - Dtype("float64"): str_cast.stod, - Dtype("bool"): str_cast.to_booleans, + make_dtype_from_obj("int8"): str_cast.stoi8, + make_dtype_from_obj("int16"): str_cast.stoi16, + make_dtype_from_obj("int32"): str_cast.stoi, + make_dtype_from_obj("int64"): str_cast.stol, + make_dtype_from_obj("uint8"): str_cast.stoui8, + make_dtype_from_obj("uint16"): str_cast.stoui16, + make_dtype_from_obj("uint32"): str_cast.stoui, + make_dtype_from_obj("uint64"): str_cast.stoul, + make_dtype_from_obj("float32"): str_cast.stof, + make_dtype_from_obj("float64"): str_cast.stod, + make_dtype_from_obj("bool"): str_cast.to_booleans, # TODO: support Date32 UNIX days # np.dtype("datetime64[D]"): str_cast.timestamp2int, - Dtype("datetime64[s]"): str_cast.timestamp2int, - Dtype("datetime64[ms]"): str_cast.timestamp2int, - Dtype("datetime64[us]"): str_cast.timestamp2int, - Dtype("datetime64[ns]"): str_cast.timestamp2int, + make_dtype_from_obj("datetime64[s]"): str_cast.timestamp2int, + make_dtype_from_obj("datetime64[ms]"): str_cast.timestamp2int, + make_dtype_from_obj("datetime64[us]"): str_cast.timestamp2int, + make_dtype_from_obj("datetime64[ns]"): str_cast.timestamp2int, } _numeric_to_str_typecast_functions = { - np.dtype("int8"): str_cast.i8tos, - np.dtype("int16"): str_cast.i16tos, - np.dtype("int32"): str_cast.itos, - np.dtype("int64"): str_cast.ltos, - np.dtype("uint8"): str_cast.ui8tos, - np.dtype("uint16"): str_cast.ui16tos, - np.dtype("uint32"): str_cast.uitos, - np.dtype("uint64"): str_cast.ultos, - np.dtype("float32"): str_cast.ftos, - np.dtype("float64"): str_cast.dtos, - np.dtype("bool"): str_cast.from_booleans, + make_dtype_from_obj(np.dtype("int8")): str_cast.i8tos, + make_dtype_from_obj(np.dtype("int16")): str_cast.i16tos, + make_dtype_from_obj(np.dtype("int32")): str_cast.itos, + make_dtype_from_obj(np.dtype("int64")): str_cast.ltos, + make_dtype_from_obj(np.dtype("uint8")): str_cast.ui8tos, + make_dtype_from_obj(np.dtype("uint16")): str_cast.ui16tos, + make_dtype_from_obj(np.dtype("uint32")): str_cast.uitos, + make_dtype_from_obj(np.dtype("uint64")): str_cast.ultos, + make_dtype_from_obj(np.dtype("float32")): str_cast.ftos, + make_dtype_from_obj(np.dtype("float64")): str_cast.dtos, + make_dtype_from_obj(np.dtype("bool")): str_cast.from_booleans, # TODO: support Date32 UNIX days # np.dtype("datetime64[D]"): str_cast.int2timestamp, - np.dtype("datetime64[s]"): str_cast.int2timestamp, - np.dtype("datetime64[ms]"): str_cast.int2timestamp, - np.dtype("datetime64[us]"): str_cast.int2timestamp, - np.dtype("datetime64[ns]"): str_cast.int2timestamp, + make_dtype_from_obj(np.dtype("datetime64[s]")): str_cast.int2timestamp, + make_dtype_from_obj(np.dtype("datetime64[ms]")): str_cast.int2timestamp, + make_dtype_from_obj(np.dtype("datetime64[us]")): str_cast.int2timestamp, + make_dtype_from_obj(np.dtype("datetime64[ns]")): str_cast.int2timestamp, } @@ -4138,7 +4138,7 @@ def __init__( Two non-null columns containing the string data and offsets respectively """ - dtype = np.dtype("object") + dtype = cudf.StringDtype() if size is None: for child in children: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index fe4710a2de9..46d07bfcea8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -56,14 +56,10 @@ class Dtype(ExtensionDtype, _Dtype): is_datetime = False is_list = False is_float = False - pa_type = None - def __init__(self, arg): - - cudf_dtype = make_dtype_from_obj(arg) - cudf_dtype.__init__(self) - def __eq__(self, other): + if isinstance(other, self.__class__): + return True if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__: return True if self.to_numpy == other: @@ -195,40 +191,40 @@ def __init__(self): def make_dtype_from_string(obj): if obj in {'str', 'string', 'object'}: - return StringDtype + return StringDtype() elif 'datetime' in obj: if obj == 'datetime64[ns]': - return Datetime64NSDtype + return Datetime64NSDtype() elif obj == 'datetime64[us]': - return Datetime64USDtype + return Datetime64USDtype() elif obj == 'datetime64[ms]': - return Datetime64MSDtype + return Datetime64MSDtype() elif obj == 'datetime64[s]': - return Datetime64SDtype + return Datetime64SDtype() elif 'int' in obj or 'Int' in obj: if obj in {'int', 'Int', 'int64', 'Int64'}: - return Int64Dtype + return Int64Dtype() elif obj in {'int32', 'Int32'}: - return Int32Dtype + return Int32Dtype() elif obj in {'int16', 'Int16'}: - return Int16Dtype + return Int16Dtype() elif obj in {'int8', 'Int8'}: - return Int8Dtype + return Int8Dtype() elif obj in {'uint64', 'UInt64'}: - return UInt64Dtype + return UInt64Dtype() elif obj in {'uint32', 'UInt32'}: - return UInt32Dtype + return UInt32Dtype() elif obj in {'uint16', 'UInt16'}: - return UInt16Dtype + return UInt16Dtype() elif obj in {'uint8', 'Uint8'}: - return UInt8Dtype + return UInt8Dtype() elif 'float' in obj: if obj in {'float64', 'Float64'}: - return Float64Dtype + return Float64Dtype() elif obj in {'float32', 'Float32'}: - return Float32Dtype + return Float32Dtype() elif 'bool' in obj: - return BooleanDtype + return BooleanDtype() @@ -246,6 +242,8 @@ def make_dtype_from_obj(obj): return pa_to_cudf_dtypes[obj] elif isinstance(obj, str): return make_dtype_from_string(obj) + else: + raise TypeError class CategoricalDtype(ExtensionDtype): @@ -410,40 +408,40 @@ def __repr__(self): pa_to_cudf_dtypes = { - pa.uint8(): UInt8Dtype, - pa.uint16(): UInt16Dtype, - pa.uint32(): UInt32Dtype, - pa.uint64(): UInt64Dtype, - pa.int8(): Int8Dtype, - pa.int16(): Int16Dtype, - pa.int32(): Int32Dtype, - pa.int64(): Int64Dtype, - pa.bool_(): BooleanDtype, - pa.string(): StringDtype, - pa.float32(): Float32Dtype, - pa.float64(): Float64Dtype, - pa.timestamp('ns'): Datetime64NSDtype, - pa.timestamp('us'): Datetime64USDtype, - pa.timestamp('ms'): Datetime64MSDtype, - pa.timestamp('s'): Datetime64SDtype, + pa.uint8(): UInt8Dtype(), + pa.uint16(): UInt16Dtype(), + pa.uint32(): UInt32Dtype(), + pa.uint64(): UInt64Dtype(), + pa.int8(): Int8Dtype(), + pa.int16(): Int16Dtype(), + pa.int32(): Int32Dtype(), + pa.int64(): Int64Dtype(), + pa.bool_(): BooleanDtype(), + pa.string(): StringDtype(), + pa.float32(): Float32Dtype(), + pa.float64(): Float64Dtype(), + pa.timestamp('ns'): Datetime64NSDtype(), + pa.timestamp('us'): Datetime64USDtype(), + pa.timestamp('ms'): Datetime64MSDtype(), + pa.timestamp('s'): Datetime64SDtype(), None: Dtype } np_to_cudf_dtypes = { - np.dtype('int8'): Int8Dtype, - np.dtype('int16'): Int16Dtype, - np.dtype('int32'): Int32Dtype, - np.dtype('int64'): Int64Dtype, - np.dtype('uint8'): UInt8Dtype, - np.dtype('uint16'): UInt16Dtype, - np.dtype('uint32'): UInt32Dtype, - np.dtype('uint64'): UInt64Dtype, - np.dtype('bool'): BooleanDtype, - np.dtype('object'): StringDtype, - np.dtype('float32'): Float32Dtype, - np.dtype('float64'): Float64Dtype, - np.dtype('datetime64[ns]'): Datetime64NSDtype, - np.dtype('datetime64[us]'): Datetime64USDtype, - np.dtype('datetime64[ms]'): Datetime64MSDtype, - np.dtype('datetime64[s]'): Datetime64SDtype, + np.dtype('int8'): Int8Dtype(), + np.dtype('int16'): Int16Dtype(), + np.dtype('int32'): Int32Dtype(), + np.dtype('int64'): Int64Dtype(), + np.dtype('uint8'): UInt8Dtype(), + np.dtype('uint16'): UInt16Dtype(), + np.dtype('uint32'): UInt32Dtype(), + np.dtype('uint64'): UInt64Dtype(), + np.dtype('bool'): BooleanDtype(), + np.dtype('object'): StringDtype(), + np.dtype('float32'): Float32Dtype(), + np.dtype('float64'): Float64Dtype(), + np.dtype('datetime64[ns]'): Datetime64NSDtype(), + np.dtype('datetime64[us]'): Datetime64USDtype(), + np.dtype('datetime64[ms]'): Datetime64MSDtype(), + np.dtype('datetime64[s]'): Datetime64SDtype(), } diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f4a026996ff..4eda63c345c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -145,7 +145,8 @@ def __init__( ``null`` values. If ``False``, leaves ``np.nan`` values as is. """ - dtype = Dtype(dtype) + from cudf.core.dtypes import make_dtype_from_obj + dtype = make_dtype_from_obj(dtype) if dtype is not None else None if isinstance(data, pd.Series): if name is None: name = data.name @@ -187,7 +188,6 @@ def __init__( ) else: data = {} - if not isinstance(data, column.ColumnBase): data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype) From 7f6cb360cd7de8db6504afac443b63ca09772028 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 31 Jul 2020 08:56:15 -0700 Subject: [PATCH 11/80] more progress on columns, dtype object --- python/cudf/cudf/_lib/binaryop.pyx | 11 ++----- python/cudf/cudf/core/column/column.py | 13 +++++---- python/cudf/cudf/core/column/datetime.py | 7 +++-- python/cudf/cudf/core/column/numerical.py | 33 +++++++++++---------- python/cudf/cudf/core/dtypes.py | 35 +++++++++++++++++++++-- python/cudf/cudf/tests/test_column.py | 4 +-- 6 files changed, 64 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx index 7823ad20c5a..4323e1f4b79 100644 --- a/python/cudf/cudf/_lib/binaryop.pyx +++ b/python/cudf/cudf/_lib/binaryop.pyx @@ -27,6 +27,7 @@ from cudf.utils.dtypes import is_string_dtype from cudf._lib.cpp.binaryop cimport binary_operator cimport cudf._lib.cpp.binaryop as cpp_binaryop +from cudf._lib.types cimport _Dtype class BinaryOperation(IntEnum): @@ -170,19 +171,13 @@ def binaryop(lhs, rhs, op, dtype): """ Dispatches a binary op call to the appropriate libcudf function: """ + cdef _Dtype py_dtype = dtype op = BinaryOperation[op.upper()] cdef binary_operator c_op = ( op ) - cdef type_id tid = ( - ( - ( - np_to_cudf_types[np.dtype(dtype)] - ) - ) - ) - cdef data_type c_dtype = data_type(tid) + cdef data_type c_dtype = py_dtype.get_libcudf_type() if isinstance(lhs, Scalar) or np.isscalar(lhs) or lhs is None: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2365e904881..ae0c939f74c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1129,9 +1129,9 @@ def build_column( offset : int, optional children : tuple, optional """ - dtype = pd.api.types.pandas_dtype(dtype) + dtype = make_dtype_from_obj(dtype) - if is_categorical_dtype(dtype): + if dtype.is_categorical: if not len(children) == 1: raise ValueError( "Must specify exactly one child column for CategoricalColumn" @@ -1146,7 +1146,7 @@ def build_column( null_count=null_count, children=children, ) - elif dtype.type is np.datetime64: + elif dtype.is_datetime: return cudf.core.column.DatetimeColumn( data=data, dtype=dtype, @@ -1155,7 +1155,7 @@ def build_column( offset=offset, null_count=null_count, ) - elif dtype.type in (np.object_, np.str_): + elif dtype.is_string: return cudf.core.column.StringColumn( mask=mask, size=size, @@ -1265,6 +1265,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): * pandas.Categorical objects """ + dtype = make_dtype_from_obj(dtype) if dtype is not None else None + if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) @@ -1449,7 +1451,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): ) data = cudf.core.column.NumericalColumn( data=padata, - dtype=dtype, + dtype=make_dtype_from_obj(arbitrary.type), mask=pamask, size=pa_size, offset=pa_offset, @@ -1648,7 +1650,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): dtype = pd.api.types.pandas_dtype(dtype) if dtype.is_categorical: raise TypeError - pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null) data = as_column(pa_data, dtype=make_dtype_from_obj(pa_data.type), nan_as_null=nan_as_null) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 939123f2474..271351c3890 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,6 +11,7 @@ from cudf.core.column import column from cudf.utils import utils from cudf.utils.dtypes import is_scalar, np_to_pa_dtype +from cudf.core.dtypes import make_dtype_from_obj # nanoseconds per time_unit _numpy_to_pandas_conversion = { @@ -45,7 +46,7 @@ def __init__( mask : Buffer; optional The validity mask """ - dtype = np.dtype(dtype) + dtype = make_dtype_from_obj(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -60,7 +61,7 @@ def __init__( null_count=null_count, ) assert self.dtype.type is np.datetime64 - self._time_unit, _ = np.datetime_data(self.dtype) + self._time_unit, _ = np.datetime_data(self.dtype.to_numpy) def __contains__(self, item): # Handles improper item types @@ -164,7 +165,7 @@ def as_string_column(self, dtype, **kwargs): def to_pandas(self, index=None, nullable_pd_dtype=False): return pd.Series( - self.to_array(fillna="pandas").astype(self.dtype), index=index + self.to_array(fillna="pandas").astype(self.dtype.to_pandas), index=index ) def to_arrow(self): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 958b22136dd..459461d94e5 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -70,26 +70,27 @@ def unary_operator(self, unaryop): def binary_operator(self, binop, rhs, reflect=False): int_dtypes = [ - np.dtype("int8"), - np.dtype("int16"), - np.dtype("int32"), - np.dtype("int64"), - np.dtype("uint8"), - np.dtype("uint16"), - np.dtype("uint32"), - np.dtype("uint64"), + cudf.Int8Dtype(), + cudf.Int16Dtype(), + cudf.Int32Dtype(), + cudf.Int64Dtype(), + cudf.UInt8Dtype(), + cudf.UInt16Dtype(), + cudf.UInt32Dtype(), + cudf.UInt64Dtype(), ] tmp = rhs if reflect: tmp = self if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs): - out_dtype = np.result_type(self.dtype, rhs.dtype) + out_dtype = np.result_type(make_dtype_from_obj(self.dtype).to_numpy, make_dtype_from_obj(rhs.dtype).to_numpy) + out_dtype = make_dtype_from_obj(out_dtype) if binop in ["mod", "floordiv"]: if (tmp.dtype in int_dtypes) and ( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): - out_dtype = np.dtype("float_") + out_dtype = cudf.Float64Dtype() elif rhs is None: out_dtype = self.dtype else: @@ -160,8 +161,6 @@ def as_numerical_column(self, dtype, **kwargs): # expect a cudf dtype always here if dtype == self.dtype: return self - import pdb - pdb.set_trace() return libcudf.unary.cast(self, dtype) def to_pandas(self, index=None, nullable_pd_dtype=False): @@ -198,14 +197,13 @@ def to_arrow(self): if self.nullable: mask = pa.py_buffer(self.mask_array_view.copy_to_host()) data = pa.py_buffer(self.data_array_view.copy_to_host()) - pa_dtype = np_to_pa_dtype(self.dtype) out = pa.Array.from_buffers( - type=pa_dtype, + type=self.dtype.pa_type, length=len(self), buffers=[mask, data], null_count=self.null_count, ) - if self.dtype == np.bool: + if self.dtype.is_boolean: return out.cast(pa.bool_()) else: return out @@ -312,7 +310,8 @@ def fillna(self, fill_value): """ if np.isscalar(fill_value): # castsafely to the same dtype as self - fill_value_casted = self.dtype.type(fill_value) + # TODO - produce a libcudf scalar directly + fill_value_casted = self.dtype.to_numpy.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( "Cannot safely cast non-equivalent {} to {}".format( @@ -455,7 +454,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): if is_op_comparison: out_dtype = "bool" - + out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) if is_op_comparison: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 46d07bfcea8..2d8c22307f5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -60,8 +60,11 @@ class Dtype(ExtensionDtype, _Dtype): def __eq__(self, other): if isinstance(other, self.__class__): return True + if isinstance(other, Dtype) and not isinstance(other, self.__class__): + return False if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__: return True + if self.to_numpy == other: return True raise NotImplementedError @@ -80,11 +83,16 @@ def itemsize(self): @property def type(self): - return self.to_pandas.type + if self.is_float or self.is_datetime: + return self.to_numpy.kind + else: self.to_pandas.type @property def kind(self): - return self.to_pandas.kind + if self.is_float: + return 'f' + else: + return self.to_pandas.kind @property def name(self): @@ -242,8 +250,16 @@ def make_dtype_from_obj(obj): return pa_to_cudf_dtypes[obj] elif isinstance(obj, str): return make_dtype_from_string(obj) + elif obj in pd_to_cudf_dtypes.keys(): + return pd_to_cudf_dtypes[obj] else: - raise TypeError + try: + if issubclass(obj, np.generic): + return np_to_cudf_dtypes[np.dtype(obj)] + except: + import pdb + pdb.set_trace() + raise TypeError('cant transform this object to a cudf dtype. ') class CategoricalDtype(ExtensionDtype): @@ -445,3 +461,16 @@ def __repr__(self): np.dtype('datetime64[ms]'): Datetime64MSDtype(), np.dtype('datetime64[s]'): Datetime64SDtype(), } + +pd_to_cudf_dtypes = { + pd.Int8Dtype(): Int8Dtype(), + pd.Int16Dtype(): Int16Dtype(), + pd.Int32Dtype(): Int32Dtype(), + pd.Int64Dtype(): Int64Dtype(), + pd.UInt8Dtype(): UInt8Dtype(), + pd.UInt16Dtype(): UInt16Dtype(), + pd.UInt32Dtype(): UInt32Dtype(), + pd.UInt64Dtype(): UInt64Dtype(), + pd.BooleanDtype(): BooleanDtype(), + pd.StringDtype(): StringDtype() +} diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 60165b51fc4..87b643de9a7 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -44,10 +44,10 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if cudf.utils.dtypes.is_categorical_dtype(col.dtype): + if col.dtype.is_categorical: assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) - elif pd.api.types.is_string_dtype(col.dtype): + elif col.dtype.is_string: assert col.size == (col.children[0].size - 1) assert col.size == ( (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 From a81c3681f3ae343b8aa01817ad19ce031eb7ebfd Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 31 Jul 2020 11:40:37 -0700 Subject: [PATCH 12/80] forgot string O --- python/cudf/cudf/core/dtypes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2d8c22307f5..ad0bf4c2a37 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -198,7 +198,9 @@ def __init__(self): self._name = "String" def make_dtype_from_string(obj): - if obj in {'str', 'string', 'object'}: + import pdb + pdb.set_trace() + if obj in {'str', 'string', 'object', 'O'}: return StringDtype() elif 'datetime' in obj: if obj == 'datetime64[ns]': From 572c39f950a32f4f188d9eff205ecae10df096b1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 31 Jul 2020 13:27:41 -0700 Subject: [PATCH 13/80] more progress --- python/cudf/cudf/core/column/column.py | 14 +- python/cudf/cudf/core/dtypes.py | 193 +++++++++++++++---------- python/cudf/cudf/tests/test_column.py | 2 +- 3 files changed, 121 insertions(+), 88 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ae0c939f74c..84cb50afecf 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -187,7 +187,7 @@ def _concat(cls, objs, dtype=None): if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) - if is_categorical_dtype(dtype): + if dtype.is_categorical: dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) @@ -200,18 +200,18 @@ def _concat(cls, objs, dtype=None): [ o for o in not_null_cols - if not is_numerical_dtype(o.dtype) - or np.issubdtype(o.dtype, np.datetime64) + if not o.dtype.is_numeric + or o.dtype.is_datetime ] ) == 0 ): - col_dtypes = [o.dtype for o in not_null_cols] + np_col_dtypes = [o.dtype.to_numpy for o in not_null_cols] # Use NumPy to find a common dtype - common_dtype = np.find_common_type(col_dtypes, []) + np_common_dtype = np.find_common_type(np_col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): - objs[i] = objs[i].astype(common_dtype) + objs[i] = objs[i].astype(make_dtype_from_obj(np_common_dtype)) # Find the first non-null column: head = objs[0] @@ -232,7 +232,7 @@ def _concat(cls, objs, dtype=None): raise ValueError("All columns must be the same type") cats = None - is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) + is_categorical = all(o.dtype.is_categorical for o in objs) # Combine CategoricalColumn categories if is_categorical: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index ad0bf4c2a37..cf2614c8017 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -22,32 +22,33 @@ pa.string(): pd.StringDtype(), pa.float32(): np.float32(), pa.float64(): np.float64(), - pa.timestamp('ns'): np.dtype('datetime64[ns]'), - pa.timestamp('us'): np.dtype('datetime64[us]'), - pa.timestamp('ms'): np.dtype('datetime64[ms]'), - pa.timestamp('s'): np.dtype('datetime64[s]'), + pa.timestamp("ns"): np.dtype("datetime64[ns]"), + pa.timestamp("us"): np.dtype("datetime64[us]"), + pa.timestamp("ms"): np.dtype("datetime64[ms]"), + pa.timestamp("s"): np.dtype("datetime64[s]"), } pa_to_np_dtypes = { - pa.uint8(): np.dtype('uint8'), - pa.uint16(): np.dtype('uint16'), - pa.uint32(): np.dtype('uint32'), - pa.uint64(): np.dtype('uint64'), - pa.int8(): np.dtype('int8'), - pa.int16(): np.dtype('int16'), - pa.int32(): np.dtype('int32'), - pa.int64(): np.dtype('int64'), - pa.bool_(): np.dtype('bool'), - pa.string(): np.dtype('object'), - pa.float32(): np.dtype('float32'), - pa.float64(): np.dtype('float64'), - pa.timestamp('ns'): np.dtype('datetime64[ns]'), - pa.timestamp('us'): np.dtype('datetime64[us]'), - pa.timestamp('ms'): np.dtype('datetime64[ms]'), - pa.timestamp('s'): np.dtype('datetime64[s]'), - None: None + pa.uint8(): np.dtype("uint8"), + pa.uint16(): np.dtype("uint16"), + pa.uint32(): np.dtype("uint32"), + pa.uint64(): np.dtype("uint64"), + pa.int8(): np.dtype("int8"), + pa.int16(): np.dtype("int16"), + pa.int32(): np.dtype("int32"), + pa.int64(): np.dtype("int64"), + pa.bool_(): np.dtype("bool"), + pa.string(): np.dtype("object"), + pa.float32(): np.dtype("float32"), + pa.float64(): np.dtype("float64"), + pa.timestamp("ns"): np.dtype("datetime64[ns]"), + pa.timestamp("us"): np.dtype("datetime64[us]"), + pa.timestamp("ms"): np.dtype("datetime64[ms]"), + pa.timestamp("s"): np.dtype("datetime64[s]"), + None: None, } + class Dtype(ExtensionDtype, _Dtype): is_integer = False is_string = False @@ -56,18 +57,25 @@ class Dtype(ExtensionDtype, _Dtype): is_datetime = False is_list = False is_float = False + is_numeric = False pa_type = None + def __eq__(self, other): if isinstance(other, self.__class__): return True if isinstance(other, Dtype) and not isinstance(other, self.__class__): return False - if isinstance(other, self.to_pandas.__class__) or other is self.to_pandas.__class__: + if ( + isinstance(other, self.to_pandas.__class__) + or other is self.to_pandas.__class__ + ): return True - + if self.to_numpy == other: return True - raise NotImplementedError + if isinstance(other, str) and str(self.to_numpy) == other: + return True + return False @property def to_numpy(self): @@ -85,12 +93,13 @@ def itemsize(self): def type(self): if self.is_float or self.is_datetime: return self.to_numpy.kind - else: self.to_pandas.type + else: + return self.to_pandas.type @property def kind(self): if self.is_float: - return 'f' + return "f" else: return self.to_pandas.kind @@ -104,147 +113,169 @@ def __repr__(self): def __hash__(self): return hash(self.__repr__()) - class IntDtype(Dtype): is_integer = True + is_numeric = True + class UInt8Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint8() self._name = "UInt8" - + + class UInt16Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint16() self._name = "UInt16" + class UInt32Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint32() self._name = "UInt32" + class UInt64Dtype(IntDtype): def __init__(self): self.pa_type = pa.uint64() self._name = "UInt64" + class Int8Dtype(IntDtype): def __init__(self): self.pa_type = pa.int8() self._name = "Int8" + class Int16Dtype(IntDtype): def __init__(self): self.pa_type = pa.int16() self._name = "Int16" + class Int32Dtype(IntDtype): def __init__(self): self.pa_type = pa.int32() self._name = "Int32" + class Int64Dtype(IntDtype): def __init__(self): self.pa_type = pa.int64() self._name = "Int64" + class FloatDtype(Dtype): is_float = True + is_numeric = True + class Float32Dtype(FloatDtype): def __init__(self): self.pa_type = pa.float32() self._name = "Float32" + class Float64Dtype(FloatDtype): def __init__(self): self.pa_type = pa.float64() self._name = "Float64" + class BooleanDtype(Dtype): is_boolean = True + def __init__(self): self.pa_type = pa.bool_() self._name = "Boolean" + class DatetimeDtype(Dtype): is_datetime = True + class Datetime64NSDtype(DatetimeDtype): def __init__(self): - self.pa_type = pa.timestamp('ns') + self.pa_type = pa.timestamp("ns") self._name = "Datetime64NS" + class Datetime64USDtype(DatetimeDtype): def __init__(self): - self.pa_type = pa.timestamp('us') + self.pa_type = pa.timestamp("us") self._name = "Datetime64US" + class Datetime64MSDtype(DatetimeDtype): def __init__(self): - self.pa_type = pa.timestamp('ms') + self.pa_type = pa.timestamp("ms") self._name = "Datetime64MS" + class Datetime64SDtype(DatetimeDtype): def __init__(self): - self.pa_type = pa.timestamp('s') + self.pa_type = pa.timestamp("s") self._name = "Datetime64S" + class StringDtype(Dtype): is_string = True + def __init__(self): self.pa_type = pa.string() self._name = "String" + def make_dtype_from_string(obj): - import pdb - pdb.set_trace() - if obj in {'str', 'string', 'object', 'O'}: + if obj in {"str", "string", "object", "O"}: return StringDtype() - elif 'datetime' in obj: - if obj == 'datetime64[ns]': + elif "datetime" in obj: + if obj == "datetime64[ns]": return Datetime64NSDtype() - elif obj == 'datetime64[us]': + elif obj == "datetime64[us]": return Datetime64USDtype() - elif obj == 'datetime64[ms]': + elif obj == "datetime64[ms]": return Datetime64MSDtype() - elif obj == 'datetime64[s]': + elif obj == "datetime64[s]": return Datetime64SDtype() - elif 'int' in obj or 'Int' in obj: - if obj in {'int', 'Int', 'int64', 'Int64'}: + elif "int" in obj or "Int" in obj: + if obj in {"int", "Int", "int64", "Int64"}: return Int64Dtype() - elif obj in {'int32', 'Int32'}: + elif obj in {"int32", "Int32"}: return Int32Dtype() - elif obj in {'int16', 'Int16'}: + elif obj in {"int16", "Int16"}: return Int16Dtype() - elif obj in {'int8', 'Int8'}: + elif obj in {"int8", "Int8"}: return Int8Dtype() - elif obj in {'uint64', 'UInt64'}: + elif obj in {"uint64", "UInt64"}: return UInt64Dtype() - elif obj in {'uint32', 'UInt32'}: + elif obj in {"uint32", "UInt32"}: return UInt32Dtype() - elif obj in {'uint16', 'UInt16'}: + elif obj in {"uint16", "UInt16"}: return UInt16Dtype() - elif obj in {'uint8', 'Uint8'}: + elif obj in {"uint8", "Uint8"}: return UInt8Dtype() - elif 'float' in obj: - if obj in {'float64', 'Float64'}: + elif "float" in obj: + if obj in {"float64", "Float64"}: return Float64Dtype() - elif obj in {'float32', 'Float32'}: + elif obj in {"float32", "Float32"}: return Float32Dtype() - elif 'bool' in obj: + elif "bool" in obj: return BooleanDtype() - def make_dtype_from_numpy(obj): np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()} result = np_to_pd_types.get(obj) return result + def make_dtype_from_obj(obj): - if isinstance(obj, Dtype): + if isinstance(obj, CategoricalDtype): + return obj + elif isinstance(obj, Dtype): return np_to_cudf_dtypes[obj.to_numpy] if isinstance(obj, np.dtype): return np_to_cudf_dtypes[obj] @@ -260,12 +291,14 @@ def make_dtype_from_obj(obj): return np_to_cudf_dtypes[np.dtype(obj)] except: import pdb + pdb.set_trace() - raise TypeError('cant transform this object to a cudf dtype. ') + raise TypeError("cant transform this object to a cudf dtype. ") + -class CategoricalDtype(ExtensionDtype): +class CategoricalDtype(Dtype): - is_categorical_dtype = True + is_categorical = True def __init__(self, categories=None, ordered=None): """ @@ -438,30 +471,30 @@ def __repr__(self): pa.string(): StringDtype(), pa.float32(): Float32Dtype(), pa.float64(): Float64Dtype(), - pa.timestamp('ns'): Datetime64NSDtype(), - pa.timestamp('us'): Datetime64USDtype(), - pa.timestamp('ms'): Datetime64MSDtype(), - pa.timestamp('s'): Datetime64SDtype(), - None: Dtype + pa.timestamp("ns"): Datetime64NSDtype(), + pa.timestamp("us"): Datetime64USDtype(), + pa.timestamp("ms"): Datetime64MSDtype(), + pa.timestamp("s"): Datetime64SDtype(), + None: Dtype, } np_to_cudf_dtypes = { - np.dtype('int8'): Int8Dtype(), - np.dtype('int16'): Int16Dtype(), - np.dtype('int32'): Int32Dtype(), - np.dtype('int64'): Int64Dtype(), - np.dtype('uint8'): UInt8Dtype(), - np.dtype('uint16'): UInt16Dtype(), - np.dtype('uint32'): UInt32Dtype(), - np.dtype('uint64'): UInt64Dtype(), - np.dtype('bool'): BooleanDtype(), - np.dtype('object'): StringDtype(), - np.dtype('float32'): Float32Dtype(), - np.dtype('float64'): Float64Dtype(), - np.dtype('datetime64[ns]'): Datetime64NSDtype(), - np.dtype('datetime64[us]'): Datetime64USDtype(), - np.dtype('datetime64[ms]'): Datetime64MSDtype(), - np.dtype('datetime64[s]'): Datetime64SDtype(), + np.dtype("int8"): Int8Dtype(), + np.dtype("int16"): Int16Dtype(), + np.dtype("int32"): Int32Dtype(), + np.dtype("int64"): Int64Dtype(), + np.dtype("uint8"): UInt8Dtype(), + np.dtype("uint16"): UInt16Dtype(), + np.dtype("uint32"): UInt32Dtype(), + np.dtype("uint64"): UInt64Dtype(), + np.dtype("bool"): BooleanDtype(), + np.dtype("object"): StringDtype(), + np.dtype("float32"): Float32Dtype(), + np.dtype("float64"): Float64Dtype(), + np.dtype("datetime64[ns]"): Datetime64NSDtype(), + np.dtype("datetime64[us]"): Datetime64USDtype(), + np.dtype("datetime64[ms]"): Datetime64MSDtype(), + np.dtype("datetime64[s]"): Datetime64SDtype(), } pd_to_cudf_dtypes = { @@ -474,5 +507,5 @@ def __repr__(self): pd.UInt32Dtype(): UInt32Dtype(), pd.UInt64Dtype(): UInt64Dtype(), pd.BooleanDtype(): BooleanDtype(), - pd.StringDtype(): StringDtype() + pd.StringDtype(): StringDtype(), } diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 87b643de9a7..7cb1509cb7f 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -66,7 +66,7 @@ def test_column_offset_and_size(pandas_input, offset, size): slicer = slice(offset, size) expect = pandas_input.iloc[slicer].reset_index(drop=True) - + print(got) assert_eq(expect, got) From 4f6f316b670cc65c62b359234bd819dafbf6d42f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 3 Aug 2020 12:45:34 -0700 Subject: [PATCH 14/80] column tests pass --- python/cudf/cudf/__init__.py | 9 ++- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/_lib/types.pxd | 2 +- python/cudf/cudf/_lib/types.pyx | 2 +- python/cudf/cudf/core/column/column.py | 60 +++++++------- python/cudf/cudf/core/column/datetime.py | 8 +- python/cudf/cudf/core/column/numerical.py | 8 +- python/cudf/cudf/core/column/string.py | 63 +++++++-------- python/cudf/cudf/core/dtypes.py | 97 +++++++++++------------ python/cudf/cudf/core/series.py | 5 +- python/cudf/cudf/tests/test_column.py | 4 +- python/cudf/cudf/utils/utils.py | 2 +- 12 files changed, 130 insertions(+), 132 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 38b31e5e7b7..8d16edafa64 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -32,7 +32,12 @@ merge, ) from cudf.core.dtypes import ( - Dtype, + dtype, + Generic, + Datetime, + Floating, + Number, + Flexible, CategoricalDtype, Int8Dtype, Int16Dtype, @@ -49,7 +54,7 @@ Datetime64NSDtype, Datetime64USDtype, Datetime64MSDtype, - Datetime64SDtype + Datetime64SDtype, ) from cudf.core.groupby import Grouper diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7cf3549ed1c..5b48d92f1b6 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -349,7 +349,7 @@ cdef class Column: return self._view(c_null_count) cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if self.dtype.is_categorical: + if is_categorical_dtype(self.dtype): col = self.base_children[0] else: col = self diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index f6c0c39174a..54b5d8cfae4 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -12,4 +12,4 @@ ctypedef int32_t underlying_type_t_type_id ctypedef bool underlying_type_t_null_policy cdef class _Dtype: - cdef data_type get_libcudf_type(self) + cdef data_type get_libcudf_type(self) except * diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index fc8d4fada22..53bae8d7389 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -123,7 +123,7 @@ class NullHandling(IntEnum): cdef class _Dtype: - cdef data_type get_libcudf_type(self): + cdef data_type get_libcudf_type(self) except *: np_dtype = self.to_numpy cdef libcudf_types.type_id tid = ( ( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 84cb50afecf..f35687bb642 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -37,7 +37,6 @@ np_to_pa_dtype, ) from cudf.utils.utils import buffers_from_pyarrow, mask_dtype -from cudf.core.dtypes import make_dtype_from_obj class ColumnBase(Column, Serializable): def __init__( @@ -81,10 +80,10 @@ def data_array_view(self): """ View the data as a device array object """ - if self.dtype.is_string: + if isinstance(self.dtype, cudf.StringDtype): raise ValueError("Cannot get an array view of a StringColumn") - if self.dtype.is_categorical: + if is_categorical_dtype(self.dtype): return self.codes.data_array_view else: dtype = self.dtype @@ -187,7 +186,7 @@ def _concat(cls, objs, dtype=None): if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) - if dtype.is_categorical: + if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) @@ -200,8 +199,7 @@ def _concat(cls, objs, dtype=None): [ o for o in not_null_cols - if not o.dtype.is_numeric - or o.dtype.is_datetime + if not isinstance(o.dtype, (cudf.Number, cudf.Datetime)) ] ) == 0 @@ -211,7 +209,7 @@ def _concat(cls, objs, dtype=None): np_common_dtype = np.find_common_type(np_col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): - objs[i] = objs[i].astype(make_dtype_from_obj(np_common_dtype)) + objs[i] = objs[i].astype(cudf.dtype(np_common_dtype)) # Find the first non-null column: head = objs[0] @@ -232,7 +230,7 @@ def _concat(cls, objs, dtype=None): raise ValueError("All columns must be the same type") cats = None - is_categorical = all(o.dtype.is_categorical for o in objs) + is_categorical = all(is_categorical_dtype(o.dtype) for o in objs) # Combine CategoricalColumn categories if is_categorical: @@ -878,12 +876,12 @@ def distinct_count(self, method="sort", dropna=True): return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype, **kwargs): - dtype = make_dtype_from_obj(dtype) - if dtype.is_categorical: + dtype = cudf.dtype(dtype) + if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) - elif dtype.is_datetime: + elif isinstance(dtype, cudf.Datetime): return self.as_datetime_column(dtype, **kwargs) - elif dtype.is_string: + elif isinstance(dtype, cudf.StringDtype): return self.as_string_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) @@ -1129,9 +1127,9 @@ def build_column( offset : int, optional children : tuple, optional """ - dtype = make_dtype_from_obj(dtype) + dtype = cudf.dtype(dtype) - if dtype.is_categorical: + if is_categorical_dtype(dtype): if not len(children) == 1: raise ValueError( "Must specify exactly one child column for CategoricalColumn" @@ -1146,7 +1144,7 @@ def build_column( null_count=null_count, children=children, ) - elif dtype.is_datetime: + elif isinstance(dtype, cudf.Datetime): return cudf.core.column.DatetimeColumn( data=data, dtype=dtype, @@ -1155,7 +1153,7 @@ def build_column( offset=offset, null_count=null_count, ) - elif dtype.is_string: + elif isinstance(dtype, cudf.StringDtype): return cudf.core.column.StringColumn( mask=mask, size=size, @@ -1265,7 +1263,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): * pandas.Categorical objects """ - dtype = make_dtype_from_obj(dtype) if dtype is not None else None + dtype = cudf.dtype(dtype) if dtype is not None else None if isinstance(arbitrary, ColumnBase): if dtype is not None: @@ -1324,11 +1322,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): if isinstance(col, cudf.core.column.CategoricalColumn): return col - elif col.dtype.is_float: + elif isinstance(col.dtype, cudf.Floating): if nan_as_null or (mask is None and nan_as_null is None): mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan)) col = col.set_mask(mask) - elif col.dtype.is_datetime: + elif isinstance(col.dtype, cudf.Datetime): if nan_as_null or (mask is None and nan_as_null is None): col = utils.time_col_replace_nulls(col) return col @@ -1451,7 +1449,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): ) data = cudf.core.column.NumericalColumn( data=padata, - dtype=make_dtype_from_obj(arbitrary.type), + dtype=cudf.dtype(arbitrary.type), mask=pamask, size=pa_size, offset=pa_offset, @@ -1517,15 +1515,15 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): ): arbitrary = None if dtype is None: - dtype = np.dtype("float64") + dtype = cudf.Float64Dtype() data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) ) if not nan_as_null: - if np.issubdtype(data.dtype, np.floating): + if isinstance(data.dtype, cudf.Floating): data = data.fillna(np.nan) - elif np.issubdtype(data.dtype, np.datetime64): + elif isinstance(data.dtype, cudf.Datetime): data = data.fillna(np.datetime64("NaT")) elif hasattr(arbitrary, "__array_interface__"): @@ -1582,7 +1580,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): pa_data = pa.Array.from_pandas(arbitrary) data = as_column( - pa_data, dtype=make_dtype_from_obj(pa_data.type) + pa_data, dtype=cudf.dtype(pa_data.type) ) # There is no cast operation available for pa.Array from int to # str, Hence instead of handling in pa.Array block, we @@ -1633,9 +1631,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data = as_column(data, dtype=cudf_dtype) mask = arbitrary._mask - mask = bools_to_mask( - as_column(mask).binary_operator("eq", np.bool_(False)) - ) + mask = bools_to_mask(as_column(mask).unary_operator("not")) data = data.set_mask(mask) @@ -1648,20 +1644,20 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) - if dtype.is_categorical: + if is_categorical_dtype(dtype): raise TypeError pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null) - data = as_column(pa_data, dtype=make_dtype_from_obj(pa_data.type), nan_as_null=nan_as_null) + data = as_column(pa_data, dtype=cudf.dtype(pa_data.type), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): - if dtype.is_categorical: + if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) - elif dtype.to_numpy == np.str_: + elif isinstance(cudf.dtype(dtype), cudf.StringDtype): sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: - native_dtype = dtype.to_numpy + native_dtype = dtype.to_numpy if dtype is not None else None if dtype is None and pd.api.types.infer_dtype( arbitrary ) in ("mixed", "mixed-integer"): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 271351c3890..3e3c5c5f077 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -5,13 +5,13 @@ import pandas as pd import pyarrow as pa +import cudf.core.dtypes as cudf_dtypes from cudf import _lib as libcudf from cudf._lib.nvtx import annotate from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.utils import utils from cudf.utils.dtypes import is_scalar, np_to_pa_dtype -from cudf.core.dtypes import make_dtype_from_obj # nanoseconds per time_unit _numpy_to_pandas_conversion = { @@ -46,7 +46,7 @@ def __init__( mask : Buffer; optional The validity mask """ - dtype = make_dtype_from_obj(dtype) + dtype = cudf_dtypes.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -60,7 +60,7 @@ def __init__( offset=offset, null_count=null_count, ) - assert self.dtype.type is np.datetime64 + assert isinstance(self.dtype, cudf_dtypes.Datetime) self._time_unit, _ = np.datetime_data(self.dtype.to_numpy) def __contains__(self, item): @@ -158,7 +158,7 @@ def as_string_column(self, dtype, **kwargs): kwargs["format"] = fmt if len(self) > 0: return string._numeric_to_str_typecast_functions[ - np.dtype(self.dtype) + self.dtype ](self, **kwargs) else: return column.column_empty(0, dtype="object", masked=False) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 459461d94e5..82fa5e2a4e2 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -18,7 +18,6 @@ np_to_pa_dtype, numeric_normalize_types, ) -from cudf.core.dtypes import make_dtype_from_obj class NumericalColumn(column.ColumnBase): def __init__( @@ -32,7 +31,7 @@ def __init__( The dtype associated with the data Buffer mask : Buffer, optional """ - dtype = make_dtype_from_obj(dtype) + dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") if size is None: @@ -83,8 +82,8 @@ def binary_operator(self, binop, rhs, reflect=False): if reflect: tmp = self if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs): - out_dtype = np.result_type(make_dtype_from_obj(self.dtype).to_numpy, make_dtype_from_obj(rhs.dtype).to_numpy) - out_dtype = make_dtype_from_obj(out_dtype) + out_dtype = np.result_type(cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy) + out_dtype = cudf.dtype(out_dtype) if binop in ["mod", "floordiv"]: if (tmp.dtype in int_dtypes) and ( (np.isscalar(tmp) and (0 == tmp)) @@ -454,7 +453,6 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): if is_op_comparison: out_dtype = "bool" - out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) if is_op_comparison: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2b8e030d49e..4056e0dc7d5 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -134,46 +134,47 @@ is_scalar, is_string_dtype, ) -from cudf.core.dtypes import make_dtype_from_obj + +from cudf.core.dtypes import dtype _str_to_numeric_typecast_functions = { - make_dtype_from_obj("int8"): str_cast.stoi8, - make_dtype_from_obj("int16"): str_cast.stoi16, - make_dtype_from_obj("int32"): str_cast.stoi, - make_dtype_from_obj("int64"): str_cast.stol, - make_dtype_from_obj("uint8"): str_cast.stoui8, - make_dtype_from_obj("uint16"): str_cast.stoui16, - make_dtype_from_obj("uint32"): str_cast.stoui, - make_dtype_from_obj("uint64"): str_cast.stoul, - make_dtype_from_obj("float32"): str_cast.stof, - make_dtype_from_obj("float64"): str_cast.stod, - make_dtype_from_obj("bool"): str_cast.to_booleans, + dtype("int8"): str_cast.stoi8, + dtype("int16"): str_cast.stoi16, + dtype("int32"): str_cast.stoi, + dtype("int64"): str_cast.stol, + dtype("uint8"): str_cast.stoui8, + dtype("uint16"): str_cast.stoui16, + dtype("uint32"): str_cast.stoui, + dtype("uint64"): str_cast.stoul, + dtype("float32"): str_cast.stof, + dtype("float64"): str_cast.stod, + dtype("bool"): str_cast.to_booleans, # TODO: support Date32 UNIX days # np.dtype("datetime64[D]"): str_cast.timestamp2int, - make_dtype_from_obj("datetime64[s]"): str_cast.timestamp2int, - make_dtype_from_obj("datetime64[ms]"): str_cast.timestamp2int, - make_dtype_from_obj("datetime64[us]"): str_cast.timestamp2int, - make_dtype_from_obj("datetime64[ns]"): str_cast.timestamp2int, + dtype("datetime64[s]"): str_cast.timestamp2int, + dtype("datetime64[ms]"): str_cast.timestamp2int, + dtype("datetime64[us]"): str_cast.timestamp2int, + dtype("datetime64[ns]"): str_cast.timestamp2int, } _numeric_to_str_typecast_functions = { - make_dtype_from_obj(np.dtype("int8")): str_cast.i8tos, - make_dtype_from_obj(np.dtype("int16")): str_cast.i16tos, - make_dtype_from_obj(np.dtype("int32")): str_cast.itos, - make_dtype_from_obj(np.dtype("int64")): str_cast.ltos, - make_dtype_from_obj(np.dtype("uint8")): str_cast.ui8tos, - make_dtype_from_obj(np.dtype("uint16")): str_cast.ui16tos, - make_dtype_from_obj(np.dtype("uint32")): str_cast.uitos, - make_dtype_from_obj(np.dtype("uint64")): str_cast.ultos, - make_dtype_from_obj(np.dtype("float32")): str_cast.ftos, - make_dtype_from_obj(np.dtype("float64")): str_cast.dtos, - make_dtype_from_obj(np.dtype("bool")): str_cast.from_booleans, + dtype(np.dtype("int8")): str_cast.i8tos, + dtype(np.dtype("int16")): str_cast.i16tos, + dtype(np.dtype("int32")): str_cast.itos, + dtype(np.dtype("int64")): str_cast.ltos, + dtype(np.dtype("uint8")): str_cast.ui8tos, + dtype(np.dtype("uint16")): str_cast.ui16tos, + dtype(np.dtype("uint32")): str_cast.uitos, + dtype(np.dtype("uint64")): str_cast.ultos, + dtype(np.dtype("float32")): str_cast.ftos, + dtype(np.dtype("float64")): str_cast.dtos, + dtype(np.dtype("bool")): str_cast.from_booleans, # TODO: support Date32 UNIX days # np.dtype("datetime64[D]"): str_cast.int2timestamp, - make_dtype_from_obj(np.dtype("datetime64[s]")): str_cast.int2timestamp, - make_dtype_from_obj(np.dtype("datetime64[ms]")): str_cast.int2timestamp, - make_dtype_from_obj(np.dtype("datetime64[us]")): str_cast.int2timestamp, - make_dtype_from_obj(np.dtype("datetime64[ns]")): str_cast.int2timestamp, + dtype(np.dtype("datetime64[s]")): str_cast.int2timestamp, + dtype(np.dtype("datetime64[ms]")): str_cast.int2timestamp, + dtype(np.dtype("datetime64[us]")): str_cast.int2timestamp, + dtype(np.dtype("datetime64[ns]")): str_cast.int2timestamp, } diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index cf2614c8017..88bb9406144 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -49,21 +49,13 @@ } -class Dtype(ExtensionDtype, _Dtype): - is_integer = False - is_string = False - is_boolean = False - is_categorical = False - is_datetime = False - is_list = False - is_float = False - is_numeric = False +class Generic(ExtensionDtype, _Dtype): pa_type = None def __eq__(self, other): if isinstance(other, self.__class__): return True - if isinstance(other, Dtype) and not isinstance(other, self.__class__): + if isinstance(other, Generic) and not isinstance(other, self.__class__): return False if ( isinstance(other, self.to_pandas.__class__) @@ -91,14 +83,14 @@ def itemsize(self): @property def type(self): - if self.is_float or self.is_datetime: - return self.to_numpy.kind + if isinstance(self, (Floating, Datetime)): + return self.to_numpy.type else: return self.to_pandas.type @property def kind(self): - if self.is_float: + if isinstance(self, Floating): return "f" else: return self.to_pandas.kind @@ -113,78 +105,87 @@ def __repr__(self): def __hash__(self): return hash(self.__repr__()) +class Number(Generic): + pass -class IntDtype(Dtype): - is_integer = True - is_numeric = True +class Integer(Number): + pass +class SignedInteger(Integer): + pass -class UInt8Dtype(IntDtype): +class UnsignedInteger(Integer): + pass + +class Inexact(Number): + pass + +class Floating(Inexact): + pass + +class Flexible(Generic): + pass + +class UInt8Dtype(UnsignedInteger): def __init__(self): self.pa_type = pa.uint8() self._name = "UInt8" -class UInt16Dtype(IntDtype): +class UInt16Dtype(UnsignedInteger): def __init__(self): self.pa_type = pa.uint16() self._name = "UInt16" -class UInt32Dtype(IntDtype): +class UInt32Dtype(UnsignedInteger): def __init__(self): self.pa_type = pa.uint32() self._name = "UInt32" -class UInt64Dtype(IntDtype): +class UInt64Dtype(UnsignedInteger): def __init__(self): self.pa_type = pa.uint64() self._name = "UInt64" -class Int8Dtype(IntDtype): +class Int8Dtype(SignedInteger): def __init__(self): self.pa_type = pa.int8() self._name = "Int8" -class Int16Dtype(IntDtype): +class Int16Dtype(SignedInteger): def __init__(self): self.pa_type = pa.int16() self._name = "Int16" -class Int32Dtype(IntDtype): +class Int32Dtype(SignedInteger): def __init__(self): self.pa_type = pa.int32() self._name = "Int32" -class Int64Dtype(IntDtype): +class Int64Dtype(SignedInteger): def __init__(self): self.pa_type = pa.int64() self._name = "Int64" - -class FloatDtype(Dtype): - is_float = True - is_numeric = True - - -class Float32Dtype(FloatDtype): +class Float32Dtype(Floating): def __init__(self): self.pa_type = pa.float32() self._name = "Float32" -class Float64Dtype(FloatDtype): +class Float64Dtype(Floating): def __init__(self): self.pa_type = pa.float64() self._name = "Float64" -class BooleanDtype(Dtype): +class BooleanDtype(Generic): is_boolean = True def __init__(self): @@ -192,35 +193,34 @@ def __init__(self): self._name = "Boolean" -class DatetimeDtype(Dtype): - is_datetime = True - +class Datetime(Generic): + pass -class Datetime64NSDtype(DatetimeDtype): +class Datetime64NSDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("ns") self._name = "Datetime64NS" -class Datetime64USDtype(DatetimeDtype): +class Datetime64USDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("us") self._name = "Datetime64US" -class Datetime64MSDtype(DatetimeDtype): +class Datetime64MSDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("ms") self._name = "Datetime64MS" -class Datetime64SDtype(DatetimeDtype): +class Datetime64SDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("s") self._name = "Datetime64S" -class StringDtype(Dtype): +class StringDtype(Flexible): is_string = True def __init__(self): @@ -272,10 +272,12 @@ def make_dtype_from_numpy(obj): return result -def make_dtype_from_obj(obj): +def dtype(obj): + if obj is None: + return None if isinstance(obj, CategoricalDtype): return obj - elif isinstance(obj, Dtype): + elif isinstance(obj, Generic): return np_to_cudf_dtypes[obj.to_numpy] if isinstance(obj, np.dtype): return np_to_cudf_dtypes[obj] @@ -291,14 +293,11 @@ def make_dtype_from_obj(obj): return np_to_cudf_dtypes[np.dtype(obj)] except: import pdb - pdb.set_trace() - raise TypeError("cant transform this object to a cudf dtype. ") - + -class CategoricalDtype(Dtype): - is_categorical = True +class CategoricalDtype(Generic): def __init__(self, categories=None, ordered=None): """ @@ -475,7 +474,7 @@ def __repr__(self): pa.timestamp("us"): Datetime64USDtype(), pa.timestamp("ms"): Datetime64MSDtype(), pa.timestamp("s"): Datetime64SDtype(), - None: Dtype, + pa.null(): None } np_to_cudf_dtypes = { diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4eda63c345c..914d8d0ec1a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -10,7 +10,7 @@ from pandas._config import get_option from pandas.api.types import is_dict_like -from cudf.core.dtypes import Dtype +from cudf.core.dtypes import dtype as cudf_dtype import cudf from cudf import _lib as libcudf @@ -145,8 +145,7 @@ def __init__( ``null`` values. If ``False``, leaves ``np.nan`` values as is. """ - from cudf.core.dtypes import make_dtype_from_obj - dtype = make_dtype_from_obj(dtype) if dtype is not None else None + dtype = cudf_dtype(dtype) if isinstance(data, pd.Series): if name is None: name = data.name diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 7cb1509cb7f..35b4806f127 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -44,10 +44,10 @@ def test_column_offset_and_size(pandas_input, offset, size): children=col.base_children, ) - if col.dtype.is_categorical: + if cudf.utils.dtypes.is_categorical_dtype(col.dtype): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) - elif col.dtype.is_string: + elif isinstance(col.dtype, cudf.StringDtype): assert col.size == (col.children[0].size - 1) assert col.size == ( (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 6b8a199550b..30c7528b329 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -417,7 +417,7 @@ def time_col_replace_nulls(input_col): input_col, column.as_column( Buffer( - np.array([np.datetime64("NaT")], dtype=input_col.dtype).view( + np.array([np.datetime64("NaT")], dtype=input_col.dtype.to_numpy).view( "|u1" ) ), From ee6ece5a73c664e604c06cc1abbc3f997d553ab4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 3 Aug 2020 13:44:48 -0700 Subject: [PATCH 15/80] working up through test_array_func --- python/cudf/cudf/_lib/reduce.pyx | 18 ++++++------------ python/cudf/cudf/_lib/transform.pyx | 2 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dtypes.py | 2 ++ 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 9657693582e..39aad31f570 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -8,11 +8,12 @@ from cudf._lib.cpp.column.column cimport column from cudf._lib.scalar cimport Scalar from cudf._lib.column cimport Column from cudf._lib.types import np_to_cudf_types -from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.types cimport underlying_type_t_type_id, _Dtype from cudf._lib.move cimport move from cudf._lib.aggregation cimport make_aggregation, aggregation from libcpp.memory cimport unique_ptr import numpy as np +from cudf.core.dtypes import dtype as cudf_dtype def reduce(reduction_op, Column incol, dtype=None, **kwargs): @@ -32,23 +33,16 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): col_dtype = incol.dtype if reduction_op in ['sum', 'sum_of_squares', 'product']: - col_dtype = np.find_common_type([col_dtype], [np.uint64]) - col_dtype = col_dtype if dtype is None else dtype + col_dtype = np.find_common_type([col_dtype.to_numpy], [np.uint64]) + col_dtype = cudf_dtype(col_dtype) cdef column_view c_incol_view = incol.view() cdef unique_ptr[scalar] c_result cdef unique_ptr[aggregation] c_agg = move(make_aggregation( reduction_op, kwargs )) - cdef type_id tid = ( - ( - ( - np_to_cudf_types[np.dtype(col_dtype)] - ) - ) - ) - - cdef data_type c_out_dtype = data_type(tid) + cdef _Dtype data_dtype = col_dtype + cdef data_type c_out_dtype = data_dtype.get_libcudf_type() # check empty case if len(incol) <= incol.null_count: diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index c58669d6910..e80b7d9252f 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -75,7 +75,7 @@ def transform(Column input, op): cdef type_id c_tid cdef data_type c_dtype - nb_type = numpy_support.from_dtype(input.dtype) + nb_type = numpy_support.from_dtype(input.dtype.to_numpy) nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) c_str = compiled_op[0].encode('UTF-8') diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f35687bb642..d2340abdc83 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1606,7 +1606,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): arb_dtype = np.dtype("O") else: arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype) - if arb_dtype != arbitrary.dtype.to_numpy: + if cudf.dtype(arb_dtype) != cudf.dtype(arbitrary.dtype): arbitrary = arbitrary.astype(arb_dtype) if arb_dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 88bb9406144..830c305f05a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -287,6 +287,8 @@ def dtype(obj): return make_dtype_from_string(obj) elif obj in pd_to_cudf_dtypes.keys(): return pd_to_cudf_dtypes[obj] + elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype): + return make_dtype_from_string(obj.name) else: try: if issubclass(obj, np.generic): From 62c5e17c48fb4feb2940e51920a94f5675792e52 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 4 Aug 2020 07:12:36 -0700 Subject: [PATCH 16/80] more tests pass --- python/cudf/cudf/core/column/numerical.py | 4 ++-- python/cudf/cudf/core/series.py | 1 + python/cudf/cudf/tests/test_avro.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 82fa5e2a4e2..3f2fcc05240 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -85,7 +85,7 @@ def binary_operator(self, binop, rhs, reflect=False): out_dtype = np.result_type(cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy) out_dtype = cudf.dtype(out_dtype) if binop in ["mod", "floordiv"]: - if (tmp.dtype in int_dtypes) and ( + if (cudf.dtype(tmp.dtype) in int_dtypes) and ( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): @@ -107,7 +107,7 @@ def normalize_binop_value(self, other): return other other_dtype = np.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: - other_dtype = np.promote_types(self.dtype, other_dtype) + other_dtype = np.promote_types(self.dtype.to_numpy, other_dtype) if other_dtype == np.dtype("float16"): other = np.dtype("float32").type(other) other_dtype = other.dtype diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 914d8d0ec1a..a1ae149bb22 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1388,6 +1388,7 @@ def __rtruediv__(self, other): __div__ = __truediv__ def _bitwise_binop(self, other, op): + if isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer) if ( np.issubdtype(self.dtype, np.bool_) or np.issubdtype(self.dtype, np.integer) diff --git a/python/cudf/cudf/tests/test_avro.py b/python/cudf/cudf/tests/test_avro.py index 4a8a8d1bbdb..059f5343e0b 100644 --- a/python/cudf/cudf/tests/test_avro.py +++ b/python/cudf/cudf/tests/test_avro.py @@ -65,7 +65,7 @@ def test_avro_reader_basic(datadir, inputfile, columns, engine): # FASTAVRO produces int64 columns from avro int32 dtype, so convert # it back to int32 here for col in expect.columns: - expect[col] = expect[col].astype(got[col].dtype) + expect[col] = expect[col].astype(got[col].dtype.to_numpy) # fastavro appears to return columns in reverse order # (actual order may depend on pandas/python version) From ef5b9cb2f2e6c948b85d63d9b2995a7ee79336f3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 21 Aug 2020 07:43:35 -0700 Subject: [PATCH 17/80] handle list dtype in _Dtype --- python/cudf/cudf/_lib/column.pyx | 20 ++------------------ python/cudf/cudf/_lib/types.pyx | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 4b3a18b4a97..812c42c8c51 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -364,25 +364,9 @@ cdef class Column: col = self.base_children[0] else: col = self -<<<<<<< HEAD - cdef _Dtype data_dtype = col.dtype - cdef libcudf_types.data_type dtype = data_dtype.get_libcudf_type() -======= - data_dtype = col.dtype - cdef libcudf_types.type_id tid - - if not is_list_dtype(self.dtype): - tid = ( - ( - np_to_cudf_types[np.dtype(data_dtype)] - ) - ) - else: - tid = libcudf_types.type_id.LIST - - cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid) ->>>>>>> branch-0.15 + cdef _Dtype pydtype = self.dtype + cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type() cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 6f7739324f7..457bf16eeab 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -164,13 +164,21 @@ class NullHandling(IntEnum): cdef class _Dtype: cdef data_type get_libcudf_type(self) except *: - np_dtype = self.to_numpy - cdef libcudf_types.type_id tid = ( - ( - np_to_cudf_types[np_dtype] + + cdef libcudf_types.type_id tid + cdef data_type libcudf_type + + if not isinstance(self, ListDtype): + np_dtype = self.to_numpy + tid = ( + ( + np_to_cudf_types[np_dtype] + ) ) - ) - cdef data_type libcudf_type = libcudf_types.data_type(tid) + else: + tid = libcudf_types.type_id.LIST + + libcudf_type = libcudf_types.data_type(tid) return libcudf_type From 93207553d516a5d332fbb8d55e239b425c30c2a0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 21 Aug 2020 07:43:44 -0700 Subject: [PATCH 18/80] fix series syntax error --- python/cudf/cudf/core/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 61b9fb5a26d..e5c8ecfed29 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1381,7 +1381,6 @@ def __rtruediv__(self, other): __div__ = __truediv__ def _bitwise_binop(self, other, op): - if isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer) if ( np.issubdtype(self.dtype, np.bool_) or np.issubdtype(self.dtype, np.integer) From dac2940270933cc25d312851ff65a36ecf27a7ab Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 21 Aug 2020 08:26:54 -0700 Subject: [PATCH 19/80] add timedelta dtypes --- python/cudf/cudf/__init__.py | 4 ++++ python/cudf/cudf/core/dtypes.py | 37 +++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index af28742cfee..04dcc286968 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -56,6 +56,10 @@ Datetime64USDtype, Datetime64MSDtype, Datetime64SDtype, + Timedelta64NSDtype, + Timedelta64USDtype, + Timedelta64MSDtype, + Timedelta64SDtype ) from cudf.core.groupby import Grouper diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4dc138059ee..dc77e30d882 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -126,6 +126,12 @@ class Floating(Inexact): class Flexible(Generic): pass +class Datetime(Generic): + pass + +class Timedelta(Generic): + pass + class UInt8Dtype(UnsignedInteger): def __init__(self): self.pa_type = pa.uint8() @@ -192,10 +198,6 @@ def __init__(self): self.pa_type = pa.bool_() self._name = "Boolean" - -class Datetime(Generic): - pass - class Datetime64NSDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("ns") @@ -219,6 +221,25 @@ def __init__(self): self.pa_type = pa.timestamp("s") self._name = "Datetime64S" +class Timedelta64NSDtype(Timedelta): + def __init__(self): + self.pa_type = pa.duration('ns') + self._name = "Timedelta64NS" + +class Timedelta64USDtype(Timedelta): + def __init__(self): + self.pa_type = pa.duration('us') + self._name = "Timedelta64US" + +class Timedelta64MSDtype(Timedelta): + def __init__(self): + self.pa_type = pa.duration('ms') + self._name = "Timedelta64MS" + +class Timedelta64SDtype(Timedelta): + def __init__(self): + self.pa_type = pa.duration('s') + self._name = "Timedelta64S" class StringDtype(Flexible): is_string = True @@ -475,6 +496,10 @@ def __repr__(self): pa.timestamp("us"): Datetime64USDtype(), pa.timestamp("ms"): Datetime64MSDtype(), pa.timestamp("s"): Datetime64SDtype(), + pa.duration("ns"): Timedelta64NSDtype(), + pa.duration("us"): Timedelta64USDtype(), + pa.duration("ms"): Timedelta64MSDtype(), + pa.duration("s"): Timedelta64SDtype(), pa.null(): None } @@ -495,6 +520,10 @@ def __repr__(self): np.dtype("datetime64[us]"): Datetime64USDtype(), np.dtype("datetime64[ms]"): Datetime64MSDtype(), np.dtype("datetime64[s]"): Datetime64SDtype(), + np.dtype("timedelta64[ns]"): Timedelta64NSDtype(), + np.dtype("timedelta64[us]"): Timedelta64USDtype(), + np.dtype("timedelta64[ms]"): Timedelta64MSDtype(), + np.dtype("timedelta64[s]"): Timedelta64SDtype(), } pd_to_cudf_dtypes = { From 6eee9eb122b02b7af2daade5f7604038950518f2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 21 Aug 2020 08:27:13 -0700 Subject: [PATCH 20/80] fix some numericalcolumn bugs --- python/cudf/cudf/core/column/numerical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7d06a9a01f8..2464e691ce7 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -194,7 +194,7 @@ def to_arrow(self): buffers=[mask, data], null_count=self.null_count, ) - if self.dtype.is_boolean: + if isinstance(self.dtype, cudf.core.dtypes.BooleanDtype): return out.cast(pa.bool_()) else: return out From 1ace46016eb601015afffaabe78700b99eb1f206 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 21 Aug 2020 08:27:26 -0700 Subject: [PATCH 21/80] fix index type mapping dicts --- python/cudf/cudf/core/index.py | 41 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 69a5b0680a5..2663ad8e22d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -33,6 +33,7 @@ numeric_normalize_types, ) from cudf.utils.utils import cached_property +from cudf.core.dtypes import dtype def _to_frame(this_index, index=True, name=None): @@ -2561,29 +2562,29 @@ def as_index(arbitrary, **kwargs): _dtype_to_index = { - np.int8: Int8Index, - np.int16: Int16Index, - np.int32: Int32Index, - np.int64: Int64Index, - np.uint8: UInt8Index, - np.uint16: UInt16Index, - np.uint32: UInt32Index, - np.uint64: UInt64Index, - np.float32: Float32Index, - np.float64: Float64Index, + dtype(np.int8): Int8Index, + dtype(np.int16): Int16Index, + dtype(np.int32): Int32Index, + dtype(np.int64): Int64Index, + dtype(np.uint8): UInt8Index, + dtype(np.uint16): UInt16Index, + dtype(np.uint32): UInt32Index, + dtype(np.uint64): UInt64Index, + dtype(np.float32): Float32Index, + dtype(np.float64): Float64Index, } _index_to_dtype = { - Int8Index: np.int8, - Int16Index: np.int16, - Int32Index: np.int32, - Int64Index: np.int64, - UInt8Index: np.uint8, - UInt16Index: np.uint16, - UInt32Index: np.uint32, - UInt64Index: np.uint64, - Float32Index: np.float32, - Float64Index: np.float64, + Int8Index: dtype(np.int8), + Int16Index: dtype(np.int16), + Int32Index: dtype(np.int32), + Int64Index: dtype(np.int64), + UInt8Index: dtype(np.uint8), + UInt16Index: dtype(np.uint16), + UInt32Index: dtype(np.uint32), + UInt64Index: dtype(np.uint64), + Float32Index: dtype(np.float32), + Float64Index: dtype(np.float64), } From df6426bbbe41ba307926551b28dfbde0eb60444e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 24 Aug 2020 07:58:22 -0700 Subject: [PATCH 22/80] pass all binop tests --- python/cudf/cudf/__init__.py | 3 ++ python/cudf/cudf/_lib/scalar.pyx | 10 ++--- python/cudf/cudf/core/column/column.py | 6 +-- python/cudf/cudf/core/column/datetime.py | 24 ++++++------ python/cudf/cudf/core/column/numerical.py | 4 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/dtypes.py | 13 +++++-- python/cudf/cudf/core/series.py | 45 +++++++++-------------- 8 files changed, 54 insertions(+), 53 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 04dcc286968..56892f6787f 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -38,7 +38,10 @@ Datetime, Floating, Number, + Integer, Flexible, + Datetime, + Timedelta, CategoricalDtype, Int8Dtype, Int16Dtype, diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 7757c5a8ad6..ba2ecef1cd5 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -73,17 +73,17 @@ cdef class Scalar: else: dtype = value.dtype - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) - if pd.api.types.is_string_dtype(dtype): + if isinstance(dtype, cudf.StringDtype): _set_string_from_np_string(self.c_value, value, valid) - elif pd.api.types.is_numeric_dtype(dtype): + elif isinstance(dtype, (cudf.Number, cudf.BooleanDtype)): _set_numeric_from_np_scalar(self.c_value, value, dtype, valid) - elif pd.api.types.is_datetime64_dtype(dtype): + elif isinstance(dtype, cudf.Datetime): _set_datetime64_from_np_scalar( self.c_value, value, dtype, valid ) - elif pd.api.types.is_timedelta64_dtype(dtype): + elif isinstance(dtype, cudf.Timedelta): _set_timedelta64_from_np_scalar( self.c_value, value, dtype, valid ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6c741d01580..c89de55148c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -887,9 +887,9 @@ def astype(self, dtype, **kwargs): "Casting list columns not currently supported" ) return self - elif np.issubdtype(dtype, np.datetime64): + elif isinstance(dtype, cudf.Datetime): return self.as_datetime_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.timedelta64): + elif isinstance(dtype, cudf.Timedelta): return self.as_timedelta_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) @@ -968,7 +968,7 @@ def __cuda_array_interface__(self): output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), - "typestr": self.dtype.str, + "typestr": self.dtype.to_numpy.str, "data": (self.data_ptr, False), "version": 1, } diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 75c48273d42..807f0803e7f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -66,11 +66,9 @@ def __init__( if not (self.dtype.type is np.datetime64): raise TypeError(f"{self.dtype} is not a supported datetime type") - self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item): try: - item = np.datetime64(item, self._time_unit) + item = np.datetime64(item, self.dtype._time_unit) except ValueError: # If item cannot be converted to datetime type # np.datetime64 raises ValueError, hence `item` @@ -80,7 +78,7 @@ def __contains__(self, item): @property def time_unit(self): - return self._time_unit + return self.dtype._time_unit @property def year(self): @@ -127,7 +125,7 @@ def normalize_binop_value(self, other): if np.isnat(other): return as_scalar(val=None, dtype=self.dtype) - other = other.astype(self.dtype) + other = other.astype(self.dtype.to_numpy) return as_scalar(other) elif isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) @@ -200,25 +198,29 @@ def default_na_value(self): def binary_operator(self, op, rhs, reflect=False): lhs, rhs = self, rhs + + lhs_dtype = cudf.dtype(lhs.dtype) + rhs_dtype = cudf.dtype(rhs.dtype) + if op in ("eq", "ne", "lt", "gt", "le", "ge"): - out_dtype = np.bool - elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = cudf.BooleanDtype() + elif op == "add" and isinstance(rhs_dtype, cudf.Timedelta): out_dtype = cudf.core.column.timedelta._timedelta_binary_op_add( rhs, lhs ) - elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype): + elif op == "sub" and isinstance(rhs_dtype, cudf.Timedelta): out_dtype = cudf.core.column.timedelta._timedelta_binary_op_sub( rhs if reflect else lhs, lhs if reflect else rhs ) - elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype): + elif op == "sub" and isinstance(rhs.dtype, cudf.Datetime): units = ["s", "ms", "us", "ns"] lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype( + out_dtype = cudf.dtype(np.dtype( f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]" - ) + )) else: raise TypeError( f"Series of dtype {self.dtype} cannot perform " diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 2464e691ce7..8cb8ecaf752 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -189,7 +189,7 @@ def to_arrow(self): mask = pa.py_buffer(self.mask_array_view.copy_to_host()) data = pa.py_buffer(self.data_array_view.copy_to_host()) out = pa.Array.from_buffers( - type=self.dtype.pa_type, + type=self.dtype.pa_type if not isinstance(self.dtype, cudf.core.dtypes.BooleanDtype) else pa.int8(), length=len(self), buffers=[mask, data], null_count=self.null_count, @@ -437,7 +437,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"] if is_op_comparison: - out_dtype = "bool" + out_dtype = cudf.BooleanDtype() out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) if is_op_comparison: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 46d947e82f8..e47e42c1f13 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4772,7 +4772,7 @@ def binary_operator(self, op, rhs, reflect=False): if isinstance(rhs, StringColumn) and op == "add": return lhs.str().cat(others=rhs) elif op in ("eq", "ne", "gt", "lt", "ge", "le"): - return _string_column_binop(self, rhs, op=op, out_dtype="bool") + return _string_column_binop(self, rhs, op=op, out_dtype=cudf.BooleanDtype()) else: msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(op, type(self), type(rhs))) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index dc77e30d882..791dc74be54 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -126,7 +126,7 @@ class Floating(Inexact): class Flexible(Generic): pass -class Datetime(Generic): +class Datetime(Generic): pass class Timedelta(Generic): @@ -202,24 +202,28 @@ class Datetime64NSDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("ns") self._name = "Datetime64NS" + self._time_unit = "ns" class Datetime64USDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("us") self._name = "Datetime64US" + self._time_unit = "us" class Datetime64MSDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("ms") self._name = "Datetime64MS" + self._time_unit = "ms" class Datetime64SDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("s") self._name = "Datetime64S" + self._time_unit = "s" class Timedelta64NSDtype(Timedelta): def __init__(self): @@ -278,7 +282,7 @@ def make_dtype_from_string(obj): return UInt16Dtype() elif obj in {"uint8", "Uint8"}: return UInt8Dtype() - elif "float" in obj: + elif "float" in obj or "Float" in obj: if obj in {"float64", "Float64"}: return Float64Dtype() elif obj in {"float32", "Float32"}: @@ -299,7 +303,9 @@ def dtype(obj): if isinstance(obj, CategoricalDtype): return obj elif isinstance(obj, Generic): - return np_to_cudf_dtypes[obj.to_numpy] + return obj + elif issubclass(obj.__class__, Generic): + return obj() if isinstance(obj, np.dtype): return np_to_cudf_dtypes[obj] elif isinstance(obj, pa.lib.DataType): @@ -513,6 +519,7 @@ def __repr__(self): np.dtype("uint32"): UInt32Dtype(), np.dtype("uint64"): UInt64Dtype(), np.dtype("bool"): BooleanDtype(), + np.dtype("U"): StringDtype(), np.dtype("object"): StringDtype(), np.dtype("float32"): Float32Dtype(), np.dtype("float64"): Float64Dtype(), diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e5c8ecfed29..f31e43b179b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1058,8 +1058,8 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False): rhs = self._normalize_binop_value(rhs) if fn == "truediv": - if str(lhs.dtype) in truediv_int_dtype_corrections: - truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)] + if lhs.dtype.name in truediv_int_dtype_corrections: + truediv_type = truediv_int_dtype_corrections[lhs.dtype.name] lhs = lhs.astype(truediv_type) if fill_value is not None: @@ -1381,27 +1381,17 @@ def __rtruediv__(self, other): __div__ = __truediv__ def _bitwise_binop(self, other, op): - if ( - np.issubdtype(self.dtype, np.bool_) - or np.issubdtype(self.dtype, np.integer) - ) and ( - np.issubdtype(other.dtype, np.bool_) - or np.issubdtype(other.dtype, np.integer) - ): - # TODO: This doesn't work on Series (op) DataFrame - # because dataframe doesn't have dtype + if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer))): ser = self._binaryop(other, op) - if np.issubdtype(self.dtype, np.bool_) or np.issubdtype( - other.dtype, np.bool_ - ): - ser = ser.astype(np.bool_) - return ser + if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(other.dtype, cudf.BooleanDtype): + ser = ser.astype(cudf.BooleanDtype()) else: raise TypeError( f"Operation 'bitwise {op}' not supported between " f"{self.dtype.type.__name__} and {other.dtype.type.__name__}" ) - + return ser + def __and__(self, other): """Performs vectorized bitwise and (&) on corresponding elements of two series. @@ -2125,7 +2115,6 @@ def astype(self, dtype, copy=False, errors="raise"): return self.copy(deep=copy) try: data = self._column.astype(dtype) - return self._copy_construct( data=data.copy(deep=True) if copy else data, index=self.index ) @@ -4239,16 +4228,16 @@ def keys(self): truediv_int_dtype_corrections = { - "int8": "float32", - "int16": "float32", - "int32": "float32", - "int64": "float64", - "uint8": "float32", - "uint16": "float32", - "uint32": "float64", - "uint64": "float64", - "bool": "float32", - "int": "float", + "Int8": "Float32", + "Int16": "Float32", + "Int32": "Float32", + "Int64": "Float64", + "UInt8": "Float32", + "UInt16": "Float32", + "UInt32": "Float64", + "UInt64": "Float64", + "Boolean": "Float32", + "Int": "Float", } From 92d1a644f0a40156f3c068323581710b0b321aae Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 25 Aug 2020 15:33:58 -0700 Subject: [PATCH 23/80] more progress --- python/cudf/cudf/__init__.py | 1 + python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/_lib/stream_compaction.pyx | 3 +- python/cudf/cudf/core/column/categorical.py | 5 ++- python/cudf/cudf/core/column/column.py | 4 +- python/cudf/cudf/core/column/numerical.py | 19 +++++---- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 47 +++++++++------------ python/cudf/cudf/core/dtypes.py | 16 ++++++- python/cudf/cudf/core/join/join.py | 20 ++++----- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/tests/test_column.py | 9 ++-- python/cudf/cudf/utils/dtypes.py | 16 ++++--- 13 files changed, 84 insertions(+), 64 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 56892f6787f..cff9df9f032 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -8,6 +8,7 @@ import rmm +import cudf.api.types from cudf import core, datasets, testing from cudf._version import get_versions from cudf.core import ( diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 812c42c8c51..5021778be44 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -365,7 +365,7 @@ cdef class Column: else: col = self - cdef _Dtype pydtype = self.dtype + cdef _Dtype pydtype = col.dtype cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type() cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 2d81eb49f00..aa59fe14e2c 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import pandas as pd +from cudf.core.dtypes import BooleanDtype from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -99,7 +100,7 @@ def apply_boolean_mask(Table source_table, Column boolean_mask): Table obtained from applying mask """ - assert pd.api.types.is_bool_dtype(boolean_mask.dtype) + assert isinstance(boolean_mask.dtype, BooleanDtype) cdef unique_ptr[table] c_result cdef table_view source_table_view = source_table.view() diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 3a8df934264..5185660c13c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -308,8 +308,9 @@ def add_categories(self, new_categories, **kwargs): f"existing categories." ) common_dtype = np.find_common_type( - [old_categories.dtype, new_categories.dtype], [] + [old_categories.dtype.to_numpy, new_categories.dtype.to_numpy], [] ) + common_dtype = cudf.dtype(common_dtype) new_categories = new_categories.astype(common_dtype, copy=False) old_categories = old_categories.astype(common_dtype, copy=False) @@ -1254,7 +1255,7 @@ def _create_empty_categorical_column(categorical_column, dtype): cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, - np.dtype(categorical_column.cat().codes), + categorical_column.cat().codes.dtype, ) ), offset=categorical_column.offset, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c89de55148c..33c0b98d203 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -516,9 +516,9 @@ def __getitem__(self, arg): arg = as_column(arg) if len(arg) == 0: arg = as_column([], dtype="int32") - if pd.api.types.is_integer_dtype(arg.dtype): + if pd.api.types.is_integer_dtype(arg.dtype) or isinstance(arg.dtype, cudf.Integer): return self.take(arg) - if pd.api.types.is_bool_dtype(arg.dtype): + if pd.api.types.is_bool_dtype(arg.dtype) or isinstance(arg.dtype, cudf.BooleanDtype): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg)) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8cb8ecaf752..b0e4c563fd8 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -272,6 +272,7 @@ def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *value*. """ + to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace ) @@ -379,14 +380,15 @@ def can_cast_safely(self, to_dtype): safely cast to dtype """ if self.dtype.kind == to_dtype.kind: - if self.dtype <= to_dtype: + # todo: implement >, < for cudf.Dtype + if self.dtype.to_numpy <= to_dtype.to_numpy: return True else: # Kinds are the same but to_dtype is smaller - if "float" in to_dtype.name: - info = np.finfo(to_dtype) - elif "int" in to_dtype.name: - info = np.iinfo(to_dtype) + if isinstance(to_dtype, cudf.Floating): + info = np.finfo(to_dtype.to_numpy) + elif isinstance(to_dtype, cudf.Integer): + info = np.iinfo(to_dtype.to_numpy) min_, max_ = info.min, info.max if (self.min() > min_) and (self.max() < max_): @@ -396,7 +398,7 @@ def can_cast_safely(self, to_dtype): # want to cast int to float elif to_dtype.kind == "f" and self.dtype.kind in {"i", "u"}: - info = np.finfo(to_dtype) + info = np.finfo(to_dtype.to_numpy) biggest_exact_int = 2 ** (info.nmant + 1) if (self.min() >= -biggest_exact_int) and ( self.max() <= biggest_exact_int @@ -415,7 +417,7 @@ def can_cast_safely(self, to_dtype): # want to cast float to int: elif to_dtype.kind in {"i", "u"} and self.dtype.kind == "f": - info = np.iinfo(to_dtype) + info = np.iinfo(to_dtype.to_numpy) min_, max_ = info.min, info.max # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): @@ -503,11 +505,10 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): col_to_normalize_dtype = col_to_normalize.dtype else: raise TypeError(f"Type {type(col_to_normalize)} not supported") - if ( col_to_normalize_dtype.kind == "f" and input_column_dtype.kind in {"i", "u"} - ) or (col_to_normalize_dtype.num > input_column_dtype.num): + ) or (col_to_normalize_dtype.to_numpy.num > input_column_dtype.to_numpy.num): raise TypeError( f"Potentially unsafe cast for non-equivalent " f"{col_to_normalize_dtype.name} " diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e47e42c1f13..be96e18c148 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4740,7 +4740,7 @@ def fillna(self, fill_value): def _find_first_and_last(self, value): found_indices = self.str().contains(f"^{value}$") - found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) + found_indices = libcudf.unary.cast(found_indices, dtype=cudf.Int32Dtype()) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) return first, last diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 521d422a233..644c06bcd80 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -44,7 +44,7 @@ def __init__( The number of null values. If None, it is calculated automatically. """ - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") @@ -60,11 +60,9 @@ def __init__( if not (self.dtype.type is np.timedelta64): raise TypeError(f"{self.dtype} is not a supported duration type") - self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item): try: - item = np.timedelta64(item, self._time_unit) + item = np.timedelta64(item, self.dtype._time_unit) except ValueError: # If item cannot be converted to duration type # np.timedelta64 raises ValueError, hence `item` @@ -111,7 +109,7 @@ def to_arrow(self): def _binary_op_floordiv(self, rhs): lhs, rhs = self, rhs - if pd.api.types.is_timedelta64_dtype(rhs.dtype): + if cudf.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") @@ -148,7 +146,7 @@ def _binary_op_mul(self, rhs): return out_dtype def _binary_op_mod(self, rhs): - if pd.api.types.is_timedelta64_dtype(rhs.dtype): + if cudf.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = determine_out_dtype(self.dtype, rhs.dtype) elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype @@ -160,8 +158,8 @@ def _binary_op_mod(self, rhs): return out_dtype def _binary_op_eq_ne(self, rhs): - if pd.api.types.is_timedelta64_dtype(rhs.dtype): - out_dtype = np.bool + if cudf.api.types.is_timedelta64_dtype(rhs.dtype): + out_dtype = cudf.BooleanDtype() else: raise TypeError( f"Equality of {self.dtype} with {rhs.dtype} " @@ -170,8 +168,8 @@ def _binary_op_eq_ne(self, rhs): return out_dtype def _binary_op_lt_gt_le_ge(self, rhs): - if pd.api.types.is_timedelta64_dtype(rhs.dtype): - return np.bool + if cudf.api.types.is_timedelta64_dtype(rhs.dtype): + return cudf.BooleanDtype() else: raise TypeError( f"Invalid comparison between dtype={self.dtype}" @@ -180,7 +178,7 @@ def _binary_op_lt_gt_le_ge(self, rhs): def _binary_op_truediv(self, rhs): lhs, rhs = self, rhs - if pd.api.types.is_timedelta64_dtype(rhs.dtype): + if cudf.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") @@ -234,7 +232,8 @@ def binary_operator(self, op, rhs, reflect=False): if reflect: lhs, rhs = rhs, lhs - + import pdb + pdb.set_trace() return binop(lhs, rhs, op=op, out_dtype=out_dtype) def normalize_binop_value(self, other): @@ -575,24 +574,24 @@ def binop(lhs, rhs, op, out_dtype): def determine_out_dtype(lhs_dtype, rhs_dtype): - if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): - return rhs_dtype - elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): - return lhs_dtype + if np.can_cast(cudf.dtype(lhs_dtype).to_numpy, cudf.dtype(rhs_dtype).to_numpy): + return cudf.dtype(rhs_dtype) + elif np.can_cast(cudf.dtype(rhs_dtype).to_numpy, cudf.dtype(lhs_dtype).to_numpy): + return cudf.dtype(lhs_dtype) else: raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") def _timedelta_binary_op_add(lhs, rhs): - if pd.api.types.is_timedelta64_dtype(rhs.dtype): + if isinstance(rhs.dtype, cudf.Timedelta): out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) - elif pd.api.types.is_datetime64_dtype(rhs.dtype): + elif isinstance(rhs.dtype, cudf.Datetime): units = ["s", "ms", "us", "ns"] lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")) else: raise TypeError( f"Addition of {lhs.dtype} with {rhs.dtype} " @@ -603,19 +602,15 @@ def _timedelta_binary_op_add(lhs, rhs): def _timedelta_binary_op_sub(lhs, rhs): - if pd.api.types.is_timedelta64_dtype( - lhs.dtype - ) and pd.api.types.is_timedelta64_dtype(rhs.dtype): + if isinstance(lhs.dtype, cudf.Timedelta) and isinstance(rhs.dtype, cudf.Timedelta): out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) - elif pd.api.types.is_timedelta64_dtype( - rhs.dtype - ) and pd.api.types.is_datetime64_dtype(lhs.dtype): + elif isinstance(rhs.dtype, cudf.Timedelta) and isinstance(lhs.dtype, cudf.Datetime): units = ["s", "ms", "us", "ns"] lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")) else: raise TypeError( f"Subtraction of {lhs.dtype} with {rhs.dtype} " diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 791dc74be54..4fb67ce1424 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -26,6 +26,10 @@ pa.timestamp("us"): np.dtype("datetime64[us]"), pa.timestamp("ms"): np.dtype("datetime64[ms]"), pa.timestamp("s"): np.dtype("datetime64[s]"), + pa.duration("ns"): np.dtype('timedelta64[ns]'), + pa.duration("us"): np.dtype('timedelta64[us]'), + pa.duration("ms"): np.dtype('timedelta64[ms]'), + pa.duration("s"): np.dtype('timedelta64[s]'), } pa_to_np_dtypes = { @@ -45,6 +49,10 @@ pa.timestamp("us"): np.dtype("datetime64[us]"), pa.timestamp("ms"): np.dtype("datetime64[ms]"), pa.timestamp("s"): np.dtype("datetime64[s]"), + pa.duration("ns"): np.dtype('timedelta64[ns]'), + pa.duration("us"): np.dtype('timedelta64[us]'), + pa.duration("ms"): np.dtype('timedelta64[ms]'), + pa.duration("s"): np.dtype('timedelta64[s]'), None: None, } @@ -192,7 +200,6 @@ def __init__(self): class BooleanDtype(Generic): - is_boolean = True def __init__(self): self.pa_type = pa.bool_() @@ -289,6 +296,8 @@ def make_dtype_from_string(obj): return Float32Dtype() elif "bool" in obj: return BooleanDtype() + elif "category" in obj: + return "category" def make_dtype_from_numpy(obj): @@ -300,6 +309,8 @@ def make_dtype_from_numpy(obj): def dtype(obj): if obj is None: return None + if isinstance(obj, pd.CategoricalDtype): + return cudf.CategoricalDtype.from_pandas(obj) if isinstance(obj, CategoricalDtype): return obj elif isinstance(obj, Generic): @@ -336,6 +347,9 @@ def __init__(self, categories=None, ordered=None): self._categories = self._init_categories(categories) self.ordered = ordered + def __repr__(self): + return self.to_pandas().__repr__() + @property def categories(self): if self._categories is None: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 0aadcf875cb..459d8e215c4 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -356,7 +356,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): dtype_r, CategoricalDtype ): # categories are not equal - libcudf_join_type = np.dtype("O") + libcudf_join_type = cudf.StringDtype() elif how == "left": check_col = rcol.fillna(0) if not check_col.can_cast_safely(dtype_l): @@ -393,20 +393,20 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): raise ValueError(ctgry_err.format(lcol, "left")) libcudf_join_type = rcol.cat().categories.dtype elif how in {"inner", "outer"}: - if (np.issubdtype(dtype_l, np.number)) and ( - np.issubdtype(dtype_r, np.number) + if (isinstance(dtype_l, cudf.Number)) and ( + isinstance(dtype_r, cudf.Number) ): if dtype_l.kind == dtype_r.kind: # both ints or both floats - libcudf_join_type = max(dtype_l, dtype_r) + libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy)) else: - libcudf_join_type = np.find_common_type( - [], [dtype_l, dtype_r] - ) - elif np.issubdtype(dtype_l, np.datetime64) and np.issubdtype( - dtype_r, np.datetime64 + libcudf_join_type = cudf.dtype(np.find_common_type( + [], [dtype_l.to_numpy, dtype_r.to_numpy] + )) + elif isinstance(dtype_l, cudf.Datetime) and isinstance( + dtype_r, cudf.Datetime ): - libcudf_join_type = max(dtype_l, dtype_r) + libcudf_join_type = cudf.dtype(max(dtype_l, dtype_r)) return libcudf_join_type def libcudf_to_output_casting_rules(self, lcol, rcol, how): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f31e43b179b..83720446ab0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1588,9 +1588,9 @@ def __invert__(self): Returns a new Series. """ - if np.issubdtype(self.dtype, np.integer): + if isinstance(self.dtype, cudf.Integer): return self._unaryop("invert") - elif np.issubdtype(self.dtype, np.bool_): + elif isinstance(self.dtype, cudf.BooleanDtype): return self._unaryop("not") else: raise TypeError( diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 3fb6120f53d..7ac4df4e514 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -55,10 +55,11 @@ def test_column_offset_and_size(pandas_input, offset, size): assert col.size == col.codes.size assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) elif isinstance(col.dtype, cudf.StringDtype): - assert col.size == (col.children[0].size - 1) - assert col.size == ( - (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 - ) + if col.size > 0: + assert col.size == (col.children[0].size - 1) + assert col.size == ( + (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 + ) else: assert col.size == (col.data.size / col.dtype.itemsize) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 11596163c32..9d9bece3e10 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -70,6 +70,8 @@ def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype. """ + if isinstance(dtype, cudf.Generic): + return dtype.pa_type # special case when dtype is np.datetime64 if dtype.kind == "M": time_unit, _ = np.datetime_data(dtype) @@ -102,7 +104,7 @@ def get_numeric_type_info(dtype): def numeric_normalize_types(*args): """Cast all args to a common type using numpy promotion logic """ - dtype = np.result_type(*[a.dtype for a in args]) + dtype = np.result_type(*[a.dtype.to_numpy for a in args]) return [a.astype(dtype) for a in args] @@ -255,6 +257,8 @@ def to_cudf_compatible_scalar(val, dtype=None): val = pd.api.types.pandas_dtype(type(val)).type(val) if dtype is not None: + if isinstance(dtype, cudf.Generic): + dtype = dtype.to_numpy val = val.astype(dtype) if val.dtype.type is np.datetime64: @@ -381,25 +385,27 @@ def min_column_type(x, expected_type): If the column is not a subtype of `np.signedinteger` or `np.floating` returns the same dtype as the dtype of `x` without modification """ - if not isinstance(x, cudf.core.column.NumericalColumn): raise TypeError("Argument x must be of type column.NumericalColumn") if x.valid_count == 0: return x.dtype + x_np_dtype = x.dtype.to_numpy + expected_type = cudf.dtype(expected_type).to_numpy - if np.issubdtype(x.dtype, np.floating): + if np.issubdtype(x_np_dtype, np.floating): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) if result_type == np.dtype("float16"): # cuDF does not support float16 dtype result_type = np.dtype("float32") - return result_type + return cudf.dtype(result_type) if np.issubdtype(expected_type, np.integer): max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) - return np.promote_types(max_bound_dtype, min_bound_dtype) + result = np.promote_types(max_bound_dtype, min_bound_dtype) + return cudf.dtype(result) return x.dtype From 59b3673aed72465c135e43ca405150ae51d1b520 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Aug 2020 05:37:11 -0700 Subject: [PATCH 24/80] all column tests pass --- python/cudf/cudf/core/column/timedelta.py | 12 +++++------- python/cudf/cudf/core/dtypes.py | 10 +++++++++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 644c06bcd80..2ab0fadae82 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -16,10 +16,10 @@ from cudf.utils.utils import buffers_from_pyarrow _dtype_to_format_conversion = { - "timedelta64[ns]": "%D days %H:%M:%S", - "timedelta64[us]": "%D days %H:%M:%S", - "timedelta64[ms]": "%D days %H:%M:%S", - "timedelta64[s]": "%D days %H:%M:%S", + "Timedelta64NS": "%D days %H:%M:%S", + "Timedelta64US": "%D days %H:%M:%S", + "Timedelta64MS": "%D days %H:%M:%S", + "Timedelta64S": "%D days %H:%M:%S", } @@ -232,8 +232,6 @@ def binary_operator(self, op, rhs, reflect=False): if reflect: lhs, rhs = rhs, lhs - import pdb - pdb.set_trace() return binop(lhs, rhs, op=op, out_dtype=out_dtype) def normalize_binop_value(self, other): @@ -342,7 +340,7 @@ def as_string_column(self, dtype, **kwargs): kwargs["format"] = fmt if len(self) > 0: return string._numeric_to_str_typecast_functions[ - np.dtype(self.dtype) + self.dtype ](self, **kwargs) else: return column.column_empty(0, dtype="object", masked=False) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4fb67ce1424..0931b3af5c4 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -298,7 +298,15 @@ def make_dtype_from_string(obj): return BooleanDtype() elif "category" in obj: return "category" - + elif "timedelta" in obj: + if obj == 'timedelta64[ns]': + return Timedelta64NSDtype() + if obj == 'timedelta64[us]': + return Timedelta64USDtype() + if obj == 'timedelta64[ms]': + return Timedelta64MSDtype() + if obj == 'timedelta64[s]': + return Timedelta64SDtype() def make_dtype_from_numpy(obj): np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()} From 297a31a484ef0f59c78baca2eaacb07b7423fa67 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Aug 2020 06:27:47 -0700 Subject: [PATCH 25/80] move more stuff to cudf.api.types --- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/_lib/parquet.pyx | 3 +- python/cudf/cudf/_lib/transpose.pyx | 2 +- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 6 +- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/string.py | 7 +-- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/frame.py | 9 ++- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexing.py | 2 +- python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/core/series.py | 3 +- python/cudf/cudf/testing/testing.py | 2 +- python/cudf/cudf/utils/dtypes.py | 64 +-------------------- 15 files changed, 23 insertions(+), 91 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 5021778be44..ee9978316fa 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -7,7 +7,7 @@ import rmm import cudf from cudf.core.buffer import Buffer -from cudf.utils.dtypes import is_categorical_dtype, is_list_dtype +from cudf.api.types import is_categorical_dtype, is_list_dtype import cudf._lib as libcudfxx from cpython.buffer cimport PyObject_CheckBuffer diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 26e2e02402c..fd7e2cd847c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -11,7 +11,8 @@ import json from cython.operator import dereference import numpy as np -from cudf.utils.dtypes import np_to_pa_dtype, is_categorical_dtype +from cudf.utils.dtypes import np_to_pa_dtype +from cudf.api.types import is_categorical_dtype from libc.stdlib cimport free from libc.stdint cimport uint8_t from libcpp.memory cimport shared_ptr, unique_ptr, make_unique diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 1c31e3f5d3f..ad4edebf1cf 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -1,7 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import cudf -from cudf.utils.dtypes import is_categorical_dtype +from cudf.api.types import is_categorical_dtype from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 5185660c13c..2f1e677b898 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -14,11 +14,11 @@ from cudf.core.column.methods import ColumnMethodsMixin from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( - is_categorical_dtype, is_mixed_with_object_dtype, min_signed_type, min_unsigned_type, ) +from cudf.api.types import is_categorical_dtype class CategoricalAccessor(ColumnMethodsMixin): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 33c0b98d203..02269c71b85 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -30,15 +30,13 @@ NUMERIC_TYPES, check_cast_unsupported_dtype, get_time_unit, - is_categorical_dtype, - is_list_dtype, - is_numerical_dtype, is_scalar, - is_string_dtype, min_unsigned_type, np_to_pa_dtype, ) from cudf.utils.utils import mask_dtype +from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype + class ColumnBase(Column, Serializable): def __init__( diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index f33a5923a74..13552e20647 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -5,7 +5,7 @@ from cudf.core.column import ColumnBase from cudf.core.column.methods import ColumnMethodsMixin from cudf.core.dtypes import ListDtype -from cudf.utils.dtypes import is_list_dtype +from cudf.api.types import is_list_dtype from cudf.utils.utils import buffers_from_pyarrow diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index be96e18c148..dae0b8ef6c1 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -140,13 +140,10 @@ from cudf.utils import utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - can_convert_to_column, - is_list_dtype, - is_scalar, - is_string_dtype, + can_convert_to_column, is_scalar ) from cudf.utils.utils import buffers_from_pyarrow - +from cudf.api.types import is_list_dtype, is_string_dtype from cudf.core.dtypes import dtype _str_to_numeric_typecast_functions = { diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1023db69104..76f77d52f2f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -40,13 +40,11 @@ from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( cudf_dtype_from_pydata_dtype, - is_categorical_dtype, - is_list_dtype, is_list_like, is_scalar, - is_string_dtype, numeric_normalize_types, ) +from cudf.api.types import is_categorical_dtype, is_list_dtype, is_string_dtype from cudf.utils.utils import OrderedColumnDict diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8d826977ba7..7251f364dad 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -15,12 +15,11 @@ from cudf.core.column import as_column, build_categorical_column from cudf.utils import utils from cudf.utils.dtypes import ( - is_categorical_dtype, is_column_like, - is_numerical_dtype, is_scalar, min_scalar_type, ) +from cudf.api.types import is_numerical_dtype, is_categorical_dtype class Frame(libcudf.table.Table): @@ -270,9 +269,9 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = np.find_common_type( - [col.dtype for col in cols], [] - ) + dtypes[idx] = cudf.dtype(np.find_common_type( + [col.dtype.to_numpy for col in cols], [] + )) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2663ad8e22d..e5a89a23077 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -26,12 +26,12 @@ from cudf.utils import ioutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - is_categorical_dtype, is_list_like, is_mixed_with_object_dtype, is_scalar, numeric_normalize_types, ) +from cudf.api.types import is_categorical_dtype from cudf.utils.utils import cached_property from cudf.core.dtypes import dtype diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index ce3c6806d54..5f6d4a69bd5 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -6,12 +6,12 @@ import cudf from cudf._lib.nvtx import annotate from cudf.utils.dtypes import ( - is_categorical_dtype, is_column_like, is_list_like, is_scalar, to_cudf_compatible_scalar, ) +from cudf.api.types import is_categorical_dtype def indices_from_labels(obj, labels): diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index c2603a8d177..b423a46b88b 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -10,8 +10,8 @@ build_categorical_column, ) from cudf.utils import cudautils -from cudf.utils.dtypes import is_categorical_dtype, is_list_like - +from cudf.utils.dtypes import is_list_like +from cudf.api.types import is_categorical_dtype _axis_map = {0: 0, 1: 1, "index": 0, "columns": 1} diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 83720446ab0..4a2423ecd99 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -42,14 +42,13 @@ from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, - is_list_dtype, is_list_like, is_mixed_with_object_dtype, is_scalar, - is_string_dtype, min_scalar_type, numeric_normalize_types, ) +from cudf.api.types import is_list_dtype, is_string_dtype class Series(Frame, Serializable): diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index eba8d4c7f62..1dcf43c3367 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -5,7 +5,7 @@ import pandas as pd import cudf -from cudf.utils.dtypes import is_categorical_dtype +from cudf.api.types import is_categorical_dtype def _check_isinstance(left, right, obj): diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 9d9bece3e10..29e767fe179 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -12,6 +12,8 @@ import cudf from cudf._lib.scalar import Scalar +from cudf.api.types import is_categorical_dtype + _NA_REP = "" _np_pa_dtypes = { @@ -131,68 +133,6 @@ def is_datetime_dtype(obj): return False return "M8" in obj.str - -def is_categorical_dtype(obj): - """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype - is a pandas CategoricalDtype. - """ - if obj is None: - return False - if isinstance(obj, cudf.CategoricalDtype): - return True - if obj is cudf.CategoricalDtype: - return True - if isinstance(obj, np.dtype): - return False - if isinstance(obj, CategoricalDtype): - return True - if obj is CategoricalDtype: - return True - if obj is CategoricalDtypeType: - return True - if isinstance(obj, str) and obj == "category": - return True - if isinstance( - obj, - ( - CategoricalDtype, - cudf.core.index.CategoricalIndex, - cudf.core.column.CategoricalColumn, - pd.Categorical, - pd.CategoricalIndex, - ), - ): - return True - if isinstance(obj, np.ndarray): - return False - if isinstance( - obj, - ( - cudf.Index, - cudf.Series, - cudf.core.column.ColumnBase, - pd.Index, - pd.Series, - ), - ): - return is_categorical_dtype(obj.dtype) - if hasattr(obj, "type"): - if obj.type is CategoricalDtypeType: - return True - return pd.api.types.is_categorical_dtype(obj) - - -def is_list_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.ListDtype - or obj is cudf.core.dtypes.ListDtype - or type(obj) is cudf.core.column.ListColumn - or obj is cudf.core.column.ListColumn - or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) - or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype)) - ) - - def cudf_dtype_from_pydata_dtype(dtype): """ Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. From e5def6e162b00966e3983fdbca3381cfc5858cb6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Aug 2020 06:34:56 -0700 Subject: [PATCH 26/80] forgot entire api/ folder --- python/cudf/cudf/api/__init__.py | 0 python/cudf/cudf/api/types.py | 85 ++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 python/cudf/cudf/api/__init__.py create mode 100644 python/cudf/cudf/api/types.py diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py new file mode 100644 index 00000000000..143930332b1 --- /dev/null +++ b/python/cudf/cudf/api/types.py @@ -0,0 +1,85 @@ +import pandas as pd +import cudf +import numpy as np +from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType + +def is_datetime64_dtype(obj): + return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(obj) + +def is_timedelta64_dtype(obj): + return isinstance(obj, cudf.Timedelta) or pd.api.types.is_timedelta64_dtype(obj) + +def is_string_dtype(obj): + return isinstance(obj, cudf.StringDtype) or (pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)) + +def is_numerical_dtype(obj): + if isinstance(obj, cudf.Generic): + return isinstance(obj, cudf.Number) + if is_categorical_dtype(obj): + return False + if is_list_dtype(obj): + return False + return ( + np.issubdtype(obj, np.bool_) + or np.issubdtype(obj, np.floating) + or np.issubdtype(obj, np.signedinteger) + ) + +def is_categorical_dtype(obj): + """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype + is a pandas CategoricalDtype. + """ + if obj is None: + return False + if isinstance(obj, cudf.CategoricalDtype): + return True + if obj is cudf.CategoricalDtype: + return True + if isinstance(obj, np.dtype): + return False + if isinstance(obj, CategoricalDtype): + return True + if obj is CategoricalDtype: + return True + if obj is CategoricalDtypeType: + return True + if isinstance(obj, str) and obj == "category": + return True + if isinstance( + obj, + ( + CategoricalDtype, + cudf.core.index.CategoricalIndex, + cudf.core.column.CategoricalColumn, + pd.Categorical, + pd.CategoricalIndex, + ), + ): + return True + if isinstance(obj, np.ndarray): + return False + if isinstance( + obj, + ( + cudf.Index, + cudf.Series, + cudf.core.column.ColumnBase, + pd.Index, + pd.Series, + ), + ): + return is_categorical_dtype(obj.dtype) + if hasattr(obj, "type"): + if obj.type is CategoricalDtypeType: + return True + return pd.api.types.is_categorical_dtype(obj) + +def is_list_dtype(obj): + return ( + type(obj) is cudf.core.dtypes.ListDtype + or obj is cudf.core.dtypes.ListDtype + or type(obj) is cudf.core.column.ListColumn + or obj is cudf.core.column.ListColumn + or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) + or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype)) + ) From b4d344f7d405390fb2ba4067ef14d3387645be90 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Aug 2020 06:52:23 -0700 Subject: [PATCH 27/80] fix mutable_column_view --- python/cudf/cudf/_lib/column.pyx | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index ee9978316fa..9953c8924e6 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -307,14 +307,8 @@ cdef class Column: col = self.base_children[0] else: col = self - data_dtype = col.dtype - - cdef libcudf_types.type_id tid = ( - ( - np_to_cudf_types[np.dtype(data_dtype)] - ) - ) - cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid) + cdef _Dtype pydtype = col.dtype + cdef libcudf_types.data_type dtype = pydtype.get_libcudf_type() cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data From 22fd5d94941f03bd7a8ea1f667ad2752684a1cab Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 26 Aug 2020 14:24:47 -0700 Subject: [PATCH 28/80] working through dataframe.py tests --- python/cudf/cudf/_lib/reduce.pyx | 2 +- python/cudf/cudf/api/types.py | 4 +++- python/cudf/cudf/core/column/column.py | 5 ++--- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical.py | 16 ++++++++++------ python/cudf/cudf/core/dataframe.py | 22 +++++++++++----------- python/cudf/cudf/core/frame.py | 4 +++- python/cudf/cudf/core/index.py | 5 ++--- python/cudf/cudf/core/series.py | 8 ++------ 9 files changed, 35 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 39aad31f570..69592894cae 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -34,7 +34,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): col_dtype = incol.dtype if reduction_op in ['sum', 'sum_of_squares', 'product']: col_dtype = np.find_common_type([col_dtype.to_numpy], [np.uint64]) - col_dtype = cudf_dtype(col_dtype) + col_dtype = cudf_dtype(col_dtype) if dtype is None else cudf_dtype(dtype) cdef column_view c_incol_view = incol.view() cdef unique_ptr[scalar] c_result diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 143930332b1..ea6e503782c 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -14,7 +14,7 @@ def is_string_dtype(obj): def is_numerical_dtype(obj): if isinstance(obj, cudf.Generic): - return isinstance(obj, cudf.Number) + return isinstance(obj, (cudf.Number, cudf.BooleanDtype)) if is_categorical_dtype(obj): return False if is_list_dtype(obj): @@ -29,6 +29,8 @@ def is_categorical_dtype(obj): """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype is a pandas CategoricalDtype. """ + if isinstance(obj, cudf.Generic) and not isinstance(obj, cudf.CategoricalDtype): + return False if obj is None: return False if isinstance(obj, cudf.CategoricalDtype): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 02269c71b85..2786ca45124 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -200,7 +200,7 @@ def _concat(cls, objs, dtype=None): [ o for o in not_null_cols - if not isinstance(o.dtype, (cudf.Number, cudf.Datetime)) + if not isinstance(o.dtype, (cudf.Number)) or isinstance(o.dtype, cudf.Datetime) ] ) == 0 @@ -1421,8 +1421,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): if pa.types.is_dictionary(pa_type): new_dtype = "category" else: - new_dtype = np.dtype(pa_type.to_pandas_dtype()) - + new_dtype = cudf.dtype(pa_type) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 807f0803e7f..c205b841af5 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -151,7 +151,7 @@ def as_numerical(self): ) def as_datetime_column(self, dtype, **kwargs): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b0e4c563fd8..92dab1a15de 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -17,7 +17,7 @@ numeric_normalize_types, ) from cudf.utils.utils import buffers_from_pyarrow - +from cudf.core.dtypes import Float64Dtype class NumericalColumn(column.ColumnBase): def __init__( self, data, dtype, mask=None, size=None, offset=0, null_count=None @@ -200,18 +200,22 @@ def to_arrow(self): return out def sum(self, dtype=None): - return libcudf.reduce.reduce("sum", self, dtype=dtype) + try: + return libcudf.reduce.reduce("sum", self, dtype=dtype) + except: + import pdb + pdb.set_trace() def product(self, dtype=None): return libcudf.reduce.reduce("product", self, dtype=dtype) - def mean(self, dtype=np.float64): - return libcudf.reduce.reduce("mean", self, dtype=dtype) + def mean(self, dtype=Float64Dtype()): + return libcudf.reduce.reduce("mean", self, dtype=dtype) - def var(self, ddof=1, dtype=np.float64): + def var(self, ddof=1, dtype=Float64Dtype()): return libcudf.reduce.reduce("var", self, dtype=dtype, ddof=ddof) - def std(self, ddof=1, dtype=np.float64): + def std(self, ddof=1, dtype=Float64Dtype()): return libcudf.reduce.reduce("std", self, dtype=dtype, ddof=ddof) def sum_of_squares(self, dtype=None): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 76f77d52f2f..2717d64c0d4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3179,11 +3179,11 @@ def as_gpu_matrix(self, columns=None, order="F"): return cuda.as_cuda_array(matrix) if any( - (is_categorical_dtype(c) or np.issubdtype(c, np.dtype("object"))) + (is_categorical_dtype(c) or isinstance(c.dtype, cudf.StringDtype)) for c in cols ): raise TypeError("non-numeric data not yet supported") - dtype = np.find_common_type(cols, []) + dtype = np.find_common_type([c.dtype.to_numpy for c in cols], []) for k, c in self._data.items(): if c.has_nulls: errmsg = ( @@ -6003,7 +6003,7 @@ def kurtosis( msg = "Kurtosis only supports int, float, and bool dtypes." raise NotImplementedError(msg) - self = self.select_dtypes(include=[np.number, np.bool]) + self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()]) return self._apply_support_method( "kurtosis", axis=axis, @@ -6313,7 +6313,7 @@ def select_dtypes(self, include=None, exclude=None): ) include, exclude = map( - lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)), + lambda x: frozenset(map(cudf.dtype, x)), selection, ) @@ -6332,9 +6332,9 @@ def select_dtypes(self, include=None, exclude=None): # category handling if is_categorical_dtype(i_dtype): include_subtypes.add(i_dtype) - elif issubclass(dtype.type, i_dtype): - include_subtypes.add(dtype.type) - + elif issubclass(dtype, i_dtype): + include_subtypes.add(dtype) + # exclude all subtypes exclude_subtypes = set() for dtype in self.dtypes: @@ -6342,11 +6342,11 @@ def select_dtypes(self, include=None, exclude=None): # category handling if is_categorical_dtype(e_dtype): exclude_subtypes.add(e_dtype) - elif issubclass(dtype.type, e_dtype): - exclude_subtypes.add(dtype.type) + elif issubclass(dtype, e_dtype): + exclude_subtypes.add(dtype) include_all = set( - [cudf_dtype_from_pydata_dtype(d) for d in self.dtypes] + [cudf.dtype(d) for d in self.dtypes] ) if include: @@ -6359,7 +6359,7 @@ def select_dtypes(self, include=None, exclude=None): inclusion = inclusion - exclude_subtypes for k, col in self._data.items(): - infered_type = cudf_dtype_from_pydata_dtype(col.dtype) + infered_type = cudf.dtype(col.dtype) if infered_type in inclusion: df.insert(len(df._data), k, col) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7251f364dad..f9f36ed1798 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -289,8 +289,10 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): dtypes[idx] = min_scalar_type(len(categories[idx])) # Otherwise raise an error if columns have different dtypes elif not all( - is_dtype_equal(c.dtype, dtypes[idx]) for c in cols + c.dtype == dtypes[idx] for c in cols ): + import pdb + pdb.set_trace() raise ValueError("All columns must be the same type") return categories diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e5a89a23077..217079fbf03 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1512,7 +1512,7 @@ def _num_rows(self): @cached_property def _values(self): if len(self) > 0: - vals = cupy.arange(self._start, self._stop, dtype=self.dtype) + vals = cupy.arange(self._start, self._stop, dtype=self.dtype.to_numpy) return column.as_column(vals) else: return column.column_empty(0, masked=False, dtype=self.dtype) @@ -1625,7 +1625,7 @@ def dtype(self): """ `dtype` of the range of values in RangeIndex. """ - return np.dtype(np.int64) + return cudf.Int64Dtype() @property def is_contiguous(self): @@ -2524,7 +2524,6 @@ def as_index(arbitrary, **kwargs): - DatetimeIndex for Datetime input. - GenericIndex for all other inputs. """ - kwargs = _setdefault_name(arbitrary, **kwargs) if isinstance(arbitrary, cudf.MultiIndex): return arbitrary diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4a2423ecd99..513a8336f29 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3099,9 +3099,7 @@ def cumsum(self, axis=0, skipna=True, *args, **kwargs): result_col[first_index:] = None # pandas always returns int64 dtype if original dtype is int or `bool` - if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype( - result_col.dtype, np.bool_ - ): + if isinstance(result_col.dtype, (cudf.Integer, cudf.BooleanDtype)): return Series( result_col.astype(np.int64)._apply_scan_op("sum"), name=self.name, @@ -3161,9 +3159,7 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): result_col[first_index:] = None # pandas always returns int64 dtype if original dtype is int or `bool` - if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype( - result_col.dtype, np.bool_ - ): + if isinstance(result_col.dtype, (cudf.Integer, cudf.BooleanDtype)): return Series( result_col.astype(np.int64)._apply_scan_op("product"), name=self.name, From c5a0b62f02072cf65930d98f839d5ce8d40aa6a3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 27 Aug 2020 11:12:08 -0700 Subject: [PATCH 29/80] pass join tests --- python/cudf/cudf/_lib/string_casting.pyx | 10 +++------- python/cudf/cudf/core/column/column.py | 3 ++- python/cudf/cudf/core/column/string.py | 6 +++--- python/cudf/cudf/core/dtypes.py | 6 ++++-- python/cudf/cudf/core/join/join.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 14 ++++++-------- python/cudf/cudf/tests/test_joining.py | 7 +++++-- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 4dbb2d99db3..aee554017af 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -7,7 +7,7 @@ from cudf._lib.move cimport move from cudf._lib.scalar import as_scalar from cudf._lib.scalar cimport Scalar from cudf._lib.types import np_to_cudf_types -from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.types cimport underlying_type_t_type_id, _Dtype from cudf.core.column.column import as_column @@ -555,12 +555,8 @@ def timestamp2int( if input_col.size == 0: return as_column([], dtype=kwargs.get('dtype')) cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - np_to_cudf_types[kwargs.get('dtype')] - ) - ) - cdef data_type out_type = data_type(tid) + cdef _Dtype pydtype = kwargs.get('dtype') + cdef data_type out_type = pydtype.get_libcudf_type() cdef string c_timestamp_format = kwargs.get('format').encode('UTF-8') cdef unique_ptr[column] c_result with nogil: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2786ca45124..fc836f67d45 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -143,7 +143,7 @@ def values(self): Return a CuPy representation of the Column. """ if len(self) == 0: - return cupy.asarray([], dtype=self.dtype) + return cupy.asarray([], dtype=self.dtype.to_numpy) if self.has_nulls: raise ValueError("Column must have no nulls.") @@ -1632,6 +1632,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") + dtype = cudf.CategoricalDtype.from_pandas(sr.dtype) data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) elif isinstance(cudf.dtype(dtype), cudf.StringDtype): sr = pd.Series(arbitrary, dtype="str") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index dae0b8ef6c1..96c064c7328 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4532,7 +4532,7 @@ def _nbytes(self): def as_numerical_column(self, dtype, **kwargs): - out_dtype = np.dtype(dtype) + out_dtype = cudf.dtype(dtype) kwargs.update(dtype=out_dtype) if out_dtype.type is np.datetime64: @@ -4554,7 +4554,7 @@ def as_numerical_column(self, dtype, **kwargs): raise ValueError("Could not convert `None` value to datetime") boolean_match = self.binary_operator("eq", "NaT") - elif out_dtype.type is np.timedelta64: + elif out_dtype.type is cudf.Timedelta: if "format" not in kwargs: if len(self) > 0: kwargs.update(format="%D days %H:%M:%S") @@ -4577,7 +4577,7 @@ def as_numerical_column(self, dtype, **kwargs): self, **kwargs ) if ( - out_dtype.type in (np.datetime64, np.timedelta64) + isinstance(out_dtype, (cudf.Datetime, cudf.Timedelta)) ) and boolean_match.any(): result_col[boolean_match] = None return result_col diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 0931b3af5c4..6e5e37351e8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -326,6 +326,8 @@ def dtype(obj): elif issubclass(obj.__class__, Generic): return obj() if isinstance(obj, np.dtype): + if obj.type is np.str_: + return StringDtype() return np_to_cudf_dtypes[obj] elif isinstance(obj, pa.lib.DataType): return pa_to_cudf_dtypes[obj] @@ -448,7 +450,7 @@ def deserialize(cls, header, frames): return cls(categories=categories, ordered=ordered) -class ListDtype(ExtensionDtype): +class ListDtype(Generic): name = "list" @@ -483,7 +485,7 @@ def type(self): @classmethod def from_arrow(cls, typ): - obj = object.__new__(cls) + obj = ListDtype.__new__(ListDtype) obj._typ = typ return obj diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 459d8e215c4..ebc52490417 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -406,7 +406,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): elif isinstance(dtype_l, cudf.Datetime) and isinstance( dtype_r, cudf.Datetime ): - libcudf_join_type = cudf.dtype(max(dtype_l, dtype_r)) + libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy)) return libcudf_join_type def libcudf_to_output_casting_rules(self, lcol, rcol, how): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 57951879b7e..655bd6c28af 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -190,7 +190,7 @@ def test_series_init_none(): sr1 = Series() got = sr1.to_string() print(got) - expect = "Series([], dtype: float64)" + expect = "Series([], dtype: Float64)" # values should match despite whitespace difference assert got.split() == expect.split() @@ -198,7 +198,7 @@ def test_series_init_none(): sr2 = Series(None) got = sr2.to_string() print(got) - expect = "Series([], dtype: float64)" + expect = "Series([], dtype: Float64)" # values should match despite whitespace difference assert got.split() == expect.split() @@ -449,9 +449,9 @@ def test_dataframe_astype(nelem): df = DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df["a"] = data - assert df["a"].dtype is np.dtype(np.int32) + assert df["a"].dtype == gd.Int32Dtype() df["b"] = df["a"].astype(np.float32) - assert df["b"].dtype is np.dtype(np.float32) + assert df["b"].dtype == gd.Float32Dtype() np.testing.assert_equal(df["a"].to_array(), df["b"].to_array()) @@ -460,9 +460,9 @@ def test_index_astype(nelem): df = DataFrame() data = np.asarray(range(nelem), dtype=np.int32) df["a"] = data - assert df.index.dtype is np.dtype(np.int64) + assert df.index.dtype == gd.Int64Dtype() df.index = df.index.astype(np.float32) - assert df.index.dtype is np.dtype(np.float32) + assert df.index.dtype == gd.Float32Dtype() df["a"] = df["a"].astype(np.float32) np.testing.assert_equal(df.index.to_array(), df["a"].to_array()) df["b"] = df["a"] @@ -1545,9 +1545,7 @@ def gdf(pdf): @pytest.mark.parametrize("skipna", [True, False, None]) def test_dataframe_reductions(data, func, skipna): pdf = pd.DataFrame(data=data) - print(func(pdf, skipna=skipna)) gdf = DataFrame.from_pandas(pdf) - print(func(gdf, skipna=skipna)) assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna)) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 5cf8fb325d7..536ab79ddb0 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -802,7 +802,7 @@ def test_join_empty_table_dtype(): gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"]) - assert_eq(pd_merge["a"].dtype, gd_merge["a"].dtype) + assert gd_merge['a'].dtype == pd_merge['a'].dtype @pytest.mark.parametrize("how", ["outer", "inner", "left", "right"]) @@ -1108,11 +1108,14 @@ def test_typecast_on_join_overflow_unsafe(dtypes): lhs = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}, dtype=dtype_l) rhs = cudf.DataFrame({"a": [1, 2, 3, 4, dtype_l_max + 1]}, dtype=dtype_r) + l_typ_warn = cudf.dtype(dtype_l).name + r_typ_warn = cudf.dtype(dtype_r).name + with pytest.warns( UserWarning, match=( f"can't safely cast column" - f" from right with type {dtype_r} to {dtype_l}" + f" from right with type {r_typ_warn} to {l_typ_warn}" ), ): merged = lhs.merge(rhs, on="a", how="left") # noqa: F841 From d47de0361299560e490c3c5a72f9507cacd9942f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 27 Aug 2020 12:49:02 -0700 Subject: [PATCH 30/80] fix categorical tests --- python/cudf/cudf/core/column/column.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fc836f67d45..599046a6b26 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1607,7 +1607,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = bools_to_mask(as_column(mask).unary_operator("not")) data = data.set_mask(mask) - else: try: data = as_column( @@ -1631,9 +1630,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): - sr = pd.Series(arbitrary, dtype="category") - dtype = cudf.CategoricalDtype.from_pandas(sr.dtype) - data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) + if isinstance(dtype, pd.CategoricalDtype) or dtype is 'category': + data = as_column(pd.Series(arbitrary, dtype=dtype), nan_as_null=nan_as_null) + else: + data = as_column(arbitrary, nan_as_null=nan_as_null).astype(dtype) elif isinstance(cudf.dtype(dtype), cudf.StringDtype): sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) From fe180a3bfbfab73819b245f7a2e2ec37d9b07554 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 27 Aug 2020 15:19:06 -0700 Subject: [PATCH 31/80] more bugfixes --- python/cudf/cudf/_lib/copying.pyx | 2 +- python/cudf/cudf/api/types.py | 4 ++++ python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 8 ++++---- python/cudf/cudf/core/frame.py | 2 -- python/cudf/cudf/core/indexing.py | 4 ++-- python/cudf/cudf/tests/test_numerical.py | 20 ++++++++++---------- 7 files changed, 23 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 21813c38253..357b019c0f3 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -203,7 +203,7 @@ def _scatter_scalar(scalars, Column scatter_map, cdef bool c_bounds_check = bounds_check cdef Scalar slr for val, col in zip(scalars, target_table._columns): - slr = as_scalar(val, col.dtype) + slr = as_scalar(val, col.dtype.to_numpy) source_scalars.push_back(move(slr.c_value)) cdef column_view scatter_map_view = scatter_map.view() cdef table_view target_table_view = target_table.data_view() diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index ea6e503782c..e00023b492d 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -3,6 +3,10 @@ import numpy as np from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType +def is_bool_dtype(obj): + # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably + return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(obj) + def is_datetime64_dtype(obj): return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(obj) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 599046a6b26..3ad9ebb1551 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -35,7 +35,7 @@ np_to_pa_dtype, ) from cudf.utils.utils import mask_dtype -from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype +from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype, is_bool_dtype class ColumnBase(Column, Serializable): @@ -552,7 +552,7 @@ def __setitem__(self, key, value): nelem = abs(key_stop - key_start) else: key = as_column(key) - if pd.api.types.is_bool_dtype(key.dtype): + if is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2717d64c0d4..84088a54fe7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4921,8 +4921,8 @@ def to_records(self, index=True): ------- numpy recarray """ - members = [("index", self.index.dtype)] if index else [] - members += [(col, self[col].dtype) for col in self._data.names] + members = [("index", self.index.dtype.to_numpy)] if index else [] + members += [(col, self[col].dtype.to_numpy) for col in self._data.names] dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: @@ -6049,7 +6049,7 @@ def skew( msg = "Skew only supports int, float, and bool dtypes." raise NotImplementedError(msg) - self = self.select_dtypes(include=[np.number, np.bool]) + self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()]) return self._apply_support_method( "skew", axis=axis, @@ -6332,7 +6332,7 @@ def select_dtypes(self, include=None, exclude=None): # category handling if is_categorical_dtype(i_dtype): include_subtypes.add(i_dtype) - elif issubclass(dtype, i_dtype): + elif isinstance(dtype, i_dtype.__class__): include_subtypes.add(dtype) # exclude all subtypes diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f9f36ed1798..640ec747f72 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -291,8 +291,6 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): elif not all( c.dtype == dtypes[idx] for c in cols ): - import pdb - pdb.set_trace() raise ValueError("All columns must be the same type") return categories diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 5f6d4a69bd5..dcc9ddf9315 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -452,8 +452,8 @@ def _get_column_selection(self, arg): def _normalize_dtypes(df): if len(df.columns) > 0: - dtypes = df.dtypes.values.tolist() - normalized_dtype = np.result_type(*dtypes) + dtypes = [d.to_numpy for d in df.dtypes.values.tolist()] + normalized_dtype = cudf.dtype(np.result_type(*dtypes)) for name, col in df._data.items(): df[name] = col.astype(normalized_dtype) return df diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index c6131fbcd14..48c6522a378 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -1,19 +1,19 @@ import numpy as np import pandas as pd import pytest - +import cudf from cudf import Series from cudf.tests.utils import assert_eq def test_can_cast_safely_same_kind(): data = Series([1, 2, 3], dtype="int32")._column - to_dtype = np.dtype("int64") + to_dtype = cudf.dtype("int64") assert data.can_cast_safely(to_dtype) data = Series([1, 2, 3], dtype="int64")._column - to_dtype = np.dtype("int32") + to_dtype = cudf.dtype("int32") assert data.can_cast_safely(to_dtype) @@ -21,12 +21,12 @@ def test_can_cast_safely_same_kind(): assert not data.can_cast_safely(to_dtype) data = Series([1, 2, 3], dtype="uint32")._column - to_dtype = np.dtype("uint64") + to_dtype = cudf.dtype("uint64") assert data.can_cast_safely(to_dtype) data = Series([1, 2, 3], dtype="uint64")._column - to_dtype = np.dtype("uint32") + to_dtype = cudf.dtype("uint32") assert data.can_cast_safely(to_dtype) @@ -36,7 +36,7 @@ def test_can_cast_safely_same_kind(): def test_can_cast_safely_mixed_kind(): data = Series([1, 2, 3], dtype="int32")._column - to_dtype = np.dtype("float32") + to_dtype = cudf.dtype("float32") assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly @@ -44,18 +44,18 @@ def test_can_cast_safely_mixed_kind(): assert not data.can_cast_safely(to_dtype) data = Series([1, 2, 3], dtype="uint32")._column - to_dtype = np.dtype("float32") + to_dtype = cudf.dtype("float32") assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly data = Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column assert not data.can_cast_safely(to_dtype) - to_dtype = np.dtype("float64") + to_dtype = cudf.dtype("float64") assert data.can_cast_safely(to_dtype) data = Series([1.0, 2.0, 3.0], dtype="float32")._column - to_dtype = np.dtype("int32") + to_dtype = cudf.dtype("int32") assert data.can_cast_safely(to_dtype) # not integer float @@ -97,7 +97,7 @@ def test_to_pandas_nullable_bool(): def test_can_cast_safely_has_nulls(): data = Series([1, 2, 3, None], dtype="float32")._column - to_dtype = np.dtype("int64") + to_dtype = cudf.dtype("int64") assert data.can_cast_safely(to_dtype) From cad48d0497fd9e5ce43e9a6521153bd6a1e84cea Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 28 Aug 2020 16:01:19 -0700 Subject: [PATCH 32/80] more progress --- python/cudf/cudf/_lib/reduce.pyx | 3 ++- python/cudf/cudf/api/types.py | 6 ++++++ python/cudf/cudf/core/column/categorical.py | 5 +---- python/cudf/cudf/core/column/column.py | 10 ++++----- python/cudf/cudf/core/column/datetime.py | 8 +++---- python/cudf/cudf/core/column/numerical.py | 9 +++++++- python/cudf/cudf/core/dataframe.py | 15 +++++++------- python/cudf/cudf/core/dtypes.py | 23 ++++++++++++++++++++- python/cudf/cudf/core/frame.py | 4 +--- python/cudf/cudf/core/join/join.py | 4 +--- python/cudf/cudf/core/series.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 2 ++ python/cudf/cudf/tests/test_replace.py | 2 +- python/cudf/cudf/tests/test_repr.py | 10 +++++---- python/cudf/cudf/tests/test_string.py | 1 + python/cudf/cudf/utils/dtypes.py | 5 +++-- 16 files changed, 72 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 69592894cae..ac8065d2d6f 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -14,6 +14,7 @@ from cudf._lib.aggregation cimport make_aggregation, aggregation from libcpp.memory cimport unique_ptr import numpy as np from cudf.core.dtypes import dtype as cudf_dtype +from cudf.api.types import find_common_type def reduce(reduction_op, Column incol, dtype=None, **kwargs): @@ -33,7 +34,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): col_dtype = incol.dtype if reduction_op in ['sum', 'sum_of_squares', 'product']: - col_dtype = np.find_common_type([col_dtype.to_numpy], [np.uint64]) + col_dtype = find_common_type([col_dtype], [np.uint64]) col_dtype = cudf_dtype(col_dtype) if dtype is None else cudf_dtype(dtype) cdef column_view c_incol_view = incol.view() diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index e00023b492d..13494a4bdc3 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -89,3 +89,9 @@ def is_list_dtype(obj): or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype)) ) + +def find_common_type(array_types=[], scalar_types=[]): + array_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types] + scalar_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types] + + return cudf.dtype(np.find_common_type(array_types, scalar_types)) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 2f1e677b898..ae5c5d46562 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -307,10 +307,7 @@ def add_categories(self, new_categories, **kwargs): f"type-cast new_categories to the same type as " f"existing categories." ) - common_dtype = np.find_common_type( - [old_categories.dtype.to_numpy, new_categories.dtype.to_numpy], [] - ) - common_dtype = cudf.dtype(common_dtype) + common_dtype = cudf.api.types.find_common_type([old_categories.dtype, new_categories.dtype], []) new_categories = new_categories.astype(common_dtype, copy=False) old_categories = old_categories.astype(common_dtype, copy=False) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3ad9ebb1551..94f05935537 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -205,12 +205,12 @@ def _concat(cls, objs, dtype=None): ) == 0 ): - np_col_dtypes = [o.dtype.to_numpy for o in not_null_cols] + cudf_col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype - np_common_dtype = np.find_common_type(np_col_dtypes, []) + cudf_common_dtype = cudf.api.types.find_common_type(cudf_col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): - objs[i] = objs[i].astype(cudf.dtype(np_common_dtype)) + objs[i] = objs[i].astype(cudf_common_dtype) # Find the first non-null column: head = objs[0] @@ -1010,7 +1010,7 @@ def serialize(self): header = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) - header["dtype"] = self.dtype.str + header["dtype"] = str(self.dtype) data_header, data_frames = self.data.serialize() header["data"] = data_header @@ -1164,7 +1164,7 @@ def build_column( offset=offset, null_count=null_count, ) - elif dtype.type is np.timedelta64: + elif isinstance(dtype, cudf.Timedelta): return cudf.core.column.TimeDeltaColumn( data=data, dtype=dtype, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c205b841af5..ba652c3d3d2 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -27,10 +27,10 @@ } _dtype_to_format_conversion = { - "datetime64[ns]": "%Y-%m-%d %H:%M:%S.%9f", - "datetime64[us]": "%Y-%m-%d %H:%M:%S.%6f", - "datetime64[ms]": "%Y-%m-%d %H:%M:%S.%3f", - "datetime64[s]": "%Y-%m-%d %H:%M:%S", + "Datetime64NS": "%Y-%m-%d %H:%M:%S.%9f", + "Datetime64US": "%Y-%m-%d %H:%M:%S.%6f", + "Datetime64MS": "%Y-%m-%d %H:%M:%S.%3f", + "Datetime64S": "%Y-%m-%d %H:%M:%S", } diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 92dab1a15de..d467d0d0ddb 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -30,6 +30,13 @@ def __init__( The dtype associated with the data Buffer mask : Buffer, optional """ + try: + cudf.dtype(dtype) + dtype.itemsize + + except: + import pdb + pdb.set_trace() dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") @@ -512,7 +519,7 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): if ( col_to_normalize_dtype.kind == "f" and input_column_dtype.kind in {"i", "u"} - ) or (col_to_normalize_dtype.to_numpy.num > input_column_dtype.to_numpy.num): + ) or (col_to_normalize_dtype.num > input_column_dtype.num): raise TypeError( f"Potentially unsafe cast for non-equivalent " f"{col_to_normalize_dtype.name} " diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 84088a54fe7..7f998784d34 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3183,7 +3183,7 @@ def as_gpu_matrix(self, columns=None, order="F"): for c in cols ): raise TypeError("non-numeric data not yet supported") - dtype = np.find_common_type([c.dtype.to_numpy for c in cols], []) + dtype = cudf.api.types.find_common_type([c.dtype for c in cols], []) for k, c in self._data.items(): if c.has_nulls: errmsg = ( @@ -3191,7 +3191,7 @@ def as_gpu_matrix(self, columns=None, order="F"): "hint: use .fillna() to replace null values" ) raise ValueError(errmsg.format(k)) - cupy_dtype = dtype + cupy_dtype = dtype.to_numpy if np.issubdtype(cupy_dtype, np.datetime64): cupy_dtype = np.dtype("int64") @@ -6313,7 +6313,7 @@ def select_dtypes(self, include=None, exclude=None): ) include, exclude = map( - lambda x: frozenset(map(cudf.dtype, x)), + lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)), selection, ) @@ -6332,8 +6332,8 @@ def select_dtypes(self, include=None, exclude=None): # category handling if is_categorical_dtype(i_dtype): include_subtypes.add(i_dtype) - elif isinstance(dtype, i_dtype.__class__): - include_subtypes.add(dtype) + elif isinstance(dtype, i_dtype): + include_subtypes.add(dtype.__class__) # exclude all subtypes exclude_subtypes = set() @@ -6346,9 +6346,8 @@ def select_dtypes(self, include=None, exclude=None): exclude_subtypes.add(dtype) include_all = set( - [cudf.dtype(d) for d in self.dtypes] + [cudf_dtype_from_pydata_dtype(d) for d in self.dtypes] ) - if include: inclusion = include_all & include_subtypes elif exclude: @@ -6359,7 +6358,7 @@ def select_dtypes(self, include=None, exclude=None): inclusion = inclusion - exclude_subtypes for k, col in self._data.items(): - infered_type = cudf.dtype(col.dtype) + infered_type = cudf.dtype(col.dtype).__class__ if infered_type in inclusion: df.insert(len(df._data), k, col) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6e5e37351e8..682df0d7e9a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -77,6 +77,13 @@ def __eq__(self, other): return True return False + def __str__(self): + return str(self.to_pandas) + + @property + def num(self): + return self.to_numpy.num + @property def to_numpy(self): return pa_to_np_dtypes[self.pa_type] @@ -287,7 +294,7 @@ def make_dtype_from_string(obj): return UInt32Dtype() elif obj in {"uint16", "UInt16"}: return UInt16Dtype() - elif obj in {"uint8", "Uint8"}: + elif obj in {"uint8", "UInt8"}: return UInt8Dtype() elif "float" in obj or "Float" in obj: if obj in {"float64", "Float64"}: @@ -315,11 +322,14 @@ def make_dtype_from_numpy(obj): def dtype(obj): + if obj is None: return None if isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) if isinstance(obj, CategoricalDtype): + if obj is 'category': + return cudf.CategoricalDtype() return obj elif isinstance(obj, Generic): return obj @@ -337,6 +347,14 @@ def dtype(obj): return pd_to_cudf_dtypes[obj] elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype): return make_dtype_from_string(obj.name) + elif obj is np.number: + return cudf.Number + elif obj is np.datetime64: + return cudf.Datetime + elif obj is np.timedelta64: + return cudf.Timedelta + + else: try: if issubclass(obj, np.generic): @@ -360,6 +378,9 @@ def __init__(self, categories=None, ordered=None): def __repr__(self): return self.to_pandas().__repr__() + def __hash__(self): + return hash(self.__repr__()) + @property def categories(self): if self._categories is None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 640ec747f72..4509969de03 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -269,9 +269,7 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = cudf.dtype(np.find_common_type( - [col.dtype.to_numpy for col in cols], [] - )) + dtypes[idx] = cudf.api.types.find_common_type([col.dtype for col in cols], []) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index ebc52490417..231a114aff7 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -400,9 +400,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): # both ints or both floats libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy)) else: - libcudf_join_type = cudf.dtype(np.find_common_type( - [], [dtype_l.to_numpy, dtype_r.to_numpy] - )) + libcudf_join_type = cudf.api.types.find_common_type([], [dtype_l, dtype_r]) elif isinstance(dtype_l, cudf.Datetime) and isinstance( dtype_r, cudf.Datetime ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 513a8336f29..c6227ce4105 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1027,10 +1027,10 @@ def __repr__(self): else: lines = lines[:-1] lines[-1] = lines[-1] + "\n" - lines[-1] = lines[-1] + "dtype: %s" % self.dtype + lines[-1] = lines[-1] + "dtype: %s" % self.dtype.name else: lines = output.split(",") - return lines[0] + ", dtype: %s)" % self.dtype + return lines[0] + ", dtype: %s)" % self.dtype.name if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): lines.append(category_memory) return "\n".join(lines) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 655bd6c28af..000827cc9c8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2523,6 +2523,8 @@ def test_select_dtype(): pdf.select_dtypes(include=["float64"]), gdf.select_dtypes(include=["float64"]), ) + import pdb + pdb.set_trace() assert_eq( pdf.select_dtypes(include=["object", "int", "category"]), gdf.select_dtypes(include=["object", "int", "category"]), diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index dc8331965fe..823538a0200 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -455,7 +455,7 @@ def test_series_fillna_invalid_dtype(data_dtype): gdf.fillna(fill_value) raises.match( "Cannot safely cast non-equivalent {} to {}".format( - type(fill_value).__name__, gdf.dtype.type.__name__ + type(fill_value).__name__, gdf.dtype.name ) ) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 7ab863c3fff..a214588d367 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -46,10 +46,12 @@ def test_null_series(nrows, dtype): psrepr = psrepr.replace( str(sr._column.default_na_value()) + "\n", "\n" ) - if "UInt" in psrepr: - psrepr = psrepr.replace("UInt", "uint") - elif "Int" in psrepr: - psrepr = psrepr.replace("Int", "int") + if "uint" in psrepr: + psrepr = psrepr.replace("uint", "UInt") + elif "int" in psrepr: + psrepr = psrepr.replace("int", "Int") + elif 'float' in psrepr: + psrepr = psrepr.replace("float", "Float") assert psrepr.split() == sr.__repr__().split() diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 22e873e5f25..dd0c7c71431 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -156,6 +156,7 @@ def test_string_repr(ps_gs, item): if got_out is not None and len(got_out) > 1: expect = expect.replace("None", "") + expect = expect.replace('object', 'String') assert expect == got diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 29e767fe179..7efc0cd049f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -137,13 +137,14 @@ def cudf_dtype_from_pydata_dtype(dtype): """ Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. """ - + if isinstance(dtype, cudf.Generic): + return dtype.__class__ if is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype elif np.issubdtype(dtype, np.datetime64): dtype = np.datetime64 - return infer_dtype_from_object(dtype) + return cudf.dtype(infer_dtype_from_object(dtype)).__class__ def is_scalar(val): From 6a1785c1e8e7218863cb71831b32fe0e49af04ee Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 30 Aug 2020 09:00:48 -0700 Subject: [PATCH 33/80] all repr tests pass --- python/cudf/cudf/core/column/timedelta.py | 4 +- python/cudf/cudf/core/dtypes.py | 6 +- python/cudf/cudf/core/index.py | 5 +- python/cudf/cudf/core/series.py | 54 +++++++++++ python/cudf/cudf/tests/test_repr.py | 105 +++++++++++----------- 5 files changed, 119 insertions(+), 55 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2ab0fadae82..b20d943d30c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -291,7 +291,7 @@ def default_na_value(self): @property def time_unit(self): - return self._time_unit + return self.dtype._time_unit def fillna(self, fill_value): col = self @@ -346,7 +346,7 @@ def as_string_column(self, dtype, **kwargs): return column.column_empty(0, dtype="object", masked=False) def as_timedelta_column(self, dtype, **kwargs): - dtype = np.dtype(dtype) + dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 682df0d7e9a..5ac2fae2c03 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -78,7 +78,7 @@ def __eq__(self, other): return False def __str__(self): - return str(self.to_pandas) + return self.name @property def num(self): @@ -243,21 +243,25 @@ class Timedelta64NSDtype(Timedelta): def __init__(self): self.pa_type = pa.duration('ns') self._name = "Timedelta64NS" + self._time_unit = 'ns' class Timedelta64USDtype(Timedelta): def __init__(self): self.pa_type = pa.duration('us') self._name = "Timedelta64US" + self._time_unit = 'us' class Timedelta64MSDtype(Timedelta): def __init__(self): self.pa_type = pa.duration('ms') self._name = "Timedelta64MS" + self._time_unit = 'ms' class Timedelta64SDtype(Timedelta): def __init__(self): self.pa_type = pa.duration('s') self._name = "Timedelta64S" + self._time_unit = 's' class StringDtype(Flexible): is_string = True diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 217079fbf03..9ee250e11e4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2474,7 +2474,7 @@ def take(self, indices): def __repr__(self): return ( f"{self.__class__.__name__}({self._values.to_array()}," - f" dtype='object'" + f" dtype='String'" + ( f", name={pd.io.formats.printing.default_pprint(self.name)}" if self.name is not None @@ -2524,6 +2524,7 @@ def as_index(arbitrary, **kwargs): - DatetimeIndex for Datetime input. - GenericIndex for all other inputs. """ + kwargs = _setdefault_name(arbitrary, **kwargs) if isinstance(arbitrary, cudf.MultiIndex): return arbitrary @@ -2533,7 +2534,7 @@ def as_index(arbitrary, **kwargs): return idx elif isinstance(arbitrary, NumericalColumn): try: - return _dtype_to_index[arbitrary.dtype.type](arbitrary, **kwargs) + return _dtype_to_index[arbitrary.dtype](arbitrary, **kwargs) except KeyError: return GenericIndex(arbitrary, **kwargs) elif isinstance(arbitrary, StringColumn): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c6227ce4105..aacabeed0b2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1008,6 +1008,7 @@ def __repr__(self): else: output = preprocess.to_pandas().__repr__() + output = _fix_nullable_dtype_repr(output) lines = output.split("\n") if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): @@ -5008,3 +5009,56 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): result_col[equal_nulls] = True return Series(result_col, index=index) + +def _fix_nullable_dtype_repr(string): + + to_replace = [ + 'uint8', + 'uint16', + 'uint32', + 'uint64', + 'int8', + 'int16', + 'int32', + 'int64', + 'float32', + 'float64', + 'bool', + 'object', + 'datetime64[ns]', + 'datetime64[us]', + 'datetime64[ms]', + 'datetime64[s]' + 'timedelta64[ns]', + 'timedelta64[us]', + 'timedelta64[ms]', + 'timedelta64[s]' + ] + + + replacements = [ + 'UInt8', + 'UInt16', + 'UInt32', + 'UInt64', + 'Int8', + 'Int16', + 'Int32', + 'Int64', + 'Float32', + 'Float64', + 'Boolean', + 'String', + 'Datetime64NS', + 'Datetime64US', + 'Datetime64MS', + 'Datetime64S', + 'Timedelta64NS', + 'Timedelta64US', + 'Timedelta64MS', + 'Timedelta64S' + ] + + for tr, rp in zip(to_replace, replacements): + string = string.replace(tr, rp) + return string diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index a214588d367..f7efd680374 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -6,6 +6,7 @@ import pandas as pd import pytest from hypothesis import given, settings, strategies as st +from cudf.core.series import _fix_nullable_dtype_repr import cudf from cudf.tests import utils @@ -46,12 +47,11 @@ def test_null_series(nrows, dtype): psrepr = psrepr.replace( str(sr._column.default_na_value()) + "\n", "\n" ) - if "uint" in psrepr: - psrepr = psrepr.replace("uint", "UInt") - elif "int" in psrepr: - psrepr = psrepr.replace("int", "Int") - elif 'float' in psrepr: - psrepr = psrepr.replace("float", "Float") + from cudf.core.series import _fix_nullable_dtype_repr + # todo: this is kind of self-fulfilling since this is what is + # called inside _repr_ as well + psrepr = _fix_nullable_dtype_repr(psrepr) + assert psrepr.split() == sr.__repr__().split() @@ -92,7 +92,8 @@ def test_full_series(nrows, dtype): ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) pd.options.display.max_rows = int(nrows) - assert ps.__repr__() == sr.__repr__() + psrepr = _fix_nullable_dtype_repr(ps.__repr__()) + assert psrepr == sr.__repr__() @pytest.mark.parametrize("dtype", repr_categories) @@ -154,9 +155,8 @@ def test_integer_dataframe(x): def test_integer_series(x): sr = cudf.Series(x) ps = pd.Series(x) - print(sr) - print(ps) - assert sr.__repr__() == ps.__repr__() + psrepr = _fix_nullable_dtype_repr(ps.__repr__()) + assert sr.__repr__() == psrepr @given(st.lists(st.floats())) @@ -172,7 +172,8 @@ def test_float_dataframe(x): def test_float_series(x): sr = cudf.Series(x, nan_as_null=False) ps = pd.Series(x) - assert sr.__repr__() == ps.__repr__() + psrepr = _fix_nullable_dtype_repr(ps.__repr__()) + assert sr.__repr__() == psrepr @pytest.fixture @@ -201,7 +202,11 @@ def test_mixed_dataframe(mixed_pdf, mixed_gdf): def test_mixed_series(mixed_pdf, mixed_gdf): for col in mixed_gdf.columns: - assert mixed_gdf[col].__repr__() == mixed_pdf[col].__repr__() + try: + assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr(mixed_pdf[col].__repr__()) + except: + import pdb + pdb.set_trace() def test_MI(): @@ -253,8 +258,8 @@ def test_generic_index(length, dtype): index=np.random.randint(0, high=100, size=length).astype(dtype), ) gsr = cudf.Series.from_pandas(psr) - - assert psr.index.__repr__() == gsr.index.__repr__() + psrepr = _fix_nullable_dtype_repr(psr.index.__repr__()) + assert psrepr == gsr.index.__repr__() @pytest.mark.parametrize( @@ -316,23 +321,23 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): [ ( cudf.Index([1, 2, 3, None]), - "Int64Index([1, 2, 3, ], dtype='int64')", + "Int64Index([1, 2, 3, ], dtype='Int64')", ), ( cudf.Index([None, 2.2, 3.324342, None]), - "Float64Index([, 2.2, 3.324342, ], dtype='float64')", + "Float64Index([, 2.2, 3.324342, ], dtype='Float64')", ), ( cudf.Index([None, None, None], name="hello"), - "Float64Index([, , ], dtype='float64', name='hello')", + "Float64Index([, , ], dtype='Float64', name='hello')", ), ( cudf.Index([None], name="hello"), - "Float64Index([], dtype='float64', name='hello')", + "Float64Index([], dtype='Float64', name='hello')", ), ( - cudf.Index([None], dtype="int8", name="hello"), - "Int8Index([], dtype='int8', name='hello')", + cudf.Index([None], dtype="Int8", name="hello"), + "Int8Index([], dtype='Int8', name='hello')", ), ( cudf.Index([None] * 50, dtype="object"), @@ -340,20 +345,20 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): "None None None None None None\n None None None None None None " "None None None None None None None None\n None None None None " "None None None None None None None None None None\n None None " - "None None None None None None], dtype='object')", + "None None None None None None], dtype='String')", ), ( cudf.Index([None] * 20, dtype="uint32"), "UInt32Index([, , , , , , , , " ",\n , , , , , , , , " - ",\n , ],\n dtype='uint32')", + ",\n , ],\n dtype='UInt32')", ), ( cudf.Index( [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" ), "Int16Index([, 111, 22, 33, , 23, 34, 2343, ], " - "dtype='int16')", + "dtype='Int16')", ), ( cudf.Index([1, 2, 3, None], dtype="category"), @@ -370,32 +375,32 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): "DatetimeIndex([1970-01-01 00:00:00.000000010, " "1970-01-01 00:00:00.000000020," "\n 1970-01-01 00:00:00.000000030, ],\n " - "dtype='datetime64[ns]')", + "dtype='Datetime64NS')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), "DatetimeIndex([1970-01-01 00:00:10, " "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" - " ],\n dtype='datetime64[s]')", + " ],\n dtype='Datetime64S')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), "DatetimeIndex([1970-01-01 00:00:00.000010, " "1970-01-01 00:00:00.000020,\n " "1970-01-01 00:00:00.000030, ],\n " - "dtype='datetime64[us]')", + "dtype='Datetime64US')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), "DatetimeIndex([1970-01-01 00:00:00.010, " "1970-01-01 00:00:00.020,\n " "1970-01-01 00:00:00.030, ],\n " - "dtype='datetime64[ms]')", + "dtype='Datetime64MS')", ), ( cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), "DatetimeIndex([, , , , , , , , " - ",\n ],\n dtype='datetime64[ms]')", + ",\n ],\n dtype='Datetime64MS')", ), ], ) @@ -576,7 +581,7 @@ def test_series_null_index_repr(sr, pandas_special_case): # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. actual_repr = gsr.__repr__().replace("None", "") - assert expected_repr.split() == actual_repr.split() + assert _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split() @pytest.mark.parametrize( @@ -614,7 +619,7 @@ def test_timedelta_series_s_us_repr(data, dtype): psr = sr.to_pandas() expected = ( - psr.__repr__().replace("timedelta64[ns]", dtype).replace("NaT", "") + psr.__repr__().replace("timedelta64[ns]", str(sr.dtype)).replace("NaT", "") ) actual = sr.__repr__() @@ -628,7 +633,7 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([], dtype="timedelta64[ns]"), textwrap.dedent( """ - Series([], dtype: timedelta64[ns]) + Series([], dtype: Timedelta64NS) """ ), ), @@ -636,7 +641,7 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([], dtype="timedelta64[ms]"), textwrap.dedent( """ - Series([], dtype: timedelta64[ms]) + Series([], dtype: Timedelta64MS) """ ), ), @@ -647,7 +652,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 0 00:00:00.001000000 1 00:00:00.000200000 2 00:00:00.003000000 - dtype: timedelta64[ns] + dtype: Timedelta64NS """ ), ), @@ -658,7 +663,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 0 00:16:40 1 00:03:20 2 00:50:00 - dtype: timedelta64[ms] + dtype: Timedelta64MS """ ), ), @@ -669,7 +674,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 0 00:00:00.001000000 1 00:00:00.000200000 2 - dtype: timedelta64[ns] + dtype: Timedelta64NS """ ), ), @@ -680,7 +685,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 0 00:16:40 1 00:03:20 2 - dtype: timedelta64[ms] + dtype: Timedelta64MS """ ), ), @@ -695,7 +700,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 2 3 4 - dtype: timedelta64[ns] + dtype: Timedelta64NS """ ), ), @@ -710,7 +715,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 2 3 4 - dtype: timedelta64[ms] + dtype: Timedelta64MS """ ), ), @@ -726,7 +731,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 3 00:00:00.000000343 4 00:00:00.004353534 5 00:00:00.000435342 - dtype: timedelta64[ns] + dtype: Timedelta64NS """ ), ), @@ -742,7 +747,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 3 00:00:00.343 4 01:12:33.534 5 00:07:15.342 - dtype: timedelta64[ms] + dtype: Timedelta64MS """ ), ), @@ -760,7 +765,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 4 00:00:00 5 00:00:00.000000332 6 00:00:00.000000323 - dtype: timedelta64[ns] + dtype: Timedelta64NS """ ), ), @@ -778,7 +783,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 4 00:00:00 5 00:00:00.332 6 00:00:00.323 - dtype: timedelta64[ms] + dtype: Timedelta64MS """ ), ), @@ -804,7 +809,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 4 11573 days 23:39:03.241 5 42 days 01:35:48.734 6 0 days 00:00:23.234 - dtype: timedelta64[ms] + dtype: Timedelta64MS """ ), ), @@ -830,7 +835,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 4 00:16:39.992343241 5 00:00:03.634548734 6 00:00:00.000023234 - dtype: timedelta64[ns] + dtype: Timedelta64NS """ ), ), @@ -857,7 +862,7 @@ def test_timedelta_series_s_us_repr(data, dtype): 4 11573 days 23:39:03.241 5 42 days 01:35:48.734 6 0 days 00:00:23.234 - Name: abc, dtype: timedelta64[ms] + Name: abc, dtype: Timedelta64MS """ ), ), @@ -885,7 +890,7 @@ def test_timedelta_series_s_us_repr(data, dtype): y 00:16:39.992343241 l 00:00:03.634548734 m 00:00:00.000023234 - Name: hello, dtype: timedelta64[ns] + Name: hello, dtype: Timedelta64NS """ ), ), @@ -1060,14 +1065,14 @@ def test_timedelta_dataframe_repr(df, expected_repr): ( cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), "TimedeltaIndex(['00:16:40', '00:03:20', '00:50:00'], " - "dtype='timedelta64[ms]')", + "dtype='Timedelta64MS')", ), ( cudf.Index( [None, None, None, None, None], dtype="timedelta64[us]" ), "TimedeltaIndex([, , , , ], " - "dtype='timedelta64[us]')", + "dtype='Timedelta64US')", ), ( cudf.Index( @@ -1085,7 +1090,7 @@ def test_timedelta_dataframe_repr(df, expected_repr): "TimedeltaIndex([00:02:16.457654, , 00:04:05.345345, " "00:03:43.432411, ," " 01:00:34.548734, 00:00:00.023234]," - " dtype='timedelta64[us]')", + " dtype='Timedelta64US')", ), ( cudf.Index( @@ -1103,7 +1108,7 @@ def test_timedelta_dataframe_repr(df, expected_repr): "TimedeltaIndex([1579 days 08:54:14, , 2839 days 15:29:05," " 2586 days 00:33:31, , 42066 days 12:52:14, " "0 days 06:27:14]," - " dtype='timedelta64[s]')", + " dtype='Timedelta64S')", ), ], ) From 8552907566dd9dbd15a1a2b99fbd83d0f32cf4c9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 31 Aug 2020 11:48:45 -0700 Subject: [PATCH 34/80] all timedelta tests pass --- python/cudf/cudf/_lib/types.pyx | 105 +++++++++++++--------- python/cudf/cudf/api/types.py | 8 ++ python/cudf/cudf/core/column/string.py | 3 +- python/cudf/cudf/core/column/timedelta.py | 21 +++-- python/cudf/cudf/core/dtypes.py | 8 +- python/cudf/cudf/core/series.py | 5 +- python/cudf/cudf/tests/test_timedelta.py | 11 +-- python/cudf/cudf/utils/dtypes.py | 2 +- 8 files changed, 97 insertions(+), 66 deletions(-) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 457bf16eeab..876b0021fa5 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -18,6 +18,28 @@ from cudf.core.dtypes import ListDtype cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.types cimport data_type +from cudf.core.dtypes import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + Float32Dtype, + Float64Dtype, + StringDtype, + BooleanDtype, + Timedelta64NSDtype, + Timedelta64USDtype, + Timedelta64MSDtype, + Timedelta64SDtype, + Datetime64NSDtype, + Datetime64USDtype, + Datetime64MSDtype, + Datetime64SDtype, +) class TypeId(IntEnum): @@ -64,49 +86,49 @@ class TypeId(IntEnum): np_to_cudf_types = { - np.dtype("int8"): TypeId.INT8, - np.dtype("int16"): TypeId.INT16, - np.dtype("int32"): TypeId.INT32, - np.dtype("int64"): TypeId.INT64, - np.dtype("uint8"): TypeId.UINT8, - np.dtype("uint16"): TypeId.UINT16, - np.dtype("uint32"): TypeId.UINT32, - np.dtype("uint64"): TypeId.UINT64, - np.dtype("float32"): TypeId.FLOAT32, - np.dtype("float64"): TypeId.FLOAT64, - np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): TypeId.STRING, - np.dtype("bool"): TypeId.BOOL8, - np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, + Int8Dtype(): TypeId.INT8, + Int16Dtype(): TypeId.INT16, + Int32Dtype(): TypeId.INT32, + Int64Dtype(): TypeId.INT64, + UInt8Dtype(): TypeId.UINT8, + UInt16Dtype(): TypeId.UINT16, + UInt32Dtype(): TypeId.UINT32, + UInt64Dtype(): TypeId.UINT64, + Float32Dtype(): TypeId.FLOAT32, + Float64Dtype(): TypeId.FLOAT64, + Datetime64SDtype(): TypeId.TIMESTAMP_SECONDS, + Datetime64MSDtype(): TypeId.TIMESTAMP_MILLISECONDS, + Datetime64USDtype(): TypeId.TIMESTAMP_MICROSECONDS, + Datetime64NSDtype(): TypeId.TIMESTAMP_NANOSECONDS, + StringDtype(): TypeId.STRING, + BooleanDtype(): TypeId.BOOL8, + Timedelta64SDtype(): TypeId.DURATION_SECONDS, + Timedelta64MSDtype(): TypeId.DURATION_MILLISECONDS, + Timedelta64USDtype(): TypeId.DURATION_MICROSECONDS, + Timedelta64NSDtype(): TypeId.DURATION_NANOSECONDS, } cudf_to_np_types = { - TypeId.INT8: np.dtype("int8"), - TypeId.INT16: np.dtype("int16"), - TypeId.INT32: np.dtype("int32"), - TypeId.INT64: np.dtype("int64"), - TypeId.UINT8: np.dtype("uint8"), - TypeId.UINT16: np.dtype("uint16"), - TypeId.UINT32: np.dtype("uint32"), - TypeId.UINT64: np.dtype("uint64"), - TypeId.FLOAT32: np.dtype("float32"), - TypeId.FLOAT64: np.dtype("float64"), - TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"), - TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"), - TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"), - TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), - TypeId.STRING: np.dtype("object"), - TypeId.BOOL8: np.dtype("bool"), - TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), - TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), - TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), - TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), + TypeId.INT8: Int8Dtype(), + TypeId.INT16: Int16Dtype(), + TypeId.INT32: Int32Dtype(), + TypeId.INT64: Int64Dtype(), + TypeId.UINT8: UInt8Dtype(), + TypeId.UINT16: UInt16Dtype(), + TypeId.UINT32: UInt32Dtype(), + TypeId.UINT64: UInt64Dtype(), + TypeId.FLOAT32: Float32Dtype(), + TypeId.FLOAT64: Float64Dtype(), + TypeId.TIMESTAMP_SECONDS: Datetime64SDtype(), + TypeId.TIMESTAMP_MILLISECONDS: Datetime64MSDtype(), + TypeId.TIMESTAMP_MICROSECONDS: Datetime64USDtype(), + TypeId.TIMESTAMP_NANOSECONDS: Datetime64NSDtype(), + TypeId.STRING: StringDtype(), + TypeId.BOOL8: BooleanDtype(), + TypeId.DURATION_SECONDS: Timedelta64SDtype(), + TypeId.DURATION_MILLISECONDS: Timedelta64MSDtype(), + TypeId.DURATION_MICROSECONDS: Timedelta64USDtype(), + TypeId.DURATION_NANOSECONDS: Timedelta64NSDtype(), } duration_unit_map = { @@ -169,10 +191,9 @@ cdef class _Dtype: cdef data_type libcudf_type if not isinstance(self, ListDtype): - np_dtype = self.to_numpy tid = ( ( - np_to_cudf_types[np_dtype] + np_to_cudf_types[self] ) ) else: diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 13494a4bdc3..df785906d0b 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -95,3 +95,11 @@ def find_common_type(array_types=[], scalar_types=[]): scalar_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types] return cudf.dtype(np.find_common_type(array_types, scalar_types)) + +def can_cast(dtype_l, dtype_r): + if isinstance(dtype_l, cudf.Generic): + dtype_l = dtype_l.to_numpy + if isinstance(dtype_r, cudf.Generic): + dtype_r = dtype_r.to_numpy + + return np.can_cast(dtype_l, dtype_r) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 96c064c7328..d44ebcb474f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4531,7 +4531,6 @@ def _nbytes(self): return self.children[1].size def as_numerical_column(self, dtype, **kwargs): - out_dtype = cudf.dtype(dtype) kwargs.update(dtype=out_dtype) @@ -4554,7 +4553,7 @@ def as_numerical_column(self, dtype, **kwargs): raise ValueError("Could not convert `None` value to datetime") boolean_match = self.binary_operator("eq", "NaT") - elif out_dtype.type is cudf.Timedelta: + elif out_dtype.type is np.timedelta64: if "format" not in kwargs: if len(self) > 0: kwargs.update(format="%D days %H:%M:%S") diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index b20d943d30c..25d0e711bbb 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -14,7 +14,7 @@ from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils.dtypes import is_scalar, np_to_pa_dtype from cudf.utils.utils import buffers_from_pyarrow - +from cudf.api.types import can_cast _dtype_to_format_conversion = { "Timedelta64NS": "%D days %H:%M:%S", "Timedelta64US": "%D days %H:%M:%S", @@ -118,13 +118,13 @@ def _binary_op_floordiv(self, rhs): if isinstance(rhs, Scalar): rhs = np.timedelta64(rhs.value) - rhs = rhs.astype(common_dtype).astype("float64") + rhs = rhs.astype(common_dtype.to_numpy).astype("float64") else: rhs = as_scalar(None, "float64") else: - rhs = rhs.astype(common_dtype).astype("float64") + rhs = rhs.astype(common_dtype.to_numpy).astype("float64") - out_dtype = np.dtype("int64") + out_dtype = cudf.Int64Dtype() elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -187,13 +187,13 @@ def _binary_op_truediv(self, rhs): if isinstance(rhs, Scalar): rhs = np.timedelta64(rhs.value) - rhs = rhs.astype(common_dtype).astype("float64") + rhs = rhs.astype(common_dtype.to_numpy).astype("float64") else: rhs = as_scalar(None, "float64") else: rhs = rhs.astype(common_dtype).astype("float64") - out_dtype = np.dtype("float64") + out_dtype = cudf.Float64Dtype() elif rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -206,7 +206,6 @@ def _binary_op_truediv(self, rhs): def binary_operator(self, op, rhs, reflect=False): lhs, rhs = self, rhs - if op in ("eq", "ne"): out_dtype = self._binary_op_eq_ne(rhs) elif op in ("lt", "gt", "le", "ge"): @@ -251,7 +250,7 @@ def normalize_binop_value(self, other): other = other.astype("timedelta64[s]") else: common_dtype = determine_out_dtype(self.dtype, other.dtype) - other = other.astype(common_dtype) + other = other.astype(common_dtype.to_numpy) return as_scalar(other) elif np.isscalar(other): return as_scalar(other) @@ -298,7 +297,7 @@ def fillna(self, fill_value): if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) - fill_value = fill_value.astype(dtype) + fill_value = fill_value.astype(dtype.to_numpy) col = col.astype(dtype) elif not isinstance(fill_value, Scalar): fill_value = np.timedelta64(fill_value) @@ -572,9 +571,9 @@ def binop(lhs, rhs, op, out_dtype): def determine_out_dtype(lhs_dtype, rhs_dtype): - if np.can_cast(cudf.dtype(lhs_dtype).to_numpy, cudf.dtype(rhs_dtype).to_numpy): + if can_cast(lhs_dtype, rhs_dtype): return cudf.dtype(rhs_dtype) - elif np.can_cast(cudf.dtype(rhs_dtype).to_numpy, cudf.dtype(lhs_dtype).to_numpy): + elif can_cast(rhs_dtype, lhs_dtype): return cudf.dtype(lhs_dtype) else: raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 5ac2fae2c03..e369494fcf9 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -301,7 +301,7 @@ def make_dtype_from_string(obj): elif obj in {"uint8", "UInt8"}: return UInt8Dtype() elif "float" in obj or "Float" in obj: - if obj in {"float64", "Float64"}: + if obj in {"float64", "Float64", 'float', 'Float'}: return Float64Dtype() elif obj in {"float32", "Float32"}: return Float32Dtype() @@ -342,7 +342,11 @@ def dtype(obj): if isinstance(obj, np.dtype): if obj.type is np.str_: return StringDtype() - return np_to_cudf_dtypes[obj] + try: + return np_to_cudf_dtypes[obj] + except KeyError: + import pdb + pdb.set_trace() elif isinstance(obj, pa.lib.DataType): return pa_to_cudf_dtypes[obj] elif isinstance(obj, str): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aacabeed0b2..496c20fb677 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1381,7 +1381,7 @@ def __rtruediv__(self, other): __div__ = __truediv__ def _bitwise_binop(self, other, op): - if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer))): + if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))): ser = self._binaryop(other, op) if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(other.dtype, cudf.BooleanDtype): ser = ser.astype(cudf.BooleanDtype()) @@ -5028,7 +5028,7 @@ def _fix_nullable_dtype_repr(string): 'datetime64[ns]', 'datetime64[us]', 'datetime64[ms]', - 'datetime64[s]' + 'datetime64[s]', 'timedelta64[ns]', 'timedelta64[us]', 'timedelta64[ms]', @@ -5058,7 +5058,6 @@ def _fix_nullable_dtype_repr(string): 'Timedelta64MS', 'Timedelta64S' ] - for tr, rp in zip(to_replace, replacements): string = string.replace(tr, rp) return string diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 8a3e7acd68d..634517368d5 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -976,7 +976,7 @@ def test_timedelta_invalid_ops(): with pytest.raises( TypeError, match=re.escape( - f"Addition of {sr.dtype} with {np.dtype('int64')} " + f"Addition of {sr.dtype} with Int64 " f"cannot be performed." ), ): @@ -990,7 +990,7 @@ def test_timedelta_invalid_ops(): with pytest.raises( TypeError, match=re.escape( - f"Addition of {sr.dtype} with {np.dtype('object')} " + f"Addition of {sr.dtype} with String " f"cannot be performed." ), ): @@ -1021,7 +1021,7 @@ def test_timedelta_invalid_ops(): with pytest.raises( TypeError, match=re.escape( - f"Modulus of {sr.dtype} with {np.dtype('object')} " + f"Modulus of {sr.dtype} with String " f"cannot be performed." ), ): @@ -1157,13 +1157,14 @@ def test_timedelta_invalid_ops(): def test_timedelta_datetime_cast_invalid(): + from cudf.core.series import _fix_nullable_dtype_repr sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() try: psr.astype("datetime64[ns]") except TypeError as e: - with pytest.raises(type(e), match=re.escape(e.__str__())): + with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))): sr.astype("datetime64[ns]") else: raise AssertionError("Expected timedelta to datetime typecast to fail") @@ -1174,7 +1175,7 @@ def test_timedelta_datetime_cast_invalid(): try: psr.astype("timedelta64[ns]") except TypeError as e: - with pytest.raises(type(e), match=re.escape(e.__str__())): + with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))): sr.astype("timedelta64[ns]") else: raise AssertionError("Expected datetime to timedelta typecast to fail") diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 7efc0cd049f..67463b7317f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -360,7 +360,7 @@ def check_cast_unsupported_dtype(dtype): else: dtype = np.dtype(dtype) - if dtype in cudf._lib.types.np_to_cudf_types: + if cudf.dtype(dtype) in cudf._lib.types.np_to_cudf_types: return dtype if dtype == np.dtype("float16"): From 2b59285ed7e12e2faadf724b31b904a40a530aa6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 31 Aug 2020 13:14:32 -0700 Subject: [PATCH 35/80] sorting tests pass --- python/cudf/cudf/_lib/string_casting.pyx | 35 ++++++++++++++++-------- python/cudf/cudf/api/types.py | 8 ++++++ python/cudf/cudf/core/indexing.py | 4 +-- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index aee554017af..3f63bb23d6f 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -52,7 +52,18 @@ from cudf._lib.cpp.types cimport ( from libcpp.memory cimport unique_ptr from libcpp.string cimport string - +from cudf.core.dtypes import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + Float32Dtype, + Float64Dtype, +) def floating_to_string(Column input_col): cdef column_view input_column_view = input_col.view() @@ -112,7 +123,7 @@ def stod(Column input_col, **kwargs): A Column with strings cast to double """ - return string_to_floating(input_col, np.dtype("float64")) + return string_to_floating(input_col, Float64Dtype()) def ftos(Column input_col, **kwargs): @@ -144,7 +155,7 @@ def stof(Column input_col, **kwargs): A Column with strings cast to float """ - return string_to_floating(input_col, np.dtype("float32")) + return string_to_floating(input_col, Float32Dtype()) def integer_to_string(Column input_col): @@ -205,7 +216,7 @@ def stoi8(Column input_col, **kwargs): A Column with strings cast to int8 """ - return string_to_integer(input_col, np.dtype("int8")) + return string_to_integer(input_col, Int8Dtype()) def i16tos(Column input_col, **kwargs): @@ -237,7 +248,7 @@ def stoi16(Column input_col, **kwargs): A Column with strings cast to int16 """ - return string_to_integer(input_col, np.dtype("int16")) + return string_to_integer(input_col, Int16Dtype()) def itos(Column input_col, **kwargs): @@ -269,7 +280,7 @@ def stoi(Column input_col, **kwargs): A Column with strings cast to int32 """ - return string_to_integer(input_col, np.dtype("int32")) + return string_to_integer(input_col, Int32Dtype()) def ltos(Column input_col, **kwargs): @@ -301,7 +312,7 @@ def stol(Column input_col, **kwargs): A Column with strings cast to int64 """ - return string_to_integer(input_col, np.dtype("int64")) + return string_to_integer(input_col, Int64Dtype()) def ui8tos(Column input_col, **kwargs): @@ -333,7 +344,7 @@ def stoui8(Column input_col, **kwargs): A Column with strings cast to uint8 """ - return string_to_integer(input_col, np.dtype("uint8")) + return string_to_integer(input_col, UInt8Dtype()) def ui16tos(Column input_col, **kwargs): @@ -365,7 +376,7 @@ def stoui16(Column input_col, **kwargs): A Column with strings cast to uint16 """ - return string_to_integer(input_col, np.dtype("uint16")) + return string_to_integer(input_col, UInt16Dtype()) def uitos(Column input_col, **kwargs): @@ -397,7 +408,7 @@ def stoui(Column input_col, **kwargs): A Column with strings cast to uint32 """ - return string_to_integer(input_col, np.dtype("uint32")) + return string_to_integer(input_col, UInt32Dtype()) def ultos(Column input_col, **kwargs): @@ -429,7 +440,7 @@ def stoul(Column input_col, **kwargs): A Column with strings cast to uint64 """ - return string_to_integer(input_col, np.dtype("uint64")) + return string_to_integer(input_col, UInt64Dtype()) def _to_booleans(Column input_col, object string_true="True"): @@ -717,7 +728,7 @@ def htoi(Column input_col, **kwargs): cdef column_view input_column_view = input_col.view() cdef type_id tid = ( ( - np_to_cudf_types[kwargs.get('dtype', np.dtype("int64"))] + np_to_cudf_types[kwargs.get('dtype', Int64Dtype())] ) ) cdef data_type c_out_type = data_type(tid) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index df785906d0b..a89adf10c22 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -103,3 +103,11 @@ def can_cast(dtype_l, dtype_r): dtype_r = dtype_r.to_numpy return np.can_cast(dtype_l, dtype_r) + +def result_type(dtype_l, dtype_r): + if isinstance(dtype_l, cudf.Generic): + dtype_l = dtype_l.to_numpy + if isinstance(dtype_r, cudf.Generic): + dtype_r = dtype_r.to_numpy + + return cudf.dtype(np.result_type(dtype_l, dtype_r)) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index dcc9ddf9315..ba388b45f21 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -95,8 +95,8 @@ def __setitem__(self, key, value): ): # normalize types if necessary: if not pd.api.types.is_integer(key): - to_dtype = np.result_type(value.dtype, self._sr._column.dtype) - value = value.astype(to_dtype) + to_dtype = cudf.api.types.result_type(value.dtype, self._sr._column.dtype) + value = value.astype(to_dtype.to_numpy) self._sr._column._mimic_inplace( self._sr._column.astype(to_dtype), inplace=True ) From b2851a2fa4248626bef45be06d0a9f77862ffa94 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 31 Aug 2020 13:45:16 -0700 Subject: [PATCH 36/80] fix more tests --- python/cudf/cudf/_lib/binaryop.pyx | 11 ++--------- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/tests/test_udf_binops.py | 3 ++- python/cudf/cudf/utils/dtypes.py | 2 +- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx index 4323e1f4b79..18c72da25f9 100644 --- a/python/cudf/cudf/_lib/binaryop.pyx +++ b/python/cudf/cudf/_lib/binaryop.pyx @@ -224,15 +224,8 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype): """ cdef column_view c_lhs = lhs.view() cdef column_view c_rhs = rhs.view() - - cdef type_id tid = ( - ( - ( - np_to_cudf_types[np.dtype(dtype)] - ) - ) - ) - cdef data_type c_dtype = data_type(tid) + cdef _Dtype pydtype = dtype + cdef data_type c_dtype = pydtype.get_libcudf_type() cdef string cpp_str = udf_ptx.encode("UTF-8") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 94f05935537..da4c9ab06b3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -27,7 +27,7 @@ from cudf.core.dtypes import CategoricalDtype from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( - NUMERIC_TYPES, + NEW_NUMERIC_TYPES, check_cast_unsupported_dtype, get_time_unit, is_scalar, @@ -119,7 +119,7 @@ def __len__(self): return self.size def to_pandas(self, index=None, **kwargs): - if str(self.dtype) in NUMERIC_TYPES and self.null_count == 0: + if str(self.dtype) in NEW_NUMERIC_TYPES and self.null_count == 0: pd_series = pd.Series(cupy.asnumpy(self.values)) else: pd_series = self.to_arrow().to_pandas(**kwargs) diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index e3d03b80ae2..dfae94f1279 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -5,6 +5,7 @@ import numpy as np import pytest +import cudf from cudf import _lib as libcudf from cudf.core import Series from cudf.utils import dtypes as dtypeutils @@ -44,7 +45,7 @@ def generic_function(a, b): output_type = numpy_support.as_dtype(result.signature.return_type) out_col = libcudf.binaryop.binaryop_udf( - lhs_col, rhs_col, ptx_code, output_type.type + lhs_col, rhs_col, ptx_code, cudf.dtype(output_type.type) ) result = lhs_arr ** 3 + rhs_arr diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 67463b7317f..bd5a1f4ab2c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -67,7 +67,7 @@ } OTHER_TYPES = {"bool", "category", "str"} ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES - +NEW_NUMERIC_TYPES = {'Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Float32', 'Float64'} def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype. From 9540643209fb9d21a5466081eb1233f42896e599 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 1 Sep 2020 07:11:08 -0700 Subject: [PATCH 37/80] hackily pass select_dtype tests --- python/cudf/cudf/core/dataframe.py | 14 +++++++------- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 6 ++---- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/utils/dtypes.py | 10 +++++++++- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7f998784d34..93956f35a17 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3206,7 +3206,7 @@ def as_gpu_matrix(self, columns=None, order="F"): for colidx, inpcol in enumerate(cols): dense = inpcol.astype(cupy_dtype) matrix[:, colidx] = dense - return cuda.as_cuda_array(matrix).view(dtype) + return cuda.as_cuda_array(matrix).view(cupy_dtype) def as_matrix(self, columns=None): """Convert to a matrix in host memory. @@ -6324,20 +6324,20 @@ def select_dtypes(self, include=None, exclude=None): inc_ex=(include & exclude) ) ) - # include all subtypes + include_subtypes = set() - for dtype in self.dtypes: + for dtype in (d.__class__ for d in self.dtypes): for i_dtype in include: # category handling if is_categorical_dtype(i_dtype): include_subtypes.add(i_dtype) - elif isinstance(dtype, i_dtype): - include_subtypes.add(dtype.__class__) + elif issubclass(dtype, i_dtype): + include_subtypes.add(dtype) # exclude all subtypes exclude_subtypes = set() - for dtype in self.dtypes: + for dtype in (d.__class__ for d in self.dtypes): for e_dtype in exclude: # category handling if is_categorical_dtype(e_dtype): @@ -6367,7 +6367,7 @@ def select_dtypes(self, include=None, exclude=None): @ioutils.doc_to_parquet() def to_parquet(self, path, *args, **kwargs): """{docstring}""" - from cudf.io import parquet as pq + from cudf.io import parquet as pq6 return pq.to_parquet(self, path, *args, **kwargs) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 496c20fb677..bfe1cc74151 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3902,7 +3902,7 @@ def describe_categorical(self): # pandas defaults percentiles = np.array([0.25, 0.5, 0.75]) - if np.issubdtype(self.dtype, np.number): + if isinstance(self.dtype, cudf.Number): return describe_numeric(self) else: raise NotImplementedError( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 000827cc9c8..6bb927bd51c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -25,6 +25,7 @@ does_not_raise, gen_rand, ) +from cudf.core.dtypes import Number def test_init_via_list_of_tuples(): @@ -2516,15 +2517,12 @@ def test_select_dtype(): nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str} ) pdf = gdf.to_pandas() - assert_eq(pdf.select_dtypes("float64"), gdf.select_dtypes("float64")) assert_eq(pdf.select_dtypes(np.float64), gdf.select_dtypes(np.float64)) assert_eq( pdf.select_dtypes(include=["float64"]), gdf.select_dtypes(include=["float64"]), ) - import pdb - pdb.set_trace() assert_eq( pdf.select_dtypes(include=["object", "int", "category"]), gdf.select_dtypes(include=["object", "int", "category"]), @@ -2536,7 +2534,7 @@ def test_select_dtype(): ) assert_eq( pdf.select_dtypes(include=np.number), - gdf.select_dtypes(include=np.number), + gdf.select_dtypes(include=Number), ) assert_eq( pdf.select_dtypes(include=[np.int64, np.float64]), diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 9ae5c17da47..e0d35f2eb5c 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -143,7 +143,7 @@ def test_series_set_equal_length_object_by_mask(replace_data): # Lengths match in trivial case pd_bool_col = pd.Series([True] * len(psr)) gd_bool_col = Series.from_pandas(pd_bool_col) - + psr[pd_bool_col] = ( replace_data.to_pandas() if hasattr(replace_data, "to_pandas") diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index bd5a1f4ab2c..a9c265247c8 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -13,6 +13,7 @@ import cudf from cudf._lib.scalar import Scalar from cudf.api.types import is_categorical_dtype +import inspect _NA_REP = "" @@ -139,12 +140,19 @@ def cudf_dtype_from_pydata_dtype(dtype): """ if isinstance(dtype, cudf.Generic): return dtype.__class__ + if inspect.isclass(dtype): + if issubclass(dtype, cudf.Generic): + return dtype if is_categorical_dtype(dtype): return cudf.core.dtypes.CategoricalDtype elif np.issubdtype(dtype, np.datetime64): dtype = np.datetime64 - return cudf.dtype(infer_dtype_from_object(dtype)).__class__ + result = cudf.dtype(infer_dtype_from_object(dtype)) + if isinstance(result, cudf.Generic): + return result.__class__ + elif inspect.isclass(result): + return result def is_scalar(val): From 781b42ee8bc300d787c4fc6e15cc0913cb66e757 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 1 Sep 2020 11:41:32 -0700 Subject: [PATCH 38/80] all dataframe tests pass! --- python/cudf/cudf/core/column/column.py | 3 +- python/cudf/cudf/core/column/numerical.py | 2 +- python/cudf/cudf/core/dataframe.py | 10 ++-- python/cudf/cudf/core/dtypes.py | 10 ++++ python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 73 ++++++++++++----------- python/cudf/cudf/utils/dtypes.py | 4 ++ 7 files changed, 60 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index da4c9ab06b3..fb4362c7677 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -769,7 +769,7 @@ def isin(self, values): lhs_cats = lhs.cat().categories._values rhs_cats = rhs.cat().categories._values - if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype): + if not isinstance(rhs_cats.dtype, type(lhs_cats.dtype)): # If they're not the same dtype, short-circuit if the values # list doesn't have any nulls. If it does have nulls, make # the values list a Categorical with a single null @@ -1083,7 +1083,6 @@ def column_empty(row_count, dtype="object", masked=False): """ dtype = pd.api.types.pandas_dtype(dtype) children = () - if is_categorical_dtype(dtype): data = None children = ( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index d467d0d0ddb..ccab749e3ba 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -233,7 +233,7 @@ def round(self, decimals=0): msg = "Decimal values < 0 are not yet supported." raise NotImplementedError(msg) - if np.issubdtype(self.dtype, np.integer): + if isinstance(self.dtype, cudf.Integer): return self data = Buffer( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 93956f35a17..1bcbea8d8be 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4522,7 +4522,7 @@ def _sizeof_fmt(num, size_qualifier): deep = True else: deep = False - if "object" in dtype_counts or self.index.dtype == "object": + if "String" in dtype_counts or self.index.dtype == cudf.StringDtype(): size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append( @@ -5308,12 +5308,12 @@ def isin(self, values): isinstance( self[col]._column, cudf.core.column.CategoricalColumn ) - or np.issubdtype(self[col].dtype, np.dtype("object")) + or isinstance(self[col].dtype, cudf.StringDtype) ) or ( isinstance( values._column, cudf.core.column.CategoricalColumn ) - or np.issubdtype(values.dtype, np.dtype("object")) + or isinstance(values.dtype, cudf.StringDtype) ): result[col] = utils.scalar_broadcast_to(False, len(self)) else: @@ -5371,8 +5371,8 @@ def _prepare_for_rowwise_op(self): ) raise ValueError(msg) - filtered = self.select_dtypes(include=[np.number, np.bool]) - common_dtype = np.find_common_type(filtered.dtypes, []) + filtered = self.select_dtypes(include=[cudf.Number, cudf.BooleanDtype]) + common_dtype = cudf.api.types.find_common_type(filtered.dtypes, []) coerced = filtered.astype(common_dtype) return coerced diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index e369494fcf9..80c8d9ce7e8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -329,6 +329,12 @@ def dtype(obj): if obj is None: return None + if obj is str: + return cudf.StringDtype() + if obj is int: + return cudf.Int64Dtype() + if obj is float: + return cudf.Float64Dtype() if isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) if isinstance(obj, CategoricalDtype): @@ -478,6 +484,10 @@ def deserialize(cls, header, frames): ) return cls(categories=categories, ordered=ordered) + @property + def kind(self): + return 'O' + class ListDtype(Generic): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bfe1cc74151..a7dd73fbc4a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3945,7 +3945,7 @@ def diff(self, periods=1): "Diff currently requires columns with no null values" ) - if not np.issubdtype(self.dtype, np.number): + if not isinstance(self.dtype, cudf.Number): raise NotImplementedError( "Diff currently only supports numeric dtypes" ) @@ -3953,7 +3953,7 @@ def diff(self, periods=1): # TODO: move this libcudf input_col = self._column output_col = column_empty_like(input_col) - output_mask = column_empty_like(input_col, dtype="bool") + output_mask = column_empty_like(input_col, dtype=cudf.BooleanDtype()) if output_col.size > 0: cudautils.gpu_diff.forall(output_col.size)( input_col, output_col, output_mask, periods diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6bb927bd51c..60f7d93e467 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3237,30 +3237,31 @@ def test_empty_dataframe_describe(): def test_as_column_types(): from cudf.core.column import column + from cudf import Float32Dtype, Float64Dtype, StringDtype col = column.as_column(Series([])) - assert_eq(col.dtype, np.dtype("float64")) + assert isinstance(col.dtype, Float64Dtype) gds = Series(col) pds = pd.Series(pd.Series([])) assert_eq(pds, gds) col = column.as_column(Series([]), dtype="float32") - assert_eq(col.dtype, np.dtype("float32")) + assert isinstance(col.dtype, Float32Dtype) gds = Series(col) pds = pd.Series(pd.Series([], dtype="float32")) assert_eq(pds, gds) col = column.as_column(Series([]), dtype="str") - assert_eq(col.dtype, np.dtype("object")) + assert isinstance(col.dtype, StringDtype) gds = Series(col) pds = pd.Series(pd.Series([], dtype="str")) assert_eq(pds, gds) col = column.as_column(Series([]), dtype="object") - assert_eq(col.dtype, np.dtype("object")) + assert isinstance(col.dtype, StringDtype) gds = Series(col) pds = pd.Series(pd.Series([], dtype="object")) @@ -5619,17 +5620,17 @@ def test_dataframe_info_basic(): Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 0 10 non-null float64 - 1 1 10 non-null float64 - 2 2 10 non-null float64 - 3 3 10 non-null float64 - 4 4 10 non-null float64 - 5 5 10 non-null float64 - 6 6 10 non-null float64 - 7 7 10 non-null float64 - 8 8 10 non-null float64 - 9 9 10 non-null float64 - dtypes: float64(10) + 0 0 10 non-null Float64 + 1 1 10 non-null Float64 + 2 2 10 non-null Float64 + 3 3 10 non-null Float64 + 4 4 10 non-null Float64 + 5 5 10 non-null Float64 + 6 6 10 non-null Float64 + 7 7 10 non-null Float64 + 8 8 10 non-null Float64 + 9 9 10 non-null Float64 + dtypes: Float64(10) memory usage: 859.0+ bytes """ ) @@ -5652,9 +5653,9 @@ def test_dataframe_info_verbose_mem_usage(): Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 a 3 non-null int64 - 1 b 3 non-null object - dtypes: int64(1), object(1) + 0 a 3 non-null Int64 + 1 b 3 non-null String + dtypes: Int64(1), String(1) memory usage: 56.0+ bytes """ ) @@ -5670,7 +5671,7 @@ def test_dataframe_info_verbose_mem_usage(): RangeIndex: 3 entries, 0 to 2 Columns: 2 entries, a to b - dtypes: int64(1), object(1) + dtypes: Int64(1), String(1) memory usage: 56.0+ bytes """ ) @@ -5692,9 +5693,9 @@ def test_dataframe_info_verbose_mem_usage(): Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 a 3 non-null int64 - 1 b 3 non-null object - dtypes: int64(1), object(1) + 0 a 3 non-null Int64 + 1 b 3 non-null String + dtypes: Int64(1), String(1) memory usage: 91.0 bytes """ ) @@ -5723,10 +5724,10 @@ def test_dataframe_info_verbose_mem_usage(): Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) + 0 int_col 5 non-null Int64 + 1 text_col 5 non-null String + 2 float_col 5 non-null Float64 + dtypes: Float64(1), Int64(1), String(1) memory usage: 130.0 bytes """ ) @@ -5758,10 +5759,10 @@ def test_dataframe_info_null_counts(): Data columns (total 3 columns): # Column Dtype --- ------ ----- - 0 int_col int64 - 1 text_col object - 2 float_col float64 - dtypes: float64(1), int64(1), object(1) + 0 int_col Int64 + 1 text_col String + 2 float_col Float64 + dtypes: Float64(1), Int64(1), String(1) memory usage: 130.0+ bytes """ ) @@ -5808,9 +5809,9 @@ def test_dataframe_info_null_counts(): Data columns (total 2 columns): # Column Dtype --- ------ ----- - 0 a int64 - 1 b object - dtypes: int64(1), object(1) + 0 a Int64 + 1 b String + dtypes: Int64(1), String(1) memory usage: 238.0+ bytes """ ) @@ -5830,9 +5831,9 @@ def test_dataframe_info_null_counts(): Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 a 6 non-null int64 - 1 b 6 non-null object - dtypes: int64(1), object(1) + 0 a 6 non-null Int64 + 1 b 6 non-null String + dtypes: Int64(1), String(1) memory usage: 238.0+ bytes """ ) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a9c265247c8..7f609797397 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -360,6 +360,10 @@ def min_column_type(x, expected_type): def check_cast_unsupported_dtype(dtype): + + if isinstance(dtype, cudf.Generic): + return dtype.to_numpy + if is_categorical_dtype(dtype): return dtype From 13fe291dbe9965d454a28ff4f98d1d13b8da7ad2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Sep 2020 08:14:48 -0700 Subject: [PATCH 39/80] much more progress --- python/cudf/cudf/_lib/transform.pyx | 14 +++++--------- python/cudf/cudf/api/types.py | 2 ++ python/cudf/cudf/core/column/column.py | 9 +++++++-- python/cudf/cudf/core/column/datetime.py | 7 ++++--- python/cudf/cudf/core/column/numerical.py | 3 +++ python/cudf/cudf/core/dataframe.py | 6 +++++- python/cudf/cudf/core/dtypes.py | 7 ++----- python/cudf/cudf/core/indexing.py | 9 ++++----- python/cudf/cudf/core/join/join.py | 3 +++ python/cudf/cudf/core/tools/datetimes.py | 4 ++-- python/cudf/cudf/tests/test_categorical.py | 15 ++++++++------- python/cudf/cudf/tests/test_datetime.py | 5 +++-- python/cudf/cudf/tests/test_feather.py | 1 - python/cudf/cudf/utils/dtypes.py | 5 +++++ 14 files changed, 53 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 52a83744fce..8fafa166471 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -24,7 +24,7 @@ from cudf._lib.cpp.types cimport ( type_id, ) from cudf._lib.types import np_to_cudf_types -from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.types cimport underlying_type_t_type_id, _Dtype from cudf._lib.cpp.column.column_view cimport column_view try: @@ -102,14 +102,10 @@ def transform(Column input, op): compiled_op = cudautils.compile_udf(op, nb_signature) c_str = compiled_op[0].encode('UTF-8') np_dtype = np.dtype(compiled_op[1]) - - try: - c_tid = ( - np_to_cudf_types[np_dtype] - ) - c_dtype = data_type(c_tid) - - except KeyError: + cdef _Dtype pydtype = cudf.dtype(np_dtype) + if pydtype in np_to_cudf_types.keys(): + c_dtype = pydtype.get_libcudf_type() + else: raise TypeError( "Result of window function has unsupported dtype {}" .format(np_dtype) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index a89adf10c22..54e2d64b80e 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -5,6 +5,8 @@ def is_bool_dtype(obj): # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably + if hasattr(obj, "dtype"): + obj = obj.dtype return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(obj) def is_datetime64_dtype(obj): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fb4362c7677..86eb7467db5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1624,8 +1624,13 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError - pa_data = pa.array(arbitrary, type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null) - data = as_column(pa_data, dtype=cudf.dtype(pa_data.type), nan_as_null=nan_as_null) + + pa_data = pa.array(arbitrary, + type=dtype.pa_type if dtype is not None else None, + from_pandas=True if nan_as_null is None else nan_as_null) + # todo: fix this ???? ???????? + as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, pa.lib.DictionaryType) else None + data = as_column(pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index ba652c3d3d2..95940df5944 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -293,10 +293,11 @@ def from_arrow(cls, array, dtype=None): ) def can_cast_safely(self, to_dtype): - if np.issubdtype(to_dtype, np.datetime64): + to_dtype = cudf.dtype(to_dtype) + if isinstance(to_dtype, cudf.Datetime): - to_res, _ = np.datetime_data(to_dtype) - self_res, _ = np.datetime_data(self.dtype) + to_res, _ = np.datetime_data(to_dtype.to_numpy) + self_res, _ = np.datetime_data(self.dtype.to_numpy) max_int = np.iinfo(np.dtype("int64")).max diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index ccab749e3ba..71787be695e 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -174,6 +174,9 @@ def as_numerical_column(self, dtype, **kwargs): # expect a cudf dtype always here if dtype == self.dtype: return self + if dtype is None: + import pdb + pdb.set_trace() return libcudf.unary.cast(self, dtype) @classmethod diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1bcbea8d8be..833edc9e1c3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4821,8 +4821,12 @@ def to_arrow(self, preserve_index=True): # We may want to add additional metadata to this in the future, but # for now lets just piggyback off of what's done for Pandas + + # egregious hack + metadata_df = self.head(0).to_pandas() + metadata = pa.pandas_compat.construct_metadata( - self, + metadata_df, names, index_columns, index_descriptors, diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 80c8d9ce7e8..b47281e5c36 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -348,11 +348,8 @@ def dtype(obj): if isinstance(obj, np.dtype): if obj.type is np.str_: return StringDtype() - try: - return np_to_cudf_dtypes[obj] - except KeyError: - import pdb - pdb.set_trace() + else: + return np_to_cudf_dtypes.get(obj, None) elif isinstance(obj, pa.lib.DataType): return pa_to_cudf_dtypes[obj] elif isinstance(obj, str): diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index ba388b45f21..9481d2a83e3 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -207,13 +207,13 @@ def _can_downcast_to_series(self, df, arg): ): return False else: - if pd.api.types.is_bool_dtype( + if cudf.api.types.is_bool_dtype( as_column(arg[0]).dtype ) and not isinstance(arg[1], slice): return True dtypes = df.dtypes.values.tolist() all_numeric = all( - [pd.api.types.is_numeric_dtype(t) for t in dtypes] + [cudf.api.types.is_numerical_dtype(t) for t in dtypes] ) if all_numeric: return True @@ -316,8 +316,7 @@ def _getitem_tuple_arg(self, arg): if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) - - if pd.api.types.is_bool_dtype(tmp_arg[0]): + if cudf.api.types.is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) @@ -344,7 +343,7 @@ def _getitem_tuple_arg(self, arg): df.index = as_index(start) else: row_selection = column.as_column(arg[0]) - if pd.api.types.is_bool_dtype(row_selection.dtype): + if cudf.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 231a114aff7..e2a0af5cef2 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -405,6 +405,9 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): dtype_r, cudf.Datetime ): libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy)) + if libcudf_join_type is None: + # todo: test this + raise TypeError(f"Cant find an implicit common type for {dtype_l} and {dtype_r}") return libcudf_join_type def libcudf_to_output_casting_rules(self, lcol, rcol, how): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index fb34c3c2f49..8abbc0e0ac2 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -184,7 +184,7 @@ def to_datetime( column.datetime._numpy_to_pandas_conversion[u] / ( column.datetime._numpy_to_pandas_conversion["s"] - if np.datetime_data(col.dtype)[0] == "s" + if np.datetime_data(col.dtype.to_numpy)[0] == "s" else 1 ) ) @@ -261,7 +261,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): return col elif col.dtype.kind == "m": raise TypeError( - f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}" + f"dtype {col.dtype} cannot be converted to {str(cudf.dtype(_unit_dtype_map[unit]))}" ) if col.dtype.kind in ("f"): diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index d3f3f2c2dd1..2e7233c0ff3 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -8,6 +8,7 @@ from cudf.core import DataFrame, Series from cudf.core.index import as_index from cudf.tests.utils import assert_eq +from cudf.core.series import _fix_nullable_dtype_repr @pytest.fixture @@ -67,7 +68,7 @@ def test_categorical_integer(): 3 c 4 a dtype: category -Categories (3, object): [a, b, c] +Categories (3, String): [a, b, c] """ assert string.split() == expect_str.split() @@ -360,7 +361,7 @@ def test_categorical_as_ordered(pd_str_cat, inplace): assert cd_sr_1.cat.ordered is True assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered - assert str(cd_sr_1) == str(pd_sr_1) + assert str(cd_sr_1) == _fix_nullable_dtype_repr(str(pd_sr_1)) @pytest.mark.parametrize("inplace", [True, False]) @@ -379,7 +380,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace): assert cd_sr_1.cat.ordered is False assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered - assert str(cd_sr_1) == str(pd_sr_1) + assert str(cd_sr_1) == _fix_nullable_dtype_repr(str(pd_sr_1)) @pytest.mark.parametrize("from_ordered", [True, False]) @@ -394,7 +395,7 @@ def test_categorical_reorder_categories( assert_eq(pd_sr, cd_sr) - assert str(pd_sr) == str(cd_sr) + assert _fix_nullable_dtype_repr(str(pd_sr)) == str(cd_sr) kwargs = dict(ordered=to_ordered, inplace=inplace) @@ -405,7 +406,7 @@ def test_categorical_reorder_categories( assert_eq(pd_sr_1, cd_sr_1) - assert str(cd_sr_1) == str(pd_sr_1) + assert str(cd_sr_1) == _fix_nullable_dtype_repr(str(pd_sr_1)) @pytest.mark.parametrize("inplace", [True, False]) @@ -416,7 +417,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): assert_eq(pd_sr, cd_sr) - assert str(pd_sr) == str(cd_sr) + assert _fix_nullable_dtype_repr(str(pd_sr)) == str(cd_sr) pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace) cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) @@ -437,7 +438,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace): assert_eq(pd_sr, cd_sr) - assert str(pd_sr) == str(cd_sr) + assert _fix_nullable_dtype_repr(str(pd_sr)) == str(cd_sr) pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace) cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index a5e666fd57c..052cb1f6ad2 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -174,7 +174,7 @@ def test_dt_index(data, field): def test_setitem_datetime(): df = DataFrame() df["date"] = pd.date_range("20010101", "20010105").values - assert np.issubdtype(df.date.dtype, np.datetime64) + assert isinstance(df.date.dtype, cudf.Datetime) def test_sort_datetime(): @@ -630,6 +630,7 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): ], ) def test_to_datetime_errors(data): + from cudf.core.series import _fix_nullable_dtype_repr pd_data = data if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) @@ -639,7 +640,7 @@ def test_to_datetime_errors(data): try: pd.to_datetime(pd_data) except Exception as e: - with pytest.raises(type(e), match=re.escape(str(e))): + with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(str(e)))): cudf.to_datetime(gd_data) else: raise AssertionError("Was expecting `pd.to_datetime` to fail") diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 7f4608fd514..953258bed9b 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -80,7 +80,6 @@ def test_feather_reader(feather_file, columns): .to_arrow(preserve_index=False) .to_pandas() ) - assert_eq(expect, got, check_categorical=False) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 7f609797397..29a7de22436 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -398,6 +398,11 @@ def get_time_unit(obj): ), ): return obj.time_unit + elif isinstance(obj, cudf.Generic): + return obj._time_unit + elif isinstance(obj.dtype, cudf.Generic): + return obj.dtype._time_unit time_unit, _ = np.datetime_data(obj.dtype) + return time_unit From 3c047ef2c4e282a179eb6e934fefe3e319088ecc Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Sep 2020 11:37:51 -0700 Subject: [PATCH 40/80] fix indexing tests --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dtypes.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 86eb7467db5..198329717e8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1629,7 +1629,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): type=dtype.pa_type if dtype is not None else None, from_pandas=True if nan_as_null is None else nan_as_null) # todo: fix this ???? ???????? - as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, pa.lib.DictionaryType) else None + as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, (pa.lib.DictionaryType, pa.lib.ListType)) else None data = as_column(pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index b47281e5c36..d6618422043 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -495,7 +495,7 @@ def __init__(self, element_type): self._typ = pa.list_(element_type._typ) else: element_type = cudf.utils.dtypes.np_to_pa_dtype( - np.dtype(element_type) + cudf.dtype(element_type) ) self._typ = pa.list_(element_type) @@ -513,6 +513,10 @@ def leaf_type(self): else: return self.element_type + @property + def kind(self): + return 'O' + @property def type(self): # TODO: we should change this to return something like a From a1395718ae710a204ad8983da1acaafd0972724a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Sep 2020 06:17:10 -0700 Subject: [PATCH 41/80] less than 10 tests still failing --- python/cudf/cudf/_lib/parquet.pyx | 6 ++---- python/cudf/cudf/api/types.py | 9 +++------ python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/core/dtypes.py | 25 +++++++++++++++---------- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/io/parquet.py | 1 + python/cudf/cudf/tests/test_orc.py | 2 +- 8 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index fd7e2cd847c..244a28a2868 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -103,7 +103,6 @@ cpdef generate_pandas_metadata(Table table, index): ) else: types.append(np_to_pa_dtype(col.dtype)) - # Indexes if index is not False: for name in table._index.names: @@ -135,16 +134,15 @@ cpdef generate_pandas_metadata(Table table, index): index_descriptors.append(descr) else: col_names.append(name) - + metadata_df = table.head(0).to_pandas() metadata = pa.pandas_compat.construct_metadata( - table, + metadata_df, col_names, index_levels, index_descriptors, index, types, ) - md = metadata[b'pandas'] json_str = md.decode("utf-8") return json_str diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 54e2d64b80e..3f8fe33e43f 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -106,10 +106,7 @@ def can_cast(dtype_l, dtype_r): return np.can_cast(dtype_l, dtype_r) -def result_type(dtype_l, dtype_r): - if isinstance(dtype_l, cudf.Generic): - dtype_l = dtype_l.to_numpy - if isinstance(dtype_r, cudf.Generic): - dtype_r = dtype_r.to_numpy +def result_type(*arrays_and_dtypes): - return cudf.dtype(np.result_type(dtype_l, dtype_r)) + arrays_and_dtypes = (d.to_numpy if isinstance(d, cudf.Generic) else d for d in arrays_and_dtypes) + return cudf.dtype(np.result_type(*arrays_and_dtypes)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d44ebcb474f..9f1535c5f95 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4531,7 +4531,7 @@ def _nbytes(self): return self.children[1].size def as_numerical_column(self, dtype, **kwargs): - out_dtype = cudf.dtype(dtype) + out_dtype = cudf.dtype(dtype) if dtype is not None else cudf.Float64Dtype() kwargs.update(dtype=out_dtype) if out_dtype.type is np.datetime64: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 833edc9e1c3..34e179e748f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6371,7 +6371,7 @@ def select_dtypes(self, include=None, exclude=None): @ioutils.doc_to_parquet() def to_parquet(self, path, *args, **kwargs): """{docstring}""" - from cudf.io import parquet as pq6 + from cudf.io import parquet as pq return pq.to_parquet(self, path, *args, **kwargs) @@ -6473,12 +6473,12 @@ def stack(self, level=-1, dropna=True): ) # Collect datatypes and cast columns as that type - common_type = np.result_type(*self.dtypes) + common_type = cudf.api.types.result_type(*self.dtypes) homogenized = DataFrame( { c: ( self._data[c].astype(common_type) - if not np.issubdtype(self._data[c].dtype, common_type) + if not isinstance(self._data[c].dtype, type(common_type)) else self._data[c] ) for c in self._data diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index d6618422043..9e7bdb670a8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -274,14 +274,14 @@ def __init__(self): def make_dtype_from_string(obj): if obj in {"str", "string", "object", "O"}: return StringDtype() - elif "datetime" in obj: - if obj == "datetime64[ns]": + elif "datetime" in obj or "Datetime" in obj: + if obj in {"datetime64[ns]", 'Datetime64NS'}: return Datetime64NSDtype() - elif obj == "datetime64[us]": + elif obj in {"datetime64[us]", "Datetime64US"}: return Datetime64USDtype() - elif obj == "datetime64[ms]": + elif obj in {"datetime64[ms]", "Datetime64MS"}: return Datetime64MSDtype() - elif obj == "datetime64[s]": + elif obj in {"datetime64[s]", "Datetime64MS"}: return Datetime64SDtype() elif "int" in obj or "Int" in obj: if obj in {"int", "Int", "int64", "Int64"}: @@ -310,15 +310,19 @@ def make_dtype_from_string(obj): elif "category" in obj: return "category" elif "timedelta" in obj: - if obj == 'timedelta64[ns]': + if obj in {'timedelta64[ns]', "Timedelta64NS"}: return Timedelta64NSDtype() - if obj == 'timedelta64[us]': + if obj in {'timedelta64[us]', "Timedelta64US"}: return Timedelta64USDtype() - if obj == 'timedelta64[ms]': + if obj in {'timedelta64[ms]', "Timedelta64MS"}: return Timedelta64MSDtype() - if obj == 'timedelta64[s]': + if obj in {'timedelta64[s]', "Timedelta64S"}: return Timedelta64SDtype() - + else: + try: + return np_to_cudf_dtypes[np.dtype(obj)] + except: + return None def make_dtype_from_numpy(obj): np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()} result = np_to_pd_types.get(obj) @@ -570,6 +574,7 @@ def __repr__(self): pa.duration("us"): Timedelta64USDtype(), pa.duration("ms"): Timedelta64MSDtype(), pa.duration("s"): Timedelta64SDtype(), + pa.date32(): Datetime64NSDtype(), pa.null(): None } diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b423a46b88b..c549a609769 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -604,7 +604,7 @@ def length_check(obj, name): unique = df[name].unique() if not dummy_na: - if np.issubdtype(unique.dtype, np.floating): + if isinstance(unique.dtype, cudf.Floating): unique = unique.nans_to_nulls() unique = unique.dropna() diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 50bbe1c20c2..18f26c6b2dc 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -6,6 +6,7 @@ from pyarrow import parquet as pq from pyarrow.compat import guid + import cudf from cudf._lib import parquet as libparquet from cudf.utils import ioutils diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 763f810f715..28d84561f5d 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -156,7 +156,7 @@ def test_orc_reader_trailing_nulls(datadir): # PANDAS uses NaN to represent invalid data, which forces float dtype # For comparison, we can replace NaN with 0 and cast to the cuDF dtype for col in expect.columns: - expect[col] = expect[col].astype(got[col].dtype) + expect[col] = expect[col].astype(got[col].dtype.to_numpy) assert_eq(expect, got, check_categorical=False) From ea24184ac1c4879e828379234583861a7113c12e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Sep 2020 09:41:30 -0700 Subject: [PATCH 42/80] fix bugs --- python/cudf/cudf/core/column/lists.py | 1 - python/cudf/cudf/core/column/numerical.py | 5 ----- 2 files changed, 6 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index eed89885b2c..295c2fa250f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -4,7 +4,6 @@ from cudf.core.column.methods import ColumnMethodsMixin from cudf.core.dtypes import ListDtype from cudf.api.types import is_list_dtype -from cudf.utils.utils import buffers_from_pyarrow class ListColumn(ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 03989c130ed..a55c2684656 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,13 +14,8 @@ min_signed_type, numeric_normalize_types, ) -<<<<<<< HEAD -from cudf.utils.utils import buffers_from_pyarrow from cudf.core.dtypes import Float64Dtype -======= - ->>>>>>> branch-0.16 class NumericalColumn(column.ColumnBase): def __init__( self, data, dtype, mask=None, size=None, offset=0, null_count=None From ddf340b51cb986fd7267f479a85c8361d095c792 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Sep 2020 12:56:27 -0700 Subject: [PATCH 43/80] fix a few more bugs --- python/cudf/cudf/_lib/aggregation.pyx | 18 +++++++----------- python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/core/dataframe.py | 3 ++- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 5 ++--- 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7392432bb64..19634d78061 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -16,8 +16,10 @@ from cudf._lib.types cimport ( underlying_type_t_interpolation, underlying_type_t_null_policy, underlying_type_t_type_id, + _Dtype ) from cudf._lib.types import Interpolation +from cudf.core.dtypes import dtype as cudf_dtype try: # Numba >= 0.49 @@ -241,24 +243,18 @@ cdef class _AggregationFactory: cdef string cpp_str # Handling UDF type - nb_type = numpy_support.from_dtype(kwargs['dtype']) + nb_type = numpy_support.from_dtype(kwargs['dtype'].to_numpy) type_signature = (nb_type[:],) compiled_op = cudautils.compile_udf(op, type_signature) - output_np_dtype = np.dtype(compiled_op[1]) + output_np_dtype = cudf_dtype(np.dtype(compiled_op[1])) cpp_str = compiled_op[0].encode('UTF-8') - if output_np_dtype not in np_to_cudf_types: + if cudf_dtype(output_np_dtype) not in np_to_cudf_types: raise TypeError( "Result of window function has unsupported dtype {}" .format(op[1]) ) - tid = ( - ( - ( - np_to_cudf_types[output_np_dtype] - ) - ) - ) - out_dtype = libcudf_types.data_type(tid) + cdef _Dtype pydtype = output_np_dtype + out_dtype = pydtype.get_libcudf_type() agg.c_obj = move(libcudf_aggregation.make_udf_aggregation( libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 5175aafe9cb..a909d5f5762 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -178,7 +178,7 @@ def _drop_unsupported_aggs(Table values, aggs): if all(len(v) == 0 for v in aggs.values()): return aggs - from cudf.utils.dtypes import ( + from cudf.api.types import ( is_categorical_dtype, is_string_dtype, is_list_dtype diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index aa6de4b5835..d5348def80e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4882,8 +4882,9 @@ def to_arrow(self, preserve_index=True): index_descr.append(descr) out = super(DataFrame, data).to_arrow() + metadata_df = self.head(0).to_pandas() metadata = pa.pandas_compat.construct_metadata( - self, + metadata_df, out.schema.names, [self.index], index_descr, diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 9e7bdb670a8..1693e6683d6 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -281,7 +281,7 @@ def make_dtype_from_string(obj): return Datetime64USDtype() elif obj in {"datetime64[ms]", "Datetime64MS"}: return Datetime64MSDtype() - elif obj in {"datetime64[s]", "Datetime64MS"}: + elif obj in {"datetime64[s]", "Datetime64S"}: return Datetime64SDtype() elif "int" in obj or "Int" in obj: if obj in {"int", "Int", "int64", "Int64"}: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e5398befd4f..cf926f39da2 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -378,9 +378,8 @@ def test_parquet_read_filtered_everything(tmpdir): # Check filter df_filtered = cudf.read_parquet(fname, filters=[("x", "==", 12)]) assert_eq(len(df_filtered), 0) - assert_eq(df_filtered["x"].dtype, "int64") - assert_eq(df_filtered["y"].dtype, "object") - + assert isinstance(df_filtered["x"].dtype, cudf.Int64Dtype) + assert isinstance(df_filtered["y"].dtype, cudf.StringDtype) def test_parquet_read_filtered_multiple_files(tmpdir): # Generate data From 4a140425a41cb5d47632b8d82eeb1248c8c145b1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Sep 2020 13:22:20 -0700 Subject: [PATCH 44/80] construct from string tests --- python/cudf/cudf/tests/test_dtypes.py | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index a02f01327bf..9a3f9a285a5 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -75,3 +75,51 @@ def test_nested_dtype(): expect = ListDtype("int32") got = dt.element_type assert expect == got + +@pytest.mark.parametrize('string,dtype', [ + ('uint8', cudf.UInt8Dtype), + ('uint16', cudf.UInt16Dtype), + ('uint32', cudf.UInt32Dtype), + ('uint64', cudf.UInt64Dtype), + ('UInt8', cudf.UInt8Dtype), + ('UInt16', cudf.UInt16Dtype), + ('UInt32', cudf.UInt32Dtype), + ('UInt64', cudf.UInt64Dtype), + ('int8', cudf.Int8Dtype), + ('int16', cudf.Int16Dtype), + ('int32', cudf.Int32Dtype), + ('int64', cudf.Int64Dtype), + ('Int8', cudf.Int8Dtype), + ('Int16', cudf.Int16Dtype), + ('Int32', cudf.Int32Dtype), + ('Int64', cudf.Int64Dtype), + ('int', cudf.Int64Dtype), + ('float32', cudf.Float32Dtype), + ('float64', cudf.Float64Dtype), + ('Float32', cudf.Float32Dtype), + ('Float64', cudf.Float64Dtype), + ('float', cudf.Float64Dtype), + ('bool', cudf.BooleanDtype), + ('Boolean', cudf.BooleanDtype), + ('string', cudf.StringDtype), + ('String', cudf.StringDtype), + ('object', cudf.StringDtype), + ('datetime64[ns]', cudf.Datetime64NSDtype), + ('datetime64[us]', cudf.Datetime64USDtype), + ('datetime64[ms]', cudf.Datetime64MSDtype), + ('datetime64[s]', cudf.Datetime64SDtype), + ('Datetime64NS', cudf.Datetime64NSDtype), + ('Datetime64US', cudf.Datetime64USDtype), + ('Datetime64MS', cudf.Datetime64MSDtype), + ('Datetime64S', cudf.Datetime64SDtype), + ('timedelta64[ns]', cudf.Timedelta64NSDtype), + ('timedelta64[us]', cudf.Timedelta64USDtype), + ('timedelta64[ms]', cudf.Timedelta64MSDtype), + ('timedelta64[s]', cudf.Timedelta64SDtype), + ('Timedelta64NS', cudf.Timedelta64NSDtype), + ('Timedelta64US', cudf.Timedelta64USDtype), + ('Timedelta64MS', cudf.Timedelta64MSDtype), + ('Timedelta64S', cudf.Timedelta64SDtype), + ]) +def test_cudf_dtype_string_construction(string, dtype): + assert type(cudf.dtype(string) == dtype) From 55cec7e9458c02598f0f45546b9d4863cf3ae9c6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Sep 2020 15:12:33 -0700 Subject: [PATCH 45/80] clean up dtypes.py --- python/cudf/cudf/core/dtypes.py | 203 ++++++++++++++------------------ 1 file changed, 90 insertions(+), 113 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 1693e6683d6..d54a73533e6 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -105,9 +105,6 @@ def type(self): @property def kind(self): - if isinstance(self, Floating): - return "f" - else: return self.to_pandas.kind @property @@ -115,32 +112,49 @@ def name(self): return self._name def __repr__(self): - return self.pa_type.__repr__() + return self.name def __hash__(self): return hash(self.__repr__()) + + def _raise_construction_error(self): + raise TypeError(f"Cannot create {type(self)} instances") + + class Number(Generic): - pass + def __init__(self): + self._raise_construction_error() class Integer(Number): - pass + def __init__(self): + self._raise_construction_error() class SignedInteger(Integer): - pass - + def __init__(self): + self._raise_construction_error() + class UnsignedInteger(Integer): - pass + def __init__(self): + self._raise_construction_error() + class Inexact(Number): - pass - + def __init__(self): + self._raise_construction_error() + class Floating(Inexact): - pass + def __init__(self): + self._raise_construction_error() + + @property + def kind(self): + return "f" class Flexible(Generic): - pass - + def __init__(self): + self._construction_error() + class Datetime(Generic): pass @@ -264,120 +278,59 @@ def __init__(self): self._time_unit = 's' class StringDtype(Flexible): - is_string = True def __init__(self): self.pa_type = pa.string() self._name = "String" -def make_dtype_from_string(obj): - if obj in {"str", "string", "object", "O"}: - return StringDtype() - elif "datetime" in obj or "Datetime" in obj: - if obj in {"datetime64[ns]", 'Datetime64NS'}: - return Datetime64NSDtype() - elif obj in {"datetime64[us]", "Datetime64US"}: - return Datetime64USDtype() - elif obj in {"datetime64[ms]", "Datetime64MS"}: - return Datetime64MSDtype() - elif obj in {"datetime64[s]", "Datetime64S"}: - return Datetime64SDtype() - elif "int" in obj or "Int" in obj: - if obj in {"int", "Int", "int64", "Int64"}: - return Int64Dtype() - elif obj in {"int32", "Int32"}: - return Int32Dtype() - elif obj in {"int16", "Int16"}: - return Int16Dtype() - elif obj in {"int8", "Int8"}: - return Int8Dtype() - elif obj in {"uint64", "UInt64"}: - return UInt64Dtype() - elif obj in {"uint32", "UInt32"}: - return UInt32Dtype() - elif obj in {"uint16", "UInt16"}: - return UInt16Dtype() - elif obj in {"uint8", "UInt8"}: - return UInt8Dtype() - elif "float" in obj or "Float" in obj: - if obj in {"float64", "Float64", 'float', 'Float'}: - return Float64Dtype() - elif obj in {"float32", "Float32"}: - return Float32Dtype() - elif "bool" in obj: - return BooleanDtype() - elif "category" in obj: - return "category" - elif "timedelta" in obj: - if obj in {'timedelta64[ns]', "Timedelta64NS"}: - return Timedelta64NSDtype() - if obj in {'timedelta64[us]', "Timedelta64US"}: - return Timedelta64USDtype() - if obj in {'timedelta64[ms]', "Timedelta64MS"}: - return Timedelta64MSDtype() - if obj in {'timedelta64[s]', "Timedelta64S"}: - return Timedelta64SDtype() - else: - try: - return np_to_cudf_dtypes[np.dtype(obj)] - except: - return None -def make_dtype_from_numpy(obj): - np_to_pd_types = {v: k for k, v in pd_to_np_dtypes.items()} - result = np_to_pd_types.get(obj) - return result +def cudf_dtype_from_string(obj): + try: + np_dtype = np.dtype(obj) + return cudf_dtype_from_numpy(np_dtype) + except TypeError: + return _cudf_dtype_from_string.get(obj, None) -def dtype(obj): +def cudf_dtype_from_numpy(obj): + if obj is np.str_: + return StringDtype() + elif obj is np.number: + return cudf.Number + elif obj is np.datetime64: + return cudf.Datetime + elif obj is np.timedelta64: + return cudf.Timedelta + dtype = np.dtype(obj) + return _cudf_dtype_from_numpy.get(obj, None) - if obj is None: - return None - if obj is str: - return cudf.StringDtype() - if obj is int: - return cudf.Int64Dtype() - if obj is float: - return cudf.Float64Dtype() +def dtype(obj): + if isinstance(obj, Generic): + return obj + elif type(obj) is type and issubclass(obj, Generic): + return obj() + elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, np.generic)): + return cudf_dtype_from_numpy(obj) + elif isinstance(obj, str): + return cudf_dtype_from_string(obj) if isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) if isinstance(obj, CategoricalDtype): if obj is 'category': return cudf.CategoricalDtype() return obj - elif isinstance(obj, Generic): - return obj - elif issubclass(obj.__class__, Generic): - return obj() - if isinstance(obj, np.dtype): - if obj.type is np.str_: - return StringDtype() - else: - return np_to_cudf_dtypes.get(obj, None) - elif isinstance(obj, pa.lib.DataType): - return pa_to_cudf_dtypes[obj] - elif isinstance(obj, str): - return make_dtype_from_string(obj) - elif obj in pd_to_cudf_dtypes.keys(): - return pd_to_cudf_dtypes[obj] + elif obj in _pd_to_cudf_dtypes.keys(): + return _pd_to_cudf_dtypes[obj] elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype): - return make_dtype_from_string(obj.name) - elif obj is np.number: - return cudf.Number - elif obj is np.datetime64: - return cudf.Datetime - elif obj is np.timedelta64: - return cudf.Timedelta - - + return cudf_dtype_from_string(obj.name) + elif obj is str: + return cudf.StringDtype() + elif obj is int: + return cudf.Int64Dtype() + elif obj in {float, None}: + return cudf.Float64Dtype() else: - try: - if issubclass(obj, np.generic): - return np_to_cudf_dtypes[np.dtype(obj)] - except: - import pdb - pdb.set_trace() - + raise TypeError(f"Could not find a cuDF dtype matching {obj}") class CategoricalDtype(Generic): @@ -578,7 +531,7 @@ def __repr__(self): pa.null(): None } -np_to_cudf_dtypes = { +_cudf_dtype_from_numpy = { np.dtype("int8"): Int8Dtype(), np.dtype("int16"): Int16Dtype(), np.dtype("int32"): Int32Dtype(), @@ -602,7 +555,7 @@ def __repr__(self): np.dtype("timedelta64[s]"): Timedelta64SDtype(), } -pd_to_cudf_dtypes = { +_pd_to_cudf_dtypes = { pd.Int8Dtype(): Int8Dtype(), pd.Int16Dtype(): Int16Dtype(), pd.Int32Dtype(): Int32Dtype(), @@ -614,3 +567,27 @@ def __repr__(self): pd.BooleanDtype(): BooleanDtype(), pd.StringDtype(): StringDtype(), } + +_cudf_dtype_from_string = { + 'UInt8': UInt8Dtype, + 'UInt16': UInt16Dtype, + 'UInt32': UInt32Dtype, + 'UInt64': UInt64Dtype, + 'Int8': Int8Dtype, + 'Int16': Int16Dtype, + 'Int32': Int32Dtype, + 'Int64': Int64Dtype, + 'Float': Float64Dtype, + 'Float32': Float32Dtype, + 'Float64': Float64Dtype, + 'Boolean': BooleanDtype, + 'String': StringDtype, + 'Datetime64NS': Datetime64NSDtype, + 'Datetime64US': Datetime64USDtype, + 'Datetime64MS': Datetime64MSDtype, + 'Datetime64S': Datetime64SDtype, + 'Timedelta64NS': Timedelta64NSDtype, + 'Timedelta64US': Timedelta64USDtype, + 'Timedelta64MS': Timedelta64MSDtype, + 'Timedelta64S': Timedelta64SDtype, +} From c28c7b6fea0d370bbb7068f4f0d88fd765362900 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Sep 2020 06:13:01 -0700 Subject: [PATCH 46/80] fixed some bugs --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/dtypes.py | 22 +++++++++++++--------- python/cudf/cudf/core/series.py | 3 --- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8914cd8ab39..b6d66d4ba8b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1398,7 +1398,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): * pandas.Categorical objects """ - dtype = cudf.dtype(dtype) if dtype is not None else None + dtype = cudf.dtype(dtype) if isinstance(arbitrary, ColumnBase): if dtype is not None: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d5348def80e..ed12e34a688 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6088,7 +6088,7 @@ def kurtosis( msg = "Kurtosis only supports int, float, and bool dtypes." raise NotImplementedError(msg) - self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()]) + self = self.select_dtypes(include=[cudf.Number, cudf.BooleanDtype]) return self._apply_support_method( "kurtosis", axis=axis, @@ -6134,7 +6134,7 @@ def skew( msg = "Skew only supports int, float, and bool dtypes." raise NotImplementedError(msg) - self = self.select_dtypes(include=[cudf.Number(), cudf.BooleanDtype()]) + self = self.select_dtypes(include=[cudf.Number, cudf.BooleanDtype]) return self._apply_support_method( "skew", axis=axis, diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index d54a73533e6..509266af722 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -285,6 +285,8 @@ def __init__(self): def cudf_dtype_from_string(obj): + if obj == 'category': + return obj try: np_dtype = np.dtype(obj) return cudf_dtype_from_numpy(np_dtype) @@ -302,35 +304,37 @@ def cudf_dtype_from_numpy(obj): elif obj is np.timedelta64: return cudf.Timedelta dtype = np.dtype(obj) - return _cudf_dtype_from_numpy.get(obj, None) + return _cudf_dtype_from_numpy.get(dtype, None) def dtype(obj): if isinstance(obj, Generic): return obj elif type(obj) is type and issubclass(obj, Generic): return obj() - elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, np.generic)): + elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype))): return cudf_dtype_from_numpy(obj) elif isinstance(obj, str): return cudf_dtype_from_string(obj) if isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) - if isinstance(obj, CategoricalDtype): - if obj is 'category': - return cudf.CategoricalDtype() - return obj elif obj in _pd_to_cudf_dtypes.keys(): return _pd_to_cudf_dtypes[obj] elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype): return cudf_dtype_from_string(obj.name) + elif isinstance(obj, pa.lib.DataType): + return cudf_dtype_from_pyarrow[obj] elif obj is str: return cudf.StringDtype() elif obj is int: return cudf.Int64Dtype() - elif obj in {float, None}: + elif obj is float: return cudf.Float64Dtype() + elif obj is None: + return None else: - raise TypeError(f"Could not find a cuDF dtype matching {obj}") + raise TypeError + + #raise TypeError(f"Could not find a cuDF dtype matching {obj}") class CategoricalDtype(Generic): @@ -506,7 +510,7 @@ def __repr__(self): return f"ListDtype({self.element_type})" -pa_to_cudf_dtypes = { +cudf_dtype_from_pyarrow = { pa.uint8(): UInt8Dtype(), pa.uint16(): UInt16Dtype(), pa.uint32(): UInt32Dtype(), diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5aefad92e1e..91f7e7d0a45 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -12,8 +12,6 @@ from pandas._config import get_option from pandas.api.types import is_dict_like -from cudf.core.dtypes import dtype as cudf_dtype - import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate @@ -145,7 +143,6 @@ def __init__( ``null`` values. If ``False``, leaves ``np.nan`` values as is. """ - dtype = cudf_dtype(dtype) if isinstance(data, pd.Series): if name is None: name = data.name From bad1dc231d6fb5ec8d5aee6208c2d8f2ffe6609f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Sep 2020 12:10:51 -0700 Subject: [PATCH 47/80] a little iteration on dtypes.py --- python/cudf/cudf/core/column/numerical.py | 7 -- python/cudf/cudf/core/dtypes.py | 134 ++++++++++------------ python/cudf/cudf/core/series.py | 2 +- 3 files changed, 60 insertions(+), 83 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a55c2684656..d3b3fe7d0ee 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -28,13 +28,6 @@ def __init__( The dtype associated with the data Buffer mask : Buffer, optional """ - try: - cudf.dtype(dtype) - dtype.itemsize - - except: - import pdb - pdb.set_trace() dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 509266af722..d9d0ea07183 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -9,54 +9,6 @@ from cudf._lib.types import _Dtype import cudf -pa_to_pd_dtypes = { - pa.uint8(): pd.UInt8Dtype(), - pa.uint16(): pd.UInt16Dtype(), - pa.uint32(): pd.UInt32Dtype(), - pa.uint64(): pd.UInt64Dtype(), - pa.int8(): pd.Int8Dtype(), - pa.int16(): pd.Int16Dtype(), - pa.int32(): pd.Int32Dtype(), - pa.int64(): pd.Int64Dtype(), - pa.bool_(): pd.BooleanDtype(), - pa.string(): pd.StringDtype(), - pa.float32(): np.float32(), - pa.float64(): np.float64(), - pa.timestamp("ns"): np.dtype("datetime64[ns]"), - pa.timestamp("us"): np.dtype("datetime64[us]"), - pa.timestamp("ms"): np.dtype("datetime64[ms]"), - pa.timestamp("s"): np.dtype("datetime64[s]"), - pa.duration("ns"): np.dtype('timedelta64[ns]'), - pa.duration("us"): np.dtype('timedelta64[us]'), - pa.duration("ms"): np.dtype('timedelta64[ms]'), - pa.duration("s"): np.dtype('timedelta64[s]'), -} - -pa_to_np_dtypes = { - pa.uint8(): np.dtype("uint8"), - pa.uint16(): np.dtype("uint16"), - pa.uint32(): np.dtype("uint32"), - pa.uint64(): np.dtype("uint64"), - pa.int8(): np.dtype("int8"), - pa.int16(): np.dtype("int16"), - pa.int32(): np.dtype("int32"), - pa.int64(): np.dtype("int64"), - pa.bool_(): np.dtype("bool"), - pa.string(): np.dtype("object"), - pa.float32(): np.dtype("float32"), - pa.float64(): np.dtype("float64"), - pa.timestamp("ns"): np.dtype("datetime64[ns]"), - pa.timestamp("us"): np.dtype("datetime64[us]"), - pa.timestamp("ms"): np.dtype("datetime64[ms]"), - pa.timestamp("s"): np.dtype("datetime64[s]"), - pa.duration("ns"): np.dtype('timedelta64[ns]'), - pa.duration("us"): np.dtype('timedelta64[us]'), - pa.duration("ms"): np.dtype('timedelta64[ms]'), - pa.duration("s"): np.dtype('timedelta64[s]'), - None: None, -} - - class Generic(ExtensionDtype, _Dtype): pa_type = None @@ -86,11 +38,11 @@ def num(self): @property def to_numpy(self): - return pa_to_np_dtypes[self.pa_type] + return np.dtype(self.pa_type.to_pandas_dtype()) @property def to_pandas(self): - return pa_to_pd_dtypes[self.pa_type] + return pd.api.types.pandas_dtype(self.name) @property def itemsize(self): @@ -138,7 +90,6 @@ class UnsignedInteger(Integer): def __init__(self): self._raise_construction_error() - class Inexact(Number): def __init__(self): self._raise_construction_error() @@ -155,11 +106,27 @@ class Flexible(Generic): def __init__(self): self._construction_error() -class Datetime(Generic): - pass +class Datetime(Generic): + + @property + def to_numpy(self): + return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self] + + @property + def to_pandas(self): + # pandas only supports nanos + return np.dtype('datetime64[ns]') class Timedelta(Generic): - pass + + @property + def to_numpy(self): + return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self] + + @property + def to_pandas(self): + # pandas only supports nanos + return np.dtype('timedelta64[ns]') class UInt8Dtype(UnsignedInteger): def __init__(self): @@ -224,7 +191,7 @@ class BooleanDtype(Generic): def __init__(self): self.pa_type = pa.bool_() - self._name = "Boolean" + self._name = "boolean" class Datetime64NSDtype(Datetime): def __init__(self): @@ -281,7 +248,7 @@ class StringDtype(Flexible): def __init__(self): self.pa_type = pa.string() - self._name = "String" + self._name = "string" def cudf_dtype_from_string(obj): @@ -291,7 +258,10 @@ def cudf_dtype_from_string(obj): np_dtype = np.dtype(obj) return cudf_dtype_from_numpy(np_dtype) except TypeError: - return _cudf_dtype_from_string.get(obj, None) + result = _cudf_dtype_from_string.get(obj, None) + if not result: + raise TypeError(f"Could not find a cuDF dtype matching {obj}") + return result def cudf_dtype_from_numpy(obj): @@ -304,7 +274,22 @@ def cudf_dtype_from_numpy(obj): elif obj is np.timedelta64: return cudf.Timedelta dtype = np.dtype(obj) - return _cudf_dtype_from_numpy.get(dtype, None) + if dtype.type is np.str_: + return StringDtype() + result = _cudf_dtype_from_numpy.get(dtype, None) + if not result: + raise TypeError(f"Could not find a cuDF dtype matching {obj}") + return result + +def cudf_dtype_from_pandas(obj): + if isinstance(obj, pd.core.arrays.numpy_.PandasDtype): + try: + return cudf_dtype_from_numpy(obj.numpy_dtype) + except TypeError: + result = _cudf_dtype_from_pandas.get(obj, None) + if not result: + raise TypeError(f"Could not find a cuDF dtype matching {obj}") + return result def dtype(obj): if isinstance(obj, Generic): @@ -317,10 +302,8 @@ def dtype(obj): return cudf_dtype_from_string(obj) if isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) - elif obj in _pd_to_cudf_dtypes.keys(): - return _pd_to_cudf_dtypes[obj] - elif isinstance(obj, pd.core.arrays.numpy_.PandasDtype): - return cudf_dtype_from_string(obj.name) + elif isinstance(obj, (ExtensionDtype, pd.core.arrays.numpy_.PandasDtype)): + return cudf_dtype_from_pandas(obj) elif isinstance(obj, pa.lib.DataType): return cudf_dtype_from_pyarrow[obj] elif obj is str: @@ -332,6 +315,7 @@ def dtype(obj): elif obj is None: return None else: + raise TypeError #raise TypeError(f"Could not find a cuDF dtype matching {obj}") @@ -559,19 +543,6 @@ def __repr__(self): np.dtype("timedelta64[s]"): Timedelta64SDtype(), } -_pd_to_cudf_dtypes = { - pd.Int8Dtype(): Int8Dtype(), - pd.Int16Dtype(): Int16Dtype(), - pd.Int32Dtype(): Int32Dtype(), - pd.Int64Dtype(): Int64Dtype(), - pd.UInt8Dtype(): UInt8Dtype(), - pd.UInt16Dtype(): UInt16Dtype(), - pd.UInt32Dtype(): UInt32Dtype(), - pd.UInt64Dtype(): UInt64Dtype(), - pd.BooleanDtype(): BooleanDtype(), - pd.StringDtype(): StringDtype(), -} - _cudf_dtype_from_string = { 'UInt8': UInt8Dtype, 'UInt16': UInt16Dtype, @@ -595,3 +566,16 @@ def __repr__(self): 'Timedelta64MS': Timedelta64MSDtype, 'Timedelta64S': Timedelta64SDtype, } + +_cudf_dtype_from_pandas = { + pd.UInt8Dtype(): UInt8Dtype(), + pd.UInt16Dtype(): UInt16Dtype(), + pd.UInt32Dtype(): UInt32Dtype(), + pd.UInt64Dtype(): UInt64Dtype(), + pd.Int8Dtype(): Int8Dtype(), + pd.Int16Dtype(): Int16Dtype(), + pd.Int32Dtype(): Int32Dtype(), + pd.Int64Dtype(): Int64Dtype(), + pd.StringDtype(): StringDtype(), + pd.BooleanDtype(): BooleanDtype(), +} diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 91f7e7d0a45..17c4d5b8c58 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4363,7 +4363,7 @@ def keys(self): "UInt16": "Float32", "UInt32": "Float64", "UInt64": "Float64", - "Boolean": "Float32", + "boolean": "Float32", "Int": "Float", } From 09385079084170a5d3e518f66cb3f495b3e958d5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Sep 2020 12:35:06 -0700 Subject: [PATCH 48/80] implement the scalar type attribute --- python/cudf/cudf/_lib/copying.pyx | 3 ++- python/cudf/cudf/_lib/scalar.pyx | 8 ++++---- python/cudf/cudf/api/types.py | 3 +++ python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/dtypes.py | 11 +++++++---- python/cudf/cudf/core/index.py | 2 +- 7 files changed, 19 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 357b019c0f3..cab42bce789 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import pandas as pd +from cudf.api.types import is_integer_dtype from libcpp cimport bool from libcpp.memory cimport make_unique, unique_ptr @@ -129,7 +130,7 @@ def copy_range(Column input_column, def gather(Table source_table, Column gather_map, bool keep_index=True): - assert pd.api.types.is_integer_dtype(gather_map.dtype) + assert is_integer_dtype(gather_map.dtype) cdef unique_ptr[table] c_result cdef table_view source_table_view diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index ba2ecef1cd5..773ce54be31 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -107,13 +107,13 @@ cdef class Scalar: """ Returns a host copy of the underlying device scalar. """ - if pd.api.types.is_string_dtype(self.dtype): + if cudf.api.types.is_string_dtype(self.dtype): return _get_py_string_from_string(self.c_value) - elif pd.api.types.is_numeric_dtype(self.dtype): + elif cudf.api.types.is_numerical_dtype(self.dtype): return _get_np_scalar_from_numeric(self.c_value) - elif pd.api.types.is_datetime64_dtype(self.dtype): + elif cudf.api.types.is_datetime64_dtype(self.dtype): return _get_np_scalar_from_timestamp64(self.c_value) - elif pd.api.types.is_timedelta64_dtype(self.dtype): + elif cudf.api.types.is_timedelta64_dtype(self.dtype): return _get_np_scalar_from_timedelta64(self.c_value) else: raise ValueError( diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 3f8fe33e43f..f3a90e25765 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -18,6 +18,9 @@ def is_timedelta64_dtype(obj): def is_string_dtype(obj): return isinstance(obj, cudf.StringDtype) or (pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)) +def is_integer_dtype(obj): + return isinstance(obj, cudf.Integer) or pd.api.types.is_integer_dtype(obj) + def is_numerical_dtype(obj): if isinstance(obj, cudf.Generic): return isinstance(obj, (cudf.Number, cudf.BooleanDtype)) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f293f480eb3..f1974b10ef7 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -61,7 +61,7 @@ def __init__( null_count=null_count, ) - if not (self.dtype.type is np.datetime64): + if not isinstance(self.dtype, cudf.Datetime): raise TypeError(f"{self.dtype} is not a supported datetime type") def __contains__(self, item): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index a43f9ee98dd..e7d38223736 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -56,7 +56,7 @@ def __init__( null_count=null_count, ) - if not (self.dtype.type is np.timedelta64): + if not isinstance(self.dtype, cudf.Timedelta): raise TypeError(f"{self.dtype} is not a supported duration type") def __contains__(self, item): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index d9d0ea07183..a01c580aced 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -50,10 +50,7 @@ def itemsize(self): @property def type(self): - if isinstance(self, (Floating, Datetime)): - return self.to_numpy.type - else: - return self.to_pandas.type + return CUDFType(self) @property def kind(self): @@ -250,6 +247,12 @@ def __init__(self): self.pa_type = pa.string() self._name = "string" +class CUDFType(object): + def __init__(self, parent_dtype): + self.parent_dtype = parent_dtype + + def __call__(self, arg): + return cudf._lib.scalar.Scalar(arg, dtype=self.parent_dtype) def cudf_dtype_from_string(obj): if obj == 'category': diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 420c99afe92..a5e7ddb7b6c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1719,7 +1719,7 @@ def to_pandas(self): return pd.RangeIndex( start=self._start, stop=self._stop, - dtype=self.dtype, + dtype=self.dtype.to_pandas, name=self.name, ) From e5a489dbcaf04d5d411ffac68e6f1425fc340f44 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 4 Sep 2020 13:00:35 -0700 Subject: [PATCH 49/80] cleanup and style --- python/cudf/cudf/__init__.py | 48 ++++--- python/cudf/cudf/_lib/binaryop.pyx | 2 +- python/cudf/cudf/api/types.py | 49 +++++-- python/cudf/cudf/core/column/categorical.py | 6 +- python/cudf/cudf/core/column/column.py | 65 +++++++--- python/cudf/cudf/core/column/datetime.py | 15 +-- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/numerical.py | 17 ++- python/cudf/cudf/core/column/string.py | 22 ++-- python/cudf/cudf/core/column/timedelta.py | 25 ++-- python/cudf/cudf/core/dataframe.py | 13 +- python/cudf/cudf/core/dtypes.py | 134 +++++++++++--------- python/cudf/cudf/core/frame.py | 10 +- python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/indexing.py | 6 +- python/cudf/cudf/core/join/join.py | 18 ++- python/cudf/cudf/core/reshape.py | 3 +- python/cudf/cudf/core/series.py | 100 ++++++++------- python/cudf/cudf/io/parquet.py | 4 +- python/cudf/cudf/tests/test_categorical.py | 2 +- python/cudf/cudf/tests/test_column.py | 3 +- python/cudf/cudf/tests/test_dataframe.py | 4 +- python/cudf/cudf/tests/test_datetime.py | 5 +- python/cudf/cudf/tests/test_dtypes.py | 94 +++++++------- python/cudf/cudf/tests/test_joining.py | 2 +- python/cudf/cudf/tests/test_numerical.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 6 +- python/cudf/cudf/tests/test_repr.py | 20 +-- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_string.py | 2 +- python/cudf/cudf/tests/test_timedelta.py | 18 +-- python/cudf/cudf/utils/dtypes.py | 40 +++--- python/cudf/cudf/utils/utils.py | 4 +- 33 files changed, 437 insertions(+), 310 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index cff9df9f032..6e644cf09be 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -34,38 +34,36 @@ merge, ) from cudf.core.dtypes import ( - dtype, - Generic, + BooleanDtype, + CategoricalDtype, Datetime, - Floating, - Number, - Integer, + Datetime64MSDtype, + Datetime64NSDtype, + Datetime64SDtype, + Datetime64USDtype, Flexible, - Datetime, - Timedelta, - CategoricalDtype, + Float32Dtype, + Float64Dtype, + Floating, + Generic, Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + Integer, + Number, StringDtype, - Float32Dtype, - Float64Dtype, - BooleanDtype, - Datetime64NSDtype, - Datetime64USDtype, - Datetime64MSDtype, - Datetime64SDtype, + Timedelta, + Timedelta64MSDtype, Timedelta64NSDtype, + Timedelta64SDtype, Timedelta64USDtype, - Timedelta64MSDtype, - Timedelta64SDtype + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + dtype, ) - from cudf.core.groupby import Grouper from cudf.core.ops import ( add, diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx index 18c72da25f9..b2b2c217db2 100644 --- a/python/cudf/cudf/_lib/binaryop.pyx +++ b/python/cudf/cudf/_lib/binaryop.pyx @@ -23,7 +23,7 @@ from cudf._lib.cpp.types cimport ( type_id, ) -from cudf.utils.dtypes import is_string_dtype +from cudf.api.types import is_string_dtype from cudf._lib.cpp.binaryop cimport binary_operator cimport cudf._lib.cpp.binaryop as cpp_binaryop diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index f3a90e25765..732828085b4 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,26 +1,41 @@ -import pandas as pd -import cudf import numpy as np +import pandas as pd from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType +import cudf + + def is_bool_dtype(obj): # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably if hasattr(obj, "dtype"): obj = obj.dtype - return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype(obj) + return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype( + obj + ) + def is_datetime64_dtype(obj): - return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype(obj) + return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype( + obj + ) + def is_timedelta64_dtype(obj): - return isinstance(obj, cudf.Timedelta) or pd.api.types.is_timedelta64_dtype(obj) + return isinstance( + obj, cudf.Timedelta + ) or pd.api.types.is_timedelta64_dtype(obj) + def is_string_dtype(obj): - return isinstance(obj, cudf.StringDtype) or (pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)) + return isinstance(obj, cudf.StringDtype) or ( + pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj) + ) + def is_integer_dtype(obj): return isinstance(obj, cudf.Integer) or pd.api.types.is_integer_dtype(obj) + def is_numerical_dtype(obj): if isinstance(obj, cudf.Generic): return isinstance(obj, (cudf.Number, cudf.BooleanDtype)) @@ -34,11 +49,14 @@ def is_numerical_dtype(obj): or np.issubdtype(obj, np.signedinteger) ) + def is_categorical_dtype(obj): """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype is a pandas CategoricalDtype. """ - if isinstance(obj, cudf.Generic) and not isinstance(obj, cudf.CategoricalDtype): + if isinstance(obj, cudf.Generic) and not isinstance( + obj, cudf.CategoricalDtype + ): return False if obj is None: return False @@ -85,6 +103,7 @@ def is_categorical_dtype(obj): return True return pd.api.types.is_categorical_dtype(obj) + def is_list_dtype(obj): return ( type(obj) is cudf.core.dtypes.ListDtype @@ -95,12 +114,18 @@ def is_list_dtype(obj): or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype)) ) + def find_common_type(array_types=[], scalar_types=[]): - array_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types] - scalar_types = [d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types] + array_types = [ + d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types + ] + scalar_types = [ + d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types + ] return cudf.dtype(np.find_common_type(array_types, scalar_types)) + def can_cast(dtype_l, dtype_r): if isinstance(dtype_l, cudf.Generic): dtype_l = dtype_l.to_numpy @@ -109,7 +134,11 @@ def can_cast(dtype_l, dtype_r): return np.can_cast(dtype_l, dtype_r) + def result_type(*arrays_and_dtypes): - arrays_and_dtypes = (d.to_numpy if isinstance(d, cudf.Generic) else d for d in arrays_and_dtypes) + arrays_and_dtypes = ( + d.to_numpy if isinstance(d, cudf.Generic) else d + for d in arrays_and_dtypes + ) return cudf.dtype(np.result_type(*arrays_and_dtypes)) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d5931e439c5..f9108f4be64 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -7,6 +7,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask +from cudf.api.types import is_categorical_dtype from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethodsMixin @@ -16,7 +17,6 @@ min_signed_type, min_unsigned_type, ) -from cudf.api.types import is_categorical_dtype class CategoricalAccessor(ColumnMethodsMixin): @@ -305,7 +305,9 @@ def add_categories(self, new_categories, **kwargs): f"type-cast new_categories to the same type as " f"existing categories." ) - common_dtype = cudf.api.types.find_common_type([old_categories.dtype, new_categories.dtype], []) + common_dtype = cudf.api.types.find_common_type( + [old_categories.dtype, new_categories.dtype], [] + ) new_categories = new_categories.astype(common_dtype, copy=False) old_categories = old_categories.astype(common_dtype, copy=False) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b6d66d4ba8b..0f5f29913b0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -23,6 +23,12 @@ from cudf._lib.scalar import as_scalar from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count from cudf._lib.transform import bools_to_mask +from cudf.api.types import ( + is_bool_dtype, + is_categorical_dtype, + is_list_dtype, + is_string_dtype, +) from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.core.dtypes import CategoricalDtype @@ -34,10 +40,8 @@ is_scalar, min_signed_type, min_unsigned_type, - np_to_pa_dtype, ) from cudf.utils.utils import mask_dtype -from cudf.api.types import is_categorical_dtype, is_list_dtype, is_numerical_dtype, is_string_dtype, is_bool_dtype class ColumnBase(Column, Serializable): @@ -202,14 +206,17 @@ def _concat(cls, objs, dtype=None): [ o for o in not_null_cols - if not isinstance(o.dtype, (cudf.Number)) or isinstance(o.dtype, cudf.Datetime) + if not isinstance(o.dtype, (cudf.Number)) + or isinstance(o.dtype, cudf.Datetime) ] ) == 0 ): cudf_col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype - cudf_common_dtype = cudf.api.types.find_common_type(cudf_col_dtypes, []) + cudf_common_dtype = cudf.api.types.find_common_type( + cudf_col_dtypes, [] + ) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(cudf_common_dtype) @@ -635,9 +642,13 @@ def __getitem__(self, arg): arg = as_column(arg) if len(arg) == 0: arg = as_column([], dtype="int32") - if pd.api.types.is_integer_dtype(arg.dtype) or isinstance(arg.dtype, cudf.Integer): + if pd.api.types.is_integer_dtype(arg.dtype) or isinstance( + arg.dtype, cudf.Integer + ): return self.take(arg) - if pd.api.types.is_bool_dtype(arg.dtype) or isinstance(arg.dtype, cudf.BooleanDtype): + if pd.api.types.is_bool_dtype(arg.dtype) or isinstance( + arg.dtype, cudf.BooleanDtype + ): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg)) @@ -1607,9 +1618,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): elif arb_dtype.kind in ("O", "U"): pa_data = pa.Array.from_pandas(arbitrary) - data = as_column( - pa_data, dtype=cudf.dtype(pa_data.type) - ) + data = as_column(pa_data, dtype=cudf.dtype(pa_data.type)) # There is no cast operation available for pa.Array from int to # str, Hence instead of handling in pa.Array block, we # will have to type-cast here. @@ -1681,24 +1690,44 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): if is_categorical_dtype(dtype): raise TypeError - pa_data = pa.array(arbitrary, - type=dtype.pa_type if dtype is not None else None, - from_pandas=True if nan_as_null is None else nan_as_null) + pa_data = pa.array( + arbitrary, + type=dtype.pa_type if dtype is not None else None, + from_pandas=True if nan_as_null is None else nan_as_null, + ) # todo: fix this ???? ???????? - as_column_dtype = cudf.dtype(pa_data.type) if not isinstance(pa_data.type, (pa.lib.DictionaryType, pa.lib.ListType)) else None - data = as_column(pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null) + as_column_dtype = ( + cudf.dtype(pa_data.type) + if not isinstance( + pa_data.type, (pa.lib.DictionaryType, pa.lib.ListType) + ) + else None + ) + data = as_column( + pa_data, dtype=as_column_dtype, nan_as_null=nan_as_null + ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): - if isinstance(dtype, pd.CategoricalDtype) or dtype is 'category': - data = as_column(pd.Series(arbitrary, dtype=dtype), nan_as_null=nan_as_null) + if ( + isinstance(dtype, pd.CategoricalDtype) + or dtype is "category" # noqa: F632 + ): + data = as_column( + pd.Series(arbitrary, dtype=dtype), + nan_as_null=nan_as_null, + ) else: - data = as_column(arbitrary, nan_as_null=nan_as_null).astype(dtype) + data = as_column( + arbitrary, nan_as_null=nan_as_null + ).astype(dtype) elif isinstance(cudf.dtype(dtype), cudf.StringDtype): sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: - native_dtype = dtype.to_numpy if dtype is not None else None + native_dtype = ( + dtype.to_numpy if dtype is not None else None + ) if dtype is None and pd.api.types.infer_dtype( arbitrary ) in ("mixed", "mixed-integer"): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index f1974b10ef7..3894b5dd0dc 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -5,11 +5,11 @@ import numpy as np import pandas as pd -import cudf.core.dtypes as cudf_dtypes import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate from cudf._lib.scalar import Scalar, as_scalar +from cudf.core import dtypes as cudf_dtypes from cudf.core.column import column, string from cudf.utils.dtypes import is_scalar @@ -170,9 +170,9 @@ def as_string_column(self, dtype, **kwargs): ) kwargs["format"] = fmt if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - self.dtype - ](self, **kwargs) + return string._numeric_to_str_typecast_functions[self.dtype]( + self, **kwargs + ) else: return column.column_empty(0, dtype="object", masked=False) @@ -184,7 +184,6 @@ def default_na_value(self): def binary_operator(self, op, rhs, reflect=False): lhs, rhs = self, rhs - lhs_dtype = cudf.dtype(lhs.dtype) rhs_dtype = cudf.dtype(rhs.dtype) if op in ("eq", "ne", "lt", "gt", "le", "ge"): @@ -203,9 +202,9 @@ def binary_operator(self, op, rhs, reflect=False): lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = cudf.dtype(np.dtype( - f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]" - )) + out_dtype = cudf.dtype( + np.dtype(f"timedelta64[{units[max(lhs_unit, rhs_unit)]}]") + ) else: raise TypeError( f"Series of dtype {self.dtype} cannot perform " diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 295c2fa250f..fb308ce09cd 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,9 +1,9 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from cudf.api.types import is_list_dtype from cudf.core.column import ColumnBase from cudf.core.column.methods import ColumnMethodsMixin from cudf.core.dtypes import ListDtype -from cudf.api.types import is_list_dtype class ListColumn(ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index d3b3fe7d0ee..2fa16c8458d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -8,13 +8,14 @@ from cudf._lib.scalar import Scalar from cudf.core.buffer import Buffer from cudf.core.column import as_column, build_column, column, string +from cudf.core.dtypes import Float64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( min_column_type, min_signed_type, numeric_normalize_types, ) -from cudf.core.dtypes import Float64Dtype + class NumericalColumn(column.ColumnBase): def __init__( @@ -79,7 +80,9 @@ def binary_operator(self, binop, rhs, reflect=False): if reflect: tmp = self if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs): - out_dtype = np.result_type(cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy) + out_dtype = np.result_type( + cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy + ) out_dtype = cudf.dtype(out_dtype) if binop in ["mod", "floordiv"]: if (cudf.dtype(tmp.dtype) in int_dtypes) and ( @@ -134,9 +137,9 @@ def int2ip(self): def as_string_column(self, dtype, **kwargs): if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - self.dtype - ](self, **kwargs) + return string._numeric_to_str_typecast_functions[self.dtype]( + self, **kwargs + ) else: return as_column([], dtype="object") @@ -167,6 +170,7 @@ def as_numerical_column(self, dtype, **kwargs): return self if dtype is None: import pdb + pdb.set_trace() return libcudf.unary.cast(self, dtype) @@ -175,13 +179,14 @@ def sum(self, dtype=None): return libcudf.reduce.reduce("sum", self, dtype=dtype) except: import pdb + pdb.set_trace() def product(self, dtype=None): return libcudf.reduce.reduce("product", self, dtype=dtype) def mean(self, dtype=Float64Dtype()): - return libcudf.reduce.reduce("mean", self, dtype=dtype) + return libcudf.reduce.reduce("mean", self, dtype=dtype) def var(self, ddof=1, dtype=Float64Dtype()): return libcudf.reduce.reduce("var", self, dtype=dtype, ddof=ddof) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a5978ff9e0a..224211d5b2d 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -132,20 +132,18 @@ slice_strings as cpp_slice_strings, ) from cudf._lib.strings.translate import ( - translate as cpp_translate, filter_characters as cpp_filter_characters, + translate as cpp_translate, ) from cudf._lib.strings.wrap import wrap as cpp_wrap +from cudf.api.types import is_list_dtype, is_string_dtype from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.dtypes import dtype from cudf.utils import utils from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import ( - can_convert_to_column, is_scalar -) -from cudf.api.types import is_list_dtype, is_string_dtype -from cudf.core.dtypes import dtype +from cudf.utils.dtypes import can_convert_to_column, is_scalar _str_to_numeric_typecast_functions = { dtype("int8"): str_cast.stoi8, @@ -4564,7 +4562,9 @@ def _nbytes(self): return self.children[1].size def as_numerical_column(self, dtype, **kwargs): - out_dtype = cudf.dtype(dtype) if dtype is not None else cudf.Float64Dtype() + out_dtype = ( + cudf.dtype(dtype) if dtype is not None else cudf.Float64Dtype() + ) kwargs.update(dtype=out_dtype) if out_dtype.type is np.datetime64: @@ -4744,7 +4744,9 @@ def fillna(self, fill_value): def _find_first_and_last(self, value): found_indices = self.str().contains(f"^{value}$") - found_indices = libcudf.unary.cast(found_indices, dtype=cudf.Int32Dtype()) + found_indices = libcudf.unary.cast( + found_indices, dtype=cudf.Int32Dtype() + ) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) return first, last @@ -4776,7 +4778,9 @@ def binary_operator(self, op, rhs, reflect=False): if isinstance(rhs, StringColumn) and op == "add": return lhs.str().cat(others=rhs) elif op in ("eq", "ne", "gt", "lt", "ge", "le"): - return _string_column_binop(self, rhs, op=op, out_dtype=cudf.BooleanDtype()) + return _string_column_binop( + self, rhs, op=op, out_dtype=cudf.BooleanDtype() + ) else: msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(op, type(self), type(rhs))) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e7d38223736..60b3f027efe 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -10,10 +10,11 @@ from cudf import _lib as libcudf from cudf._lib.nvtx import annotate from cudf._lib.scalar import Scalar, as_scalar +from cudf.api.types import can_cast from cudf.core.column import column, string from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils.dtypes import is_scalar, np_to_pa_dtype -from cudf.api.types import can_cast + _dtype_to_format_conversion = { "Timedelta64NS": "%D days %H:%M:%S", "Timedelta64US": "%D days %H:%M:%S", @@ -298,9 +299,9 @@ def as_string_column(self, dtype, **kwargs): ) kwargs["format"] = fmt if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - self.dtype - ](self, **kwargs) + return string._numeric_to_str_typecast_functions[self.dtype]( + self, **kwargs + ) else: return column.column_empty(0, dtype="object", masked=False) @@ -548,7 +549,9 @@ def _timedelta_binary_op_add(lhs, rhs): lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")) + out_dtype = cudf.dtype( + np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + ) else: raise TypeError( f"Addition of {lhs.dtype} with {rhs.dtype} " @@ -559,15 +562,21 @@ def _timedelta_binary_op_add(lhs, rhs): def _timedelta_binary_op_sub(lhs, rhs): - if isinstance(lhs.dtype, cudf.Timedelta) and isinstance(rhs.dtype, cudf.Timedelta): + if isinstance(lhs.dtype, cudf.Timedelta) and isinstance( + rhs.dtype, cudf.Timedelta + ): out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) - elif isinstance(rhs.dtype, cudf.Timedelta) and isinstance(lhs.dtype, cudf.Datetime): + elif isinstance(rhs.dtype, cudf.Timedelta) and isinstance( + lhs.dtype, cudf.Datetime + ): units = ["s", "ms", "us", "ns"] lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) lhs_unit = units.index(lhs_time_unit) rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) - out_dtype = cudf.dtype(np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")) + out_dtype = cudf.dtype( + np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]") + ) else: raise TypeError( f"Subtraction of {lhs.dtype} with {rhs.dtype} " diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ed12e34a688..0f59bda56a5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -26,6 +26,7 @@ from cudf import _lib as libcudf from cudf._lib.null_mask import MaskState, create_null_mask from cudf._lib.nvtx import annotate +from cudf.api.types import is_categorical_dtype, is_list_dtype, is_string_dtype from cudf.core import column from cudf.core.abc import Serializable from cudf.core.column import as_column, column_empty @@ -44,7 +45,6 @@ is_scalar, numeric_normalize_types, ) -from cudf.api.types import is_categorical_dtype, is_list_dtype, is_string_dtype from cudf.utils.utils import OrderedColumnDict @@ -4520,7 +4520,10 @@ def _sizeof_fmt(num, size_qualifier): deep = True else: deep = False - if "String" in dtype_counts or self.index.dtype == cudf.StringDtype(): + if ( + "String" in dtype_counts + or self.index.dtype == cudf.StringDtype() + ): size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append( @@ -4907,7 +4910,9 @@ def to_records(self, index=True): numpy recarray """ members = [("index", self.index.dtype.to_numpy)] if index else [] - members += [(col, self[col].dtype.to_numpy) for col in self._data.names] + members += [ + (col, self[col].dtype.to_numpy) for col in self._data.names + ] dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: @@ -6419,7 +6424,7 @@ def select_dtypes(self, include=None, exclude=None): include_subtypes.add(i_dtype) elif issubclass(dtype, i_dtype): include_subtypes.add(dtype) - + # exclude all subtypes exclude_subtypes = set() for dtype in (d.__class__ for d in self.dtypes): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a01c580aced..360de63efae 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -6,8 +6,10 @@ import pandas as pd import pyarrow as pa from pandas.api.extensions import ExtensionDtype -from cudf._lib.types import _Dtype + import cudf +from cudf._lib.types import _Dtype + class Generic(ExtensionDtype, _Dtype): pa_type = None @@ -15,7 +17,9 @@ class Generic(ExtensionDtype, _Dtype): def __eq__(self, other): if isinstance(other, self.__class__): return True - if isinstance(other, Generic) and not isinstance(other, self.__class__): + if isinstance(other, Generic) and not isinstance( + other, self.__class__ + ): return False if ( isinstance(other, self.to_pandas.__class__) @@ -54,7 +58,7 @@ def type(self): @property def kind(self): - return self.to_pandas.kind + return self.to_pandas.kind @property def name(self): @@ -65,65 +69,71 @@ def __repr__(self): def __hash__(self): return hash(self.__repr__()) - + def _raise_construction_error(self): raise TypeError(f"Cannot create {type(self)} instances") - class Number(Generic): def __init__(self): self._raise_construction_error() + class Integer(Number): def __init__(self): self._raise_construction_error() + class SignedInteger(Integer): def __init__(self): self._raise_construction_error() - + + class UnsignedInteger(Integer): def __init__(self): self._raise_construction_error() - + + class Inexact(Number): def __init__(self): self._raise_construction_error() - + + class Floating(Inexact): def __init__(self): self._raise_construction_error() - + @property def kind(self): return "f" + class Flexible(Generic): def __init__(self): self._construction_error() - -class Datetime(Generic): + +class Datetime(Generic): @property def to_numpy(self): - return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self] + return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self] @property def to_pandas(self): # pandas only supports nanos - return np.dtype('datetime64[ns]') + return np.dtype("datetime64[ns]") -class Timedelta(Generic): +class Timedelta(Generic): @property def to_numpy(self): - return {v:k for k,v in _cudf_dtype_from_numpy.items()}[self] + return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self] @property def to_pandas(self): # pandas only supports nanos - return np.dtype('timedelta64[ns]') + return np.dtype("timedelta64[ns]") + class UInt8Dtype(UnsignedInteger): def __init__(self): @@ -172,6 +182,7 @@ def __init__(self): self.pa_type = pa.int64() self._name = "Int64" + class Float32Dtype(Floating): def __init__(self): self.pa_type = pa.float32() @@ -185,11 +196,11 @@ def __init__(self): class BooleanDtype(Generic): - def __init__(self): self.pa_type = pa.bool_() self._name = "boolean" + class Datetime64NSDtype(Datetime): def __init__(self): self.pa_type = pa.timestamp("ns") @@ -217,36 +228,41 @@ def __init__(self): self._name = "Datetime64S" self._time_unit = "s" + class Timedelta64NSDtype(Timedelta): def __init__(self): - self.pa_type = pa.duration('ns') + self.pa_type = pa.duration("ns") self._name = "Timedelta64NS" - self._time_unit = 'ns' + self._time_unit = "ns" + class Timedelta64USDtype(Timedelta): def __init__(self): - self.pa_type = pa.duration('us') + self.pa_type = pa.duration("us") self._name = "Timedelta64US" - self._time_unit = 'us' + self._time_unit = "us" + class Timedelta64MSDtype(Timedelta): def __init__(self): - self.pa_type = pa.duration('ms') + self.pa_type = pa.duration("ms") self._name = "Timedelta64MS" - self._time_unit = 'ms' + self._time_unit = "ms" + class Timedelta64SDtype(Timedelta): def __init__(self): - self.pa_type = pa.duration('s') + self.pa_type = pa.duration("s") self._name = "Timedelta64S" - self._time_unit = 's' + self._time_unit = "s" -class StringDtype(Flexible): +class StringDtype(Flexible): def __init__(self): self.pa_type = pa.string() self._name = "string" + class CUDFType(object): def __init__(self, parent_dtype): self.parent_dtype = parent_dtype @@ -254,8 +270,9 @@ def __init__(self, parent_dtype): def __call__(self, arg): return cudf._lib.scalar.Scalar(arg, dtype=self.parent_dtype) + def cudf_dtype_from_string(obj): - if obj == 'category': + if obj == "category": return obj try: np_dtype = np.dtype(obj) @@ -284,6 +301,7 @@ def cudf_dtype_from_numpy(obj): raise TypeError(f"Could not find a cuDF dtype matching {obj}") return result + def cudf_dtype_from_pandas(obj): if isinstance(obj, pd.core.arrays.numpy_.PandasDtype): try: @@ -294,12 +312,15 @@ def cudf_dtype_from_pandas(obj): raise TypeError(f"Could not find a cuDF dtype matching {obj}") return result + def dtype(obj): if isinstance(obj, Generic): return obj elif type(obj) is type and issubclass(obj, Generic): return obj() - elif isinstance(obj, np.dtype) or (isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype))): + elif isinstance(obj, np.dtype) or ( + isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype)) + ): return cudf_dtype_from_numpy(obj) elif isinstance(obj, str): return cudf_dtype_from_string(obj) @@ -320,12 +341,11 @@ def dtype(obj): else: raise TypeError - - #raise TypeError(f"Could not find a cuDF dtype matching {obj}") + # raise TypeError(f"Could not find a cuDF dtype matching {obj}") -class CategoricalDtype(Generic): +class CategoricalDtype(Generic): def __init__(self, categories=None, ordered=None): """ dtype similar to pd.CategoricalDtype with the categories @@ -431,7 +451,7 @@ def deserialize(cls, header, frames): @property def kind(self): - return 'O' + return "O" class ListDtype(Generic): @@ -463,7 +483,7 @@ def leaf_type(self): @property def kind(self): - return 'O' + return "O" @property def type(self): @@ -519,7 +539,7 @@ def __repr__(self): pa.duration("ms"): Timedelta64MSDtype(), pa.duration("s"): Timedelta64SDtype(), pa.date32(): Datetime64NSDtype(), - pa.null(): None + pa.null(): None, } _cudf_dtype_from_numpy = { @@ -547,35 +567,35 @@ def __repr__(self): } _cudf_dtype_from_string = { - 'UInt8': UInt8Dtype, - 'UInt16': UInt16Dtype, - 'UInt32': UInt32Dtype, - 'UInt64': UInt64Dtype, - 'Int8': Int8Dtype, - 'Int16': Int16Dtype, - 'Int32': Int32Dtype, - 'Int64': Int64Dtype, - 'Float': Float64Dtype, - 'Float32': Float32Dtype, - 'Float64': Float64Dtype, - 'Boolean': BooleanDtype, - 'String': StringDtype, - 'Datetime64NS': Datetime64NSDtype, - 'Datetime64US': Datetime64USDtype, - 'Datetime64MS': Datetime64MSDtype, - 'Datetime64S': Datetime64SDtype, - 'Timedelta64NS': Timedelta64NSDtype, - 'Timedelta64US': Timedelta64USDtype, - 'Timedelta64MS': Timedelta64MSDtype, - 'Timedelta64S': Timedelta64SDtype, + "UInt8": UInt8Dtype, + "UInt16": UInt16Dtype, + "UInt32": UInt32Dtype, + "UInt64": UInt64Dtype, + "Int8": Int8Dtype, + "Int16": Int16Dtype, + "Int32": Int32Dtype, + "Int64": Int64Dtype, + "Float": Float64Dtype, + "Float32": Float32Dtype, + "Float64": Float64Dtype, + "Boolean": BooleanDtype, + "String": StringDtype, + "Datetime64NS": Datetime64NSDtype, + "Datetime64US": Datetime64USDtype, + "Datetime64MS": Datetime64MSDtype, + "Datetime64S": Datetime64SDtype, + "Timedelta64NS": Timedelta64NSDtype, + "Timedelta64US": Timedelta64USDtype, + "Timedelta64MS": Timedelta64MSDtype, + "Timedelta64S": Timedelta64SDtype, } _cudf_dtype_from_pandas = { pd.UInt8Dtype(): UInt8Dtype(), - pd.UInt16Dtype(): UInt16Dtype(), + pd.UInt16Dtype(): UInt16Dtype(), pd.UInt32Dtype(): UInt32Dtype(), pd.UInt64Dtype(): UInt64Dtype(), - pd.Int8Dtype(): Int8Dtype(), + pd.Int8Dtype(): Int8Dtype(), pd.Int16Dtype(): Int16Dtype(), pd.Int32Dtype(): Int32Dtype(), pd.Int64Dtype(): Int64Dtype(), diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 649a0af9fd8..31de28f408f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -13,6 +13,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate +from cudf.api.types import is_categorical_dtype, is_numerical_dtype from cudf.core.column import as_column, build_categorical_column from cudf.utils import utils from cudf.utils.dtypes import ( @@ -21,7 +22,6 @@ min_scalar_type, min_signed_type, ) -from cudf.api.types import is_numerical_dtype, is_categorical_dtype class Frame(libcudf.table.Table): @@ -277,7 +277,9 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = cudf.api.types.find_common_type([col.dtype for col in cols], []) + dtypes[idx] = cudf.api.types.find_common_type( + [col.dtype for col in cols], [] + ) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) @@ -294,9 +296,7 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) # Otherwise raise an error if columns have different dtypes - elif not all( - c.dtype == dtypes[idx] for c in cols - ): + elif not all(c.dtype == dtypes[idx] for c in cols): raise ValueError("All columns must be the same type") return categories diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a5e7ddb7b6c..31afdf10a33 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -11,6 +11,7 @@ import cudf from cudf._lib.nvtx import annotate +from cudf.api.types import is_categorical_dtype from cudf.core.abc import Serializable from cudf.core.column import ( CategoricalColumn, @@ -22,6 +23,7 @@ column, ) from cudf.core.column.string import StringMethods as StringMethods +from cudf.core.dtypes import dtype from cudf.core.frame import Frame from cudf.utils import ioutils, utils from cudf.utils.docutils import copy_docstring @@ -31,9 +33,7 @@ is_scalar, numeric_normalize_types, ) -from cudf.api.types import is_categorical_dtype from cudf.utils.utils import cached_property -from cudf.core.dtypes import dtype def _to_frame(this_index, index=True, name=None): diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index e1324c15268..83dfc0cb768 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -4,13 +4,13 @@ import cudf from cudf._lib.nvtx import annotate +from cudf.api.types import is_categorical_dtype from cudf.utils.dtypes import ( is_column_like, is_list_like, is_scalar, to_cudf_compatible_scalar, ) -from cudf.api.types import is_categorical_dtype def indices_from_labels(obj, labels): @@ -94,7 +94,9 @@ def __setitem__(self, key, value): ): # normalize types if necessary: if not pd.api.types.is_integer(key): - to_dtype = cudf.api.types.result_type(value.dtype, self._sr._column.dtype) + to_dtype = cudf.api.types.result_type( + value.dtype, self._sr._column.dtype + ) value = value.astype(to_dtype.to_numpy) self._sr._column._mimic_inplace( self._sr._column.astype(to_dtype), inplace=True diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index e2a0af5cef2..95a1a05b377 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,7 +2,6 @@ import itertools import warnings -import numpy as np import pandas as pd import cudf @@ -398,16 +397,25 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): ): if dtype_l.kind == dtype_r.kind: # both ints or both floats - libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy)) + libcudf_join_type = cudf.dtype( + max(dtype_l.to_numpy, dtype_r.to_numpy) + ) else: - libcudf_join_type = cudf.api.types.find_common_type([], [dtype_l, dtype_r]) + libcudf_join_type = cudf.api.types.find_common_type( + [], [dtype_l, dtype_r] + ) elif isinstance(dtype_l, cudf.Datetime) and isinstance( dtype_r, cudf.Datetime ): - libcudf_join_type = cudf.dtype(max(dtype_l.to_numpy, dtype_r.to_numpy)) + libcudf_join_type = cudf.dtype( + max(dtype_l.to_numpy, dtype_r.to_numpy) + ) if libcudf_join_type is None: # todo: test this - raise TypeError(f"Cant find an implicit common type for {dtype_l} and {dtype_r}") + raise TypeError( + f"Cant find an implicit common \ + type for {dtype_l} and {dtype_r}" + ) return libcudf_join_type def libcudf_to_output_casting_rules(self, lcol, rcol, how): diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index dbed1510866..4c2816deeea 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -3,6 +3,7 @@ import pandas as pd import cudf +from cudf.api.types import is_categorical_dtype from cudf.core import DataFrame, Index, Series from cudf.core.column import ( CategoricalColumn, @@ -10,7 +11,7 @@ build_categorical_column, ) from cudf.utils.dtypes import is_list_like -from cudf.api.types import is_categorical_dtype + _axis_map = {0: 0, 1: 1, "index": 0, "columns": 1} diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 17c4d5b8c58..8804aff2e38 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -16,6 +16,7 @@ from cudf import _lib as libcudf from cudf._lib.nvtx import annotate from cudf._lib.transform import bools_to_mask +from cudf.api.types import is_list_dtype, is_string_dtype from cudf.core.abc import Serializable from cudf.core.column import ( ColumnBase, @@ -46,7 +47,6 @@ min_scalar_type, numeric_normalize_types, ) -from cudf.api.types import is_list_dtype, is_string_dtype class Series(Frame, Serializable): @@ -1396,9 +1396,19 @@ def __rtruediv__(self, other): __div__ = __truediv__ def _bitwise_binop(self, other, op): - if (isinstance(self.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))) and (isinstance(other.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta))): + if ( + isinstance( + self.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta) + ) + ) and ( + isinstance( + other.dtype, (cudf.BooleanDtype, cudf.Integer, cudf.Timedelta) + ) + ): ser = self._binaryop(other, op) - if isinstance(self.dtype, cudf.BooleanDtype) or isinstance(other.dtype, cudf.BooleanDtype): + if isinstance(self.dtype, cudf.BooleanDtype) or isinstance( + other.dtype, cudf.BooleanDtype + ): ser = ser.astype(cudf.BooleanDtype()) else: raise TypeError( @@ -1406,7 +1416,7 @@ def _bitwise_binop(self, other, op): f"{self.dtype.type.__name__} and {other.dtype.type.__name__}" ) return ser - + def __and__(self, other): """Performs vectorized bitwise and (&) on corresponding elements of two series. @@ -5177,53 +5187,53 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): return Series(result_col, index=index) + def _fix_nullable_dtype_repr(string): to_replace = [ - 'uint8', - 'uint16', - 'uint32', - 'uint64', - 'int8', - 'int16', - 'int32', - 'int64', - 'float32', - 'float64', - 'bool', - 'object', - 'datetime64[ns]', - 'datetime64[us]', - 'datetime64[ms]', - 'datetime64[s]', - 'timedelta64[ns]', - 'timedelta64[us]', - 'timedelta64[ms]', - 'timedelta64[s]' + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "bool", + "object", + "datetime64[ns]", + "datetime64[us]", + "datetime64[ms]", + "datetime64[s]", + "timedelta64[ns]", + "timedelta64[us]", + "timedelta64[ms]", + "timedelta64[s]", ] - replacements = [ - 'UInt8', - 'UInt16', - 'UInt32', - 'UInt64', - 'Int8', - 'Int16', - 'Int32', - 'Int64', - 'Float32', - 'Float64', - 'Boolean', - 'String', - 'Datetime64NS', - 'Datetime64US', - 'Datetime64MS', - 'Datetime64S', - 'Timedelta64NS', - 'Timedelta64US', - 'Timedelta64MS', - 'Timedelta64S' + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Int8", + "Int16", + "Int32", + "Int64", + "Float32", + "Float64", + "Boolean", + "String", + "Datetime64NS", + "Datetime64US", + "Datetime64MS", + "Datetime64S", + "Timedelta64NS", + "Timedelta64US", + "Timedelta64MS", + "Timedelta64S", ] for tr, rp in zip(to_replace, replacements): string = string.replace(tr, rp) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index acaab744344..0ca741a5b0c 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -4,10 +4,8 @@ from collections import defaultdict from fsspec.core import get_fs_token_paths -from pyarrow import parquet as pq +from pyarrow import dataset as ds, parquet as pq from pyarrow.compat import guid -from pyarrow import dataset as ds - import cudf from cudf._lib import parquet as libparquet diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 1155a1b91ce..ebbbd7b8cd5 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -8,8 +8,8 @@ from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 from cudf.core.index import as_index -from cudf.tests.utils import assert_eq from cudf.core.series import _fix_nullable_dtype_repr +from cudf.tests.utils import assert_eq @pytest.fixture diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 7ac4df4e514..438838c9f36 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -58,7 +58,8 @@ def test_column_offset_and_size(pandas_input, offset, size): if col.size > 0: assert col.size == (col.children[0].size - 1) assert col.size == ( - (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 + (col.children[0].data.size / col.children[0].dtype.itemsize) + - 1 ) else: assert col.size == (col.data.size / col.dtype.itemsize) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d414185a540..4edcf0955f5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -17,6 +17,7 @@ from cudf.core._compat import PANDAS_GE_110 from cudf.core.column import column from cudf.core.dataframe import DataFrame, Series +from cudf.core.dtypes import Number from cudf.tests import utils from cudf.tests.utils import ( ALL_TYPES, @@ -26,7 +27,6 @@ does_not_raise, gen_rand, ) -from cudf.core.dtypes import Number def test_init_via_list_of_tuples(): @@ -3242,8 +3242,8 @@ def test_empty_dataframe_describe(): def test_as_column_types(): - from cudf.core.column import column from cudf import Float32Dtype, Float64Dtype, StringDtype + from cudf.core.column import column col = column.as_column(Series([])) assert isinstance(col.dtype, Float64Dtype) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index a6d43686812..f043e045a7b 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -640,6 +640,7 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): ) def test_to_datetime_errors(data): from cudf.core.series import _fix_nullable_dtype_repr + pd_data = data if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) @@ -649,7 +650,9 @@ def test_to_datetime_errors(data): try: pd.to_datetime(pd_data) except Exception as e: - with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(str(e)))): + with pytest.raises( + type(e), match=re.escape(_fix_nullable_dtype_repr(str(e))) + ): cudf.to_datetime(gd_data) else: raise AssertionError("Was expecting `pd.to_datetime` to fail") diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 9a3f9a285a5..2b6abd951b2 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -76,50 +76,54 @@ def test_nested_dtype(): got = dt.element_type assert expect == got -@pytest.mark.parametrize('string,dtype', [ - ('uint8', cudf.UInt8Dtype), - ('uint16', cudf.UInt16Dtype), - ('uint32', cudf.UInt32Dtype), - ('uint64', cudf.UInt64Dtype), - ('UInt8', cudf.UInt8Dtype), - ('UInt16', cudf.UInt16Dtype), - ('UInt32', cudf.UInt32Dtype), - ('UInt64', cudf.UInt64Dtype), - ('int8', cudf.Int8Dtype), - ('int16', cudf.Int16Dtype), - ('int32', cudf.Int32Dtype), - ('int64', cudf.Int64Dtype), - ('Int8', cudf.Int8Dtype), - ('Int16', cudf.Int16Dtype), - ('Int32', cudf.Int32Dtype), - ('Int64', cudf.Int64Dtype), - ('int', cudf.Int64Dtype), - ('float32', cudf.Float32Dtype), - ('float64', cudf.Float64Dtype), - ('Float32', cudf.Float32Dtype), - ('Float64', cudf.Float64Dtype), - ('float', cudf.Float64Dtype), - ('bool', cudf.BooleanDtype), - ('Boolean', cudf.BooleanDtype), - ('string', cudf.StringDtype), - ('String', cudf.StringDtype), - ('object', cudf.StringDtype), - ('datetime64[ns]', cudf.Datetime64NSDtype), - ('datetime64[us]', cudf.Datetime64USDtype), - ('datetime64[ms]', cudf.Datetime64MSDtype), - ('datetime64[s]', cudf.Datetime64SDtype), - ('Datetime64NS', cudf.Datetime64NSDtype), - ('Datetime64US', cudf.Datetime64USDtype), - ('Datetime64MS', cudf.Datetime64MSDtype), - ('Datetime64S', cudf.Datetime64SDtype), - ('timedelta64[ns]', cudf.Timedelta64NSDtype), - ('timedelta64[us]', cudf.Timedelta64USDtype), - ('timedelta64[ms]', cudf.Timedelta64MSDtype), - ('timedelta64[s]', cudf.Timedelta64SDtype), - ('Timedelta64NS', cudf.Timedelta64NSDtype), - ('Timedelta64US', cudf.Timedelta64USDtype), - ('Timedelta64MS', cudf.Timedelta64MSDtype), - ('Timedelta64S', cudf.Timedelta64SDtype), - ]) + +@pytest.mark.parametrize( + "string,dtype", + [ + ("uint8", cudf.UInt8Dtype), + ("uint16", cudf.UInt16Dtype), + ("uint32", cudf.UInt32Dtype), + ("uint64", cudf.UInt64Dtype), + ("UInt8", cudf.UInt8Dtype), + ("UInt16", cudf.UInt16Dtype), + ("UInt32", cudf.UInt32Dtype), + ("UInt64", cudf.UInt64Dtype), + ("int8", cudf.Int8Dtype), + ("int16", cudf.Int16Dtype), + ("int32", cudf.Int32Dtype), + ("int64", cudf.Int64Dtype), + ("Int8", cudf.Int8Dtype), + ("Int16", cudf.Int16Dtype), + ("Int32", cudf.Int32Dtype), + ("Int64", cudf.Int64Dtype), + ("int", cudf.Int64Dtype), + ("float32", cudf.Float32Dtype), + ("float64", cudf.Float64Dtype), + ("Float32", cudf.Float32Dtype), + ("Float64", cudf.Float64Dtype), + ("float", cudf.Float64Dtype), + ("bool", cudf.BooleanDtype), + ("Boolean", cudf.BooleanDtype), + ("string", cudf.StringDtype), + ("String", cudf.StringDtype), + ("object", cudf.StringDtype), + ("datetime64[ns]", cudf.Datetime64NSDtype), + ("datetime64[us]", cudf.Datetime64USDtype), + ("datetime64[ms]", cudf.Datetime64MSDtype), + ("datetime64[s]", cudf.Datetime64SDtype), + ("Datetime64NS", cudf.Datetime64NSDtype), + ("Datetime64US", cudf.Datetime64USDtype), + ("Datetime64MS", cudf.Datetime64MSDtype), + ("Datetime64S", cudf.Datetime64SDtype), + ("timedelta64[ns]", cudf.Timedelta64NSDtype), + ("timedelta64[us]", cudf.Timedelta64USDtype), + ("timedelta64[ms]", cudf.Timedelta64MSDtype), + ("timedelta64[s]", cudf.Timedelta64SDtype), + ("Timedelta64NS", cudf.Timedelta64NSDtype), + ("Timedelta64US", cudf.Timedelta64USDtype), + ("Timedelta64MS", cudf.Timedelta64MSDtype), + ("Timedelta64S", cudf.Timedelta64SDtype), + ], +) def test_cudf_dtype_string_construction(string, dtype): assert type(cudf.dtype(string) == dtype) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 536ab79ddb0..f7e3ba3be94 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -802,7 +802,7 @@ def test_join_empty_table_dtype(): gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"]) gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"]) - assert gd_merge['a'].dtype == pd_merge['a'].dtype + assert gd_merge["a"].dtype == pd_merge["a"].dtype @pytest.mark.parametrize("how", ["outer", "inner", "left", "right"]) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 48c6522a378..c10d1879ccf 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -1,6 +1,6 @@ -import numpy as np import pandas as pd import pytest + import cudf from cudf import Series from cudf.tests.utils import assert_eq diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index cf926f39da2..27a075c80f7 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -5,20 +5,19 @@ from glob import glob from io import BytesIO from string import ascii_letters -from packaging import version import numpy as np import pandas as pd import pyarrow as pa import pytest +from packaging import version from pyarrow import parquet as pq import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata +from cudf.tests import dataset_generator as dg from cudf.tests.utils import assert_eq -import cudf.tests.dataset_generator as dg - @pytest.fixture(scope="module") def datadir(datadir): @@ -381,6 +380,7 @@ def test_parquet_read_filtered_everything(tmpdir): assert isinstance(df_filtered["x"].dtype, cudf.Int64Dtype) assert isinstance(df_filtered["y"].dtype, cudf.StringDtype) + def test_parquet_read_filtered_multiple_files(tmpdir): # Generate data fname_0 = tmpdir.join("filtered_multiple_files_0.parquet") diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 115570fb8fe..57b72caffa3 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -6,10 +6,10 @@ import pandas as pd import pytest from hypothesis import given, settings, strategies as st -from cudf.core.series import _fix_nullable_dtype_repr import cudf from cudf.core._compat import PANDAS_GE_110 +from cudf.core.series import _fix_nullable_dtype_repr from cudf.tests import utils from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes @@ -49,6 +49,7 @@ def test_null_series(nrows, dtype): str(sr._column.default_na_value()) + "\n", "\n" ) from cudf.core.series import _fix_nullable_dtype_repr + # todo: this is kind of self-fulfilling since this is what is # called inside _repr_ as well psrepr = _fix_nullable_dtype_repr(psrepr) @@ -203,12 +204,9 @@ def test_mixed_dataframe(mixed_pdf, mixed_gdf): def test_mixed_series(mixed_pdf, mixed_gdf): for col in mixed_gdf.columns: - try: - assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr(mixed_pdf[col].__repr__()) - except: - import pdb - pdb.set_trace() - + assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr( + mixed_pdf[col].__repr__() + ) def test_MI(): gdf = cudf.DataFrame( @@ -582,7 +580,9 @@ def test_series_null_index_repr(sr, pandas_special_case): # Whereas cudf is consistent with strings `null` values # to be printed as `None` everywhere. actual_repr = gsr.__repr__().replace("None", "") - assert _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split() + assert ( + _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split() + ) @pytest.mark.parametrize( @@ -622,7 +622,9 @@ def test_timedelta_series_s_us_repr(data, dtype): psr = sr.to_pandas() expected = ( - psr.__repr__().replace("timedelta64[ns]", str(sr.dtype)).replace("NaT", "") + psr.__repr__() + .replace("timedelta64[ns]", str(sr.dtype)) + .replace("NaT", "") ) actual = sr.__repr__() diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index e0d35f2eb5c..9ae5c17da47 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -143,7 +143,7 @@ def test_series_set_equal_length_object_by_mask(replace_data): # Lengths match in trivial case pd_bool_col = pd.Series([True] * len(psr)) gd_bool_col = Series.from_pandas(pd_bool_col) - + psr[pd_bool_col] = ( replace_data.to_pandas() if hasattr(replace_data, "to_pandas") diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 8e2e2585c27..99ee4878f11 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -156,7 +156,7 @@ def test_string_repr(ps_gs, item): if got_out is not None and len(got_out) > 1: expect = expect.replace("None", "") - expect = expect.replace('object', 'String') + expect = expect.replace("object", "String") assert expect == got diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 634517368d5..564271c217f 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -976,8 +976,7 @@ def test_timedelta_invalid_ops(): with pytest.raises( TypeError, match=re.escape( - f"Addition of {sr.dtype} with Int64 " - f"cannot be performed." + f"Addition of {sr.dtype} with Int64 " f"cannot be performed." ), ): sr + 1 @@ -990,8 +989,7 @@ def test_timedelta_invalid_ops(): with pytest.raises( TypeError, match=re.escape( - f"Addition of {sr.dtype} with String " - f"cannot be performed." + f"Addition of {sr.dtype} with String " f"cannot be performed." ), ): sr + "a" @@ -1021,8 +1019,7 @@ def test_timedelta_invalid_ops(): with pytest.raises( TypeError, match=re.escape( - f"Modulus of {sr.dtype} with String " - f"cannot be performed." + f"Modulus of {sr.dtype} with String " f"cannot be performed." ), ): sr % "a" @@ -1158,13 +1155,16 @@ def test_timedelta_invalid_ops(): def test_timedelta_datetime_cast_invalid(): from cudf.core.series import _fix_nullable_dtype_repr + sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() try: psr.astype("datetime64[ns]") except TypeError as e: - with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))): + with pytest.raises( + type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__())) + ): sr.astype("datetime64[ns]") else: raise AssertionError("Expected timedelta to datetime typecast to fail") @@ -1175,7 +1175,9 @@ def test_timedelta_datetime_cast_invalid(): try: psr.astype("timedelta64[ns]") except TypeError as e: - with pytest.raises(type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__()))): + with pytest.raises( + type(e), match=re.escape(_fix_nullable_dtype_repr(e.__str__())) + ): sr.astype("timedelta64[ns]") else: raise AssertionError("Expected datetime to timedelta typecast to fail") diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 4959a4a5419..aaf37b635a2 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,5 @@ import datetime as dt +import inspect import numbers from collections import namedtuple from collections.abc import Sequence @@ -8,13 +9,10 @@ import pandas as pd import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object -from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType import cudf from cudf._lib.scalar import Scalar from cudf.api.types import is_categorical_dtype -import inspect - _NA_REP = "" _np_pa_dtypes = { @@ -68,7 +66,19 @@ } OTHER_TYPES = {"bool", "category", "str"} ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES -NEW_NUMERIC_TYPES = {'Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Float32', 'Float64'} +NEW_NUMERIC_TYPES = { + "Int8", + "Int16", + "Int32", + "Int64", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Float32", + "Float64", +} + def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype. @@ -110,23 +120,6 @@ def numeric_normalize_types(*args): dtype = np.result_type(*[a.dtype.to_numpy for a in args]) return [a.astype(dtype) for a in args] - -def is_numerical_dtype(obj): - if is_categorical_dtype(obj): - return False - if is_list_dtype(obj): - return False - return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - ) - - -def is_string_dtype(obj): - return pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj) - - def is_datetime_dtype(obj): if obj is None: return False @@ -134,6 +127,7 @@ def is_datetime_dtype(obj): return False return "M8" in obj.str + def cudf_dtype_from_pydata_dtype(dtype): """ Given a numpy or pandas dtype, converts it into the equivalent cuDF Python dtype. @@ -151,7 +145,7 @@ def cudf_dtype_from_pydata_dtype(dtype): dtype = np.datetime64 result = cudf.dtype(infer_dtype_from_object(dtype)) - if isinstance(result, cudf.Generic): + if isinstance(result, cudf.Generic): return result.__class__ elif inspect.isclass(result): return result @@ -193,7 +187,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: val = val.item() - if ((dtype is None) and isinstance(val, str)) or is_string_dtype(dtype): + if ((dtype is None) and isinstance(val, str)) or cudf.api.types.is_string_dtype(dtype): dtype = "str" if isinstance(val, dt.datetime): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 7af5e6f8caf..c8193c7226a 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -349,7 +349,9 @@ def time_col_replace_nulls(input_col): column.as_column( Buffer( np.array( - [input_col.default_na_value()], dtype=input_col.dtype.to_numpy).view("|u1") + [input_col.default_na_value()], + dtype=input_col.dtype.to_numpy, + ).view("|u1") ), dtype=input_col.dtype, ), From 62a7d5bf4d46590273b87ab159f8cf2c9ea2e453 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Sep 2020 18:47:48 -0700 Subject: [PATCH 50/80] bug fixes and type attribute plumbing/iteration --- python/cudf/cudf/_lib/reduce.pyx | 4 +- python/cudf/cudf/_lib/scalar.pyx | 6 +++ python/cudf/cudf/api/types.py | 37 +++++++++++---- python/cudf/cudf/core/column/column.py | 5 +-- python/cudf/cudf/core/column/numerical.py | 16 +++---- python/cudf/cudf/core/column/string.py | 4 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/dtypes.py | 18 ++++---- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/indexing.py | 2 +- python/cudf/cudf/core/series.py | 52 ---------------------- python/cudf/cudf/tests/test_categorical.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 26 +++++------ python/cudf/cudf/tests/test_index.py | 8 ++-- python/cudf/cudf/tests/test_string.py | 2 +- python/cudf/cudf/tests/utils.py | 52 ++++++++++++++++++++++ 17 files changed, 133 insertions(+), 111 deletions(-) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index ac8065d2d6f..63e3f28b450 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -48,9 +48,9 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): # check empty case if len(incol) <= incol.null_count: if reduction_op == 'sum' or reduction_op == 'sum_of_squares': - return incol.dtype.type(0) + return incol.dtype.type(0).value if reduction_op == 'product': - return incol.dtype.type(1) + return incol.dtype.type(1).value return np.nan with nogil: diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 773ce54be31..47a0d55816c 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -93,6 +93,11 @@ cdef class Scalar: f"{type(value).__name__} to cudf scalar" ) + def __eq__(self, other): + if isinstance(other, Scalar): + other = other.value + return self.value == other + @property def dtype(self): """ @@ -352,6 +357,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s): def as_scalar(val, dtype=None): + dtype = cudf.dtype(dtype) if isinstance(val, Scalar): if (dtype is None or dtype == val.dtype): return val diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 732828085b4..484b9f1bfd1 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -6,37 +6,53 @@ def is_bool_dtype(obj): + if hasattr(obj, 'dtype'): + obj = obj.dtype # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably if hasattr(obj, "dtype"): obj = obj.dtype - return isinstance(obj, cudf.BooleanDtype) or pd.api.types.is_bool_dtype( + return isinstance(obj, cudf.BooleanDtype) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_bool_dtype( obj - ) + )) def is_datetime64_dtype(obj): - return isinstance(obj, cudf.Datetime) or pd.api.types.is_datetime64_dtype( + if hasattr(obj, 'dtype'): + obj = obj.dtype + return isinstance(obj, cudf.Datetime) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_datetime64_dtype( obj - ) + )) def is_timedelta64_dtype(obj): + if hasattr(obj, 'dtype'): + obj = obj.dtype return isinstance( obj, cudf.Timedelta - ) or pd.api.types.is_timedelta64_dtype(obj) + ) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_timedelta64_dtype(obj)) def is_string_dtype(obj): - return isinstance(obj, cudf.StringDtype) or ( + if hasattr(obj, 'dtype'): + obj = obj.dtype + return isinstance(obj, cudf.StringDtype) or (not isinstance(obj, cudf.Generic) and ( pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj) - ) + )) def is_integer_dtype(obj): - return isinstance(obj, cudf.Integer) or pd.api.types.is_integer_dtype(obj) + if hasattr(obj, 'dtype'): + obj = obj.dtype + try: + return isinstance(obj, cudf.Integer) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_integer_dtype(obj)) + except: + import pdb + pdb.set_trace() def is_numerical_dtype(obj): + if hasattr(obj, 'dtype'): + obj = obj.dtype if isinstance(obj, cudf.Generic): return isinstance(obj, (cudf.Number, cudf.BooleanDtype)) if is_categorical_dtype(obj): @@ -142,3 +158,8 @@ def result_type(*arrays_and_dtypes): for d in arrays_and_dtypes ) return cudf.dtype(np.result_type(*arrays_and_dtypes)) + +def isnan(obj): + if isinstance(obj, cudf._lib.scalar.Scalar): + obj = obj.value + return np.isnan(obj) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0f5f29913b0..f26c87c08ff 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -642,11 +642,11 @@ def __getitem__(self, arg): arg = as_column(arg) if len(arg) == 0: arg = as_column([], dtype="int32") - if pd.api.types.is_integer_dtype(arg.dtype) or isinstance( + if cudf.api.types.is_integer_dtype(arg.dtype) or isinstance( arg.dtype, cudf.Integer ): return self.take(arg) - if pd.api.types.is_bool_dtype(arg.dtype) or isinstance( + if cudf.api.types.is_bool_dtype(arg.dtype) or isinstance( arg.dtype, cudf.BooleanDtype ): return self.apply_boolean_mask(arg) @@ -1616,7 +1616,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data=buffer, mask=mask, dtype=arbitrary.dtype ) elif arb_dtype.kind in ("O", "U"): - pa_data = pa.Array.from_pandas(arbitrary) data = as_column(pa_data, dtype=cudf.dtype(pa_data.type)) # There is no cast operation available for pa.Array from int to diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 2fa16c8458d..e56f87aac21 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. import numpy as np -from pandas.api.types import is_integer_dtype +from cudf.api.types import is_integer_dtype import cudf from cudf import _lib as libcudf @@ -175,12 +175,7 @@ def as_numerical_column(self, dtype, **kwargs): return libcudf.unary.cast(self, dtype) def sum(self, dtype=None): - try: return libcudf.reduce.reduce("sum", self, dtype=dtype) - except: - import pdb - - pdb.set_trace() def product(self, dtype=None): return libcudf.reduce.reduce("product", self, dtype=dtype) @@ -236,7 +231,7 @@ def default_na_value(self): """ dkind = self.dtype.kind if dkind == "f": - return self.dtype.type(np.nan) + return self.dtype.type(np.nan).value elif dkind == "i": return np.iinfo(self.dtype.to_numpy).min elif dkind == "u": @@ -280,7 +275,8 @@ def fillna(self, fill_value): """ Fill null values with *fill_value* """ - if np.isscalar(fill_value): + + if np.isscalar(fill_value) and not isinstance(fill_value, libcudf.scalar.Scalar): # castsafely to the same dtype as self # TODO - produce a libcudf scalar directly fill_value_casted = self.dtype.to_numpy.type(fill_value) @@ -291,6 +287,8 @@ def fillna(self, fill_value): ) ) fill_value = fill_value_casted + elif isinstance(fill_value, libcudf.scalar.Scalar): + fill_value = libcudf.scalar.as_scalar(fill_value, dtype=self.dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) # cast safely to the same dtype as self @@ -471,7 +469,7 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): col_to_normalize_casted = input_column_dtype.type( col_to_normalize[0] ) - if not np.isnan(col_to_normalize_casted) and ( + if not cudf.api.types.isnan(col_to_normalize_casted) and ( col_to_normalize_casted != col_to_normalize[0] ): raise TypeError( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 224211d5b2d..3641781c07b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4567,7 +4567,7 @@ def as_numerical_column(self, dtype, **kwargs): ) kwargs.update(dtype=out_dtype) - if out_dtype.type is np.datetime64: + if isinstance(out_dtype, cudf.Datetime): if "format" not in kwargs: if len(self) > 0: # infer on host from the first not na element @@ -4586,7 +4586,7 @@ def as_numerical_column(self, dtype, **kwargs): raise ValueError("Could not convert `None` value to datetime") boolean_match = self.binary_operator("eq", "NaT") - elif out_dtype.type is np.timedelta64: + elif isinstance(out_dtype, cudf.Timedelta): if "format" not in kwargs: if len(self) > 0: kwargs.update(format="%D days %H:%M:%S") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0f59bda56a5..71f91432612 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4521,7 +4521,7 @@ def _sizeof_fmt(num, size_qualifier): else: deep = False if ( - "String" in dtype_counts + "string" in dtype_counts or self.index.dtype == cudf.StringDtype() ): size_qualifier = "+" diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 360de63efae..ac55b34afc4 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -15,15 +15,15 @@ class Generic(ExtensionDtype, _Dtype): pa_type = None def __eq__(self, other): - if isinstance(other, self.__class__): + if isinstance(other, type(self)): return True if isinstance(other, Generic) and not isinstance( - other, self.__class__ + other, type(self) ): return False if ( - isinstance(other, self.to_pandas.__class__) - or other is self.to_pandas.__class__ + isinstance(other, type(self.to_pandas)) + or other is type(self.to_pandas) ): return True @@ -338,12 +338,10 @@ def dtype(obj): return cudf.Float64Dtype() elif obj is None: return None + elif obj is np.object: + return else: - - raise TypeError - - # raise TypeError(f"Could not find a cuDF dtype matching {obj}") - + raise TypeError(f"Could not find cuDF dtype matching {obj}") class CategoricalDtype(Generic): def __init__(self, categories=None, ordered=None): @@ -413,7 +411,7 @@ def __eq__(self, other): return other == self.name elif other is self: return True - elif not isinstance(other, self.__class__): + elif not isinstance(other, type(self)): return False elif self.ordered != other.ordered: return False diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 31de28f408f..8792dccba85 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -486,7 +486,7 @@ def _get_columns_by_index(self, indices): ) def _gather(self, gather_map, keep_index=True): - if not pd.api.types.is_integer_dtype(gather_map.dtype): + if not cudf.api.types.is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") result = self.__class__._from_table( libcudf.copying.gather( @@ -3142,7 +3142,7 @@ def _get_replacement_values(to_replace, replacement, col_name, column): if all_nan: replacement = [replacement] * len(to_replace) # Do not broadcast numeric dtypes - elif pd.api.types.is_numeric_dtype(column.dtype): + elif cudf.api.types.is_numerical_dtype(column.dtype): if len(to_replace) > 0: replacement = [replacement] else: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 31afdf10a33..a3ce9d8cb19 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -725,8 +725,8 @@ def append(self, other): ) raise TypeError( f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " + f"dtype `{self.dtype}` with an Index " + f"of dtype `{other.dtype}`, please type-cast " f"either one of them to same dtypes." ) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 83dfc0cb768..ef5ca3d6341 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -89,7 +89,7 @@ def __setitem__(self, key, value): else: value = column.as_column(value) - if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype( + if hasattr(value, "dtype") and cudf.api.types.is_numerical_dtype( value.dtype ): # normalize types if necessary: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8804aff2e38..8efc6c3a0a1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5186,55 +5186,3 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): result_col[equal_nulls] = True return Series(result_col, index=index) - - -def _fix_nullable_dtype_repr(string): - - to_replace = [ - "uint8", - "uint16", - "uint32", - "uint64", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "bool", - "object", - "datetime64[ns]", - "datetime64[us]", - "datetime64[ms]", - "datetime64[s]", - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - ] - - replacements = [ - "UInt8", - "UInt16", - "UInt32", - "UInt64", - "Int8", - "Int16", - "Int32", - "Int64", - "Float32", - "Float64", - "Boolean", - "String", - "Datetime64NS", - "Datetime64US", - "Datetime64MS", - "Datetime64S", - "Timedelta64NS", - "Timedelta64US", - "Timedelta64MS", - "Timedelta64S", - ] - for tr, rp in zip(to_replace, replacements): - string = string.replace(tr, rp) - return string diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index ebbbd7b8cd5..3ccd8a2e85c 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -71,7 +71,7 @@ def test_categorical_integer(): 3 c 4 a dtype: category -Categories (3, String): ['a', 'b', 'c'] +Categories (3, string): ['a', 'b', 'c'] """ assert string.split() == expect_str.split() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4edcf0955f5..c2429504764 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5576,8 +5576,8 @@ def test_dataframe_info_verbose_mem_usage(): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 3 non-null Int64 - 1 b 3 non-null String - dtypes: Int64(1), String(1) + 1 b 3 non-null string + dtypes: Int64(1), string(1) memory usage: 56.0+ bytes """ ) @@ -5593,7 +5593,7 @@ def test_dataframe_info_verbose_mem_usage(): RangeIndex: 3 entries, 0 to 2 Columns: 2 entries, a to b - dtypes: Int64(1), String(1) + dtypes: Int64(1), string(1) memory usage: 56.0+ bytes """ ) @@ -5616,8 +5616,8 @@ def test_dataframe_info_verbose_mem_usage(): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 3 non-null Int64 - 1 b 3 non-null String - dtypes: Int64(1), String(1) + 1 b 3 non-null string + dtypes: Int64(1), string(1) memory usage: 91.0 bytes """ ) @@ -5647,9 +5647,9 @@ def test_dataframe_info_verbose_mem_usage(): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 int_col 5 non-null Int64 - 1 text_col 5 non-null String + 1 text_col 5 non-null string 2 float_col 5 non-null Float64 - dtypes: Float64(1), Int64(1), String(1) + dtypes: Float64(1), Int64(1), string(1) memory usage: 130.0 bytes """ ) @@ -5682,9 +5682,9 @@ def test_dataframe_info_null_counts(): # Column Dtype --- ------ ----- 0 int_col Int64 - 1 text_col String + 1 text_col string 2 float_col Float64 - dtypes: Float64(1), Int64(1), String(1) + dtypes: Float64(1), Int64(1), string(1) memory usage: 130.0+ bytes """ ) @@ -5732,8 +5732,8 @@ def test_dataframe_info_null_counts(): # Column Dtype --- ------ ----- 0 a Int64 - 1 b String - dtypes: Int64(1), String(1) + 1 b string + dtypes: Int64(1), string(1) memory usage: 238.0+ bytes """ ) @@ -5754,8 +5754,8 @@ def test_dataframe_info_null_counts(): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 6 non-null Int64 - 1 b 6 non-null String - dtypes: Int64(1), String(1) + 1 b 6 non-null string + dtypes: Int64(1), string(1) memory usage: 238.0+ bytes """ ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 463970eef8c..a28d10bd758 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -923,8 +923,8 @@ def test_index_append_error(data, other): TypeError, match=re.escape( f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " + f"dtype `{gd_data.dtype}` with an Index " + f"of dtype `{gd_other.dtype}`, please type-cast " f"either one of them to same dtypes." ), ): @@ -934,8 +934,8 @@ def test_index_append_error(data, other): TypeError, match=re.escape( f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " + f"dtype `{gd_other.dtype}` with an Index " + f"of dtype `{gd_data.dtype}`, please type-cast " f"either one of them to same dtypes." ), ): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 99ee4878f11..3575a61503b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -156,7 +156,7 @@ def test_string_repr(ps_gs, item): if got_out is not None and len(got_out) > 1: expect = expect.replace("None", "") - expect = expect.replace("object", "String") + expect = expect.replace("object", "string") assert expect == got diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py index dea444dd560..f5646f67dde 100644 --- a/python/cudf/cudf/tests/utils.py +++ b/python/cudf/cudf/tests/utils.py @@ -152,6 +152,58 @@ def gen_rand_series(dtype, size, **kwargs): return cudf.Series(values) +def _fix_nullable_dtype_repr(string): + + to_replace = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "bool", + "object", + "datetime64[ns]", + "datetime64[us]", + "datetime64[ms]", + "datetime64[s]", + "timedelta64[ns]", + "timedelta64[us]", + "timedelta64[ms]", + "timedelta64[s]", + ] + + replacements = [ + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Int8", + "Int16", + "Int32", + "Int64", + "Float32", + "Float64", + "boolean", + "string", + "Datetime64NS", + "Datetime64US", + "Datetime64MS", + "Datetime64S", + "Timedelta64NS", + "Timedelta64US", + "Timedelta64MS", + "Timedelta64S", + ] + for tr, rp in zip(to_replace, replacements): + string = string.replace(tr, rp) + return string + + @contextmanager def does_not_raise(): yield From 80baff4f805f48fda88261c13d87bc562fe179c9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Sep 2020 08:00:19 -0700 Subject: [PATCH 51/80] fix repr and move around testing utilities --- python/cudf/cudf/core/series.py | 4 +++- python/cudf/cudf/tests/test_categorical.py | 3 +-- python/cudf/cudf/tests/test_repr.py | 16 +++++++--------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8efc6c3a0a1..8448f67f618 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1023,11 +1023,13 @@ def __repr__(self): else: output = preprocess.to_pandas().__repr__() - output = _fix_nullable_dtype_repr(output) lines = output.split("\n") if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): category_memory = lines[-1] + to_replace = str(self.dtype.categories.dtype.to_numpy) + replacement = str(self.dtype.categories.dtype.name) + category_memory = category_memory.replace(to_replace, replacement) lines = lines[:-1] if len(lines) > 1: if lines[-1].startswith("Name: "): diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 3ccd8a2e85c..1577c2b39f2 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -8,8 +8,7 @@ from cudf.core import DataFrame, Series from cudf.core._compat import PANDAS_GE_110 from cudf.core.index import as_index -from cudf.core.series import _fix_nullable_dtype_repr -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, _fix_nullable_dtype_repr @pytest.fixture diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 57b72caffa3..f8226aada39 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -9,7 +9,6 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.core.series import _fix_nullable_dtype_repr from cudf.tests import utils from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes @@ -48,11 +47,10 @@ def test_null_series(nrows, dtype): psrepr = psrepr.replace( str(sr._column.default_na_value()) + "\n", "\n" ) - from cudf.core.series import _fix_nullable_dtype_repr # todo: this is kind of self-fulfilling since this is what is # called inside _repr_ as well - psrepr = _fix_nullable_dtype_repr(psrepr) + psrepr = utils._fix_nullable_dtype_repr(psrepr) assert psrepr.split() == sr.__repr__().split() @@ -94,7 +92,7 @@ def test_full_series(nrows, dtype): ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) pd.options.display.max_rows = int(nrows) - psrepr = _fix_nullable_dtype_repr(ps.__repr__()) + psrepr = utils._fix_nullable_dtype_repr(ps.__repr__()) assert psrepr == sr.__repr__() @@ -157,7 +155,7 @@ def test_integer_dataframe(x): def test_integer_series(x): sr = cudf.Series(x) ps = pd.Series(x) - psrepr = _fix_nullable_dtype_repr(ps.__repr__()) + psrepr = utils._fix_nullable_dtype_repr(ps.__repr__()) assert sr.__repr__() == psrepr @@ -174,7 +172,7 @@ def test_float_dataframe(x): def test_float_series(x): sr = cudf.Series(x, nan_as_null=False) ps = pd.Series(x) - psrepr = _fix_nullable_dtype_repr(ps.__repr__()) + psrepr = utils._fix_nullable_dtype_repr(ps.__repr__()) assert sr.__repr__() == psrepr @@ -204,7 +202,7 @@ def test_mixed_dataframe(mixed_pdf, mixed_gdf): def test_mixed_series(mixed_pdf, mixed_gdf): for col in mixed_gdf.columns: - assert mixed_gdf[col].__repr__() == _fix_nullable_dtype_repr( + assert mixed_gdf[col].__repr__() == utils._fix_nullable_dtype_repr( mixed_pdf[col].__repr__() ) @@ -257,7 +255,7 @@ def test_generic_index(length, dtype): index=np.random.randint(0, high=100, size=length).astype(dtype), ) gsr = cudf.Series.from_pandas(psr) - psrepr = _fix_nullable_dtype_repr(psr.index.__repr__()) + psrepr = utils._fix_nullable_dtype_repr(psr.index.__repr__()) assert psrepr == gsr.index.__repr__() @@ -581,7 +579,7 @@ def test_series_null_index_repr(sr, pandas_special_case): # to be printed as `None` everywhere. actual_repr = gsr.__repr__().replace("None", "") assert ( - _fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split() + utils._fix_nullable_dtype_repr(expected_repr).split() == actual_repr.split() ) From 38e11af5f123e18d11a54b1d87aa71781492c760 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Sep 2020 08:08:16 -0700 Subject: [PATCH 52/80] clean up reduce.pyx --- python/cudf/cudf/_lib/reduce.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 63e3f28b450..5d5a3f5d2a7 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -31,11 +31,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): A numpy data type to use for the output, defaults to the same type as the input column """ - + dtype = cudf_dtype(dtype) col_dtype = incol.dtype if reduction_op in ['sum', 'sum_of_squares', 'product']: col_dtype = find_common_type([col_dtype], [np.uint64]) - col_dtype = cudf_dtype(col_dtype) if dtype is None else cudf_dtype(dtype) + col_dtype = col_dtype if dtype is None else dtype cdef column_view c_incol_view = incol.view() cdef unique_ptr[scalar] c_result From 22b299d053c7c642e1bde282c4bb102a720a1960 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 9 Sep 2020 15:50:15 -0700 Subject: [PATCH 53/80] implement cudf::scalar -> cudf.Scalar -> Buffer, column --- python/cudf/cudf/_lib/cpp/scalar/scalar.pxd | 6 ++ python/cudf/cudf/_lib/reduce.pyx | 6 +- python/cudf/cudf/_lib/scalar.pxd | 1 + python/cudf/cudf/_lib/scalar.pyx | 93 ++++++++++++++++++++- python/cudf/cudf/core/buffer.py | 13 ++- python/cudf/cudf/core/column/column.py | 5 ++ 6 files changed, 116 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd index 3eb11c2bfd0..6b5242b8e08 100644 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd @@ -23,6 +23,7 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: numeric_scalar(T value, bool is_valid) except + void set_value(T value) except + T value() except + + T* data() except + cdef cppclass timestamp_scalar[T](scalar): timestamp_scalar() except + @@ -34,6 +35,8 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: int64_t ticks_since_epoch_64 "ticks_since_epoch"() except + int32_t ticks_since_epoch_32 "ticks_since_epoch"() except + T value() except + + T* data() except + + cdef cppclass duration_scalar[T](scalar): duration_scalar() except + @@ -44,6 +47,8 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: duration_scalar(int32_t value, bool is_valid) except + int64_t ticks "count"() except + T value() except + + T* data() except + + cdef cppclass string_scalar(scalar): string_scalar() except + @@ -51,3 +56,4 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: string_scalar(string st, bool is_valid) except + string_scalar(string_scalar other) except + string to_string() except + + const char* data() except + diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 5d5a3f5d2a7..60780abbfb5 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -48,9 +48,9 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): # check empty case if len(incol) <= incol.null_count: if reduction_op == 'sum' or reduction_op == 'sum_of_squares': - return incol.dtype.type(0).value + return incol.dtype.type(0) if reduction_op == 'product': - return incol.dtype.type(1).value + return incol.dtype.type(1) return np.nan with nogil: @@ -61,7 +61,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): )) py_result = Scalar.from_unique_ptr(move(c_result)) - return py_result.value + return py_result def scan(scan_op, Column incol, inclusive, **kwargs): diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 34dfea5431a..6c8a2155c98 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -4,6 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp cimport bool from cudf._lib.cpp.scalar.scalar cimport scalar +from libc.stdint cimport uintptr_t cdef class Scalar: diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 47a0d55816c..b18edcc531e 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -15,6 +15,7 @@ from libc.stdint cimport ( ) from libcpp.memory cimport unique_ptr from libcpp cimport bool +from libc.stdint cimport uintptr_t import cudf from cudf._lib.types import cudf_to_np_types, duration_unit_map @@ -43,7 +44,6 @@ from cudf._lib.cpp.scalar.scalar cimport ( ) cimport cudf._lib.cpp.types as libcudf_types - cdef class Scalar: def __init__(self, value, dtype=None): @@ -145,6 +145,11 @@ cdef class Scalar: cdef Scalar s = Scalar.__new__(Scalar) s.c_value = move(ptr) return s + + @property + def ptr(self): + return _get_ptr_from_scalar_any(self.c_value) + cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): @@ -365,3 +370,89 @@ def as_scalar(val, dtype=None): return Scalar(val.value, dtype) else: return Scalar(value=val, dtype=dtype) + +cdef _get_ptr_from_scalar_any(unique_ptr[scalar]& s): + cdef scalar* s_ptr = s.get() + if not s_ptr[0].is_valid(): + return None + + cdef libcudf_types.data_type cdtype = s_ptr[0].type() + + if cdtype.id() == libcudf_types.INT8: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.INT16: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.INT32: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.INT64: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.UINT8: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.UINT16: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.UINT32: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.UINT64: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.FLOAT32: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.FLOAT64: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.BOOL8: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.TIMESTAMP_NANOSECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.TIMESTAMP_MICROSECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.TIMESTAMP_MILLISECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.TIMESTAMP_SECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.DURATION_NANOSECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.DURATION_MICROSECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.DURATION_MILLISECONDS: + return int( + (s_ptr)[0].data() + ) + elif cdtype.id() == libcudf_types.DURATION_SECONDS: + return int( + (s_ptr)[0].data() + ) + else: + raise ValueError('Could not get pointer from cudf::scalar') diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 43ef5e42106..c12a08f04f5 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -6,7 +6,7 @@ import rmm from rmm import DeviceBuffer - +import cudf from cudf.core.abc import Serializable @@ -17,9 +17,10 @@ def __init__(self, data=None, size=None, owner=None): Parameters ---------- - data : Buffer, array_like, int - An array-like object or integer representing a - device or host pointer to pre-allocated memory. + data : Buffer, array_like, int, Scalar + An array-like object, integer, or `Scalar` + representing a device or host pointer to + pre-allocated memory. size : int, optional Size of memory allocation. Required if a pointer is passed for `data`. @@ -45,6 +46,10 @@ def __init__(self, data=None, size=None, owner=None): self.ptr = data self.size = size self._owner = owner + elif isinstance(data, cudf._lib.scalar.Scalar): + self.ptr = data.ptr + self.size = data.dtype.itemsize + self._owner = data elif data is None: self.ptr = 0 self.size = 0 diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f26c87c08ff..a10397253c9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1616,6 +1616,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data=buffer, mask=mask, dtype=arbitrary.dtype ) elif arb_dtype.kind in ("O", "U"): + import pdb + pdb.set_trace() pa_data = pa.Array.from_pandas(arbitrary) data = as_column(pa_data, dtype=cudf.dtype(pa_data.type)) # There is no cast operation available for pa.Array from int to @@ -1670,6 +1672,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = bools_to_mask(as_column(mask).unary_operator("not")) data = data.set_mask(mask) + elif isinstance(arbitrary, cudf._lib.scalar.Scalar): + buffer = Buffer(arbitrary) + data = as_column(buffer, dtype=arbitrary.dtype) else: try: data = as_column( From 1552c0a6a414b0fe0b30ab7d9752cc3651a8afc9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 10 Sep 2020 16:16:37 -0700 Subject: [PATCH 54/80] minor bugfixes --- python/cudf/cudf/tests/test_datetime.py | 3 +-- python/cudf/cudf/tests/test_timedelta.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index f043e045a7b..9db1a04ab2a 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -13,6 +13,7 @@ from cudf.core import DataFrame, Series from cudf.core.index import DatetimeIndex from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.tests.utils import _fix_nullable_dtype_repr def data1(): @@ -639,8 +640,6 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): ], ) def test_to_datetime_errors(data): - from cudf.core.series import _fix_nullable_dtype_repr - pd_data = data if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 564271c217f..7e4637c63ba 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -8,7 +8,7 @@ import pytest import cudf -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, _fix_nullable_dtype_repr from cudf.utils import dtypes as dtypeutils _TIMEDELTA_DATA = [ @@ -1154,8 +1154,6 @@ def test_timedelta_invalid_ops(): def test_timedelta_datetime_cast_invalid(): - from cudf.core.series import _fix_nullable_dtype_repr - sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() From 78caafae4907e0e152031455a2057e5b67562d66 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 10 Sep 2020 16:23:48 -0700 Subject: [PATCH 55/80] add __int__ and __float__ to scalar --- python/cudf/cudf/_lib/scalar.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index b18edcc531e..3ae4f18900d 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -149,7 +149,12 @@ cdef class Scalar: @property def ptr(self): return _get_ptr_from_scalar_any(self.c_value) - + + def __int__(self): + return int(self.value) + + def __float__(self): + return float(self.value) cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): From a9fe2fb292d6e42a0893cee66935e5f3b6fa932e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 11 Sep 2020 08:59:49 -0700 Subject: [PATCH 56/80] partially implement scalar binops --- python/cudf/cudf/_lib/scalar.pyx | 56 +++++++++++++++++++++-- python/cudf/cudf/core/column/column.py | 8 ++-- python/cudf/cudf/tests/test_scalar.py | 61 +++++++++++++++++++++++++- python/cudf/cudf/utils/dtypes.py | 2 +- 4 files changed, 117 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 3ae4f18900d..d1f892cd327 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -43,7 +43,7 @@ from cudf._lib.cpp.scalar.scalar cimport ( string_scalar ) cimport cudf._lib.cpp.types as libcudf_types - +from cudf.utils.dtypes import to_cudf_compatible_scalar cdef class Scalar: def __init__(self, value, dtype=None): @@ -61,7 +61,7 @@ cdef class Scalar: A NumPy dtype. """ - value = cudf.utils.dtypes.to_cudf_compatible_scalar(value, dtype=dtype) + value = to_cudf_compatible_scalar(value, dtype=dtype) valid = value is not None @@ -133,7 +133,7 @@ cdef class Scalar: def __repr__(self): if self.value is None: - return f"Scalar({self.value}, {self.dtype.__repr__()})" + return f"Scalar(, {self.dtype.__repr__()})" else: return f"Scalar({self.value.__repr__()})" @@ -156,6 +156,56 @@ cdef class Scalar: def __float__(self): return float(self.value) + def __add__(self, other): + return self._scalar_binop(other, '__add__') + + def __sub__(self, other): + return self._scalar_binop(other, '__sub__') + + def __mul__(self, other): + return self._scalar_binop(other, '__mul__') + + def __div__(self, other): + return self._scalar_binop(other, '__div__') + + def __mod__(self, other): + return self._scalar_binop(other, '__mod__') + + def __divmod__(self, other): + return self._scalar_binop(other, '__divmod__') + + def __and__(self, other): + return self._scalar_binop(other, '__and__') + + def __xor__(self, other): + return self._scalar_binop(other, '__or__') + + def _binop_result_dtype_or_error(self, other): + + if (self.dtype.kind == 'O' and other.dtype.kind != 'O') or (self.dtype.kind != 'O' and other.dtype.kind == 'O'): + wrong_dtype = self.dtype if self.dtype.kind != 'O' else other.dtype + raise TypeError(f"Can only concatenate string (not {wrong_dtype}) to string") + + + return cudf.api.types.find_common_type([ + self.dtype, other.dtype + ]) + + def _scalar_binop(self, other, op): + other = to_cudf_compatible_scalar(other) + out_dtype = self._binop_result_dtype_or_error(other) + + valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid()) + if not valid: + return cudf.Scalar(None, dtype=out_dtype) + else: + result = self._dispatch_scalar_binop(other, op) + return Scalar(result, dtype=out_dtype) + + def _dispatch_scalar_binop(self, other, op): + if isinstance(other, Scalar): + other = other.value + return getattr(self.value, op)(other) cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): value = value if valid else "" diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a10397253c9..61037d36905 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1616,8 +1616,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data=buffer, mask=mask, dtype=arbitrary.dtype ) elif arb_dtype.kind in ("O", "U"): - import pdb - pdb.set_trace() pa_data = pa.Array.from_pandas(arbitrary) data = as_column(pa_data, dtype=cudf.dtype(pa_data.type)) # There is no cast operation available for pa.Array from int to @@ -1672,9 +1670,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = bools_to_mask(as_column(mask).unary_operator("not")) data = data.set_mask(mask) - elif isinstance(arbitrary, cudf._lib.scalar.Scalar): - buffer = Buffer(arbitrary) - data = as_column(buffer, dtype=arbitrary.dtype) + #elif isinstance(arbitrary, cudf._lib.scalar.Scalar): + # buffer = Buffer(arbitrary) + # data = as_column(buffer, dtype=arbitrary.dtype) else: try: data = as_column( diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index c8fb5e40d23..09fea834f59 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -7,7 +7,7 @@ from cudf._lib.scalar import Scalar from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES - +import operator @pytest.mark.parametrize( "value", @@ -144,3 +144,62 @@ def test_date_duration_scalars(value): np.testing.assert_equal(actual, expected) assert s.is_valid() is True + +@pytest.mark.parametrize('pairs', [ + (1, 1), + (1, 1.5), + (-1.5, 1), + (1, 'a'), + ('a', 'b'), + (1.5, 'a'), + (1, False), + (False, True), + (1.5, False), + (True, 1.5), + ('a', False), +]) +@pytest.mark.parametrize('dtype_l', [ + np.dtype('uint8'), + np.dtype('uint16'), + np.dtype('uint32'), + np.dtype('uint64'), + np.dtype('int8'), + np.dtype('int16'), + np.dtype('int32'), + np.dtype('int64'), + np.dtype('float32'), + np.dtype('float64'), + np.dtype('bool'), + np.dtype('object') +]) +@pytest.mark.parametrize('dtype_r', [ + np.dtype('uint8'), + np.dtype('uint16'), + np.dtype('uint32'), + np.dtype('uint64'), + np.dtype('int8'), + np.dtype('int16'), + np.dtype('int32'), + np.dtype('int64'), + np.dtype('float32'), + np.dtype('float64'), + np.dtype('bool'), + np.dtype('object') +]) +@pytest.mark.parametrize('op', [ + operator.add, + operator.sub, + operator.mul, +]) +def test_scalar_binops_value(pairs, dtype_l, dtype_r, op): + l, r = pairs + host_value_l = dtype_l.type(l) + host_value_r = dtype_r.type(r) + + gpu_value_l = Scalar(l) + gpu_value_r = Scalar(r) + + expect = op(host_value_l, host_value_r) + got = op(gpu_value_l, gpu_value_r) + + assert expect == got.value diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index aaf37b635a2..26946f685a9 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -175,7 +175,7 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if val is None: + if val is None or isinstance(val, cudf._lib.scalar.Scalar): return val if not is_scalar(val): From 455af02b92fea7e67b70e418f2324b342bb497c4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 11 Sep 2020 13:03:30 -0700 Subject: [PATCH 57/80] partial tests for scalar binop result dtype --- python/cudf/cudf/_lib/scalar.pyx | 2 +- python/cudf/cudf/tests/test_scalar.py | 78 +++++++++++++++++++++++---- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index d1f892cd327..83e49f79cf2 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -197,7 +197,7 @@ cdef class Scalar: valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid()) if not valid: - return cudf.Scalar(None, dtype=out_dtype) + return Scalar(None, dtype=out_dtype) else: result = self._dispatch_scalar_binop(other, op) return Scalar(result, dtype=out_dtype) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 09fea834f59..3ca4b9f7a18 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -170,7 +170,7 @@ def test_date_duration_scalars(value): np.dtype('float32'), np.dtype('float64'), np.dtype('bool'), - np.dtype('object') + np.dtype('str') ]) @pytest.mark.parametrize('dtype_r', [ np.dtype('uint8'), @@ -184,7 +184,7 @@ def test_date_duration_scalars(value): np.dtype('float32'), np.dtype('float64'), np.dtype('bool'), - np.dtype('object') + np.dtype('str') ]) @pytest.mark.parametrize('op', [ operator.add, @@ -193,13 +193,73 @@ def test_date_duration_scalars(value): ]) def test_scalar_binops_value(pairs, dtype_l, dtype_r, op): l, r = pairs - host_value_l = dtype_l.type(l) - host_value_r = dtype_r.type(r) + import re + try: + host_value_l = dtype_l.type(l) + except ValueError as e: + with pytest.raises(ValueError, match=re.escape(str(e))): + gpu_value_l = Scalar(l, dtype=dtype_l) + return + try: + host_value_r = dtype_r.type(r) + except ValueError as e: + with pytest.raises(ValueError, match=re.escape(str(e))): + gpu_value_r = Scalar(r, dtype=dtype_r) + return - gpu_value_l = Scalar(l) - gpu_value_r = Scalar(r) - - expect = op(host_value_l, host_value_r) + gpu_value_l = Scalar(l, dtype=dtype_l) + gpu_value_r = Scalar(r, dtype=dtype_r) + try: + expect = op(host_value_l, host_value_r) + except np.core._exceptions.UFuncTypeError: + with pytest.raises(TypeError): + got = op(gpu_value_l, gpu_value_r) + return got = op(gpu_value_l, gpu_value_r) - assert expect == got.value + + +@pytest.mark.parametrize('dtype_l', [ + np.dtype('uint8'), + np.dtype('uint16'), + np.dtype('uint32'), + np.dtype('uint64'), + np.dtype('int8'), + np.dtype('int16'), + np.dtype('int32'), + np.dtype('int64'), + np.dtype('float32'), + np.dtype('float64'), +]) +@pytest.mark.parametrize('dtype_r', [ + np.dtype('uint8'), + np.dtype('uint16'), + np.dtype('uint32'), + np.dtype('uint64'), + np.dtype('int8'), + np.dtype('int16'), + np.dtype('int32'), + np.dtype('int64'), + np.dtype('float32'), + np.dtype('float64'), +]) +@pytest.mark.parametrize('op', [ + operator.add, + operator.sub, + operator.mul, +]) +@pytest.mark.parametrize('l_valid', [True, False]) +@pytest.mark.parametrize('r_valid', [True, False]) +def test_scalar_binops_dtype_and_validity(dtype_l, dtype_r, l_valid, r_valid, op): + l_value = 0 if l_valid else None + r_value = 0 if r_valid else None + + expect_dtype = op(dtype_l.type(0), dtype_r.type(0)).dtype + + scalar_l = Scalar(l_value, dtype=dtype_l) + scalar_r = Scalar(r_value, dtype=dtype_r) + + got = op(scalar_l, scalar_r) + + assert got.dtype == expect_dtype + assert got.is_valid() == (l_valid and r_valid) From e4c0bf1105556ac3f3d215c39f753cba5be5c027 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 13 Sep 2020 16:47:46 -0700 Subject: [PATCH 58/80] scalar binop updates --- python/cudf/cudf/_lib/scalar.pyx | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 83e49f79cf2..0abf4ded84f 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -180,6 +180,18 @@ cdef class Scalar: def __xor__(self, other): return self._scalar_binop(other, '__or__') + def __gt__(self, other): + return self._scalar_binop(other, '__gt__').value + + def __lt__(self, other): + return self._scalar_binop(other, '__gt__').value + + def __ge__(self, other): + return self._scalar_binop(other, '__ge__').value + + def __le__(self, other): + return self._scalar_binop(other, '__le__').value + def _binop_result_dtype_or_error(self, other): if (self.dtype.kind == 'O' and other.dtype.kind != 'O') or (self.dtype.kind != 'O' and other.dtype.kind == 'O'): @@ -193,8 +205,11 @@ cdef class Scalar: def _scalar_binop(self, other, op): other = to_cudf_compatible_scalar(other) - out_dtype = self._binop_result_dtype_or_error(other) + if op in ['__eq__', '__lt__', '__gt__', '__le__', '__ge__']: + out_dtype = cudf.BooleanDtype() + else: + out_dtype = self._binop_result_dtype_or_error(other) valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid()) if not valid: return Scalar(None, dtype=out_dtype) From 42828c0ef19bda2c90932fe4b0d9cc1927f9c032 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 06:52:15 -0700 Subject: [PATCH 59/80] convert a list of cudf.Scalars into a contiguous column --- python/cudf/cudf/core/column/column.py | 6 +++--- python/cudf/cudf/core/dataframe.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 61037d36905..4f2bb2bb4ef 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1670,9 +1670,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = bools_to_mask(as_column(mask).unary_operator("not")) data = data.set_mask(mask) - #elif isinstance(arbitrary, cudf._lib.scalar.Scalar): - # buffer = Buffer(arbitrary) - # data = as_column(buffer, dtype=arbitrary.dtype) + elif isinstance(arbitrary, cudf._lib.scalar.Scalar): + buffer = Buffer(arbitrary) + data = as_column(buffer, dtype=arbitrary.dtype) else: try: data = as_column( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 71f91432612..7e3c091b51f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6261,6 +6261,10 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): result = DataFrame(index=support_result[0].index) for idx, col in enumerate(self._data.names): result[col] = support_result[idx] + elif isinstance(result[0], cudf._lib.scalar.Scalar): + result = _gpu_scalars_to_column(result) + result = cudf.Series(result) + result = result.set_index(self._data.names) else: result = Series(result) result = result.set_index(self._data.names) @@ -7067,3 +7071,16 @@ def _get_union_of_series_names(series_list): names_list = [*range(len(series_list))] return names_list + + +def _gpu_scalars_to_column(list_of_scalars): + ''' + Convert a list of cuDF scalars into a contiguous column + ''' + ind = range(len(list_of_scalars)) + cols_dict = { + k: v for k, v in zip(ind, [as_column(i) for i in list_of_scalars]) + } + + tbl = DataFrame(cols_dict) + return (tbl.T)[0]._column From 0d3d6a0e08d23308fde92f4ae511c614d118fba1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 08:45:48 -0700 Subject: [PATCH 60/80] migrate scalar methods to python --- python/cudf/cudf/__init__.py | 1 + python/cudf/cudf/_lib/reduce.pyx | 5 +- python/cudf/cudf/_lib/scalar.pyx | 76 -------------------------- python/cudf/cudf/core/__init__.py | 1 + python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/utils.py | 4 ++ 7 files changed, 11 insertions(+), 80 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 6e644cf09be..ff02a9af1fc 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -32,6 +32,7 @@ UInt64Index, from_pandas, merge, + Scalar ) from cudf.core.dtypes import ( BooleanDtype, diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 60780abbfb5..0d4dd41f1ca 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -15,6 +15,7 @@ from libcpp.memory cimport unique_ptr import numpy as np from cudf.core.dtypes import dtype as cudf_dtype from cudf.api.types import find_common_type +from cudf.core.scalar import Scalar as PyScalar def reduce(reduction_op, Column incol, dtype=None, **kwargs): @@ -60,8 +61,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): c_out_dtype )) - py_result = Scalar.from_unique_ptr(move(c_result)) - return py_result + cy_result = Scalar.from_unique_ptr(move(c_result)) + return PyScalar(cy_result) def scan(scan_op, Column incol, inclusive, **kwargs): diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0abf4ded84f..93259438869 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -93,11 +93,6 @@ cdef class Scalar: f"{type(value).__name__} to cudf scalar" ) - def __eq__(self, other): - if isinstance(other, Scalar): - other = other.value - return self.value == other - @property def dtype(self): """ @@ -150,77 +145,6 @@ cdef class Scalar: def ptr(self): return _get_ptr_from_scalar_any(self.c_value) - def __int__(self): - return int(self.value) - - def __float__(self): - return float(self.value) - - def __add__(self, other): - return self._scalar_binop(other, '__add__') - - def __sub__(self, other): - return self._scalar_binop(other, '__sub__') - - def __mul__(self, other): - return self._scalar_binop(other, '__mul__') - - def __div__(self, other): - return self._scalar_binop(other, '__div__') - - def __mod__(self, other): - return self._scalar_binop(other, '__mod__') - - def __divmod__(self, other): - return self._scalar_binop(other, '__divmod__') - - def __and__(self, other): - return self._scalar_binop(other, '__and__') - - def __xor__(self, other): - return self._scalar_binop(other, '__or__') - - def __gt__(self, other): - return self._scalar_binop(other, '__gt__').value - - def __lt__(self, other): - return self._scalar_binop(other, '__gt__').value - - def __ge__(self, other): - return self._scalar_binop(other, '__ge__').value - - def __le__(self, other): - return self._scalar_binop(other, '__le__').value - - def _binop_result_dtype_or_error(self, other): - - if (self.dtype.kind == 'O' and other.dtype.kind != 'O') or (self.dtype.kind != 'O' and other.dtype.kind == 'O'): - wrong_dtype = self.dtype if self.dtype.kind != 'O' else other.dtype - raise TypeError(f"Can only concatenate string (not {wrong_dtype}) to string") - - - return cudf.api.types.find_common_type([ - self.dtype, other.dtype - ]) - - def _scalar_binop(self, other, op): - other = to_cudf_compatible_scalar(other) - - if op in ['__eq__', '__lt__', '__gt__', '__le__', '__ge__']: - out_dtype = cudf.BooleanDtype() - else: - out_dtype = self._binop_result_dtype_or_error(other) - valid = self.is_valid() and (isinstance(other, np.generic) or other.is_valid()) - if not valid: - return Scalar(None, dtype=out_dtype) - else: - result = self._dispatch_scalar_binop(other, op) - return Scalar(result, dtype=out_dtype) - - def _dispatch_scalar_binop(self, other, op): - if isinstance(other, Scalar): - other = other.value - return getattr(self.value, op)(other) cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): value = value if valid else "" diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index d30f949f72c..6fd61c0240f 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -22,3 +22,4 @@ ) from cudf.core.multiindex import MultiIndex from cudf.core.series import Series +from cudf.core.scalar import Scalar diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 4f2bb2bb4ef..ef03a66ef73 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1670,7 +1670,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = bools_to_mask(as_column(mask).unary_operator("not")) data = data.set_mask(mask) - elif isinstance(arbitrary, cudf._lib.scalar.Scalar): + elif isinstance(arbitrary, cudf.Scalar): buffer = Buffer(arbitrary) data = as_column(buffer, dtype=arbitrary.dtype) else: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7e3c091b51f..1e68fe8b57b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6261,7 +6261,7 @@ def _apply_support_method(self, method, axis=0, *args, **kwargs): result = DataFrame(index=support_result[0].index) for idx, col in enumerate(self._data.names): result[col] = support_result[idx] - elif isinstance(result[0], cudf._lib.scalar.Scalar): + elif isinstance(result[0], cudf.Scalar): result = _gpu_scalars_to_column(result) result = cudf.Series(result) result = result.set_index(self._data.names) diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py index f5646f67dde..def40067acf 100644 --- a/python/cudf/cudf/tests/utils.py +++ b/python/cudf/cudf/tests/utils.py @@ -93,6 +93,10 @@ def assert_eq(left, right, **kwargs): else: assert np.array_equal(left, right) else: + if isinstance(left, cudf._lib.scalar.Scalar): + left = left.value + if isinstance(right, cudf._lib.scalar.Scalar): + right = right.value if left == right: return True else: From 63e1387f3ab927bbc9ab2d523a1b99d25d549ec2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 12:55:25 -0700 Subject: [PATCH 61/80] actually include scalar.py and update tests --- python/cudf/cudf/core/scalar.py | 114 ++++++++++++++++++++++++++ python/cudf/cudf/tests/test_scalar.py | 24 ++++-- 2 files changed, 129 insertions(+), 9 deletions(-) create mode 100644 python/cudf/cudf/core/scalar.py diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py new file mode 100644 index 00000000000..3a620880229 --- /dev/null +++ b/python/cudf/cudf/core/scalar.py @@ -0,0 +1,114 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +import cudf._lib as libcudf +from cudf.utils.dtypes import to_cudf_compatible_scalar +from cudf.core.dtypes import BooleanDtype +from cudf.api.types import find_common_type +import numpy as np + +class Scalar(libcudf.scalar.Scalar): + def __init__(self, value, dtype=None): + if isinstance(value, libcudf.scalar.Scalar): + if dtype and not value.dtype == dtype: + raise TypeError + self._data = value + else: + self._data = libcudf.scalar.Scalar(value, dtype=dtype) + + @property + def value(self): + return self._data.value + + @property + def ptr(self): + return self._data.ptr + + @property + def dtype(self): + return self._data.dtype + + @property + def is_valid(self): + return self._data.is_valid + + def __int__(self): + return int(self.value) + + def __float__(self): + return float(self.value) + + def __bool__(self): + return bool(self.value) + + def __add__(self, other): + return self._scalar_binop(other, "__add__") + + def __sub__(self, other): + return self._scalar_binop(other, "__sub__") + + def __mul__(self, other): + return self._scalar_binop(other, "__mul__") + + def __truediv__(self, other): + return self._scalar_binop(other, "__truediv__") + + def __mod__(self, other): + return self._scalar_binop(other, "__mod__") + + def __divmod__(self, other): + return self._scalar_binop(other, "__divmod__") + + def __and__(self, other): + return self._scalar_binop(other, "__and__") + + def __xor__(self, other): + return self._scalar_binop(other, "__or__") + + def __gt__(self, other): + return self._scalar_binop(other, "__gt__").value + + def __lt__(self, other): + return self._scalar_binop(other, "__lt__").value + + def __ge__(self, other): + return self._scalar_binop(other, "__ge__").value + + def __le__(self, other): + return self._scalar_binop(other, "__le__").value + + def __eq__(self, other): + return self._scalar_binop(other, '__eq__').value + + def _binop_result_dtype_or_error(self, other, op): + + if (self.dtype.kind == "O" and other.dtype.kind != "O") or ( + self.dtype.kind != "O" and other.dtype.kind == "O" + ): + wrong_dtype = self.dtype if self.dtype.kind != "O" else other.dtype + raise TypeError( + f"Can only concatenate string (not {wrong_dtype}) to string" + ) + if (self.dtype.kind == "O" or other.dtype.kind == "O") and op != "__add__": + raise TypeError(f"{op} is not supported for string type scalars") + + return find_common_type([self.dtype, other.dtype]) + + def _scalar_binop(self, other, op): + other = to_cudf_compatible_scalar(other) + + if op in ["__eq__", "__lt__", "__gt__", "__le__", "__ge__"]: + out_dtype = BooleanDtype() + else: + out_dtype = self._binop_result_dtype_or_error(other, op) + valid = self.is_valid() and ( + isinstance(other, np.generic) or other.is_valid() + ) + if not valid: + return Scalar(None, dtype=out_dtype) + else: + result = self._dispatch_scalar_binop(other, op) + return Scalar(result, dtype=out_dtype) + + def _dispatch_scalar_binop(self, other, op): + if isinstance(other, Scalar): + other = other.value + return getattr(self.value, op)(other) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 3ca4b9f7a18..04cbe501f2e 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from cudf._lib.scalar import Scalar +from cudf import Scalar from cudf.tests.utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES import operator @@ -190,28 +190,34 @@ def test_date_duration_scalars(value): operator.add, operator.sub, operator.mul, + operator.truediv ]) def test_scalar_binops_value(pairs, dtype_l, dtype_r, op): - l, r = pairs + lval, rval = pairs + if (isinstance(lval, str) and dtype_l != np.dtype('str')) or (isinstance(rval, str) and dtype_r != np.dtype('str')): + pytest.skip("Invalid scalar/dtype combination") + + + import re try: - host_value_l = dtype_l.type(l) + host_value_l = dtype_l.type(lval) except ValueError as e: with pytest.raises(ValueError, match=re.escape(str(e))): - gpu_value_l = Scalar(l, dtype=dtype_l) + gpu_value_l = Scalar(lval, dtype=dtype_l) return try: - host_value_r = dtype_r.type(r) + host_value_r = dtype_r.type(rval) except ValueError as e: with pytest.raises(ValueError, match=re.escape(str(e))): - gpu_value_r = Scalar(r, dtype=dtype_r) + gpu_value_r = Scalar(rval, dtype=dtype_r) return - gpu_value_l = Scalar(l, dtype=dtype_l) - gpu_value_r = Scalar(r, dtype=dtype_r) + gpu_value_l = Scalar(lval, dtype=dtype_l) + gpu_value_r = Scalar(rval, dtype=dtype_r) try: expect = op(host_value_l, host_value_r) - except np.core._exceptions.UFuncTypeError: + except TypeError: with pytest.raises(TypeError): got = op(gpu_value_l, gpu_value_r) return From 2005d65f73af51169af6cd83b0c3b331f6db3937 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 13:21:08 -0700 Subject: [PATCH 62/80] fix the rest of test_reductions.py --- python/cudf/cudf/core/scalar.py | 18 ++++++++++++++++++ python/cudf/cudf/tests/test_reductions.py | 4 ++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 3a620880229..92f0f3ac269 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -42,15 +42,27 @@ def __bool__(self): def __add__(self, other): return self._scalar_binop(other, "__add__") + def __radd__(self, other): + return self._scalar_binop(other, '__radd__') + def __sub__(self, other): return self._scalar_binop(other, "__sub__") + def __rsub__(self, other): + return self._scalar_binop(other, "__rsub__") + def __mul__(self, other): return self._scalar_binop(other, "__mul__") + def __rmul__(self, other): + return self._scalar_binop(other, "__rmul__") + def __truediv__(self, other): return self._scalar_binop(other, "__truediv__") + def __rtruediv__(self, other): + return self._scalar_binop(other, "__rtruediv__") + def __mod__(self, other): return self._scalar_binop(other, "__mod__") @@ -78,6 +90,9 @@ def __le__(self, other): def __eq__(self, other): return self._scalar_binop(other, '__eq__').value + def __abs__(self): + return self._scalar_unaop('__abs__') + def _binop_result_dtype_or_error(self, other, op): if (self.dtype.kind == "O" and other.dtype.kind != "O") or ( @@ -112,3 +127,6 @@ def _dispatch_scalar_binop(self, other, op): if isinstance(other, Scalar): other = other.value return getattr(self.value, op)(other) + + def _scalar_unaop(self, op): + return Scalar(getattr(self.value, op)()) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 9da3af36763..88ffa9a036f 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -81,7 +81,7 @@ def test_sum_of_squares(dtype, nelem): if np.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: - np.testing.assert_array_almost_equal(expect, got) + np.testing.assert_array_almost_equal(expect, got.value) else: print("overflow, passing") else: @@ -130,7 +130,7 @@ def test_sum_masked(nelem): expect = data[res_mask].sum() significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) + np.testing.assert_approx_equal(expect, got.value, significant=significant) def test_sum_boolean(): From 523919cef1eb1da226e16ce661fb3ec7fc1654c1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 13:52:35 -0700 Subject: [PATCH 63/80] fix indexing error --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/core/scalar.py | 5 ++++- python/cudf/cudf/utils/cudautils.py | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ef03a66ef73..9537371bb64 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -600,7 +600,7 @@ def element_indexing(self, index): def __getitem__(self, arg): - if isinstance(arg, Number): + if isinstance(arg, (Number, cudf.Scalar)): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index ac55b34afc4..b6b343d04f3 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -268,7 +268,7 @@ def __init__(self, parent_dtype): self.parent_dtype = parent_dtype def __call__(self, arg): - return cudf._lib.scalar.Scalar(arg, dtype=self.parent_dtype) + return cudf.Scalar(arg, dtype=self.parent_dtype) def cudf_dtype_from_string(obj): diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 92f0f3ac269..43bf24deee6 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -90,6 +90,9 @@ def __le__(self, other): def __eq__(self, other): return self._scalar_binop(other, '__eq__').value + def __ne__(self, other): + return self._scalar_binop(other, "__ne__").value + def __abs__(self): return self._scalar_unaop('__abs__') @@ -110,7 +113,7 @@ def _binop_result_dtype_or_error(self, other, op): def _scalar_binop(self, other, op): other = to_cudf_compatible_scalar(other) - if op in ["__eq__", "__lt__", "__gt__", "__le__", "__ge__"]: + if op in ["__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"]: out_dtype = BooleanDtype() else: out_dtype = self._binop_result_dtype_or_error(other, op) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index d1abf981a19..c8fb0b7a2ec 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -186,7 +186,7 @@ def find_first(arr, val, mask=None, compare="eq"): found_col = found_col.find_and_replace([arr.size], [None], True) min_index = found_col.min() - return -1 if min_index is None or np.isnan(min_index) else min_index + return -1 if min_index is None or cudf.api.types.isnan(min_index) else min_index def find_last(arr, val, mask=None, compare="eq"): From c730301423ffd04459c723d10729307f63f56817 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 14:35:04 -0700 Subject: [PATCH 64/80] fix as_scalar --- python/cudf/cudf/_lib/scalar.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 93259438869..ead3097f3b4 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -357,6 +357,8 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s): def as_scalar(val, dtype=None): dtype = cudf.dtype(dtype) + if isinstance(val, cudf.Scalar): + return as_scalar(val._data, dtype=dtype) if isinstance(val, Scalar): if (dtype is None or dtype == val.dtype): return val From c5450c2d588f7ca02628a74e6759447ae957659d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 15:01:58 -0700 Subject: [PATCH 65/80] remove unecessary code --- python/cudf/cudf/core/dtypes.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index b6b343d04f3..99528c6bde6 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -80,28 +80,19 @@ def __init__(self): class Integer(Number): - def __init__(self): - self._raise_construction_error() - + pass class SignedInteger(Integer): - def __init__(self): - self._raise_construction_error() - + pass class UnsignedInteger(Integer): - def __init__(self): - self._raise_construction_error() - + pass class Inexact(Number): - def __init__(self): - self._raise_construction_error() - + pass class Floating(Inexact): - def __init__(self): - self._raise_construction_error() + pass @property def kind(self): From 7bc08936103fb3c35cf1de36db9cdb2a1488c3bc Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Sep 2020 15:02:15 -0700 Subject: [PATCH 66/80] minor bugfixes --- python/cudf/cudf/utils/cudautils.py | 2 +- python/cudf/cudf/utils/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index c8fb0b7a2ec..1c3f4e773c6 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -207,7 +207,7 @@ def find_last(arr, val, mask=None, compare="eq"): found_col = found_col.find_and_replace([arr.size], [None], True) max_index = found_col.max() - return -1 if max_index is None or np.isnan(max_index) else max_index + return -1 if max_index is None or cudf.api.types.isnan(max_index) else max_index @cuda.jit diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index c8193c7226a..35efdf94260 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -74,17 +74,17 @@ def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) - scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) + scalar = cudf.Scalar(to_cudf_compatible_scalar(scalar, dtype=dtype)) dtype = scalar.dtype - if np.dtype(dtype).kind in ("O", "U"): + if dtype.kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: - out_col.data_array_view[:] = scalar + out_col.data_array_view[:] = scalar.value return out_col From a3a48934761203d86391b2b08f6e3a4b1a98bca8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 15 Sep 2020 11:58:59 -0700 Subject: [PATCH 67/80] scalar plumbing, cudf.api.types additions, bug fixes --- python/cudf/cudf/_lib/copying.pyx | 2 +- python/cudf/cudf/api/types.py | 33 ++++++++++++++++++----- python/cudf/cudf/core/column/numerical.py | 22 ++++++++------- python/cudf/cudf/core/scalar.py | 3 +++ python/cudf/cudf/utils/dtypes.py | 16 +++++------ python/cudf/cudf/utils/utils.py | 2 +- 6 files changed, 52 insertions(+), 26 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index cab42bce789..9174d611166 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -204,7 +204,7 @@ def _scatter_scalar(scalars, Column scatter_map, cdef bool c_bounds_check = bounds_check cdef Scalar slr for val, col in zip(scalars, target_table._columns): - slr = as_scalar(val, col.dtype.to_numpy) + slr = as_scalar(val, col.dtype) source_scalars.push_back(move(slr.c_value)) cdef column_view scatter_map_view = scatter_map.view() cdef table_view target_table_view = target_table.data_view() diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 484b9f1bfd1..b82f6ac6145 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -142,13 +142,15 @@ def find_common_type(array_types=[], scalar_types=[]): return cudf.dtype(np.find_common_type(array_types, scalar_types)) -def can_cast(dtype_l, dtype_r): - if isinstance(dtype_l, cudf.Generic): - dtype_l = dtype_l.to_numpy - if isinstance(dtype_r, cudf.Generic): - dtype_r = dtype_r.to_numpy +def can_cast(from_, to, casting='safe'): + if isinstance(from_, cudf.Generic): + from_ = from_.to_numpy + elif isinstance(from_, cudf.Scalar): + from_ = from_.value + if isinstance(to, cudf.Generic): + to = to.to_numpy - return np.can_cast(dtype_l, dtype_r) + return np.can_cast(from_, to, casting=casting) def result_type(*arrays_and_dtypes): @@ -163,3 +165,22 @@ def isnan(obj): if isinstance(obj, cudf._lib.scalar.Scalar): obj = obj.value return np.isnan(obj) + +def min_scalar_type(a): + if isinstance(a, cudf.Scalar): + a = a.value + result = np.min_scalar_type(a) + if result == np.dtype('float16'): + return cudf.Float32Dtype() + return cudf.dtype(result) + +def promote_types(type1, type2): + if isinstance(type1, cudf.Generic): + type1 = type1.to_numpy + if isinstance(type2, cudf.Generic): + type2 = type2.to_numpy + + result = np.promote_types(type1, type2) + if result == np.dtype('float16'): + return cudf.Float32Dtype() + return cudf.dtype(result) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index e56f87aac21..45fc042a416 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -51,15 +51,17 @@ def __contains__(self, item): # Handles improper item types # Fails if item is of type None, so the handler. try: - if np.can_cast(item, self.data_array_view.dtype): - item = self.data_array_view.dtype.type(item) + if cudf.api.types.can_cast(item, self.dtype): + if isinstance(item, cudf.Scalar): + item = item.value + item = cudf.Scalar(item, dtype=self.dtype) else: return False except (TypeError, ValueError): return False # TODO: Use `scalar`-based `contains` wrapper return libcudf.search.contains( - self, column.as_column([item], dtype=self.dtype) + self, column.as_column(item, dtype=self.dtype) ).any() def unary_operator(self, unaryop): @@ -105,16 +107,16 @@ def _apply_scan_op(self, op): def normalize_binop_value(self, other): if other is None: return other - other_dtype = np.min_scalar_type(other) + other_dtype = cudf.api.types.min_scalar_type(other) if other_dtype.kind in {"b", "i", "u", "f"}: - other_dtype = np.promote_types(self.dtype.to_numpy, other_dtype) + other_dtype = cudf.api.types.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): other = np.dtype("float32").type(other) other_dtype = other.dtype if self.dtype.kind == "b": other_dtype = min_signed_type(other) - if np.isscalar(other): - other = np.dtype(other_dtype).type(other) + if np.isscalar(other) or isinstance(other, cudf.Scalar): + other = cudf.Scalar(other, dtype=other_dtype) return other else: ary = utils.scalar_broadcast_to( @@ -169,9 +171,8 @@ def as_numerical_column(self, dtype, **kwargs): if dtype == self.dtype: return self if dtype is None: - import pdb - - pdb.set_trace() + # dtype = None can cause segfault here + raise TypeError('libcudf.unary.cast requires a dtype') return libcudf.unary.cast(self, dtype) def sum(self, dtype=None): @@ -455,6 +456,7 @@ def _safe_cast_to_int(col, dtype): def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): + normalized_column = column.as_column( col_to_normalize, dtype=input_column_dtype if len(col_to_normalize) <= 0 else None, diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 43bf24deee6..ddf2b3f5211 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -75,6 +75,9 @@ def __and__(self, other): def __xor__(self, other): return self._scalar_binop(other, "__or__") + def __pow__(self, other): + return self._scalar_binop(other, "__pow__") + def __gt__(self, other): return self._scalar_binop(other, "__gt__").value diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 26946f685a9..1b8a2d28cb2 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -330,25 +330,25 @@ def min_column_type(x, expected_type): If the column is not a subtype of `np.signedinteger` or `np.floating` returns the same dtype as the dtype of `x` without modification """ + + expected_type = cudf.dtype(expected_type) if not isinstance(x, cudf.core.column.NumericalColumn): raise TypeError("Argument x must be of type column.NumericalColumn") if x.valid_count == 0: return x.dtype - x_np_dtype = x.dtype.to_numpy - expected_type = cudf.dtype(expected_type).to_numpy - if np.issubdtype(x_np_dtype, np.floating): - max_bound_dtype = np.min_scalar_type(x.max()) - min_bound_dtype = np.min_scalar_type(x.min()) + if isinstance(x.dtype, cudf.Floating): + max_bound_dtype = np.min_scalar_type(x.max().value) + min_bound_dtype = np.min_scalar_type(x.min().value) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) if result_type == np.dtype("float16"): # cuDF does not support float16 dtype result_type = np.dtype("float32") return cudf.dtype(result_type) - if np.issubdtype(expected_type, np.integer): - max_bound_dtype = np.min_scalar_type(x.max()) - min_bound_dtype = np.min_scalar_type(x.min()) + if isinstance(expected_type, cudf.Integer): + max_bound_dtype = np.min_scalar_type(x.max().value) + min_bound_dtype = np.min_scalar_type(x.min().value) result = np.promote_types(max_bound_dtype, min_bound_dtype) return cudf.dtype(result) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 35efdf94260..fcf86b6bf82 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -79,7 +79,7 @@ def scalar_broadcast_to(scalar, size, dtype=None): if dtype.kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") - scalar_str_col = column.as_column([scalar], dtype="str") + scalar_str_col = column.as_column([scalar.value], dtype="str") return scalar_str_col[gather_map] else: out_col = column.column_empty(size, dtype=dtype) From 6bf121c25bf1bedaf0605155282f21bba7b0df23 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 15 Sep 2020 15:03:35 -0700 Subject: [PATCH 68/80] add cudf.api.types.isscalar(element) --- python/cudf/cudf/api/types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index b82f6ac6145..192e708d147 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -184,3 +184,6 @@ def promote_types(type1, type2): if result == np.dtype('float16'): return cudf.Float32Dtype() return cudf.dtype(result) + +def isscalar(element): + return isinstance(element, cudf._lib.scalar.Scalar) or np.isscalar(element) From 165f86c1b90c836d77841fa21447023368822240 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 15 Sep 2020 15:04:24 -0700 Subject: [PATCH 69/80] plumbing --- python/cudf/cudf/core/column/numerical.py | 6 +++--- python/cudf/cudf/core/dtypes.py | 4 ++-- python/cudf/cudf/core/series.py | 4 +++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 45fc042a416..715160e1b05 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -82,13 +82,13 @@ def binary_operator(self, binop, rhs, reflect=False): if reflect: tmp = self if isinstance(rhs, (NumericalColumn, Scalar)) or np.isscalar(rhs): - out_dtype = np.result_type( - cudf.dtype(self.dtype).to_numpy, cudf.dtype(rhs.dtype).to_numpy + out_dtype = cudf.api.types.result_type( + self.dtype, rhs.dtype ) out_dtype = cudf.dtype(out_dtype) if binop in ["mod", "floordiv"]: if (cudf.dtype(tmp.dtype) in int_dtypes) and ( - (np.isscalar(tmp) and (0 == tmp)) + (cudf.api.types.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): out_dtype = cudf.Float64Dtype() diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 99528c6bde6..41d4f669061 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -22,8 +22,8 @@ def __eq__(self, other): ): return False if ( - isinstance(other, type(self.to_pandas)) - or other is type(self.to_pandas) + isinstance(other, self.to_pandas.type) + or other is self.to_pandas ): return True diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8448f67f618..cc4555d9155 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4029,7 +4029,9 @@ def describe_numeric(self): + [self.max()] ) data = _format_stats_values(data) - + for i, d in enumerate(data): + if isinstance(d, cudf.Scalar): + data[i] = d.value return Series( data=data, index=names, nan_as_null=False, name=self.name, ) From cec9528ecb96f2b28309c186a28a9279c2fcfca8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 15 Sep 2020 15:05:16 -0700 Subject: [PATCH 70/80] scalars may __round__ --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/scalar.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1e68fe8b57b..d3b81296fda 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7079,7 +7079,7 @@ def _gpu_scalars_to_column(list_of_scalars): ''' ind = range(len(list_of_scalars)) cols_dict = { - k: v for k, v in zip(ind, [as_column(i) for i in list_of_scalars]) + k: v for k, v in zip(ind, [as_column(cudf.Scalar(i)) for i in list_of_scalars]) } tbl = DataFrame(cols_dict) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index ddf2b3f5211..0c35833c91e 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -99,6 +99,9 @@ def __ne__(self, other): def __abs__(self): return self._scalar_unaop('__abs__') + def __round__(self, n): + return self._scalar_binop(n, '__round__') + def _binop_result_dtype_or_error(self, other, op): if (self.dtype.kind == "O" and other.dtype.kind != "O") or ( From a8b380b763d61a82cd54a5f0ce9ca5570101d1c4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 16 Sep 2020 08:47:39 -0700 Subject: [PATCH 71/80] to_numpy -> numpy_dtype --- python/cudf/cudf/_lib/aggregation.pyx | 2 +- python/cudf/cudf/_lib/transform.pyx | 2 +- python/cudf/cudf/api/types.py | 14 +++++++------- python/cudf/cudf/core/column/column.py | 10 +++++----- python/cudf/cudf/core/column/datetime.py | 6 +++--- python/cudf/cudf/core/column/numerical.py | 16 ++++++++-------- python/cudf/cudf/core/column/timedelta.py | 10 +++++----- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/core/dtypes.py | 6 ++++-- python/cudf/cudf/core/indexing.py | 4 ++-- python/cudf/cudf/core/join/join.py | 4 ++-- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/core/tools/datetimes.py | 2 +- python/cudf/cudf/tests/test_avro.py | 2 +- python/cudf/cudf/tests/test_orc.py | 2 +- python/cudf/cudf/utils/dtypes.py | 6 +++--- python/cudf/cudf/utils/utils.py | 2 +- 17 files changed, 49 insertions(+), 47 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 19634d78061..96b948d65e8 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -243,7 +243,7 @@ cdef class _AggregationFactory: cdef string cpp_str # Handling UDF type - nb_type = numpy_support.from_dtype(kwargs['dtype'].to_numpy) + nb_type = numpy_support.from_dtype(kwargs['dtype'].numpy_dtype) type_signature = (nb_type[:],) compiled_op = cudautils.compile_udf(op, type_signature) output_np_dtype = cudf_dtype(np.dtype(compiled_op[1])) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 8fafa166471..cc839659b13 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -97,7 +97,7 @@ def transform(Column input, op): cdef type_id c_tid cdef data_type c_dtype - nb_type = numpy_support.from_dtype(input.dtype.to_numpy) + nb_type = numpy_support.from_dtype(input.dtype.numpy_dtype) nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) c_str = compiled_op[0].encode('UTF-8') diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 192e708d147..92cc561a1c8 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -133,10 +133,10 @@ def is_list_dtype(obj): def find_common_type(array_types=[], scalar_types=[]): array_types = [ - d.to_numpy if isinstance(d, cudf.Generic) else d for d in array_types + d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in array_types ] scalar_types = [ - d.to_numpy if isinstance(d, cudf.Generic) else d for d in scalar_types + d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in scalar_types ] return cudf.dtype(np.find_common_type(array_types, scalar_types)) @@ -144,11 +144,11 @@ def find_common_type(array_types=[], scalar_types=[]): def can_cast(from_, to, casting='safe'): if isinstance(from_, cudf.Generic): - from_ = from_.to_numpy + from_ = from_.numpy_dtype elif isinstance(from_, cudf.Scalar): from_ = from_.value if isinstance(to, cudf.Generic): - to = to.to_numpy + to = to.numpy_dtype return np.can_cast(from_, to, casting=casting) @@ -156,7 +156,7 @@ def can_cast(from_, to, casting='safe'): def result_type(*arrays_and_dtypes): arrays_and_dtypes = ( - d.to_numpy if isinstance(d, cudf.Generic) else d + d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in arrays_and_dtypes ) return cudf.dtype(np.result_type(*arrays_and_dtypes)) @@ -176,9 +176,9 @@ def min_scalar_type(a): def promote_types(type1, type2): if isinstance(type1, cudf.Generic): - type1 = type1.to_numpy + type1 = type1.numpy_dtype if isinstance(type2, cudf.Generic): - type2 = type2.to_numpy + type2 = type2.numpy_dtype result = np.promote_types(type1, type2) if result == np.dtype('float16'): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9537371bb64..3a3446d4b7f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -98,7 +98,7 @@ def data_array_view(self): result = cuda.devicearray.DeviceNDArray( shape=(result.nbytes // dtype.itemsize,), strides=(dtype.itemsize,), - dtype=dtype.to_numpy, + dtype=dtype.numpy_dtype, gpu_data=result.gpu_data, ) return result @@ -149,7 +149,7 @@ def values(self): Return a CuPy representation of the Column. """ if len(self) == 0: - return cupy.asarray([], dtype=self.dtype.to_numpy) + return cupy.asarray([], dtype=self.dtype.numpy_dtype) if self.has_nulls: raise ValueError("Column must have no nulls.") @@ -1098,7 +1098,7 @@ def __cuda_array_interface__(self): output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), - "typestr": self.dtype.to_numpy.str, + "typestr": self.dtype.numpy_dtype.str, "data": (self.data_ptr, False), "version": 1, } @@ -1573,7 +1573,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: - arbitrary = arbitrary.astype(dtype.to_numpy) + arbitrary = arbitrary.astype(dtype.numpy_dtype) if arb_dtype.kind == "M": @@ -1728,7 +1728,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data = as_column(sr, nan_as_null=nan_as_null) else: native_dtype = ( - dtype.to_numpy if dtype is not None else None + dtype.numpy_dtype if dtype is not None else None ) if dtype is None and pd.api.types.infer_dtype( arbitrary diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 3894b5dd0dc..8b80f609a74 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -123,7 +123,7 @@ def normalize_binop_value(self, other): if np.isnat(other): return as_scalar(val=None, dtype=self.dtype) - other = other.astype(self.dtype.to_numpy) + other = other.astype(self.dtype.numpy_dtype) return as_scalar(other) elif isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) @@ -265,8 +265,8 @@ def can_cast_safely(self, to_dtype): to_dtype = cudf.dtype(to_dtype) if isinstance(to_dtype, cudf.Datetime): - to_res, _ = np.datetime_data(to_dtype.to_numpy) - self_res, _ = np.datetime_data(self.dtype.to_numpy) + to_res, _ = np.datetime_data(to_dtype.numpy_dtype) + self_res, _ = np.datetime_data(self.dtype.numpy_dtype) max_int = np.iinfo(np.dtype("int64")).max diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 715160e1b05..3072f513f40 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -234,9 +234,9 @@ def default_na_value(self): if dkind == "f": return self.dtype.type(np.nan).value elif dkind == "i": - return np.iinfo(self.dtype.to_numpy).min + return np.iinfo(self.dtype.numpy_dtype).min elif dkind == "u": - return np.iinfo(self.dtype.to_numpy).max + return np.iinfo(self.dtype.numpy_dtype).max elif dkind == "b": return self.dtype.type(False) else: @@ -280,7 +280,7 @@ def fillna(self, fill_value): if np.isscalar(fill_value) and not isinstance(fill_value, libcudf.scalar.Scalar): # castsafely to the same dtype as self # TODO - produce a libcudf scalar directly - fill_value_casted = self.dtype.to_numpy.type(fill_value) + fill_value_casted = self.dtype.numpy_dtype.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( "Cannot safely cast non-equivalent {} to {}".format( @@ -360,14 +360,14 @@ def can_cast_safely(self, to_dtype): """ if self.dtype.kind == to_dtype.kind: # todo: implement >, < for cudf.Dtype - if self.dtype.to_numpy <= to_dtype.to_numpy: + if self.dtype.numpy_dtype <= to_dtype.numpy_dtype: return True else: # Kinds are the same but to_dtype is smaller if isinstance(to_dtype, cudf.Floating): - info = np.finfo(to_dtype.to_numpy) + info = np.finfo(to_dtype.numpy_dtype) elif isinstance(to_dtype, cudf.Integer): - info = np.iinfo(to_dtype.to_numpy) + info = np.iinfo(to_dtype.numpy_dtype) min_, max_ = info.min, info.max if (self.min() > min_) and (self.max() < max_): @@ -377,7 +377,7 @@ def can_cast_safely(self, to_dtype): # want to cast int to float elif to_dtype.kind == "f" and self.dtype.kind in {"i", "u"}: - info = np.finfo(to_dtype.to_numpy) + info = np.finfo(to_dtype.numpy_dtype) biggest_exact_int = 2 ** (info.nmant + 1) if (self.min() >= -biggest_exact_int) and ( self.max() <= biggest_exact_int @@ -396,7 +396,7 @@ def can_cast_safely(self, to_dtype): # want to cast float to int: elif to_dtype.kind in {"i", "u"} and self.dtype.kind == "f": - info = np.iinfo(to_dtype.to_numpy) + info = np.iinfo(to_dtype.numpy_dtype) min_, max_ = info.min, info.max # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 60b3f027efe..24b0f01c4cd 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -94,11 +94,11 @@ def _binary_op_floordiv(self, rhs): if isinstance(rhs, Scalar): rhs = np.timedelta64(rhs.value) - rhs = rhs.astype(common_dtype.to_numpy).astype("float64") + rhs = rhs.astype(common_dtype.numpy_dtype).astype("float64") else: rhs = as_scalar(None, "float64") else: - rhs = rhs.astype(common_dtype.to_numpy).astype("float64") + rhs = rhs.astype(common_dtype.numpy_dtype).astype("float64") out_dtype = cudf.Int64Dtype() elif rhs.dtype.kind in ("f", "i", "u"): @@ -163,7 +163,7 @@ def _binary_op_truediv(self, rhs): if isinstance(rhs, Scalar): rhs = np.timedelta64(rhs.value) - rhs = rhs.astype(common_dtype.to_numpy).astype("float64") + rhs = rhs.astype(common_dtype.numpy_dtype).astype("float64") else: rhs = as_scalar(None, "float64") else: @@ -226,7 +226,7 @@ def normalize_binop_value(self, other): other = other.astype("timedelta64[s]") else: common_dtype = determine_out_dtype(self.dtype, other.dtype) - other = other.astype(common_dtype.to_numpy) + other = other.astype(common_dtype.numpy_dtype) return as_scalar(other) elif np.isscalar(other): return as_scalar(other) @@ -258,7 +258,7 @@ def fillna(self, fill_value): if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) - fill_value = fill_value.astype(dtype.to_numpy) + fill_value = fill_value.astype(dtype.numpy_dtype) col = col.astype(dtype) elif not isinstance(fill_value, Scalar): fill_value = np.timedelta64(fill_value) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d3b81296fda..e4bb2c6ec69 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3189,7 +3189,7 @@ def as_gpu_matrix(self, columns=None, order="F"): "hint: use .fillna() to replace null values" ) raise ValueError(errmsg.format(k)) - cupy_dtype = dtype.to_numpy + cupy_dtype = dtype.numpy_dtype if np.issubdtype(cupy_dtype, np.datetime64): cupy_dtype = np.dtype("int64") @@ -4909,9 +4909,9 @@ def to_records(self, index=True): ------- numpy recarray """ - members = [("index", self.index.dtype.to_numpy)] if index else [] + members = [("index", self.index.dtype.numpy_dtype)] if index else [] members += [ - (col, self[col].dtype.to_numpy) for col in self._data.names + (col, self[col].dtype.numpy_dtype) for col in self._data.names ] dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 41d4f669061..a963daffefc 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -9,6 +9,8 @@ import cudf from cudf._lib.types import _Dtype +#from cudf.utils.utils import cached_property + class Generic(ExtensionDtype, _Dtype): @@ -41,11 +43,11 @@ def num(self): return self.to_numpy.num @property - def to_numpy(self): + def numpy_dtype(self): return np.dtype(self.pa_type.to_pandas_dtype()) @property - def to_pandas(self): + def pandas_dtype(self): return pd.api.types.pandas_dtype(self.name) @property diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index ef5ca3d6341..926f44c2ced 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -97,7 +97,7 @@ def __setitem__(self, key, value): to_dtype = cudf.api.types.result_type( value.dtype, self._sr._column.dtype ) - value = value.astype(to_dtype.to_numpy) + value = value.astype(to_dtype.numpy_dtype) self._sr._column._mimic_inplace( self._sr._column.astype(to_dtype), inplace=True ) @@ -452,7 +452,7 @@ def _get_column_selection(self, arg): def _normalize_dtypes(df): if len(df.columns) > 0: - dtypes = [d.to_numpy for d in df.dtypes.values.tolist()] + dtypes = [d.numpy_dtype for d in df.dtypes.values.tolist()] normalized_dtype = cudf.dtype(np.result_type(*dtypes)) for name, col in df._data.items(): df[name] = col.astype(normalized_dtype) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 95a1a05b377..b18babefe83 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -398,7 +398,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): if dtype_l.kind == dtype_r.kind: # both ints or both floats libcudf_join_type = cudf.dtype( - max(dtype_l.to_numpy, dtype_r.to_numpy) + max(dtype_l.numpy_dtype, dtype_r.numpy_dtype) ) else: libcudf_join_type = cudf.api.types.find_common_type( @@ -408,7 +408,7 @@ def input_to_libcudf_casting_rules(self, lcol, rcol, how): dtype_r, cudf.Datetime ): libcudf_join_type = cudf.dtype( - max(dtype_l.to_numpy, dtype_r.to_numpy) + max(dtype_l.numpy_dtype, dtype_r.numpy_dtype) ) if libcudf_join_type is None: # todo: test this diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cc4555d9155..7e42b06787a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1027,7 +1027,7 @@ def __repr__(self): if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): category_memory = lines[-1] - to_replace = str(self.dtype.categories.dtype.to_numpy) + to_replace = str(self.dtype.categories.dtype.numpy_dtype) replacement = str(self.dtype.categories.dtype.name) category_memory = category_memory.replace(to_replace, replacement) lines = lines[:-1] diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 16eee560f0e..54f5d4a1e1e 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -184,7 +184,7 @@ def to_datetime( column.datetime._numpy_to_pandas_conversion[u] / ( column.datetime._numpy_to_pandas_conversion["s"] - if np.datetime_data(col.dtype.to_numpy)[0] == "s" + if np.datetime_data(col.dtype.numpy_dtype)[0] == "s" else 1 ) ) diff --git a/python/cudf/cudf/tests/test_avro.py b/python/cudf/cudf/tests/test_avro.py index 059f5343e0b..3a79ae469c0 100644 --- a/python/cudf/cudf/tests/test_avro.py +++ b/python/cudf/cudf/tests/test_avro.py @@ -65,7 +65,7 @@ def test_avro_reader_basic(datadir, inputfile, columns, engine): # FASTAVRO produces int64 columns from avro int32 dtype, so convert # it back to int32 here for col in expect.columns: - expect[col] = expect[col].astype(got[col].dtype.to_numpy) + expect[col] = expect[col].astype(got[col].dtype.numpy_dtype) # fastavro appears to return columns in reverse order # (actual order may depend on pandas/python version) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 58a17b5a2ed..c92f0603e4e 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -156,7 +156,7 @@ def test_orc_reader_trailing_nulls(datadir): # PANDAS uses NaN to represent invalid data, which forces float dtype # For comparison, we can replace NaN with 0 and cast to the cuDF dtype for col in expect.columns: - expect[col] = expect[col].astype(got[col].dtype.to_numpy) + expect[col] = expect[col].astype(got[col].dtype.numpy_dtype) assert_eq(expect, got, check_categorical=False) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1b8a2d28cb2..391b9a800d3 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -117,7 +117,7 @@ def get_numeric_type_info(dtype): def numeric_normalize_types(*args): """Cast all args to a common type using numpy promotion logic """ - dtype = np.result_type(*[a.dtype.to_numpy for a in args]) + dtype = np.result_type(*[a.dtype.numpy_dtype for a in args]) return [a.astype(dtype) for a in args] def is_datetime_dtype(obj): @@ -203,7 +203,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if dtype is not None: if isinstance(dtype, cudf.Generic): - dtype = dtype.to_numpy + dtype = dtype.numpy_dtype val = val.astype(dtype) if val.dtype.type is np.datetime64: @@ -358,7 +358,7 @@ def min_column_type(x, expected_type): def check_cast_unsupported_dtype(dtype): if isinstance(dtype, cudf.Generic): - return dtype.to_numpy + return dtype.numpy_dtype if is_categorical_dtype(dtype): return dtype diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index fcf86b6bf82..17cdc4a93c3 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -350,7 +350,7 @@ def time_col_replace_nulls(input_col): Buffer( np.array( [input_col.default_na_value()], - dtype=input_col.dtype.to_numpy, + dtype=input_col.dtype.numpy_dtype, ).view("|u1") ), dtype=input_col.dtype, From 1dc151ad2dd9f1198439d1b0fb9948d974788466 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 16 Sep 2020 11:11:43 -0700 Subject: [PATCH 72/80] extra to_numpy -> numpy_dtype that were missed --- python/cudf/cudf/core/dtypes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a963daffefc..79378a78e4a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -29,9 +29,9 @@ def __eq__(self, other): ): return True - if self.to_numpy == other: + if self.numpy_dtype == other: return True - if isinstance(other, str) and str(self.to_numpy) == other: + if isinstance(other, str) and str(self.numpy_dtype) == other: return True return False @@ -40,7 +40,7 @@ def __str__(self): @property def num(self): - return self.to_numpy.num + return self.numpy_dtype.num @property def numpy_dtype(self): @@ -52,7 +52,7 @@ def pandas_dtype(self): @property def itemsize(self): - return self.to_numpy.itemsize + return self.numpy_dtype.itemsize @property def type(self): @@ -108,7 +108,7 @@ def __init__(self): class Datetime(Generic): @property - def to_numpy(self): + def numpy_dtype(self): return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self] @property @@ -119,7 +119,7 @@ def to_pandas(self): class Timedelta(Generic): @property - def to_numpy(self): + def numpy_dtype(self): return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self] @property From 46a9c2f16e838767a2e861ba943561edfd3b2d72 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 16 Sep 2020 14:31:27 -0700 Subject: [PATCH 73/80] add docstrings, respond to reviews --- python/cudf/cudf/_lib/scalar.pyx | 3 +- python/cudf/cudf/api/types.py | 364 +++++++++++++++++++++++++++++- python/cudf/cudf/core/dtypes.py | 14 +- python/cudf/cudf/core/frame.py | 6 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexing.py | 4 +- 6 files changed, 373 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index ead3097f3b4..2a2d7a21f57 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -15,7 +15,6 @@ from libc.stdint cimport ( ) from libcpp.memory cimport unique_ptr from libcpp cimport bool -from libc.stdint cimport uintptr_t import cudf from cudf._lib.types import cudf_to_np_types, duration_unit_map @@ -109,7 +108,7 @@ cdef class Scalar: """ if cudf.api.types.is_string_dtype(self.dtype): return _get_py_string_from_string(self.c_value) - elif cudf.api.types.is_numerical_dtype(self.dtype): + elif cudf.api.types.is_numeric_dtype(self.dtype): return _get_np_scalar_from_numeric(self.c_value) elif cudf.api.types.is_datetime64_dtype(self.dtype): return _get_np_scalar_from_timestamp64(self.c_value) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 92cc561a1c8..b28ec5fcf4f 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,3 +1,5 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + import numpy as np import pandas as pd from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType @@ -6,9 +8,47 @@ def is_bool_dtype(obj): + """ + Check whether the provided array or dtype is of a boolean dtype. + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + Returns + ------- + boolean + Whether or not the array or dtype is of a boolean dtype. + Notes + ----- + Accepts cuDF, Pandas, or NumPy dtypes and arrays. + + Examples + -------- + >>> is_bool_dtype(cudf.BooleanDtype()) + True + >>> is_bool_dtype(cudf.Series([True, False, None])) + True + >>> is_bool_dtype(str) + False + >>> is_bool_dtype(int) + False + >>> is_bool_dtype(bool) + True + >>> is_bool_dtype(np.bool_) + True + >>> is_bool_dtype(np.array(['a', 'b'])) + False + >>> is_bool_dtype(pd.Series([1, 2])) + False + >>> is_bool_dtype(np.array([True, False])) + True + >>> is_bool_dtype(pd.Categorical([True, False])) + True + >>> is_bool_dtype(pd.arrays.SparseArray([True, False])) + True + """ if hasattr(obj, 'dtype'): obj = obj.dtype - # todo - pd.api.types.is_bool_dtype should not give false, nor work at all probably if hasattr(obj, "dtype"): obj = obj.dtype return isinstance(obj, cudf.BooleanDtype) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_bool_dtype( @@ -17,6 +57,37 @@ def is_bool_dtype(obj): def is_datetime64_dtype(obj): + """ + Check whether the provided array or dtype is of the datetime64 dtype. + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + Returns + ------- + boolean + Whether or not the array or dtype is of the datetime64 dtype. + Notes + -------- + Accepts cuDF, Pandas, or NumPy dtypes and arrays. + + Examples + -------- + >>> is_datetime64_dtype(cudf.Datetime64NSDtype()) + True + >>> is_datetime64_dtype(cudf.Series([1, 2, 3], dtype='datetime64[ms]')) + True + >>> is_datetime64_dtype(object) + False + >>> is_datetime64_dtype(np.datetime64) + True + >>> is_datetime64_dtype(np.array([], dtype=int)) + False + >>> is_datetime64_dtype(np.array([], dtype=np.datetime64)) + True + >>> is_datetime64_dtype([1, 2, 3]) + False + """ if hasattr(obj, 'dtype'): obj = obj.dtype return isinstance(obj, cudf.Datetime) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_datetime64_dtype( @@ -25,6 +96,33 @@ def is_datetime64_dtype(obj): def is_timedelta64_dtype(obj): + """ + Check whether an array or dtype is of the timedelta64 dtype. + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + Returns + ------- + boolean + Whether or not the array or dtype is of the timedelta64 dtype. + Examples + -------- + >>> is_timedelta64_dtype(cudf.Timedelta64NSDtype()) + True + >>> is_timedelta64_dtype(cudf.Series([1,2,3], dtype='timedelta64[ns]')) + True + >>> is_timedelta64_dtype(object) + False + >>> is_timedelta64_dtype(np.timedelta64) + True + >>> is_timedelta64_dtype([1, 2, 3]) + False + >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) + True + >>> is_timedelta64_dtype('0 days') + False + """ if hasattr(obj, 'dtype'): obj = obj.dtype return isinstance( @@ -33,6 +131,34 @@ def is_timedelta64_dtype(obj): def is_string_dtype(obj): + """ + Check whether the provided array or dtype is of the string dtype. + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + Returns + ------- + boolean + Whether or not the array or dtype is of the string dtype. + Examples + -------- + >>> is_string_dtype(cudf.StringDtype()) + True + >>> is_string_dtype(cudf.Series(['a','b','c'])) + True + >>> is_string_dtype(str) + True + >>> is_string_dtype(object) + True + >>> is_string_dtype(int) + False + >>> + >>> is_string_dtype(np.array(['a', 'b'])) + True + >>> is_string_dtype(pd.Series([1, 2])) + False + """ if hasattr(obj, 'dtype'): obj = obj.dtype return isinstance(obj, cudf.StringDtype) or (not isinstance(obj, cudf.Generic) and ( @@ -41,6 +167,49 @@ def is_string_dtype(obj): def is_integer_dtype(obj): + """ + Check whether the provided array or dtype is of an integer dtype. + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + Returns + ------- + boolean + Whether or not the array or dtype is of an integer dtype and + Examples + -------- + >>> is_integer_dtype(cudf.Int64Dtype()) + True + >>> is_integer_dtype(cudf.Series([1,2,3], dtype='int64')) + True + >>> is_integer_dtype(str) + False + >>> is_integer_dtype(int) + True + >>> is_integer_dtype(float) + False + >>> is_integer_dtype(np.uint64) + True + >>> is_integer_dtype('int8') + True + >>> is_integer_dtype('Int8') + True + >>> is_integer_dtype(pd.Int8Dtype) + True + >>> is_integer_dtype(np.datetime64) + False + >>> is_integer_dtype(np.timedelta64) + False + >>> is_integer_dtype(np.array(['a', 'b'])) + False + >>> is_integer_dtype(pd.Series([1, 2])) + True + >>> is_integer_dtype(np.array([], dtype=np.timedelta64)) + False + >>> is_integer_dtype(pd.Index([1, 2.])) # float + False + """ if hasattr(obj, 'dtype'): obj = obj.dtype try: @@ -50,7 +219,44 @@ def is_integer_dtype(obj): pdb.set_trace() -def is_numerical_dtype(obj): +def is_numeric_dtype(obj): + """ + Check whether the provided array or dtype is of a numeric dtype. + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + Returns + ------- + boolean + Whether or not the array or dtype is of a numeric dtype. + Examples + -------- + >>> is_numeric_dtype(cudf.Float32Dtype()) + True + >>> is_numeric_dtype(cudf.Series([1.0, 2.0, 3.0])) + True + >>> is_numeric_dtype(str) + False + >>> is_numeric_dtype(int) + True + >>> is_numeric_dtype(float) + True + >>> is_numeric_dtype(np.uint64) + True + >>> is_numeric_dtype(np.datetime64) + False + >>> is_numeric_dtype(np.timedelta64) + False + >>> is_numeric_dtype(np.array(['a', 'b'])) + False + >>> is_numeric_dtype(pd.Series([1, 2])) + True + >>> is_numeric_dtype(pd.Index([1, 2.])) + True + >>> is_numeric_dtype(np.array([], dtype=np.timedelta64)) + False + """ if hasattr(obj, 'dtype'): obj = obj.dtype if isinstance(obj, cudf.Generic): @@ -131,7 +337,32 @@ def is_list_dtype(obj): ) -def find_common_type(array_types=[], scalar_types=[]): +def find_common_type(array_types, scalar_types): + """ + Determine common type following numpy coercion rules. + Similar to numpy.find_common_type, but accepts both + numpy and cuDF datatypes. + + Parameters + ---------- + array_types : sequence + A list of dtypes or dtype convertible objects representing arrays. + scalar_types : sequence + A list of dtypes or dtype convertible objects representing scalars. + Returns + ------- + datatype : cuDF dtype + The common data type, which is the maximum of `array_types` ignoring + `scalar_types`, unless the maximum of `scalar_types` is of a + different kind (`dtype.kind`). + See Also + -------- + numpy.find_common_type + Notes + -------- + Accepts numpy dtypes, cuDF dtypes, or a mix of both + + """ array_types = [ d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in array_types ] @@ -143,6 +374,30 @@ def find_common_type(array_types=[], scalar_types=[]): def can_cast(from_, to, casting='safe'): + """ + Returns True if cast between data types can occur according to the casting rule. + If from is a scalar or array scalar, also returns True if the scalar value + can be cast without overflow or truncation to an integer. + + Parameters + ---------- + from_ : dtype, dtype specifier, scalar, or array + Data type, scalar, or array to cast from. + to : dtype or dtype specifier + Data type to cast to. + casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’}, optional + Controls what kind of data casting may occur. + - ‘no’ means the data types should not be cast at all. + - ‘equiv’ means only byte-order changes are allowed. + - ‘safe’ means only casts which can preserve values are allowed. + - ‘same_kind’ means only safe casts or casts within a kind, + like float64 to float32, are allowed + - ‘unsafe’ means any data conversions may be done. + + Notes + -------- + Accepts numpy dtypes, cuDF dtypes, or a mix of both + """ if isinstance(from_, cudf.Generic): from_ = from_.numpy_dtype elif isinstance(from_, cudf.Scalar): @@ -154,19 +409,73 @@ def can_cast(from_, to, casting='safe'): def result_type(*arrays_and_dtypes): + """ + Returns the type that results from applying the NumPy type promotion rules to the arguments. + See numpy.result_type for details. + + See Also + -------- + numpy.result_type + Returns + ------- + datatype : cuDF dtype + + Notes + -------- + Accepts numpy dtypes, cuDF dtypes, or a mix of both + + """ arrays_and_dtypes = ( d.numpy_dtype if isinstance(d, cudf.Generic) else d for d in arrays_and_dtypes ) return cudf.dtype(np.result_type(*arrays_and_dtypes)) -def isnan(obj): - if isinstance(obj, cudf._lib.scalar.Scalar): - obj = obj.value - return np.isnan(obj) +def isnan(x): + """ + Returns true if an input scalar is equal to NaN. + + Parameters + ------- + x : cuDF or NumPy scalar + + See Also + ------- + numpy.isnan + + Notes + -------- + Accepts numpy dtypes, cuDF dtypes, or a mix of both + + """ + if isinstance(x, cudf._lib.scalar.Scalar): + x = x.value + return np.isnan(x) def min_scalar_type(a): + """ + For scalar a, returns the data type with the smallest size and smallest + scalar kind which can hold its value. For non-scalar array a, returns + the vector’s dtype unmodified. + + Parameters + ------- + a : cuDF or NumPy scalar + + Returns + ------- + result : cuDF dtype + + See Also + ------- + numpy.mim_scalar_type + + Notes + -------- + Accepts numpy dtypes, cuDF dtypes, or a mix of both + + """ if isinstance(a, cudf.Scalar): a = a.value result = np.min_scalar_type(a) @@ -175,6 +484,28 @@ def min_scalar_type(a): return cudf.dtype(result) def promote_types(type1, type2): + """ + Returns the data type with the smallest size and smallest scalar kind + to which both type1 and type2 may be safely cast. + + Parameters + ------- + type1 : cuDF or NumPy dtype + type2 : cuDF or NumPy dtype + + Returns + ------- + result : cuDF dtype, the promoted type + + See Also + -------- + numpy.promote_types + + Notes + -------- + Accepts numpy dtypes, cuDF dtypes, or a mix of both + + """ if isinstance(type1, cudf.Generic): type1 = type1.numpy_dtype if isinstance(type2, cudf.Generic): @@ -186,4 +517,23 @@ def promote_types(type1, type2): return cudf.dtype(result) def isscalar(element): + """ + Returns True if the type of `element` is a scalar type, + including cuDF, NumPy, and standard python scalars + + Parameters + ---------- + element : any + Input argument, can be of any type. + Returns + ------- + val : bool + True if `element` is a scalar type, False if it is not. + + See Also + -------- + numpy.isscalar + + """ + return isinstance(element, cudf._lib.scalar.Scalar) or np.isscalar(element) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 79378a78e4a..d2945eb0b22 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -24,8 +24,8 @@ def __eq__(self, other): ): return False if ( - isinstance(other, self.to_pandas.type) - or other is self.to_pandas + isinstance(other, self.pandas_dtype.type) + or other is self.pandas_dtype ): return True @@ -60,7 +60,7 @@ def type(self): @property def kind(self): - return self.to_pandas.kind + return self.pandas_dtype.kind @property def name(self): @@ -112,7 +112,7 @@ def numpy_dtype(self): return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self] @property - def to_pandas(self): + def pandas_dtype(self): # pandas only supports nanos return np.dtype("datetime64[ns]") @@ -123,7 +123,7 @@ def numpy_dtype(self): return {v: k for k, v in _cudf_dtype_from_numpy.items()}[self] @property - def to_pandas(self): + def pandas_dtype(self): # pandas only supports nanos return np.dtype("timedelta64[ns]") @@ -348,6 +348,10 @@ def __init__(self, categories=None, ordered=None): def __repr__(self): return self.to_pandas().__repr__() + @property + def pandas_dtype(self): + return self.to_pandas() + def __hash__(self): return hash(self.__repr__()) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8792dccba85..974638bcf23 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -13,7 +13,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.nvtx import annotate -from cudf.api.types import is_categorical_dtype, is_numerical_dtype +from cudf.api.types import is_categorical_dtype, is_numeric_dtype from cudf.core.column import as_column, build_categorical_column from cudf.utils import utils from cudf.utils.dtypes import ( @@ -276,7 +276,7 @@ def find_common_dtypes_and_categories(non_null_columns, dtypes): # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): + if all(is_numeric_dtype(col.dtype) for col in cols): dtypes[idx] = cudf.api.types.find_common_type( [col.dtype for col in cols], [] ) @@ -3142,7 +3142,7 @@ def _get_replacement_values(to_replace, replacement, col_name, column): if all_nan: replacement = [replacement] * len(to_replace) # Do not broadcast numeric dtypes - elif cudf.api.types.is_numerical_dtype(column.dtype): + elif cudf.api.types.is_numeric_dtype(column.dtype): if len(to_replace) > 0: replacement = [replacement] else: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a3ce9d8cb19..d895a2760a1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1719,7 +1719,7 @@ def to_pandas(self): return pd.RangeIndex( start=self._start, stop=self._stop, - dtype=self.dtype.to_pandas, + dtype=self.dtype.pandas_dtype, name=self.name, ) diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 926f44c2ced..962f5ba3bef 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -89,7 +89,7 @@ def __setitem__(self, key, value): else: value = column.as_column(value) - if hasattr(value, "dtype") and cudf.api.types.is_numerical_dtype( + if hasattr(value, "dtype") and cudf.api.types.is_numeric_dtype( value.dtype ): # normalize types if necessary: @@ -214,7 +214,7 @@ def _can_downcast_to_series(self, df, arg): return True dtypes = df.dtypes.values.tolist() all_numeric = all( - [cudf.api.types.is_numerical_dtype(t) for t in dtypes] + [cudf.api.types.is_numeric_dtype(t) for t in dtypes] ) if all_numeric: return True From 81e60581961deed48b0cc8d85158dfd8e08d7dd8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 07:50:21 -0700 Subject: [PATCH 74/80] minor fixes and code removal --- python/cudf/cudf/api/types.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index b28ec5fcf4f..b553e9978c3 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -212,12 +212,7 @@ def is_integer_dtype(obj): """ if hasattr(obj, 'dtype'): obj = obj.dtype - try: - return isinstance(obj, cudf.Integer) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_integer_dtype(obj)) - except: - import pdb - pdb.set_trace() - + return isinstance(obj, cudf.Integer) or (not isinstance(obj, cudf.Generic) and pd.api.types.is_integer_dtype(obj)) def is_numeric_dtype(obj): """ @@ -328,7 +323,7 @@ def is_categorical_dtype(obj): def is_list_dtype(obj): return ( - type(obj) is cudf.core.dtypes.ListDtype + isinstance(obj, cudf.core.dtypes.ListDtype) or obj is cudf.core.dtypes.ListDtype or type(obj) is cudf.core.column.ListColumn or obj is cudf.core.column.ListColumn From d7930eb43cad3f386d2aa0417d89fe92be8d309e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 07:52:02 -0700 Subject: [PATCH 75/80] remove cudf_dtype_from_pydata_dtype --- python/cudf/cudf/core/dataframe.py | 18 ++++++++++-------- python/cudf/cudf/core/dtypes.py | 15 +++++++++++---- python/cudf/cudf/tests/test_dataframe.py | 6 +++--- python/cudf/cudf/utils/dtypes.py | 24 ------------------------ 4 files changed, 24 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e4bb2c6ec69..3372fd35fb7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -40,7 +40,6 @@ from cudf.utils import applyutils, ioutils, queryutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - cudf_dtype_from_pydata_dtype, is_list_like, is_scalar, numeric_normalize_types, @@ -6406,8 +6405,12 @@ def select_dtypes(self, include=None, exclude=None): "at least one of include or exclude must be nonempty" ) + def cudf_dtype_type(d): + res = cudf.dtype(d) + return type(res) if isinstance(res, cudf.Generic) else res + include, exclude = map( - lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)), + lambda x: frozenset(map(cudf_dtype_type, x)), selection, ) @@ -6419,28 +6422,27 @@ def select_dtypes(self, include=None, exclude=None): ) ) # include all subtypes - include_subtypes = set() - for dtype in (d.__class__ for d in self.dtypes): + for dtype in (type(d) for d in self.dtypes): for i_dtype in include: # category handling if is_categorical_dtype(i_dtype): include_subtypes.add(i_dtype) - elif issubclass(dtype, i_dtype): + elif isinstance(dtype, i_dtype) or issubclass(dtype, i_dtype): include_subtypes.add(dtype) # exclude all subtypes exclude_subtypes = set() - for dtype in (d.__class__ for d in self.dtypes): + for dtype in (type(d) for d in self.dtypes): for e_dtype in exclude: # category handling if is_categorical_dtype(e_dtype): exclude_subtypes.add(e_dtype) - elif issubclass(dtype, e_dtype): + elif isinstance(dtype, e_dtype) or issubclass(dtype, e_dtype): exclude_subtypes.add(dtype) include_all = set( - [cudf_dtype_from_pydata_dtype(d) for d in self.dtypes] + [cudf_dtype_type(d) for d in self.dtypes] ) if include: inclusion = include_all & include_subtypes diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index d2945eb0b22..0a4b69ff896 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -266,13 +266,15 @@ def __call__(self, arg): def cudf_dtype_from_string(obj): if obj == "category": - return obj + return CategoricalDtype() try: np_dtype = np.dtype(obj) return cudf_dtype_from_numpy(np_dtype) except TypeError: result = _cudf_dtype_from_string.get(obj, None) if not result: + import pdb + pdb.set_trace() raise TypeError(f"Could not find a cuDF dtype matching {obj}") return result @@ -282,9 +284,9 @@ def cudf_dtype_from_numpy(obj): return StringDtype() elif obj is np.number: return cudf.Number - elif obj is np.datetime64: + elif obj in {np.datetime64, np.dtype('datetime64')}: return cudf.Datetime - elif obj is np.timedelta64: + elif obj in {np.timedelta64, np.dtype('timedelta64')}: return cudf.Timedelta dtype = np.dtype(obj) if dtype.type is np.str_: @@ -310,7 +312,10 @@ def dtype(obj): if isinstance(obj, Generic): return obj elif type(obj) is type and issubclass(obj, Generic): - return obj() + if obj in cant_construct_dtypes: + return obj + else: + return obj() elif isinstance(obj, np.dtype) or ( isinstance(obj, type) and issubclass(obj, (np.generic, np.dtype)) ): @@ -597,3 +602,5 @@ def __repr__(self): pd.StringDtype(): StringDtype(), pd.BooleanDtype(): BooleanDtype(), } + +cant_construct_dtypes = {Number, Integer, UnsignedInteger, Floating, Inexact, Timedelta} diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c2429504764..5267b8af970 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2635,11 +2635,11 @@ def test_select_dtype_datetime(): assert_eq(gdf[["timestamp"]], gdf.select_dtypes("datetime64")) assert_eq(gdf[["timestamp"]], gdf.select_dtypes(np.dtype("datetime64"))) assert_eq(gdf[["timestamp"]], gdf.select_dtypes(include="datetime64")) - assert_eq(gdf[["timestamp"]], gdf.select_dtypes("datetime64[ms]")) + assert_eq(gdf[["timestamp"]], gdf.select_dtypes("datetime64[ns]")) assert_eq( - gdf[["timestamp"]], gdf.select_dtypes(np.dtype("datetime64[ms]")) + gdf[["timestamp"]], gdf.select_dtypes(np.dtype("datetime64[ns]")) ) - assert_eq(gdf[["timestamp"]], gdf.select_dtypes(include="datetime64[ms]")) + assert_eq(gdf[["timestamp"]], gdf.select_dtypes(include="datetime64[ns]")) def test_array_ufunc(): diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 391b9a800d3..241a0bc0b4a 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -127,30 +127,6 @@ def is_datetime_dtype(obj): return False return "M8" in obj.str - -def cudf_dtype_from_pydata_dtype(dtype): - """ Given a numpy or pandas dtype, converts it into the equivalent cuDF - Python dtype. - """ - if isinstance(dtype, cudf.Generic): - return dtype.__class__ - if inspect.isclass(dtype): - if issubclass(dtype, cudf.Generic): - return dtype - if is_categorical_dtype(dtype): - return cudf.core.dtypes.CategoricalDtype - elif dtype in cudf._lib.types.np_to_cudf_types: - return dtype.type - elif np.issubdtype(dtype, np.datetime64): - dtype = np.datetime64 - - result = cudf.dtype(infer_dtype_from_object(dtype)) - if isinstance(result, cudf.Generic): - return result.__class__ - elif inspect.isclass(result): - return result - - def is_scalar(val): return ( val is None From c290a15180fde60275d767e41b2d2ae700a02246 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 07:52:27 -0700 Subject: [PATCH 76/80] update api calls for find_common_type to be numpy-like --- python/cudf/cudf/core/scalar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 0c35833c91e..14830b4ccea 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -114,7 +114,7 @@ def _binop_result_dtype_or_error(self, other, op): if (self.dtype.kind == "O" or other.dtype.kind == "O") and op != "__add__": raise TypeError(f"{op} is not supported for string type scalars") - return find_common_type([self.dtype, other.dtype]) + return find_common_type([self.dtype, other.dtype], []) def _scalar_binop(self, other, op): other = to_cudf_compatible_scalar(other) From e90e3255bbbc6a4014fe19fbe15f25c5d5879442 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 12:33:27 -0700 Subject: [PATCH 77/80] let pandas handle categorical edge cases --- python/cudf/cudf/core/column/column.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3a3446d4b7f..65f5ef82c04 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1409,7 +1409,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): * pandas.Categorical objects """ - dtype = cudf.dtype(dtype) + if dtype and dtype is not 'category': + dtype = cudf.dtype(dtype) if isinstance(arbitrary, ColumnBase): if dtype is not None: @@ -1510,6 +1511,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data = as_column( cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype ) + elif dtype is 'category': + return as_column(pd.Series(arbitrary, dtype=dtype)) else: data = as_column( pa.array(arbitrary, from_pandas=nan_as_null), From 3d8ca2f474c5f1818b6010af9333e772422d606c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 13:34:39 -0700 Subject: [PATCH 78/80] fix categorical creation and casting throughout cudf --- python/cudf/cudf/core/column/column.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 65f5ef82c04..b628def8ca1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1004,10 +1004,10 @@ def distinct_count(self, method="sort", dropna=True): return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype, **kwargs): - dtype = cudf.dtype(dtype) if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) - elif isinstance(dtype, cudf.Datetime): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.Datetime): return self.as_datetime_column(dtype, **kwargs) elif isinstance(dtype, cudf.StringDtype): return self.as_string_column(dtype, **kwargs) @@ -1496,6 +1496,8 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): arbitrary.array, pd.core.arrays.masked.BaseMaskedArray ): return as_column(arbitrary.array) + elif dtype is 'category': + return as_column(pd.Series(arbitrary, dtype=dtype)) if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: @@ -1511,8 +1513,6 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): data = as_column( cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype ) - elif dtype is 'category': - return as_column(pd.Series(arbitrary, dtype=dtype)) else: data = as_column( pa.array(arbitrary, from_pandas=nan_as_null), From 265338409fee828c043c56f9d9323ee8127ad73e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 13:34:58 -0700 Subject: [PATCH 79/80] remove old code --- python/cudf/cudf/core/column/numerical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 3072f513f40..8f50c9188d8 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -166,7 +166,6 @@ def as_timedelta_column(self, dtype, **kwargs): ) def as_numerical_column(self, dtype, **kwargs): - # dtype = np.dtype(dtype) # expect a cudf dtype always here if dtype == self.dtype: return self From 123784bdf9dbaf78d059054ccb245a017c8888f1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 17 Sep 2020 14:59:50 -0700 Subject: [PATCH 80/80] continued bugfixes --- python/cudf/cudf/core/dtypes.py | 4 ++-- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/scalar.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 0a4b69ff896..52155e5517f 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -265,8 +265,6 @@ def __call__(self, arg): def cudf_dtype_from_string(obj): - if obj == "category": - return CategoricalDtype() try: np_dtype = np.dtype(obj) return cudf_dtype_from_numpy(np_dtype) @@ -579,6 +577,7 @@ def __repr__(self): "Float32": Float32Dtype, "Float64": Float64Dtype, "Boolean": BooleanDtype, + "string": StringDtype, "String": StringDtype, "Datetime64NS": Datetime64NSDtype, "Datetime64US": Datetime64USDtype, @@ -588,6 +587,7 @@ def __repr__(self): "Timedelta64US": Timedelta64USDtype, "Timedelta64MS": Timedelta64MSDtype, "Timedelta64S": Timedelta64SDtype, + 'category': CategoricalDtype } _cudf_dtype_from_pandas = { diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d895a2760a1..75c48d5abe6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1941,7 +1941,7 @@ def find_label_range(self, first, last): if last is not None: end = col.find_last_value(last, closest=True) end += 1 - return begin, end + return begin.value, end.value @property def is_unique(self): diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 14830b4ccea..3413f350ce0 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -9,7 +9,8 @@ class Scalar(libcudf.scalar.Scalar): def __init__(self, value, dtype=None): if isinstance(value, libcudf.scalar.Scalar): if dtype and not value.dtype == dtype: - raise TypeError + # TODO should be doable on the device + value = libcudf.scalar.Scalar(value.value, dtype=dtype) self._data = value else: self._data = libcudf.scalar.Scalar(value, dtype=dtype)