From 844ce5d7e355e71e6d8a7079a65ab5450385f7d5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 22 Feb 2024 17:45:20 -0800 Subject: [PATCH 1/7] Clean up Columns.astype & cudf.dtype --- python/cudf/cudf/api/types.py | 2 ++ python/cudf/cudf/core/column/column.py | 47 +++++++------------------- python/cudf/cudf/core/dtypes.py | 34 ++++++++----------- python/cudf/cudf/utils/dtypes.py | 14 -------- 4 files changed, 29 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index a422eb82231..417d8b0922a 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -504,6 +504,8 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: ): return True elif isinstance(dtype_to_check, pd.CategoricalDtype): + if dtype_to_check.categories is None: + return False return _is_pandas_nullable_extension_dtype( dtype_to_check.categories.dtype ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 191c55a8a68..0d4f48b1faf 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -91,8 +91,6 @@ min_scalar_type, min_unsigned_type, np_to_pa_dtype, - pandas_dtypes_alias_to_cudf_alias, - pandas_dtypes_to_np_dtypes, ) from cudf.utils.utils import _array_ufunc, mask_dtype @@ -964,42 +962,14 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: col = self.copy() else: col = self + if dtype == "category": + return col.as_categorical_column(dtype) + was_object = dtype == object or dtype == np.dtype(object) + dtype = cudf.dtype(dtype) if self.dtype == dtype: return col - if _is_categorical_dtype(dtype): - return col.as_categorical_column(dtype) - - if ( - isinstance(dtype, str) - and dtype in pandas_dtypes_alias_to_cudf_alias - ): - if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError("not supported") - else: - dtype = pandas_dtypes_alias_to_cudf_alias[dtype] - elif _is_pandas_nullable_extension_dtype(dtype) and cudf.get_option( - "mode.pandas_compatible" - ): - raise NotImplementedError("not supported") - else: - dtype = pandas_dtypes_to_np_dtypes.get(dtype, dtype) - if _is_non_decimal_numeric_dtype(dtype): - return col.as_numerical_column(dtype) - elif _is_categorical_dtype(dtype): + elif isinstance(dtype, CategoricalDtype): return col.as_categorical_column(dtype) - elif cudf.dtype(dtype).type in { - np.str_, - np.object_, - str, - }: - if cudf.get_option("mode.pandas_compatible") and np.dtype( - dtype - ).type in {np.object_}: - raise ValueError( - f"Casting to {dtype} is not supported, use " - "`.astype('str')` instead." - ) - return col.as_string_column(dtype) elif isinstance(dtype, IntervalDtype): return col.as_interval_column(dtype) elif isinstance(dtype, (ListDtype, StructDtype)): @@ -1014,6 +984,13 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: return col.as_datetime_column(dtype) elif np.issubdtype(cast(Any, dtype), np.timedelta64): return col.as_timedelta_column(dtype) + elif dtype.kind == "O": + if cudf.get_option("mode.pandas_compatible") and was_object: + raise ValueError( + f"Casting to {dtype} is not supported, use " + "`.astype('str')` instead." + ) + return col.as_string_column(dtype) else: return col.as_numerical_column(dtype) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 26d2ea3e992..c8e3d8ba828 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -42,12 +42,12 @@ def dtype(arbitrary): # next, try interpreting arbitrary as a NumPy dtype that we support: try: np_dtype = np.dtype(arbitrary) - if np_dtype.kind in ("OU"): - return np.dtype("object") except TypeError: pass else: - if np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: + if np_dtype.kind in set("OU"): + return np.dtype("object") + elif np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: raise TypeError(f"Unsupported type {np_dtype}") return np_dtype @@ -55,25 +55,21 @@ def dtype(arbitrary): # `arbitrary` as a Pandas extension type. # Return the corresponding NumPy/cuDF type. pd_dtype = pd.api.types.pandas_dtype(arbitrary) - if cudf.get_option( - "mode.pandas_compatible" - ) and cudf.api.types._is_pandas_nullable_extension_dtype(pd_dtype): - raise NotImplementedError("not supported") - try: - return dtype(pd_dtype.numpy_dtype) - except AttributeError: - if isinstance(pd_dtype, pd.CategoricalDtype): - return cudf.CategoricalDtype.from_pandas(pd_dtype) + if cudf.api.types._is_pandas_nullable_extension_dtype(pd_dtype): + if cudf.get_option("mode.pandas_compatible"): + raise NotImplementedError("not supported") elif isinstance(pd_dtype, pd.StringDtype): return np.dtype("object") - elif isinstance(pd_dtype, pd.IntervalDtype): - return cudf.IntervalDtype.from_pandas(pd_dtype) - elif isinstance(pd_dtype, pd.DatetimeTZDtype): - return pd_dtype else: - raise TypeError( - f"Cannot interpret {arbitrary} as a valid cuDF dtype" - ) + return dtype(pd_dtype.numpy_dtype) + elif isinstance(pd_dtype, pd.CategoricalDtype): + return cudf.CategoricalDtype.from_pandas(pd_dtype) + elif isinstance(pd_dtype, pd.IntervalDtype): + return cudf.IntervalDtype.from_pandas(pd_dtype) + elif isinstance(pd_dtype, pd.DatetimeTZDtype): + return pd_dtype + else: + raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype") def _decode_type( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index c8aca94ba19..3780fcc627e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -74,25 +74,11 @@ pd.StringDtype(): np.dtype("object"), } -pandas_dtypes_alias_to_cudf_alias = { - "UInt8": "uint8", - "UInt16": "uint16", - "UInt32": "uint32", - "UInt64": "uint64", - "Int8": "int8", - "Int16": "int16", - "Int32": "int32", - "Int64": "int64", - "boolean": "bool", -} - np_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() np_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() pandas_dtypes_to_np_dtypes[pd.Float32Dtype()] = np.dtype("float32") pandas_dtypes_to_np_dtypes[pd.Float64Dtype()] = np.dtype("float64") -pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32" -pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64" SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} From 1bde06a269b568f431c31efda1ed4396e360d88f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 22 Feb 2024 17:48:03 -0800 Subject: [PATCH 2/7] add todo --- python/cudf/cudf/core/column/column.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0d4f48b1faf..a24b3aa68b6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -963,6 +963,8 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: else: col = self if dtype == "category": + # TODO: Figure out why `cudf.dtype("category")` + # astype's different than just the string return col.as_categorical_column(dtype) was_object = dtype == object or dtype == np.dtype(object) dtype = cudf.dtype(dtype) From 11d01170303b95c08cbd1c96cb54229c27ed3fc3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:43:26 -0800 Subject: [PATCH 3/7] Fix errors --- python/cudf/cudf/core/column/column.py | 4 ++++ python/cudf/cudf/core/dtypes.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 231fe15b519..e141d75e73b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -966,6 +966,10 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: # TODO: Figure out why `cudf.dtype("category")` # astype's different than just the string return col.as_categorical_column(dtype) + elif dtype == "interval" and isinstance( + self.dtype, cudf.IntervalDtype + ): + return col was_object = dtype == object or dtype == np.dtype(object) dtype = cudf.dtype(dtype) if self.dtype == dtype: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index c8e3d8ba828..2b712638a98 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -62,6 +62,8 @@ def dtype(arbitrary): return np.dtype("object") else: return dtype(pd_dtype.numpy_dtype) + elif isinstance(pd_dtype, pd.core.dtypes.dtypes.NumpyEADtype): + return dtype(pd_dtype.numpy_dtype) elif isinstance(pd_dtype, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(pd_dtype) elif isinstance(pd_dtype, pd.IntervalDtype): From 989f67d528c99979250b8bc1ebfb75f72e545cf9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Feb 2024 16:19:10 -0800 Subject: [PATCH 4/7] Trigger CI From 7870e44a9fa46c0b890ddb2d1fc578665a7536ea Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 26 Feb 2024 15:18:40 -0800 Subject: [PATCH 5/7] Trigger CI From c7ccea8a4a47a372072ba3018fadfa6735c54336 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 14:51:37 -0800 Subject: [PATCH 6/7] Update python/cudf/cudf/core/dtypes.py Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com> --- python/cudf/cudf/core/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2b712638a98..b42eef4d2c7 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -57,7 +57,7 @@ def dtype(arbitrary): pd_dtype = pd.api.types.pandas_dtype(arbitrary) if cudf.api.types._is_pandas_nullable_extension_dtype(pd_dtype): if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError("not supported") + raise NotImplementedError("Nullable types not supported in pandas compatibility mode") elif isinstance(pd_dtype, pd.StringDtype): return np.dtype("object") else: From a09aca3106561c029ba9a2ed37ee9bbad38f1854 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:00:17 -0800 Subject: [PATCH 7/7] style --- python/cudf/cudf/core/dtypes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index b42eef4d2c7..c658701f851 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -57,7 +57,9 @@ def dtype(arbitrary): pd_dtype = pd.api.types.pandas_dtype(arbitrary) if cudf.api.types._is_pandas_nullable_extension_dtype(pd_dtype): if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError("Nullable types not supported in pandas compatibility mode") + raise NotImplementedError( + "Nullable types not supported in pandas compatibility mode" + ) elif isinstance(pd_dtype, pd.StringDtype): return np.dtype("object") else: