diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9467bbeed15..654f936ec6b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -71,7 +71,7 @@ get_time_unit, is_column_like, is_mixed_with_object_dtype, - min_scalar_type, + min_signed_type, min_unsigned_type, ) from cudf.utils.utils import _array_ufunc, mask_dtype @@ -1351,7 +1351,7 @@ def _label_encoding( self, cats: ColumnBase, dtype: Dtype | None = None, - na_sentinel: ScalarLike | None = None, + na_sentinel: cudf.Scalar | None = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1391,7 +1391,7 @@ def _return_sentinel_column(): return as_column(na_sentinel, dtype=dtype, length=len(self)) if dtype is None: - dtype = min_scalar_type(max(len(cats), na_sentinel), 8) + dtype = min_signed_type(max(len(cats), na_sentinel.value), 8) if is_mixed_with_object_dtype(self, cats): return _return_sentinel_column() diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index cea68c88c90..31d2e6e0e28 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -29,10 +29,10 @@ from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + find_common_type, min_column_type, min_signed_type, np_dtypes_to_pandas_dtypes, - numeric_normalize_types, ) from .numerical_base import NumericalBaseColumn @@ -517,11 +517,15 @@ def find_and_replace( ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() - to_replace_col, replacement_col, replaced = numeric_normalize_types( - to_replace_col, replacement_col, self + common_type = find_common_type( + (to_replace_col.dtype, replacement_col.dtype, self.dtype) ) + replaced = self.astype(common_type) df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} + { + "old": to_replace_col.astype(common_type), + "new": replacement_col.astype(common_type), + } ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b3d938829c9..c7a5656c319 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -83,8 +83,7 @@ cudf_dtype_from_pydata_dtype, find_common_type, is_column_like, - min_scalar_type, - numeric_normalize_types, + min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api @@ -103,20 +102,6 @@ "var": "nanvar", } -_numeric_reduction_ops = ( - "mean", - "min", - "max", - "sum", - "product", - "prod", - "std", - "var", - "kurtosis", - "kurt", - "skew", -) - def _shape_mismatch_error(x, y): raise ValueError( @@ -923,7 +908,8 @@ def _init_from_series_list(self, data, columns, index): final_index = ensure_index(index) series_lengths = list(map(len, data)) - data = numeric_normalize_types(*data) + common_dtype = find_common_type([obj.dtype for obj in data]) + data = [obj.astype(common_dtype) for obj in data] if series_lengths.count(series_lengths[0]) == len(series_lengths): # Calculating the final dataframe columns by # getting union of all `index` of the Series objects. @@ -8305,7 +8291,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): )._column.unique() # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end - dtypes[idx] = min_scalar_type(len(categories[idx])) + dtypes[idx] = min_signed_type(len(categories[idx])) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 4164f981fca..cd52a34e35e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -52,11 +52,9 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - _NUMPY_SCTYPES, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, - numeric_normalize_types, ) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf, search_range @@ -357,12 +355,10 @@ def _data(self): @_performance_tracking def __contains__(self, item): hash(item) - if isinstance(item, bool) or not isinstance( - item, - tuple( - _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float] - ), - ): + if not isinstance(item, (np.floating, np.integer, int, float)): + return False + elif isinstance(item, (np.timedelta64, np.datetime64, bool)): + # Cases that would pass the above check return False try: int_item = int(item) @@ -1601,9 +1597,13 @@ def append(self, other): f"either one of them to same dtypes." ) - if isinstance(self._values, cudf.core.column.NumericalColumn): - if self.dtype != other.dtype: - this, other = numeric_normalize_types(self, other) + if ( + isinstance(self._column, cudf.core.column.NumericalColumn) + and self.dtype != other.dtype + ): + common_type = find_common_type((self.dtype, other.dtype)) + this = this.astype(common_type) + other = other.astype(common_type) to_concat = [this, other] return self._concat(to_concat) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index af912bee342..69c268db149 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -89,10 +89,6 @@ BOOL_TYPES = {"bool"} ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES -# The NumPy scalar types are a bit of a mess as they align with the C types -# so for now we use the `sctypes` dict (although it was made private in 2.0) -_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes - def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype.""" @@ -114,12 +110,6 @@ def np_to_pa_dtype(dtype): return _np_pa_dtypes[cudf.dtype(dtype).type] -def numeric_normalize_types(*args): - """Cast all args to a common type using numpy promotion logic""" - dtype = np.result_type(*[a.dtype for a in args]) - return [a.astype(dtype) for a in args] - - def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated @@ -330,32 +320,28 @@ def can_convert_to_column(obj): return is_column_like(obj) or cudf.api.types.is_list_like(obj) -def min_scalar_type(a, min_size=8): - return min_signed_type(a, min_size=min_size) - - -def min_signed_type(x, min_size=8): +def min_signed_type(x: int, min_size: int = 8) -> np.dtype: """ Return the smallest *signed* integer dtype that can represent the integer ``x`` """ - for int_dtype in _NUMPY_SCTYPES["int"]: + for int_dtype in (np.int8, np.int16, np.int32, np.int64): if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: - return int_dtype + return np.dtype(int_dtype) # resort to using `int64` and let numpy raise appropriate exception: return np.int64(x).dtype -def min_unsigned_type(x, min_size=8): +def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype: """ Return the smallest *unsigned* integer dtype that can represent the integer ``x`` """ - for int_dtype in _NUMPY_SCTYPES["uint"]: + for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64): if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: - return int_dtype + return np.dtype(int_dtype) # resort to using `uint64` and let numpy raise appropriate exception: return np.uint64(x).dtype