Clean unneeded/redudant dtype utils (#16309)

* Replace `min_scalar_type` with `min_signed_type` (the former just called the latter) * Replace `numeric_normalize_types` with `find_common_dtype` followed by a column `astype` * Removed `_NUMPY_SCTYPES` with just hardcoding the integer/floating types or using `np.integer`/`np.floating` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #16309
rapidsai · Jul 19, 2024 · fa0d89d · fa0d89d
1 parent 18f5fe0
commit fa0d89d
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 56 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -71,7 +71,7 @@
     get_time_unit,
     is_column_like,
     is_mixed_with_object_dtype,
-    min_scalar_type,
+    min_signed_type,
     min_unsigned_type,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
@@ -1356,7 +1356,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: ScalarLike | None = None,
+        na_sentinel: cudf.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1396,7 +1396,7 @@ def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -29,10 +29,10 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    find_common_type,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
-    numeric_normalize_types,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -517,11 +517,15 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, self
+        common_type = find_common_type(
+            (to_replace_col.dtype, replacement_col.dtype, self.dtype)
         )
+        replaced = self.astype(common_type)
         df = cudf.DataFrame._from_data(
-            {"old": to_replace_col, "new": replacement_col}
+            {
+                "old": to_replace_col.astype(common_type),
+                "new": replacement_col.astype(common_type),
+            }
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -83,8 +83,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
-    min_scalar_type,
-    numeric_normalize_types,
+    min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
@@ -103,20 +102,6 @@
     "var": "nanvar",
 }
 
-_numeric_reduction_ops = (
-    "mean",
-    "min",
-    "max",
-    "sum",
-    "product",
-    "prod",
-    "std",
-    "var",
-    "kurtosis",
-    "kurt",
-    "skew",
-)
-
 
 def _shape_mismatch_error(x, y):
     raise ValueError(
@@ -923,7 +908,8 @@ def _init_from_series_list(self, data, columns, index):
             final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
-        data = numeric_normalize_types(*data)
+        common_dtype = find_common_type([obj.dtype for obj in data])
+        data = [obj.astype(common_dtype) for obj in data]
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
             # getting union of all `index` of the Series objects.
@@ -8304,7 +8290,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
-            dtypes[idx] = min_scalar_type(len(categories[idx]))
+            dtypes[idx] = min_signed_type(len(categories[idx]))
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -52,11 +52,9 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
-    numeric_normalize_types,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
@@ -357,12 +355,10 @@ def _data(self):
     @_performance_tracking
     def __contains__(self, item):
         hash(item)
-        if isinstance(item, bool) or not isinstance(
-            item,
-            tuple(
-                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
-            ),
-        ):
+        if not isinstance(item, (np.floating, np.integer, int, float)):
+            return False
+        elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
+            # Cases that would pass the above check
             return False
         try:
             int_item = int(item)
@@ -1601,9 +1597,13 @@ def append(self, other):
                         f"either one of them to same dtypes."
                     )
 
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
+                if (
+                    isinstance(self._column, cudf.core.column.NumericalColumn)
+                    and self.dtype != other.dtype
+                ):
+                    common_type = find_common_type((self.dtype, other.dtype))
+                    this = this.astype(common_type)
+                    other = other.astype(common_type)
                 to_concat = [this, other]
 
         return self._concat(to_concat)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
@@ -89,10 +89,6 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
-# The NumPy scalar types are a bit of a mess as they align with the C types
-# so for now we use the `sctypes` dict (although it was made private in 2.0)
-_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
-
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -114,12 +110,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic"""
-    dtype = np.result_type(*[a.dtype for a in args])
-    return [a.astype(dtype) for a in args]
-
-
 def _find_common_type_decimal(dtypes):
     # Find the largest scale and the largest difference between
     # precision and scale of the columns to be concatenated
@@ -330,32 +320,28 @@ def can_convert_to_column(obj):
     return is_column_like(obj) or cudf.api.types.is_list_like(obj)
 
 
-def min_scalar_type(a, min_size=8):
-    return min_signed_type(a, min_size=min_size)
-
-
-def min_signed_type(x, min_size=8):
+def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["int"]:
+    for int_dtype in (np.int8, np.int16, np.int32, np.int64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `int64` and let numpy raise appropriate exception:
     return np.int64(x).dtype
 
 
-def min_unsigned_type(x, min_size=8):
+def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["uint"]:
+    for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `uint64` and let numpy raise appropriate exception:
     return np.uint64(x).dtype