Skip to content

Commit

Permalink
Clean unneeded/redudant dtype utils (#16309)
Browse files Browse the repository at this point in the history
* Replace `min_scalar_type` with `min_signed_type` (the former just called the latter)
* Replace `numeric_normalize_types` with `find_common_dtype` followed by a column `astype`
* Removed `_NUMPY_SCTYPES` with just hardcoding the integer/floating types or using `np.integer`/`np.floating`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16309
  • Loading branch information
mroeschke authored Jul 19, 2024
1 parent 18f5fe0 commit fa0d89d
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 56 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
get_time_unit,
is_column_like,
is_mixed_with_object_dtype,
min_scalar_type,
min_signed_type,
min_unsigned_type,
)
from cudf.utils.utils import _array_ufunc, mask_dtype
Expand Down Expand Up @@ -1356,7 +1356,7 @@ def _label_encoding(
self,
cats: ColumnBase,
dtype: Dtype | None = None,
na_sentinel: ScalarLike | None = None,
na_sentinel: cudf.Scalar | None = None,
):
"""
Convert each value in `self` into an integer code, with `cats`
Expand Down Expand Up @@ -1396,7 +1396,7 @@ def _return_sentinel_column():
return as_column(na_sentinel, dtype=dtype, length=len(self))

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)

if is_mixed_with_object_dtype(self, cats):
return _return_sentinel_column()
Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
from cudf.core.mixins import BinaryOperand
from cudf.errors import MixedTypeError
from cudf.utils.dtypes import (
find_common_type,
min_column_type,
min_signed_type,
np_dtypes_to_pandas_dtypes,
numeric_normalize_types,
)

from .numerical_base import NumericalBaseColumn
Expand Down Expand Up @@ -517,11 +517,15 @@ def find_and_replace(
)
elif len(replacement_col) == 1 and len(to_replace_col) == 0:
return self.copy()
to_replace_col, replacement_col, replaced = numeric_normalize_types(
to_replace_col, replacement_col, self
common_type = find_common_type(
(to_replace_col.dtype, replacement_col.dtype, self.dtype)
)
replaced = self.astype(common_type)
df = cudf.DataFrame._from_data(
{"old": to_replace_col, "new": replacement_col}
{
"old": to_replace_col.astype(common_type),
"new": replacement_col.astype(common_type),
}
)
df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
if df._data["old"].null_count == 1:
Expand Down
22 changes: 4 additions & 18 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@
cudf_dtype_from_pydata_dtype,
find_common_type,
is_column_like,
min_scalar_type,
numeric_normalize_types,
min_signed_type,
)
from cudf.utils.performance_tracking import _performance_tracking
from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
Expand All @@ -103,20 +102,6 @@
"var": "nanvar",
}

_numeric_reduction_ops = (
"mean",
"min",
"max",
"sum",
"product",
"prod",
"std",
"var",
"kurtosis",
"kurt",
"skew",
)


def _shape_mismatch_error(x, y):
raise ValueError(
Expand Down Expand Up @@ -923,7 +908,8 @@ def _init_from_series_list(self, data, columns, index):
final_index = ensure_index(index)

series_lengths = list(map(len, data))
data = numeric_normalize_types(*data)
common_dtype = find_common_type([obj.dtype for obj in data])
data = [obj.astype(common_dtype) for obj in data]
if series_lengths.count(series_lengths[0]) == len(series_lengths):
# Calculating the final dataframe columns by
# getting union of all `index` of the Series objects.
Expand Down Expand Up @@ -8304,7 +8290,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
)._column.unique()
# Set the column dtype to the codes' dtype. The categories
# will be re-assigned at the end
dtypes[idx] = min_scalar_type(len(categories[idx]))
dtypes[idx] = min_signed_type(len(categories[idx]))
# Otherwise raise an error if columns have different dtypes
elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
raise ValueError("All columns must be the same type")
Expand Down
22 changes: 11 additions & 11 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,9 @@
from cudf.core.single_column_frame import SingleColumnFrame
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
_NUMPY_SCTYPES,
_maybe_convert_to_default_type,
find_common_type,
is_mixed_with_object_dtype,
numeric_normalize_types,
)
from cudf.utils.performance_tracking import _performance_tracking
from cudf.utils.utils import _warn_no_dask_cudf, search_range
Expand Down Expand Up @@ -357,12 +355,10 @@ def _data(self):
@_performance_tracking
def __contains__(self, item):
hash(item)
if isinstance(item, bool) or not isinstance(
item,
tuple(
_NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
),
):
if not isinstance(item, (np.floating, np.integer, int, float)):
return False
elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
# Cases that would pass the above check
return False
try:
int_item = int(item)
Expand Down Expand Up @@ -1601,9 +1597,13 @@ def append(self, other):
f"either one of them to same dtypes."
)

if isinstance(self._values, cudf.core.column.NumericalColumn):
if self.dtype != other.dtype:
this, other = numeric_normalize_types(self, other)
if (
isinstance(self._column, cudf.core.column.NumericalColumn)
and self.dtype != other.dtype
):
common_type = find_common_type((self.dtype, other.dtype))
this = this.astype(common_type)
other = other.astype(common_type)
to_concat = [this, other]

return self._concat(to_concat)
Expand Down
26 changes: 6 additions & 20 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,6 @@
BOOL_TYPES = {"bool"}
ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES

# The NumPy scalar types are a bit of a mess as they align with the C types
# so for now we use the `sctypes` dict (although it was made private in 2.0)
_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes


def np_to_pa_dtype(dtype):
"""Util to convert numpy dtype to PyArrow dtype."""
Expand All @@ -114,12 +110,6 @@ def np_to_pa_dtype(dtype):
return _np_pa_dtypes[cudf.dtype(dtype).type]


def numeric_normalize_types(*args):
"""Cast all args to a common type using numpy promotion logic"""
dtype = np.result_type(*[a.dtype for a in args])
return [a.astype(dtype) for a in args]


def _find_common_type_decimal(dtypes):
# Find the largest scale and the largest difference between
# precision and scale of the columns to be concatenated
Expand Down Expand Up @@ -330,32 +320,28 @@ def can_convert_to_column(obj):
return is_column_like(obj) or cudf.api.types.is_list_like(obj)


def min_scalar_type(a, min_size=8):
return min_signed_type(a, min_size=min_size)


def min_signed_type(x, min_size=8):
def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
"""
Return the smallest *signed* integer dtype
that can represent the integer ``x``
"""
for int_dtype in _NUMPY_SCTYPES["int"]:
for int_dtype in (np.int8, np.int16, np.int32, np.int64):
if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
return int_dtype
return np.dtype(int_dtype)
# resort to using `int64` and let numpy raise appropriate exception:
return np.int64(x).dtype


def min_unsigned_type(x, min_size=8):
def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
"""
Return the smallest *unsigned* integer dtype
that can represent the integer ``x``
"""
for int_dtype in _NUMPY_SCTYPES["uint"]:
for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
if 0 <= x <= np.iinfo(int_dtype).max:
return int_dtype
return np.dtype(int_dtype)
# resort to using `uint64` and let numpy raise appropriate exception:
return np.uint64(x).dtype

Expand Down

0 comments on commit fa0d89d

Please sign in to comment.