From 1889c7c0f517c95143016a6e391275144a034f7a Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 15 Jul 2024 20:32:15 +0200 Subject: [PATCH] MAINT: Adapt to NumPy 2 promotion changes (#16141) Splitting out the non API changes from gh-15897, the Scalar API change is required for the tests to pass with NumPy 2, but almost all changes should be relatively straight forward here on their own. (I will add inline comments.) --- This PR does not fix integer comparisons, there are currently no tests that run into these. xref: https://github.com/rapidsai/build-planning/issues/38 Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16141 --- python/cudf/cudf/core/_internals/where.py | 24 +++++++++++------- python/cudf/cudf/core/column/categorical.py | 4 ++- python/cudf/cudf/core/column/numerical.py | 27 ++++++++++++++++----- python/cudf/cudf/tests/test_binops.py | 21 +++++++++++++--- python/cudf/cudf/tests/test_doctests.py | 13 +++++++++- python/cudf/cudf/tests/test_dtypes.py | 1 - 6 files changed, 69 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 44ce0ddef25..f3183e6029d 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -54,13 +54,17 @@ def _check_and_cast_columns_with_other( other_is_scalar = is_scalar(other) if other_is_scalar: - if (isinstance(other, float) and not np.isnan(other)) and ( - source_dtype.type(other) != other - ): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {source_dtype.name}" - ) + if isinstance(other, float) and not np.isnan(other): + try: + is_safe = source_dtype.type(other) == other + except OverflowError: + is_safe = False + + if not is_safe: + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {source_dtype.name}" + ) if cudf.utils.utils.is_na_like(other): return _normalize_categorical( @@ -84,8 +88,10 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast( - other, source_dtype + if ( + _is_non_decimal_numeric_dtype(source_dtype) + and not other_is_scalar # can-cast fails for Python scalars + and _can_cast(other, source_dtype) ): common_dtype = source_dtype elif ( diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f763d3b4b0c..9aaccca349d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -47,7 +47,9 @@ ) -_DEFAULT_CATEGORICAL_VALUE = -1 +# Using np.int8(-1) to allow silent wrap-around when casting to uint +# it may make sense to make this dtype specific or a function. +_DEFAULT_CATEGORICAL_VALUE = np.int8(-1) class CategoricalAccessor(ColumnMethods): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a0550bff72b..b8fa00e9643 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -301,15 +301,28 @@ def normalize_binop_value( if isinstance(other, cudf.Scalar): if self.dtype == other.dtype: return other + # expensive device-host transfer just to # adjust the dtype other = other.value + + # NumPy 2 needs a Python scalar to do weak promotion, but + # pandas forces weak promotion always + # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies. + if other.dtype.kind in "ifc": + other = other.item() + elif not isinstance(other, (int, float, complex)): + # Go via NumPy to get the value + other = np.array(other) + if other.dtype.kind in "ifc": + other = other.item() + # Try and match pandas and hence numpy. Deduce the common - # dtype via the _value_ of other, and the dtype of self. TODO: - # When NEP50 is accepted, this might want changed or - # simplified. - # This is not at all simple: - # np.result_type(np.int64(0), np.uint8) + # dtype via the _value_ of other, and the dtype of self on NumPy 1.x + # with NumPy 2, we force weak promotion even for our/NumPy scalars + # to match pandas 2.2. + # Weak promotion is not at all simple: + # np.result_type(0, np.uint8) # => np.uint8 # np.result_type(np.asarray([0], dtype=np.int64), np.uint8) # => np.int64 @@ -626,7 +639,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: min_, max_ = iinfo.min, iinfo.max # best we can do is hope to catch it here and avoid compare - if (self.min() >= min_) and (self.max() <= max_): + # Use Python floats, which have precise comparison for float64. + # NOTE(seberg): it would make sense to limit to the mantissa range. + if (float(self.min()) >= min_) and (float(self.max()) <= max_): filled = self.fillna(0) return (cudf.Series(filled) % 1 == 0).all() else: diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 7d8c3b53115..5265278db4c 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -539,7 +539,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class): if obj_class == "Index": gs = Index(gs) - gs_result = func(gs) + try: + gs_result = func(gs) + except OverflowError: + # An error is fine, if pandas raises the same error: + with pytest.raises(OverflowError): + func(random_series) + + return # class typing if obj_class == "Index": @@ -589,7 +596,14 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): if obj_class == "Index": gs = Index(gs) - gs_result = gpu_func(gs) + try: + gs_result = gpu_func(gs) + except OverflowError: + # An error is fine, if pandas raises the same error: + with pytest.raises(OverflowError): + cpu_func(random_series) + + return # class typing if obj_class == "Index": @@ -770,7 +784,8 @@ def test_operator_func_series_and_scalar( fill_value=fill_value, ) pdf_series_result = getattr(pdf_series, func)( - scalar, fill_value=fill_value + np.array(scalar)[()] if use_cudf_scalar else scalar, + fill_value=fill_value, ) assert_eq(pdf_series_result, gdf_series_result) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 0da5c6b04d6..794660cffcb 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import contextlib import doctest import inspect @@ -8,6 +8,7 @@ import numpy as np import pytest +from packaging import version import cudf @@ -80,6 +81,16 @@ def chdir_to_tmp_path(cls, tmp_path): yield os.chdir(original_directory) + @pytest.fixture(autouse=True) + def prinoptions(cls): + # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should + # be adapted evantually. + if version.parse(np.__version__) >= version.parse("2.0"): + with np.printoptions(legacy="1.25"): + yield + else: + yield + @pytest.mark.parametrize( "docstring", itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index edb534a3618..c62b5889fdd 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -341,7 +341,6 @@ def test_dtype(in_dtype, expect): np.complex128, complex, "S", - "a", "V", "float16", np.float16,