From 089ce99b3e55778e8112ef478573846d77032b51 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:58:38 -0700 Subject: [PATCH 1/4] Replace is checks with more standard checks --- python/cudf/cudf/core/_internals/where.py | 6 +++--- python/cudf/cudf/core/column/column.py | 10 +++++----- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/lists.py | 9 +++++---- python/cudf/cudf/core/column/numerical.py | 4 ++-- python/cudf/cudf/core/series.py | 2 +- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 44ce0ddef25..5b32219d02c 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -54,9 +54,9 @@ def _check_and_cast_columns_with_other( other_is_scalar = is_scalar(other) if other_is_scalar: - if (isinstance(other, float) and not np.isnan(other)) and ( - source_dtype.type(other) != other - ): + if ( + isinstance(other, (float, np.floating)) and not np.isnan(other) + ) and (source_dtype.type(other) != other): raise TypeError( f"Cannot safely cast non-equivalent " f"{type(other).__name__} to {source_dtype.name}" diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f633d527681..70be83b5ddb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1426,9 +1426,10 @@ def column_empty_like( return column_empty(row_count, dtype, masked) -def _has_any_nan(arbitrary): +def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: + """Check if an object dtype Series or array contains NaN.""" return any( - ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x)) + isinstance(x, (float, np.floating)) and np.isnan(x) for x in np.asarray(arbitrary) ) @@ -2280,9 +2281,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Notice, we can always cast pure null columns not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtyp) - and np.issubdtype(dtyp, np.datetime64) - for dtyp in not_null_col_dtypes + _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M" + for dtype in not_null_col_dtypes ): common_dtype = find_common_type(not_null_col_dtypes) # Cast all columns to the common dtype diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 214e84028d2..0b683758195 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -645,7 +645,7 @@ def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) def can_cast_safely(self, to_dtype: Dtype) -> bool: - if np.issubdtype(to_dtype, np.datetime64): + if to_dtype.kind == "M": # type: ignore[union-attr] to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index cc15e78314e..1489b5efa13 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -564,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType: raise ValueError( "lists_indices and list column is of different " "size." ) - if not _is_non_decimal_numeric_dtype( - lists_indices_col.children[1].dtype - ) or not np.issubdtype( - lists_indices_col.children[1].dtype, np.integer + if ( + not _is_non_decimal_numeric_dtype( + lists_indices_col.children[1].dtype + ) + or lists_indices_col.children[1].dtype.kind not in "iu" ): raise TypeError( "lists_indices should be column of values of index types." diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a0550bff72b..7f9af7ee33c 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -232,8 +232,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: tmp = self if reflect else other # Guard against division by zero for integers. if ( - (tmp.dtype.type in int_float_dtype_mapping) - and (tmp.dtype.type != np.bool_) + tmp.dtype.type in int_float_dtype_mapping + and tmp.dtype.kind != "b" and ( ( ( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8c8fa75918c..83a21fed418 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -214,7 +214,7 @@ def __setitem__(self, key, value): and self._frame.dtype.categories.dtype.kind == "f" ) ) - and isinstance(value, (np.float32, np.float64)) + and isinstance(value, np.floating) and np.isnan(value) ): raise MixedTypeError( From e03c156dfc09fac3807cc3e6fcfa6b79d2b2e0c1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 12 Jul 2024 13:30:34 -0700 Subject: [PATCH 2/4] finish adjusting np is checks --- python/cudf/cudf/core/column/numerical.py | 14 ++------- python/cudf/cudf/core/join/_join_helpers.py | 33 ++++++++------------- python/cudf/cudf/testing/testing.py | 10 +++---- python/cudf/cudf/utils/dtypes.py | 4 +-- 4 files changed, 20 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7f9af7ee33c..a407d643b6b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -235,18 +235,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: tmp.dtype.type in int_float_dtype_mapping and tmp.dtype.kind != "b" and ( - ( - ( - np.isscalar(tmp) - or ( - isinstance(tmp, cudf.Scalar) - # host to device copy - and tmp.is_valid() - ) - ) - and (0 == tmp) - ) - or ((isinstance(tmp, NumericalColumn)) and (0 in tmp)) + (is_scalar(tmp) and tmp == 0) + or (isinstance(tmp, NumericalColumn) and 0 in tmp) ) ): out_dtype = cudf.dtype("float64") diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index dd0a4f666a1..d825b8b3e78 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -9,7 +9,7 @@ import numpy as np import cudf -from cudf.api.types import is_decimal_dtype, is_dtype_equal +from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype from cudf.core.column import CategoricalColumn from cudf.core.dtypes import CategoricalDtype @@ -88,38 +88,29 @@ def _match_join_keys( ) if ( - np.issubdtype(ltype, np.number) - and np.issubdtype(rtype, np.number) - and not ( - np.issubdtype(ltype, np.timedelta64) - or np.issubdtype(rtype, np.timedelta64) - ) + is_numeric_dtype(ltype) + and is_numeric_dtype(rtype) + and not (ltype.dtype.kind == "m" or rtype.dtype.kind == "m") ): common_type = ( max(ltype, rtype) if ltype.kind == rtype.kind else np.result_type(ltype, rtype) ) - elif ( - np.issubdtype(ltype, np.datetime64) - and np.issubdtype(rtype, np.datetime64) - ) or ( - np.issubdtype(ltype, np.timedelta64) - and np.issubdtype(rtype, np.timedelta64) + elif (ltype.dtype.kind == "M" and rtype.dtype.kind == "M") or ( + ltype.dtype.kind == "m" and rtype.dtype.kind == "m" ): common_type = max(ltype, rtype) - elif ( - np.issubdtype(ltype, np.datetime64) - or np.issubdtype(ltype, np.timedelta64) - ) and not rcol.fillna(0).can_cast_safely(ltype): + elif ltype.dtype.kind in "mM" and not rcol.fillna(0).can_cast_safely( + ltype + ): raise TypeError( f"Cannot join between {ltype} and {rtype}, please type-cast both " "columns to the same type." ) - elif ( - np.issubdtype(rtype, np.datetime64) - or np.issubdtype(rtype, np.timedelta64) - ) and not lcol.fillna(0).can_cast_safely(rtype): + elif rtype.dtype.kind in "mM" and not lcol.fillna(0).can_cast_safely( + rtype + ): raise TypeError( f"Cannot join between {rtype} and {ltype}, please type-cast both " "columns to the same type." diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index e56c8d867cb..c2072d90e98 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -158,12 +158,12 @@ def assert_column_equal( return True if check_datetimelike_compat: - if np.issubdtype(left.dtype, np.datetime64): + if left.dtype.kind == "M": right = right.astype(left.dtype) - elif np.issubdtype(right.dtype, np.datetime64): + elif right.dtype.kind == "M": left = left.astype(right.dtype) - if np.issubdtype(left.dtype, np.datetime64): + if left.dtype.kind == "M": if not left.equals(right): raise AssertionError( f"[datetimelike_compat=True] {left.values} " @@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs): tm.assert_index_equal(left, right, **kwargs) elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - if np.issubdtype(left.dtype, np.floating) and np.issubdtype( - right.dtype, np.floating - ): + if left.dtype.kind == "f" and right.dtype.kind == "f": assert np.allclose(left, right, equal_nan=True) else: assert np.array_equal(left, right) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2aa3129ab30..7d5371fa637 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -373,10 +373,10 @@ def min_column_type(x, expected_type): if x.null_count == len(x): return x.dtype - if np.issubdtype(x.dtype, np.floating): + if x.dtype.kind == "f": return get_min_float_dtype(x) - elif np.issubdtype(expected_type, np.integer): + elif cudf.dtype(expected_type).kind in "iu": max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) From 7e5a3b646282eee2195e80d44c165764ac58a0d2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 12 Jul 2024 17:46:33 -0700 Subject: [PATCH 3/4] Handle NA, .dtype typo --- python/cudf/cudf/core/column/numerical.py | 3 ++- python/cudf/cudf/core/join/_join_helpers.py | 14 +++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a407d643b6b..54085055f38 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -234,8 +234,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if ( tmp.dtype.type in int_float_dtype_mapping and tmp.dtype.kind != "b" + # tmp == 0 can return NA and ( - (is_scalar(tmp) and tmp == 0) + (is_scalar(tmp) and ((tmp == 0) is True)) or (isinstance(tmp, NumericalColumn) and 0 in tmp) ) ): diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index d825b8b3e78..32c84763401 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -90,27 +90,23 @@ def _match_join_keys( if ( is_numeric_dtype(ltype) and is_numeric_dtype(rtype) - and not (ltype.dtype.kind == "m" or rtype.dtype.kind == "m") + and not (ltype.kind == "m" or rtype.kind == "m") ): common_type = ( max(ltype, rtype) if ltype.kind == rtype.kind else np.result_type(ltype, rtype) ) - elif (ltype.dtype.kind == "M" and rtype.dtype.kind == "M") or ( - ltype.dtype.kind == "m" and rtype.dtype.kind == "m" + elif (ltype.kind == "M" and rtype.kind == "M") or ( + ltype.kind == "m" and rtype.kind == "m" ): common_type = max(ltype, rtype) - elif ltype.dtype.kind in "mM" and not rcol.fillna(0).can_cast_safely( - ltype - ): + elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype): raise TypeError( f"Cannot join between {ltype} and {rtype}, please type-cast both " "columns to the same type." ) - elif rtype.dtype.kind in "mM" and not lcol.fillna(0).can_cast_safely( - rtype - ): + elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype): raise TypeError( f"Cannot join between {rtype} and {ltype}, please type-cast both " "columns to the same type." From 7c0822c0176c43616b1fc69504485cbf61e7b840 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:23:02 -0700 Subject: [PATCH 4/4] Split out binop conditional --- python/cudf/cudf/core/column/numerical.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 1e7f95c36c3..b156e75be7d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -234,14 +234,15 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if ( tmp.dtype.type in int_float_dtype_mapping and tmp.dtype.kind != "b" - # tmp == 0 can return NA - and ( - (is_scalar(tmp) and ((tmp == 0) is True)) - or (isinstance(tmp, NumericalColumn) and 0 in tmp) - ) ): - out_dtype = cudf.dtype("float64") - + if isinstance(tmp, NumericalColumn) and 0 in tmp: + out_dtype = cudf.dtype("float64") + elif isinstance(tmp, cudf.Scalar): + if tmp.is_valid() and tmp == 0: + # tmp == 0 can return NA + out_dtype = cudf.dtype("float64") + elif is_scalar(tmp) and tmp == 0: + out_dtype = cudf.dtype("float64") if op in { "__lt__", "__gt__",