From beda22ed28030bbed2faaa5a49509255f11976aa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:29:05 -1000 Subject: [PATCH] Replace is_bool_type with checking .dtype.kind (#16255) It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16255 --- python/cudf/cudf/core/_base_index.py | 9 +++------ python/cudf/cudf/core/_internals/where.py | 8 ++------ python/cudf/cudf/core/column/column.py | 7 +++---- python/cudf/cudf/core/column/numerical.py | 5 ++--- python/cudf/cudf/core/dataframe.py | 13 ++++++------- python/cudf/cudf/core/groupby/groupby.py | 4 ++-- python/cudf/cudf/core/indexing_utils.py | 3 +-- python/cudf/cudf/core/multiindex.py | 4 ---- python/cudf/cudf/core/series.py | 11 +++++------ python/cudf/cudf/core/single_column_frame.py | 3 +-- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_index.py | 5 ++--- 12 files changed, 28 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 9ba2d161619..479f87bb78b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -20,7 +20,6 @@ from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( - is_bool_dtype, is_integer, is_integer_dtype, is_list_like, @@ -610,10 +609,8 @@ def union(self, other, sort=None): ) if cudf.get_option("mode.pandas_compatible"): - if ( - is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype) - ) or ( - not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype) + if (self.dtype.kind == "b" and other.dtype.kind != "b") or ( + self.dtype.kind != "b" and other.dtype.kind == "b" ): # Bools + other types will result in mixed type. # This is not yet consistent in pandas and specific to APIs. @@ -2154,7 +2151,7 @@ def _apply_boolean_mask(self, boolean_mask): Rows corresponding to `False` is dropped. """ boolean_mask = cudf.core.column.as_column(boolean_mask) - if not is_bool_dtype(boolean_mask.dtype): + if boolean_mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index f3183e6029d..4a36be76b6d 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -7,11 +7,7 @@ import numpy as np import cudf -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_scalar, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, @@ -112,7 +108,7 @@ def _check_and_cast_columns_with_other( other = cudf.Scalar(other) if is_mixed_with_object_dtype(other, source_col) or ( - is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype) + source_dtype.kind == "b" and common_dtype.kind != "b" ): raise TypeError(mixed_err) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fd3664ecac4..dbdf501e022 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -41,7 +41,6 @@ _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, - is_bool_dtype, is_dtype_equal, is_scalar, is_string_dtype, @@ -619,7 +618,7 @@ def _scatter_by_column( key: cudf.core.column.NumericalColumn, value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": # `key` is boolean mask if len(key) != len(self): raise ValueError( @@ -644,7 +643,7 @@ def _scatter_by_column( self._check_scatter_key_length(num_keys, value) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return libcudf.copying.boolean_mask_scatter([value], [self], key)[ 0 ]._with_type_metadata(self.dtype) @@ -1083,7 +1082,7 @@ def as_decimal_column( def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask) - if not is_bool_dtype(mask.dtype): + if mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return apply_boolean_mask([self], mask)[0]._with_type_metadata( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7f05a5f91a1..cea68c88c90 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -13,7 +13,6 @@ from cudf import _lib as libcudf from cudf._lib import pylibcudf from cudf.api.types import ( - is_bool_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -159,7 +158,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value) ) - if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype): + if self.dtype.kind != "b" and device_value.dtype.kind == "b": raise TypeError(f"Invalid value {value} for dtype {self.dtype}") else: device_value = device_value.astype(self.dtype) @@ -264,7 +263,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: f"{self.dtype.type.__name__} and " f"{other.dtype.type.__name__}" ) - if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype): + if self.dtype.kind == "b" or other.dtype.kind == "b": out_dtype = "bool" if ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2121e623c1c..b3d938829c9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -32,7 +32,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_dtype_equal, is_list_like, @@ -171,7 +170,7 @@ def _can_downcast_to_series(self, df, arg): ): return False else: - if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance( + if as_column(arg[0]).dtype.kind == "b" and not isinstance( arg[1], slice ): return True @@ -320,7 +319,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg[1], ) - if is_bool_dtype(tmp_arg[0].dtype): + if tmp_arg[0].dtype.kind == "b": df = columns_df._apply_boolean_mask( BooleanMask(tmp_arg[0], len(columns_df)) ) @@ -3678,8 +3677,8 @@ def agg(self, aggs, axis=None): """ dtypes = [self[col].dtype for col in self._column_names] common_dtype = find_common_type(dtypes) - if not is_bool_dtype(common_dtype) and any( - is_bool_dtype(dtype) for dtype in dtypes + if common_dtype.kind != "b" and any( + dtype.kind == "b" for dtype in dtypes ): raise MixedTypeError("Cannot create a column with mixed types") @@ -6305,8 +6304,8 @@ def _reduce( and any( not is_object_dtype(dtype) for dtype in source_dtypes ) - or not is_bool_dtype(common_dtype) - and any(is_bool_dtype(dtype) for dtype in source_dtypes) + or common_dtype.kind != "b" + and any(dtype.kind == "b" for dtype in source_dtypes) ): raise TypeError( "Columns must all have the same dtype to " diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8659d7c2392..d2c75715be2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,7 +22,7 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype +from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -1534,7 +1534,7 @@ def mult(df): # For `sum` & `product`, boolean types # will need to result in `int64` type. for name, col in res._data.items(): - if is_bool_dtype(col.dtype): + if col.dtype.kind == "b": res._data[name] = col.astype("int") return res diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index a5fed02cbed..9c81b0eb607 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -10,7 +10,6 @@ import cudf from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, is_integer_dtype, ) @@ -230,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): key = key.astype(key.codes.dtype) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: return EmptyIndexer() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3ed72ff812a..ff4b06c6334 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -841,10 +841,6 @@ def _get_row_major( | tuple[Any, ...] | list[tuple[Any, ...]], ) -> DataFrameOrSeries: - if pd.api.types.is_bool_dtype( - list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple - ): - return df[row_tuple] if isinstance(row_tuple, slice): if row_tuple.start is None: row_tuple = slice(self[0], row_tuple.stop, row_tuple.step) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8c8fa75918c..e12cc3d52fb 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -22,7 +22,6 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_integer, is_integer_dtype, @@ -221,10 +220,10 @@ def __setitem__(self, key, value): f"Cannot assign {value=} to " f"non-float dtype={self._frame.dtype}" ) - elif ( - self._frame.dtype.kind == "b" - and not is_bool_dtype(value) - and value not in {None, cudf.NA} + elif self._frame.dtype.kind == "b" and not ( + value in {None, cudf.NA} + or isinstance(value, (np.bool_, bool)) + or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b") ): raise MixedTypeError( f"Cannot assign {value=} to " @@ -3221,7 +3220,7 @@ def describe( percentiles = np.array([0.25, 0.5, 0.75]) dtype = "str" - if is_bool_dtype(self.dtype): + if self.dtype.kind == "b": data = _describe_categorical(self, percentiles) elif isinstance(self._column, cudf.core.column.NumericalColumn): data = _describe_numeric(self, percentiles) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9555aee6a2..04c7db7a53c 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -11,7 +11,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, is_integer_dtype, is_numeric_dtype, @@ -361,7 +360,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: arg = cudf.core.column.column_empty(0, dtype="int32") if is_integer_dtype(arg.dtype): return self._column.take(arg) - if is_bool_dtype(arg.dtype): + if arg.dtype.kind == "b": if (bn := len(arg)) != (n := len(self)): raise IndexError( f"Boolean mask has wrong length: {bn} not {n}" diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7ccf83e424c..2009fc49ce5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only): else (pdf[column].notna().count() == 0) ) or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or cudf.api.types.is_bool_dtype(pdf[column].dtype) + or pdf[column].dtype.kind == "b" for column in pdf ): with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05dcd85df6a..9eba6122d26 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -16,7 +16,6 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible): expected, actual, exact=False - if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype)) - or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype)) + if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") + or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") else True, )