From 9edb40591a996479d40e29166a5a63ba2518e774 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jul 2024 15:10:51 -0700 Subject: [PATCH 1/4] Replace is_float/integer_dtype checks with .kind checks --- python/cudf/cudf/core/_base_index.py | 20 +++----------- python/cudf/cudf/core/column/column.py | 29 ++++++++++---------- python/cudf/cudf/core/column/decimal.py | 4 +-- python/cudf/cudf/core/column/numerical.py | 14 +++------- python/cudf/cudf/core/index.py | 13 +++++---- python/cudf/cudf/core/indexing_utils.py | 3 +- python/cudf/cudf/core/series.py | 7 ++--- python/cudf/cudf/core/single_column_frame.py | 3 +- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/utils/dtypes.py | 23 ++++++---------- 10 files changed, 45 insertions(+), 73 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e160fa697ee..a3b11598e9a 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -19,15 +19,7 @@ ) from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import ( - is_bool_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_scalar, - is_signed_integer_dtype, - is_unsigned_integer_dtype, -) +from cudf.api.types import is_bool_dtype, is_integer, is_list_like, is_scalar from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column from cudf.errors import MixedTypeError @@ -616,12 +608,8 @@ def union(self, other, sort=None): # Bools + other types will result in mixed type. # This is not yet consistent in pandas and specific to APIs. raise MixedTypeError("Cannot perform union with mixed types") - if ( - is_signed_integer_dtype(self.dtype) - and is_unsigned_integer_dtype(other.dtype) - ) or ( - is_unsigned_integer_dtype(self.dtype) - and is_signed_integer_dtype(other.dtype) + if (self.dtype.kind == "i" and other.dtype.kind == "u") or ( + self.dtype.kind == "u" and other.dtype.kind == "i" ): # signed + unsigned types will result in # mixed type for union in pandas. @@ -2098,7 +2086,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. - if not is_integer_dtype(gather_map.dtype): + if gather_map.dtype.kind not in "iu": gather_map = gather_map.astype(size_type_dtype) if not _gather_map_is_valid( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f633d527681..6046eecbd3c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2186,25 +2186,26 @@ def as_column( and arbitrary.null_count > 0 ): arbitrary = arbitrary.cast(pa.float64()) - if cudf.get_option( - "default_integer_bitwidth" - ) and pa.types.is_integer(arbitrary.type): - dtype = _maybe_convert_to_default_type("int") - elif cudf.get_option( - "default_float_bitwidth" - ) and pa.types.is_floating(arbitrary.type): - dtype = _maybe_convert_to_default_type("float") + if ( + cudf.get_option("default_integer_bitwidth") + and pa.types.is_integer(arbitrary.type) + ) or ( + cudf.get_option("default_float_bitwidth") + and pa.types.is_floating(arbitrary.type) + ): + dtype = _maybe_convert_to_default_type( + arbitrary.type.to_pandas_dtype() + ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): arbitrary = pd.Series(arbitrary) - if cudf.get_option( - "default_integer_bitwidth" - ) and arbitrary.dtype.kind in set("iu"): - dtype = _maybe_convert_to_default_type("int") - elif ( + if ( + cudf.get_option("default_integer_bitwidth") + and arbitrary.dtype.kind in set("iu") + ) or ( cudf.get_option("default_float_bitwidth") and arbitrary.dtype.kind == "f" ): - dtype = _maybe_convert_to_default_type("float") + dtype = _maybe_convert_to_default_type(arbitrary.dtype) return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index a63055ed527..6a7f338b065 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -15,7 +15,7 @@ from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf.api.types import is_integer_dtype, is_scalar +from cudf.api.types import is_scalar from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase from cudf.core.dtypes import ( @@ -150,7 +150,7 @@ def _validate_fillna_value( def normalize_binop_value(self, other): if isinstance(other, ColumnBase): if isinstance(other, cudf.core.column.NumericalColumn): - if not is_integer_dtype(other.dtype): + if other.dtype.kind not in "iu": raise TypeError( "Decimal columns only support binary operations with " "integer numerical columns." diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a0550bff72b..bbb08b07a00 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,13 +14,7 @@ from cudf import _lib as libcudf from cudf._lib import pylibcudf from cudf._lib.types import size_type_dtype -from cudf.api.types import ( - is_bool_dtype, - is_float_dtype, - is_integer, - is_integer_dtype, - is_scalar, -) +from cudf.api.types import is_bool_dtype, is_integer, is_scalar from cudf.core.column import ( ColumnBase, as_column, @@ -264,7 +258,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: out_dtype = "bool" if op in {"__and__", "__or__", "__xor__"}: - if is_float_dtype(self.dtype) or is_float_dtype(other.dtype): + if self.dtype.kind == "f" or other.dtype.kind == "f": raise TypeError( f"Operation 'bitwise {op[2:-2]}' not supported between " f"{self.dtype.type.__name__} and " @@ -275,8 +269,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if ( op == "__pow__" - and is_integer_dtype(self.dtype) - and (is_integer(other) or is_integer_dtype(other.dtype)) + and self.dtype.kind in "iu" + and (is_integer(other) or other.dtype.kind in "iu") ): op = "INT_POW" diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b398ee2343e..1b937af975f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1458,18 +1458,19 @@ def notna(self): notnull = notna def _is_numeric(self): - return isinstance( - self._values, cudf.core.column.NumericalColumn - ) and self.dtype != cudf.dtype("bool") + return ( + isinstance(self._values, cudf.core.column.NumericalColumn) + and self.dtype.kind != "b" + ) def _is_boolean(self): - return self.dtype == cudf.dtype("bool") + return self.dtype.kind == "b" def _is_integer(self): - return cudf.api.types.is_integer_dtype(self.dtype) + return self.dtype.kind in "iu" def _is_floating(self): - return cudf.api.types.is_float_dtype(self.dtype) + return self.dtype.kind == "f" def _is_object(self): return isinstance(self._values, cudf.core.column.StringColumn) diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index a5fed02cbed..cfaa08d5f4f 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -12,7 +12,6 @@ _is_scalar_or_zero_d_array, is_bool_dtype, is_integer, - is_integer_dtype, ) from cudf.core.copy_types import BooleanMask, GatherMap @@ -234,7 +233,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: return EmptyIndexer() - elif is_integer_dtype(key.dtype): + elif key.dtype.kind in "iu": return MapIndexer(GatherMap(key, n, nullify=False)) else: raise TypeError( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8c8fa75918c..20c791e6202 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -25,7 +25,6 @@ is_bool_dtype, is_dict_like, is_integer, - is_integer_dtype, is_scalar, ) from cudf.core import indexing_utils @@ -357,12 +356,10 @@ def _loc_to_iloc(self, arg): ) if not _is_non_decimal_numeric_dtype(index_dtype) and not ( isinstance(index_dtype, cudf.CategoricalDtype) - and is_integer_dtype(index_dtype.categories.dtype) + and index_dtype.categories.dtype.kind in "iu" ): # TODO: switch to cudf.utils.dtypes.is_integer(arg) - if isinstance(arg, cudf.Scalar) and is_integer_dtype( - arg.dtype - ): + if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu": # Do not remove until pandas 3.0 support is added. assert ( PANDAS_LT_300 diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9555aee6a2..9ff8c2fd980 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -13,7 +13,6 @@ _is_scalar_or_zero_d_array, is_bool_dtype, is_integer, - is_integer_dtype, is_numeric_dtype, ) from cudf.core.column import ColumnBase, as_column @@ -359,7 +358,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: arg = as_column(arg) if len(arg) == 0: arg = cudf.core.column.column_empty(0, dtype="int32") - if is_integer_dtype(arg.dtype): + if arg.dtype.kind in "iu": return self._column.take(arg) if is_bool_dtype(arg.dtype): if (bn := len(arg)) != (n := len(self)): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f40106a30f4..d88816260bd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10835,7 +10835,7 @@ def test_dataframe_contains(name, contains, other_names): expectation = contains is cudf.NA and name is cudf.NA assert (contains in pdf) == expectation assert (contains in gdf) == expectation - elif pd.api.types.is_float_dtype(gdf.columns.dtype): + elif gdf.columns.dtype.kind == "f": # In some cases, the columns are converted to an Index[float] based on # the other column names. That casts name values from None to np.nan. expectation = contains is np.nan and (name is None or name is np.nan) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2aa3129ab30..e8a8b377e35 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -653,25 +653,18 @@ def _can_cast(from_dtype, to_dtype): return np.can_cast(from_dtype, to_dtype) -def _maybe_convert_to_default_type(dtype): +def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj: """Convert `dtype` to default if specified by user. If not specified, return as is. """ - if cudf.get_option("default_integer_bitwidth"): - if cudf.api.types.is_signed_integer_dtype(dtype): - return cudf.dtype( - f'i{cudf.get_option("default_integer_bitwidth")//8}' - ) - elif cudf.api.types.is_unsigned_integer_dtype(dtype): - return cudf.dtype( - f'u{cudf.get_option("default_integer_bitwidth")//8}' - ) - if cudf.get_option( - "default_float_bitwidth" - ) and cudf.api.types.is_float_dtype(dtype): - return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}') - + if ib := cudf.get_option("default_integer_bitwidth"): + if dtype.kind == "i": + return cudf.dtype(f"i{ib//8}") + elif dtype.kind == "u": + return cudf.dtype(f"u{ib//8}") + if fb := cudf.get_option("default_float_bitwidth") and dtype.kind == "f": + return cudf.dtype(f"f{fb//8}") return dtype From a1ca0a9a3b786df8550c956bbf108d95185d68b0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jul 2024 15:36:08 -0700 Subject: [PATCH 2/4] Another simplification --- python/cudf/cudf/api/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index d97e9c815b6..294ae2fd985 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -90,7 +90,7 @@ def is_integer(obj): bool """ if isinstance(obj, cudf.Scalar): - return pd.api.types.is_integer_dtype(obj.dtype) + return obj.dtype.kind in "iu" return pd.api.types.is_integer(obj) From b7f0b40cbcdb225c2a35d0860d10e7964c4885a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 12 Jul 2024 10:51:14 -0700 Subject: [PATCH 3/4] Address test failures --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/utils/dtypes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6046eecbd3c..ec3e2f7349e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2194,7 +2194,7 @@ def as_column( and pa.types.is_floating(arbitrary.type) ): dtype = _maybe_convert_to_default_type( - arbitrary.type.to_pandas_dtype() + cudf.dtype(arbitrary.type.to_pandas_dtype()) ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): arbitrary = pd.Series(arbitrary) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e8a8b377e35..d4e7a4e40c6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -663,7 +663,7 @@ def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj: return cudf.dtype(f"i{ib//8}") elif dtype.kind == "u": return cudf.dtype(f"u{ib//8}") - if fb := cudf.get_option("default_float_bitwidth") and dtype.kind == "f": + if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f": return cudf.dtype(f"f{fb//8}") return dtype From 78de5fd4ebe352633af5434cc6cb29907b5112a7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:07:40 -0700 Subject: [PATCH 4/4] put typing import in block --- python/cudf/cudf/utils/dtypes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 573ba789b1c..bc0e2ebb5d7 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,7 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import datetime from decimal import Decimal +from typing import TYPE_CHECKING import cupy as cp import numpy as np @@ -10,7 +12,9 @@ from pandas.core.dtypes.common import infer_dtype_from_object import cudf -from cudf._typing import DtypeObj + +if TYPE_CHECKING: + from cudf._typing import DtypeObj """Map numpy dtype to pyarrow types. Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special