From cbcfa678392cec07c33f57090a793c3631d19c7e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Dec 2023 11:06:19 -1000 Subject: [PATCH] Use isinstance over is_foo_dtype internally (#14638) Internally IMO we should prefer `isinstance` over `is_*_dtype` when we know the input is a dtype instance in order to be stricter/faster that the `is_*_dtype` checks Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/14638 --- python/cudf/cudf/core/column/column.py | 43 +++++++----------------- python/cudf/cudf/core/column/interval.py | 5 ++- python/cudf/cudf/core/column/lists.py | 12 +++---- python/cudf/cudf/core/column/string.py | 9 ++--- python/cudf/cudf/core/column/struct.py | 3 +- python/cudf/cudf/core/dataframe.py | 6 ++-- python/cudf/cudf/core/dtypes.py | 10 ++++-- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/series.py | 15 ++++----- python/cudf/cudf/core/tools/numeric.py | 4 +-- python/cudf/cudf/testing/testing.py | 16 +++++---- python/cudf/cudf/utils/dtypes.py | 21 +++++++----- 12 files changed, 62 insertions(+), 85 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7eb721336cc..064e4243cf9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -55,9 +55,6 @@ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_decimal32_dtype, - is_decimal64_dtype, - is_decimal128_dtype, is_decimal_dtype, is_dtype_equal, is_integer_dtype, @@ -65,7 +62,6 @@ is_list_dtype, is_scalar, is_string_dtype, - is_struct_dtype, ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable @@ -1023,21 +1019,15 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: "`.astype('str')` instead." ) return self.as_string_column(dtype, **kwargs) - elif is_list_dtype(dtype): + elif isinstance(dtype, (ListDtype, StructDtype)): if not self.dtype == dtype: raise NotImplementedError( - "Casting list columns not currently supported" + f"Casting {self.dtype} columns not currently supported" ) return self - elif is_struct_dtype(dtype): - if not self.dtype == dtype: - raise NotImplementedError( - "Casting struct columns not currently supported" - ) - return self - elif is_interval_dtype(self.dtype): + elif isinstance(dtype, IntervalDtype): return self.as_interval_column(dtype, **kwargs) - elif is_decimal_dtype(dtype): + elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): return self.as_decimal_column(dtype, **kwargs) elif np.issubdtype(cast(Any, dtype), np.datetime64): return self.as_datetime_column(dtype, **kwargs) @@ -1578,7 +1568,7 @@ def build_column( ) return col - if is_categorical_dtype(dtype): + if isinstance(dtype, CategoricalDtype): if not len(children) == 1: raise ValueError( "Must specify exactly one child column for CategoricalColumn" @@ -1604,7 +1594,7 @@ def build_column( offset=offset, null_count=null_count, ) - elif is_datetime64tz_dtype(dtype): + elif isinstance(dtype, pd.DatetimeTZDtype): if data is None: raise TypeError("Must specify data buffer") return cudf.core.column.datetime.DatetimeTZColumn( @@ -1634,7 +1624,7 @@ def build_column( children=children, null_count=null_count, ) - elif is_list_dtype(dtype): + elif isinstance(dtype, ListDtype): return cudf.core.column.ListColumn( size=size, dtype=dtype, @@ -1643,7 +1633,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_interval_dtype(dtype): + elif isinstance(dtype, IntervalDtype): return cudf.core.column.IntervalColumn( dtype=dtype, mask=mask, @@ -1652,7 +1642,7 @@ def build_column( children=children, null_count=null_count, ) - elif is_struct_dtype(dtype): + elif isinstance(dtype, StructDtype): if size is None: raise TypeError("Must specify size") return cudf.core.column.StructColumn( @@ -1664,7 +1654,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_decimal64_dtype(dtype): + elif isinstance(dtype, cudf.Decimal64Dtype): if size is None: raise TypeError("Must specify size") return cudf.core.column.Decimal64Column( @@ -1676,7 +1666,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_decimal32_dtype(dtype): + elif isinstance(dtype, cudf.Decimal32Dtype): if size is None: raise TypeError("Must specify size") return cudf.core.column.Decimal32Column( @@ -1688,7 +1678,7 @@ def build_column( null_count=null_count, children=children, ) - elif is_decimal128_dtype(dtype): + elif isinstance(dtype, cudf.Decimal128Dtype): if size is None: raise TypeError("Must specify size") return cudf.core.column.Decimal128Column( @@ -1700,15 +1690,6 @@ def build_column( null_count=null_count, children=children, ) - elif is_interval_dtype(dtype): - return cudf.core.column.IntervalColumn( - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=children, - ) else: raise TypeError(f"Unrecognized dtype: {dtype}") diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 01e3cc159b9..eed7bba3628 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -5,7 +5,6 @@ import pyarrow as pa import cudf -from cudf.api.types import is_interval_dtype from cudf.core.column import StructColumn from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -101,11 +100,11 @@ def copy(self, deep=True): ) def as_interval_column(self, dtype, **kwargs): - if is_interval_dtype(dtype): + if isinstance(dtype, IntervalDtype): if isinstance(self.dtype, CategoricalDtype): new_struct = self._get_decategorized_column() return IntervalColumn.from_struct_column(new_struct) - if is_interval_dtype(dtype): + else: # a user can directly input the string `interval` as the dtype # when creating an interval series or interval dataframe if dtype == "interval": diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0bb9f70f851..a5653e66513 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -24,11 +24,7 @@ from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_list_dtype, - is_scalar, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.dtypes import ListDtype @@ -298,7 +294,7 @@ class ListMethods(ColumnMethods): _column: ListColumn def __init__(self, parent: ParentType): - if not is_list_dtype(parent.dtype): + if not isinstance(parent.dtype, ListDtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) @@ -589,7 +585,7 @@ def unique(self) -> ParentType: dtype: list """ - if is_list_dtype(self._column.children[1].dtype): + if isinstance(self._column.children[1].dtype, ListDtype): raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( @@ -642,7 +638,7 @@ def sort_values( raise NotImplementedError("`kind` not currently implemented.") if na_position not in {"first", "last"}: raise ValueError(f"Unknown `na_position` value {na_position}") - if is_list_dtype(self._column.children[1].dtype): + if isinstance(self._column.children[1].dtype, ListDtype): raise NotImplementedError("Nested lists sort is not supported.") return self._return_or_inplace( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0b3e87eb40a..7bf81f3e2d3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -28,12 +28,7 @@ from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype -from cudf.api.types import ( - is_integer, - is_list_dtype, - is_scalar, - is_string_dtype, -) +from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase @@ -126,7 +121,7 @@ class StringMethods(ColumnMethods): def __init__(self, parent): value_type = ( parent.dtype.leaf_type - if is_list_dtype(parent.dtype) + if isinstance(parent.dtype, cudf.ListDtype) else parent.dtype ) if not is_string_dtype(value_type): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 35fb18cc5c7..6cfa8db0d96 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -9,7 +9,6 @@ import cudf from cudf._typing import Dtype -from cudf.api.types import is_struct_dtype from cudf.core.column import ColumnBase, build_struct_column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype @@ -158,7 +157,7 @@ class StructMethods(ColumnMethods): _column: StructColumn def __init__(self, parent=None): - if not is_struct_dtype(parent.dtype): + if not isinstance(parent.dtype, StructDtype): raise AttributeError( "Can only use .struct accessor with a 'struct' dtype" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 151269ea180..01935fec8c3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -50,13 +50,11 @@ is_datetime_dtype, is_dict_like, is_dtype_equal, - is_list_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_scalar, is_string_dtype, - is_struct_dtype, ) from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core.abc import Serializable @@ -1825,7 +1823,9 @@ def _clean_nulls_from_dataframe(self, df): filling with `` values. """ for col in df._data: - if is_list_dtype(df._data[col]) or is_struct_dtype(df._data[col]): + if isinstance( + df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) + ): # TODO we need to handle this pass elif df._data[col].has_nulls(): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 07231bd1f2c..8ea659daea0 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1050,7 +1050,10 @@ def is_list_dtype(obj): or type(obj) is cudf.core.column.ListColumn or obj is cudf.core.column.ListColumn or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) - or (hasattr(obj, "dtype") and is_list_dtype(obj.dtype)) + or ( + hasattr(obj, "dtype") + and isinstance(obj.dtype, cudf.core.dtypes.ListDtype) + ) ) @@ -1076,7 +1079,10 @@ def is_struct_dtype(obj): isinstance(obj, cudf.core.dtypes.StructDtype) or obj is cudf.core.dtypes.StructDtype or (isinstance(obj, str) and obj == cudf.core.dtypes.StructDtype.name) - or (hasattr(obj, "dtype") and is_struct_dtype(obj.dtype)) + or ( + hasattr(obj, "dtype") + and isinstance(obj.dtype, cudf.core.dtypes.StructDtype) + ) ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a2f3db681ec..ab089ceb103 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -43,7 +43,6 @@ is_bool_dtype, is_decimal_dtype, is_dict_like, - is_list_dtype, is_list_like, is_scalar, ) @@ -4101,7 +4100,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): # specified nested column. Other columns' corresponding rows are # duplicated. If ignore_index is set, the original index is not # exploded and will be replaced with a `RangeIndex`. - if not is_list_dtype(self._data[explode_column].dtype): + if not isinstance(self._data[explode_column].dtype, ListDtype): data = self._data.copy(deep=True) idx = None if ignore_index else self._index.copy(deep=True) return self.__class__._from_data(data, index=idx) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bb329619250..3562c83e797 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -40,15 +40,12 @@ _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, is_bool_dtype, - is_decimal_dtype, is_dict_like, is_float_dtype, is_integer, is_integer_dtype, - is_list_dtype, is_scalar, is_string_dtype, - is_struct_dtype, ) from cudf.core import indexing_utils from cudf.core.abc import Serializable @@ -1502,12 +1499,14 @@ def __repr__(self): if ( preprocess.nullable and not isinstance( - preprocess._column, cudf.core.column.CategoricalColumn + preprocess.dtype, + ( + cudf.CategoricalDtype, + cudf.ListDtype, + cudf.StructDtype, + cudf.core.dtypes.DecimalDtype, + ), ) - and not is_list_dtype(preprocess.dtype) - and not is_struct_dtype(preprocess.dtype) - and not is_decimal_dtype(preprocess.dtype) - and not is_struct_dtype(preprocess.dtype) ) or isinstance( preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn, diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index f24d8572655..b1bcf4b98c5 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -11,9 +11,7 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_datetime_dtype, - is_list_dtype, is_string_dtype, - is_struct_dtype, is_timedelta_dtype, ) from cudf.core.column import as_column @@ -132,7 +130,7 @@ def to_numeric(arg, errors="raise", downcast=None): return arg else: raise e - elif is_list_dtype(dtype) or is_struct_dtype(dtype): + elif isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): raise ValueError("Input does not support nested datatypes") elif _is_non_decimal_numeric_dtype(dtype): pass diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 762e646ee54..a45733a0f83 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -12,11 +12,8 @@ from cudf._lib.unary import is_nan from cudf.api.types import ( is_categorical_dtype, - is_decimal_dtype, - is_list_dtype, is_numeric_dtype, is_string_dtype, - is_struct_dtype, ) from cudf.core.missing import NA, NaT @@ -26,10 +23,15 @@ def dtype_can_compare_equal_to_other(dtype): # as equal to equal values of a different dtype return not ( is_string_dtype(dtype) - or is_list_dtype(dtype) - or is_struct_dtype(dtype) - or is_decimal_dtype(dtype) - or isinstance(dtype, cudf.IntervalDtype) + or isinstance( + dtype, + ( + cudf.IntervalDtype, + cudf.ListDtype, + cudf.StructDtype, + cudf.core.dtypes.DecimalDtype, + ), + ) ) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 73ea8e2cfc4..c65404445cb 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -202,12 +202,13 @@ def cudf_dtype_to_pa_type(dtype): """Given a cudf pandas dtype, converts it into the equivalent cuDF Python dtype. """ - if cudf.api.types.is_categorical_dtype(dtype): - raise NotImplementedError() - elif ( - cudf.api.types.is_list_dtype(dtype) - or cudf.api.types.is_struct_dtype(dtype) - or cudf.api.types.is_decimal_dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): + raise NotImplementedError( + "No conversion from Categorical to pyarrow type" + ) + elif isinstance( + dtype, + (cudf.StructDtype, cudf.ListDtype, cudf.core.dtypes.DecimalDtype), ): return dtype.to_arrow() else: @@ -557,7 +558,9 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) - if any(cudf.api.types.is_decimal_dtype(dtype) for dtype in dtypes): + if any( + isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes + ): if all( cudf.api.types.is_decimal_dtype(dtype) or cudf.api.types.is_numeric_dtype(dtype) @@ -572,7 +575,7 @@ def find_common_type(dtypes): ) else: return cudf.dtype("O") - if any(cudf.api.types.is_list_dtype(dtype) for dtype in dtypes): + if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes): if len(dtypes) == 1: return dtypes.get(0) else: @@ -585,7 +588,7 @@ def find_common_type(dtypes): "Finding a common type for `ListDtype` is currently " "not supported" ) - if any(cudf.api.types.is_struct_dtype(dtype) for dtype in dtypes): + if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes): if len(dtypes) == 1: return dtypes.get(0) else: