From 4ec6f329e13f35165c36aac5f62c08d6dba88311 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 17:02:26 -0800 Subject: [PATCH 1/2] Refactor CategoricalIndex.__init__, get ordered and categories only from dtype --- python/cudf/cudf/core/column/categorical.py | 122 +++++++++----------- python/cudf/cudf/core/dtypes.py | 6 +- python/cudf/cudf/core/index.py | 68 +++++------ python/cudf/cudf/tests/test_categorical.py | 5 + 4 files changed, 89 insertions(+), 112 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6b3ee0ba852..6671266048c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -198,7 +198,7 @@ def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: FutureWarning, ) return self._return_or_inplace( - self._column.as_ordered(), inplace=inplace + self._column.as_ordered(ordered=True), inplace=inplace ) def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: @@ -280,7 +280,7 @@ def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: FutureWarning, ) return self._return_or_inplace( - self._column.as_unordered(), inplace=inplace + self._column.as_ordered(ordered=False), inplace=inplace ) def add_categories( @@ -728,8 +728,6 @@ def __init__( assert child.base_mask is None size = children[0].size size = size - offset - if isinstance(dtype, pd.api.types.CategoricalDtype): - dtype = CategoricalDtype.from_pandas(dtype) if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be instance of CategoricalDtype") super().__init__( @@ -803,12 +801,6 @@ def children(self) -> Tuple[NumericalColumn]: def categories(self) -> ColumnBase: return self.dtype.categories._values - @categories.setter - def categories(self, value): - self._dtype = CategoricalDtype( - categories=value, ordered=self.dtype.ordered - ) - @property def codes(self) -> NumericalColumn: if self._codes is None: @@ -819,10 +811,6 @@ def codes(self) -> NumericalColumn: def ordered(self) -> bool: return self.dtype.ordered - @ordered.setter - def ordered(self, value: bool): - self.dtype.ordered = value - def unary_operator(self, unaryop: str): raise TypeError( f"Series of dtype `category` cannot perform the operation: " @@ -840,7 +828,9 @@ def __setitem__(self, key, value): else: arr = value to_add_categories = len( - cudf.Index(arr, nan_as_null=False).difference(self.categories) + cudf.Index(arr, nan_as_null=False).difference( + self.dtype.categories + ) ) if to_add_categories > 0: @@ -857,12 +847,12 @@ def __setitem__(self, key, value): codes = self.codes codes[key] = value out = cudf.core.column.build_categorical_column( - categories=self.categories, + categories=self.dtype.categories, codes=codes, mask=codes.base_mask, size=codes.size, offset=self.offset, - ordered=self.ordered, + ordered=self.dtype.ordered, ) self._mimic_inplace(out, inplace=True) @@ -898,7 +888,7 @@ def slice( codes.base_data, dtype=codes.dtype ), mask=codes.base_mask, - ordered=self.ordered, + ordered=self.dtype.ordered, size=codes.size, offset=codes.offset, ), @@ -911,7 +901,11 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if not isinstance(other, CategoricalColumn): raise ValueError # Note: at this stage we are guaranteed that the dtypes are equal. - if not self.ordered and op not in {"__eq__", "__ne__", "NULL_EQUALS"}: + if not self.dtype.ordered and op not in { + "__eq__", + "__ne__", + "NULL_EQUALS", + }: raise TypeError( "The only binary operations supported by unordered " "categorical columns are equality and inequality." @@ -969,7 +963,7 @@ def to_pandas( if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - if self.categories.dtype.kind == "f": + if self.dtype.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( categories=self.categories, @@ -1009,7 +1003,7 @@ def to_arrow(self) -> pa.Array: else np.int8 ) codes = self.codes.astype(signed_type) - categories = self.categories + categories = self.dtype.categories out_indices = codes.to_arrow() out_dictionary = categories.to_arrow() @@ -1017,7 +1011,7 @@ def to_arrow(self) -> pa.Array: return pa.DictionaryArray.from_arrays( out_indices, out_dictionary, - ordered=self.ordered, + ordered=self.dtype.ordered, ) @property @@ -1036,7 +1030,9 @@ def values(self): def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": return ( - self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + self.astype(self.dtype.categories.dtype) + .clip(lo, hi) + .astype(self.dtype) ) def data_array_view( @@ -1052,7 +1048,7 @@ def unique(self) -> CategoricalColumn: mask=codes.base_mask, offset=codes.offset, size=codes.size, - ordered=self.ordered, + ordered=self.dtype.ordered, ) def _encode(self, value) -> ScalarLike: @@ -1074,10 +1070,12 @@ def find_and_replace( """ to_replace_col = column.as_column(to_replace) if len(to_replace_col) == to_replace_col.null_count: - to_replace_col = to_replace_col.astype(self.categories.dtype) + to_replace_col = to_replace_col.astype(self.dtype.categories.dtype) replacement_col = column.as_column(replacement) if len(replacement_col) == replacement_col.null_count: - replacement_col = replacement_col.astype(self.categories.dtype) + replacement_col = replacement_col.astype( + self.dtype.categories.dtype + ) if type(to_replace_col) != type(replacement_col): raise TypeError( @@ -1096,15 +1094,15 @@ def find_and_replace( .element_indexing(0) ) # TODO: This line of code does not work because we cannot use the - # `in` operator on self.categories (which is a column). mypy + # `in` operator on self.dtype.categories (which is a column). mypy # realizes that this is wrong because __iter__ is not implemented. # However, it seems that this functionality has been broken for a # long time so for now we're just having mypy ignore and we'll come # back to this. - if fill_value in self.categories: # type: ignore + if fill_value in self.dtype.categories: # type: ignore replaced = self.fillna(fill_value) else: - new_categories = self.categories.append( + new_categories = self.dtype.categories.append( column.as_column([fill_value]) ) replaced = self._set_categories(new_categories) @@ -1199,10 +1197,10 @@ def isnull(self) -> ColumnBase: """ result = libcudf.unary.is_null(self) - if self.categories.dtype.kind == "f": + if self.dtype.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = libcudf.unary.is_nan(self.dtype.categories) if categories.any(): code = self._encode(np.nan) result = result | (self.codes == cudf.Scalar(code)) @@ -1215,10 +1213,10 @@ def notnull(self) -> ColumnBase: """ result = libcudf.unary.is_valid(self) - if self.categories.dtype.kind == "f": + if self.dtype.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = libcudf.unary.is_nan(self.dtype.categories) if categories.any(): code = self._encode(np.nan) result = result & (self.codes != cudf.Scalar(code)) @@ -1260,7 +1258,7 @@ def fillna( # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value._set_categories( - self.categories, + self.dtype.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( @@ -1276,11 +1274,11 @@ def indices_of( @property def is_monotonic_increasing(self) -> bool: - return bool(self.ordered) and self.codes.is_monotonic_increasing + return bool(self.dtype.ordered) and self.codes.is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: - return bool(self.ordered) and self.codes.is_monotonic_decreasing + return bool(self.dtype.ordered) and self.codes.is_monotonic_decreasing def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": @@ -1302,7 +1300,9 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") - if not isinstance(self.categories, type(dtype.categories._values)): + if not isinstance( + self.dtype.categories, type(dtype.categories._values) + ): # If both categories are of different Column types, # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) @@ -1337,24 +1337,26 @@ def as_timedelta_column( def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): - # self.categories is empty; just return codes + # self.dtype.categories is empty; just return codes return self.codes gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) - out = self.categories.take(gather_map) + out = self.dtype.categories.take(gather_map) out = out.set_mask(self.mask) return out def copy(self, deep: bool = True) -> Self: result_col = super().copy(deep=deep) if deep: - result_col.categories = libcudf.copying.copy_column( - self.dtype._categories + dtype_copy = CategoricalDtype( + categories=self.dtype.categories._values.copy(), + ordered=self.dtype.ordered, ) + result_col = cast(Self, result_col._with_type_metadata(dtype_copy)) return result_col @cached_property def memory_usage(self) -> int: - return self.categories.memory_usage + self.codes.memory_usage + return self.dtype.categories.memory_usage + self.codes.memory_usage def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False @@ -1436,7 +1438,7 @@ def set_categories( ) -> CategoricalColumn: # See CategoricalAccessor.set_categories. - ordered = ordered if ordered is not None else self.ordered + ordered = ordered if ordered is not None else self.dtype.ordered new_categories = column.as_column(new_categories) if isinstance(new_categories, CategoricalColumn): @@ -1447,7 +1449,7 @@ def set_categories( # categories. if rename: # enforce same length - if len(new_categories) != len(self.categories): + if len(new_categories) != len(self.dtype.categories): raise ValueError( "new_categories must have the same " "number of items as old categories" @@ -1474,7 +1476,7 @@ def set_categories( ) elif ( not out_col._categories_equal(new_categories, ordered=True) - or not self.ordered == ordered + or not self.dtype.ordered == ordered ): out_col = out_col._set_categories( new_categories, @@ -1485,7 +1487,7 @@ def set_categories( def _categories_equal( self, new_categories: ColumnBase, ordered=False ) -> bool: - cur_categories = self.categories + cur_categories = self.dtype.categories if len(new_categories) != len(cur_categories): return False if new_categories.dtype != cur_categories.dtype: @@ -1514,7 +1516,7 @@ def _set_categories( Assumes ``new_categories`` is the same dtype as the current categories """ - cur_cats = column.as_column(self.categories) + cur_cats = column.as_column(self.dtype.categories) new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from @@ -1556,7 +1558,7 @@ def _set_categories( df = df.sort_values(by="order") df.reset_index(drop=True, inplace=True) - ordered = ordered if ordered is not None else self.ordered + ordered = ordered if ordered is not None else self.dtype.ordered new_codes = df._data["new_codes"] # codes can't have masks, so take mask out before moving in @@ -1588,31 +1590,17 @@ def reorder_categories( ) return self._set_categories(new_categories, ordered=ordered) - def as_ordered(self): - out_col = self - if not out_col.ordered: - out_col = column.build_categorical_column( - categories=self.categories, - codes=self.codes, - mask=self.base_mask, - size=self.base_size, - offset=self.offset, - ordered=True, - ) - return out_col - - def as_unordered(self): - out_col = self - if out_col.ordered: - out_col = column.build_categorical_column( + def as_ordered(self, ordered: bool): + if self.dtype.ordered != ordered: + return column.build_categorical_column( categories=self.categories, codes=self.codes, mask=self.base_mask, size=self.base_size, offset=self.offset, - ordered=False, + ordered=ordered, ) - return out_col + return self def _create_empty_categorical_column( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8ea659daea0..b62475c7df5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import decimal import operator @@ -212,10 +212,6 @@ def ordered(self) -> bool: """ return self._ordered - @ordered.setter - def ordered(self, value) -> None: - self._ordered = value - @classmethod def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3e8f6bc2ccb..bef9eed1185 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3003,50 +3003,38 @@ def __init__( copy=False, name=None, ): - if isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - if categories is not None or ordered is not None: + name = _setdefault_name(data, name=name)["name"] + if dtype is not None: + dtype = cudf.dtype(dtype) + if not isinstance(dtype, cudf.CategoricalDtype): + raise ValueError("dtype must be a CategoricalDtype") + elif categories is not None or ordered is not None: raise ValueError( "Cannot specify `categories` or " "`ordered` together with `dtype`." ) + col = column.as_column(data) if copy: - data = column.as_column(data, dtype=dtype).copy(deep=True) - kwargs = _setdefault_name(data, name=name) - if isinstance(data, CategoricalColumn): - data = data - elif isinstance(data, pd.Series) and ( - isinstance(data.dtype, pd.CategoricalDtype) - ): - codes_data = column.as_column(data.cat.codes.values) - data = column.build_categorical_column( - categories=data.cat.categories, - codes=codes_data, - ordered=data.cat.ordered, - ) - elif isinstance(data, (pd.Categorical, pd.CategoricalIndex)): - codes_data = column.as_column(data.codes) - data = column.build_categorical_column( - categories=data.categories, - codes=codes_data, - ordered=data.ordered, - ) + col = col.copy() + if not isinstance(col, CategoricalColumn): + if dtype is None: + dtype = cudf.CategoricalDtype( + categories=categories, ordered=ordered + ) + col = col.astype(dtype) else: - data = column.as_column( - data, dtype="category" if dtype is None else dtype - ) - # dtype has already been taken care - dtype = None - - if categories is not None: - data = data.set_categories(categories, ordered=ordered) - elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data = data.set_categories(dtype.categories, ordered=ordered) - elif ordered is True and data.ordered is False: - data = data.as_ordered() - elif ordered is False and data.ordered is True: - data = data.as_unordered() - - super().__init__(data, **kwargs) + if dtype is not None: + col = col.set_categories( + dtype.categories, ordered=dtype.ordered + ) + else: + if categories is not None: + col = col.set_categories( + categories, ordered=col.dtype.ordered + ) + if ordered is not None: + col = col.as_ordered(ordered) + super().__init__(data, name=name) @property # type: ignore @_cudf_nvtx_annotate @@ -3054,7 +3042,7 @@ def codes(self): """ The category codes of this categorical. """ - return as_index(self._values.codes) + return self.dtype.codes @property # type: ignore @_cudf_nvtx_annotate @@ -3062,7 +3050,7 @@ def categories(self): """ The categories of this categorical. """ - return as_index(self._values.categories) + return self.dtype.categories def _is_boolean(self): return False diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 52b7236b965..55cb45a2cb4 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -955,3 +955,8 @@ def test_empty_series_category_cast(ordered): assert_eq(expected, actual) assert_eq(expected.dtype.ordered, actual.dtype.ordered) + + +def test_categoricaldtype_ordered_not_settable(): + with pytest.raises(AttributeError): + cudf.CategoricalDtype().ordered = False From df59afa76376de32e1888fe52a77c02611323b59 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jan 2024 17:18:58 -0800 Subject: [PATCH 2/2] Revet codes changes --- python/cudf/cudf/core/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bef9eed1185..c481f9e88db 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3042,7 +3042,7 @@ def codes(self): """ The category codes of this categorical. """ - return self.dtype.codes + return as_index(self._values.codes) @property # type: ignore @_cudf_nvtx_annotate