diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 88bb4521a5b..c236967a2f6 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -165,7 +165,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]: dtype: category Categories (3, int64): [1 < 2 < 10] """ - return self._return_or_inplace(self._column.as_ordered()) + return self._return_or_inplace(self._column.as_ordered(ordered=True)) def as_unordered(self) -> Optional[SeriesOrIndex]: """ @@ -212,8 +212,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]: dtype: category Categories (3, int64): [1, 2, 10] """ - - return self._return_or_inplace(self._column.as_unordered()) + return self._return_or_inplace(self._column.as_ordered(ordered=False)) def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: """ @@ -540,8 +539,6 @@ def __init__( assert child.base_mask is None size = children[0].size size = size - offset - if isinstance(dtype, pd.api.types.CategoricalDtype): - dtype = CategoricalDtype.from_pandas(dtype) if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be instance of CategoricalDtype") super().__init__( @@ -615,12 +612,6 @@ def children(self) -> Tuple[NumericalColumn]: def categories(self) -> ColumnBase: return self.dtype.categories._values - @categories.setter - def categories(self, value): - self._dtype = CategoricalDtype( - categories=value, ordered=self.dtype.ordered - ) - @property def codes(self) -> NumericalColumn: if self._codes is None: @@ -631,10 +622,6 @@ def codes(self) -> NumericalColumn: def ordered(self) -> bool: return self.dtype.ordered - @ordered.setter - def ordered(self, value: bool): - self.dtype.ordered = value - def __setitem__(self, key, value): if cudf.api.types.is_scalar( value @@ -646,7 +633,9 @@ def __setitem__(self, key, value): else: arr = value to_add_categories = len( - cudf.Index(arr, nan_as_null=False).difference(self.categories) + cudf.Index(arr, nan_as_null=False).difference( + self.dtype.categories + ) ) if to_add_categories > 0: @@ -663,12 +652,12 @@ def __setitem__(self, key, value): codes = self.codes codes[key] = value out = cudf.core.column.build_categorical_column( - categories=self.categories, + categories=self.dtype.categories, codes=codes, mask=codes.base_mask, size=codes.size, offset=self.offset, - ordered=self.ordered, + ordered=self.dtype.ordered, ) self._mimic_inplace(out, inplace=True) @@ -704,7 +693,7 @@ def slice( codes.base_data, dtype=codes.dtype ), mask=codes.base_mask, - ordered=self.ordered, + ordered=self.dtype.ordered, size=codes.size, offset=codes.offset, ), @@ -717,7 +706,11 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if not isinstance(other, CategoricalColumn): raise ValueError # Note: at this stage we are guaranteed that the dtypes are equal. - if not self.ordered and op not in {"__eq__", "__ne__", "NULL_EQUALS"}: + if not self.dtype.ordered and op not in { + "__eq__", + "__ne__", + "NULL_EQUALS", + }: raise TypeError( "The only binary operations supported by unordered " "categorical columns are equality and inequality." @@ -781,7 +774,7 @@ def to_pandas( elif arrow_type: raise NotImplementedError(f"{arrow_type=} is not implemented.") - if self.categories.dtype.kind == "f": + if self.dtype.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( categories=self.categories, @@ -822,7 +815,7 @@ def to_arrow(self) -> pa.Array: else np.int8 ) codes = self.codes.astype(signed_type) - categories = self.categories + categories = self.dtype.categories out_indices = codes.to_arrow() out_dictionary = categories.to_arrow() @@ -830,7 +823,7 @@ def to_arrow(self) -> pa.Array: return pa.DictionaryArray.from_arrays( out_indices, out_dictionary, - ordered=self.ordered, + ordered=self.dtype.ordered, ) @property @@ -849,7 +842,9 @@ def values(self): def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": return ( - self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + self.astype(self.dtype.categories.dtype) + .clip(lo, hi) + .astype(self.dtype) ) def data_array_view( @@ -865,7 +860,7 @@ def unique(self) -> CategoricalColumn: mask=codes.base_mask, offset=codes.offset, size=codes.size, - ordered=self.ordered, + ordered=self.dtype.ordered, ) def _encode(self, value) -> ScalarLike: @@ -887,10 +882,12 @@ def find_and_replace( """ to_replace_col = column.as_column(to_replace) if len(to_replace_col) == to_replace_col.null_count: - to_replace_col = to_replace_col.astype(self.categories.dtype) + to_replace_col = to_replace_col.astype(self.dtype.categories.dtype) replacement_col = column.as_column(replacement) if len(replacement_col) == replacement_col.null_count: - replacement_col = replacement_col.astype(self.categories.dtype) + replacement_col = replacement_col.astype( + self.dtype.categories.dtype + ) if type(to_replace_col) != type(replacement_col): raise TypeError( @@ -909,15 +906,15 @@ def find_and_replace( .element_indexing(0) ) # TODO: This line of code does not work because we cannot use the - # `in` operator on self.categories (which is a column). mypy + # `in` operator on self.dtype.categories (which is a column). mypy # realizes that this is wrong because __iter__ is not implemented. # However, it seems that this functionality has been broken for a # long time so for now we're just having mypy ignore and we'll come # back to this. - if fill_value in self.categories: # type: ignore + if fill_value in self.dtype.categories: # type: ignore replaced = self.fillna(fill_value) else: - new_categories = self.categories.append( + new_categories = self.dtype.categories.append( column.as_column([fill_value]) ) replaced = self._set_categories(new_categories) @@ -1022,10 +1019,10 @@ def isnull(self) -> ColumnBase: """ result = libcudf.unary.is_null(self) - if self.categories.dtype.kind == "f": + if self.dtype.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = libcudf.unary.is_nan(self.dtype.categories) if categories.any(): code = self._encode(np.nan) result = result | (self.codes == cudf.Scalar(code)) @@ -1038,10 +1035,10 @@ def notnull(self) -> ColumnBase: """ result = libcudf.unary.is_valid(self) - if self.categories.dtype.kind == "f": + if self.dtype.categories.dtype.kind == "f": # Need to consider `np.nan` values in case # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) + categories = libcudf.unary.is_nan(self.dtype.categories) if categories.any(): code = self._encode(np.nan) result = result & (self.codes != cudf.Scalar(code)) @@ -1083,7 +1080,7 @@ def fillna( # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value._set_categories( - self.categories, + self.dtype.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( @@ -1099,11 +1096,11 @@ def indices_of( @property def is_monotonic_increasing(self) -> bool: - return bool(self.ordered) and self.codes.is_monotonic_increasing + return bool(self.dtype.ordered) and self.codes.is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: - return bool(self.ordered) and self.codes.is_monotonic_decreasing + return bool(self.dtype.ordered) and self.codes.is_monotonic_decreasing def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": @@ -1125,7 +1122,9 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") - if not isinstance(self.categories, type(dtype.categories._values)): + if not isinstance( + self.dtype.categories, type(dtype.categories._values) + ): # If both categories are of different Column types, # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) @@ -1160,24 +1159,26 @@ def as_timedelta_column( def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): - # self.categories is empty; just return codes + # self.dtype.categories is empty; just return codes return self.codes gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) - out = self.categories.take(gather_map) + out = self.dtype.categories.take(gather_map) out = out.set_mask(self.mask) return out def copy(self, deep: bool = True) -> Self: result_col = super().copy(deep=deep) if deep: - result_col.categories = libcudf.copying.copy_column( - self.dtype._categories + dtype_copy = CategoricalDtype( + categories=self.dtype.categories._values.copy(), + ordered=self.dtype.ordered, ) + result_col = cast(Self, result_col._with_type_metadata(dtype_copy)) return result_col @cached_property def memory_usage(self) -> int: - return self.categories.memory_usage + self.codes.memory_usage + return self.dtype.categories.memory_usage + self.codes.memory_usage def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False @@ -1259,7 +1260,7 @@ def set_categories( ) -> CategoricalColumn: # See CategoricalAccessor.set_categories. - ordered = ordered if ordered is not None else self.ordered + ordered = ordered if ordered is not None else self.dtype.ordered new_categories = column.as_column(new_categories) if isinstance(new_categories, CategoricalColumn): @@ -1270,7 +1271,7 @@ def set_categories( # categories. if rename: # enforce same length - if len(new_categories) != len(self.categories): + if len(new_categories) != len(self.dtype.categories): raise ValueError( "new_categories must have the same " "number of items as old categories" @@ -1297,7 +1298,7 @@ def set_categories( ) elif ( not out_col._categories_equal(new_categories, ordered=True) - or not self.ordered == ordered + or not self.dtype.ordered == ordered ): out_col = out_col._set_categories( new_categories, @@ -1308,7 +1309,7 @@ def set_categories( def _categories_equal( self, new_categories: ColumnBase, ordered=False ) -> bool: - cur_categories = self.categories + cur_categories = self.dtype.categories if len(new_categories) != len(cur_categories): return False if new_categories.dtype != cur_categories.dtype: @@ -1337,7 +1338,7 @@ def _set_categories( Assumes ``new_categories`` is the same dtype as the current categories """ - cur_cats = column.as_column(self.categories) + cur_cats = column.as_column(self.dtype.categories) new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from @@ -1379,7 +1380,7 @@ def _set_categories( df = df.sort_values(by="order") df.reset_index(drop=True, inplace=True) - ordered = ordered if ordered is not None else self.ordered + ordered = ordered if ordered is not None else self.dtype.ordered new_codes = df._data["new_codes"] # codes can't have masks, so take mask out before moving in @@ -1411,31 +1412,17 @@ def reorder_categories( ) return self._set_categories(new_categories, ordered=ordered) - def as_ordered(self): - out_col = self - if not out_col.ordered: - out_col = column.build_categorical_column( - categories=self.categories, - codes=self.codes, - mask=self.base_mask, - size=self.base_size, - offset=self.offset, - ordered=True, - ) - return out_col - - def as_unordered(self): - out_col = self - if out_col.ordered: - out_col = column.build_categorical_column( + def as_ordered(self, ordered: bool): + if self.dtype.ordered != ordered: + return column.build_categorical_column( categories=self.categories, codes=self.codes, mask=self.base_mask, size=self.base_size, offset=self.offset, - ordered=False, + ordered=ordered, ) - return out_col + return self def _create_empty_categorical_column( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 3bd342e24c2..73617763221 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -205,10 +205,6 @@ def ordered(self) -> bool: """ return self._ordered - @ordered.setter - def ordered(self, value) -> None: - self._ordered = value - @classmethod def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bd9dc1ae3da..7850d592eab 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2585,49 +2585,38 @@ def __init__( copy=False, name=None, ): - if isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - if categories is not None or ordered is not None: + name = _setdefault_name(data, name=name)["name"] + if dtype is not None: + dtype = cudf.dtype(dtype) + if not isinstance(dtype, cudf.CategoricalDtype): + raise ValueError("dtype must be a CategoricalDtype") + elif categories is not None or ordered is not None: raise ValueError( "Cannot specify `categories` or " "`ordered` together with `dtype`." ) + col = column.as_column(data) if copy: - data = column.as_column(data, dtype=dtype).copy(deep=True) - kwargs = _setdefault_name(data, name=name) - if isinstance(data, CategoricalColumn): - data = data - elif isinstance(data, pd.Series) and ( - isinstance(data.dtype, pd.CategoricalDtype) - ): - codes_data = column.as_column(data.cat.codes.values) - data = column.build_categorical_column( - categories=data.cat.categories, - codes=codes_data, - ordered=data.cat.ordered, - ) - elif isinstance(data, (pd.Categorical, pd.CategoricalIndex)): - codes_data = column.as_column(data.codes) - data = column.build_categorical_column( - categories=data.categories, - codes=codes_data, - ordered=data.ordered, - ) + col = col.copy() + if not isinstance(col, CategoricalColumn): + if dtype is None: + dtype = cudf.CategoricalDtype( + categories=categories, ordered=ordered + ) + col = col.astype(dtype) else: - data = column.as_column( - data, dtype="category" if dtype is None else dtype - ) - # dtype has already been taken care - dtype = None - - if categories is not None: - data = data.set_categories(categories, ordered=ordered) - elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data = data.set_categories(dtype.categories, ordered=ordered) - elif ordered is True and data.ordered is False: - data = data.as_ordered() - elif ordered is False and data.ordered is True: - data = data.as_unordered() - super().__init__(data, **kwargs) + if dtype is not None: + col = col.set_categories( + dtype.categories, ordered=dtype.ordered + ) + else: + if categories is not None: + col = col.set_categories( + categories, ordered=col.dtype.ordered + ) + if ordered is not None: + col = col.as_ordered(ordered) + super().__init__(data, name=name) @property # type: ignore @_cudf_nvtx_annotate @@ -2643,7 +2632,7 @@ def categories(self): """ The categories of this categorical. """ - return as_index(self._values.categories) + return self.dtype.categories def _is_boolean(self): return False diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index ad32ebce01b..44b11754d33 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -846,3 +846,8 @@ def test_empty_series_category_cast(ordered): assert_eq(expected, actual) assert_eq(expected.dtype.ordered, actual.dtype.ordered) + + +def test_categoricaldtype_ordered_not_settable(): + with pytest.raises(AttributeError): + cudf.CategoricalDtype().ordered = False