diff --git a/CHANGELOG.md b/CHANGELOG.md index 58cd4fb7908..bd3389a143b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -293,6 +293,7 @@ - PR #4630 Remove dangling reference to RMM exec policy in drop duplicates tests. - PR #4625 Fix hash-based repartition bug in dask_cudf - PR #4662 Fix to handle `keep_index` in `partition_by_hash` +- PR #4683 Fix Slicing issue with categorical column in DataFrame - PR #4676 Fix bug in `_shuffle_group` for repartition diff --git a/python/cudf/cudf/_libxx/column.pyx b/python/cudf/cudf/_libxx/column.pyx index 23b7b9348b2..b77fcfef80c 100644 --- a/python/cudf/cudf/_libxx/column.pyx +++ b/python/cudf/cudf/_libxx/column.pyx @@ -294,7 +294,7 @@ cdef class Column: cdef mutable_column_view mutable_view(self) except *: if is_categorical_dtype(self.dtype): - col = self.codes + col = self.base_children[0] else: col = self data_dtype = col.dtype @@ -347,7 +347,7 @@ cdef class Column: cdef column_view _view(self, libcudf_types.size_type null_count) except *: if is_categorical_dtype(self.dtype): - col = self.codes + col = self.base_children[0] else: col = self data_dtype = col.dtype @@ -423,7 +423,7 @@ cdef class Column: column_owner = isinstance(owner, Column) if column_owner and is_categorical_dtype(owner.dtype): - owner = owner.codes + owner = owner.base_children[0] size = cv.size() offset = cv.offset() diff --git a/python/cudf/cudf/_libxx/transpose.pyx b/python/cudf/cudf/_libxx/transpose.pyx index b2754f5d401..e28725db62d 100644 --- a/python/cudf/cudf/_libxx/transpose.pyx +++ b/python/cudf/cudf/_libxx/transpose.pyx @@ -56,8 +56,10 @@ def transpose(Table source): if cats is not None: result = Table(index=result._index, data=[ (name, cudf.core.column.column.build_categorical_column( - codes=col, - mask=col.mask, + codes=cudf.core.column.column.as_column( + col.base_data, dtype=col.dtype), + mask=col.base_mask, + size=col.size, categories=cats )) for name, col in result._data.items() diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index baeee613c35..21143fb2049 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -100,11 +100,13 @@ def set_categories(self, new_categories, **kwargs): "new_categories must have the same " "number of items as old categories" ) + out_col = column.build_categorical_column( - new_categories, - self._column.children[0], - self._column.mask, - self._column.size, + categories=new_categories, + codes=self._column.base_children[0], + mask=self._column.base_mask, + size=self._column.size, + offset=self._column.offset, ordered=ordered, ) else: @@ -182,10 +184,13 @@ def _set_categories(self, new_categories, **kwargs): ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column + # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, - codes=new_codes, - mask=new_codes.mask, + codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), + mask=new_codes.base_mask, + size=new_codes.size, + offset=new_codes.offset, ordered=ordered, ) @@ -228,6 +233,9 @@ def __init__(self, dtype, mask=None, size=None, offset=0, children=()): respectively """ if size is None: + for child in children: + assert child.offset == 0 + assert child.base_mask is None size = children[0].size size = size - offset if isinstance(dtype, pd.api.types.CategoricalDtype): @@ -294,7 +302,10 @@ def deserialize(cls, header, frames): header["mask"], [frames[n_dtype_frames + n_data_frames]] ) return column.build_column( - data=None, dtype=dtype, mask=mask, children=(data,) + data=None, + dtype=dtype, + mask=mask, + children=(column.as_column(data.base_data, dtype=data.dtype),), ) def set_base_data(self, value): @@ -318,12 +329,13 @@ def set_base_children(self, value): def children(self): if self._children is None: codes_column = self.base_children[0] + + buf = Buffer(codes_column.base_data) + buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize) + buf.size = self.size * codes_column.dtype.itemsize + codes_column = column.build_column( - data=codes_column.base_data, - dtype=codes_column.dtype, - mask=codes_column.base_mask, - size=self.size, - offset=self.offset + codes_column.offset, + data=buf, dtype=codes_column.dtype, size=self.size, ) self._children = (codes_column,) return self._children @@ -392,7 +404,7 @@ def normalize_binop_value(self, other): col = column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(ary), - mask=self.mask, + mask=self.base_mask, ordered=self.dtype.ordered, ) return col @@ -401,8 +413,9 @@ def sort_by_values(self, ascending=True, na_position="last"): codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories, - codes=codes, - mask=self.mask, + codes=column.as_column(codes.base_data, dtype=codes.dtype), + mask=codes.base_mask, + size=codes.size, ordered=self.dtype.ordered, ) return col, inds @@ -439,8 +452,10 @@ def unique(self): codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, - codes=codes, - mask=codes.mask, + codes=column.as_column(codes.base_data, dtype=codes.dtype), + mask=codes.base_mask, + offset=codes.offset, + size=codes.size, ordered=self.ordered, ) @@ -479,8 +494,10 @@ def find_and_replace(self, to_replace, replacement, all_nan): return column.build_categorical_column( categories=self.dtype.categories, - codes=output, - mask=self.mask, + codes=column.as_column(output.base_data, dtype=output.dtype), + mask=output.base_mask, + offset=output.offset, + size=output.size, ordered=self.dtype.ordered, ) @@ -517,7 +534,9 @@ def fillna(self, fill_value): result = column.build_categorical_column( categories=self.dtype.categories, - codes=result, + codes=column.as_column(result.base_data, dtype=result.dtype), + offset=result.offset, + size=result.size, mask=None, ordered=self.dtype.ordered, ) @@ -582,18 +601,27 @@ def _get_decategorized_column(self): def copy(self, deep=True): if deep: copied_col = libcudfxx.copying.copy_column(self) + return column.build_categorical_column( categories=self.dtype.categories, - codes=copied_col, - mask=copied_col.mask, + codes=column.as_column( + copied_col.base_data, dtype=copied_col.dtype + ), + offset=copied_col.offset, + size=copied_col.size, + mask=copied_col.base_mask, ordered=self.dtype.ordered, ) else: return column.build_categorical_column( categories=self.dtype.categories, - codes=self.codes, - mask=self.mask, + codes=column.as_column( + self.codes.base_data, dtype=self.codes.dtype + ), + mask=self.base_mask, ordered=self.dtype.ordered, + offset=self.offset, + size=self.size, ) def __sizeof__(self): @@ -635,7 +663,8 @@ def pandas_categorical_as_column(categorical, codes=None): return column.build_categorical_column( categories=categorical.categories, - codes=codes, + codes=column.as_column(codes.base_data, dtype=codes.dtype), + size=codes.size, mask=mask, ordered=categorical.ordered, ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6fb521d440d..1f3876786df 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -253,7 +253,11 @@ def _concat(cls, objs, dtype=None): if is_categorical: col = build_categorical_column( - categories=cats, codes=col, mask=col.mask + categories=cats, + codes=as_column(col.base_data, dtype=col.dtype), + mask=col.base_mask, + size=col.size, + offset=col.offset, ) return col @@ -434,11 +438,13 @@ def __getitem__(self, arg): if is_categorical_dtype(self): codes = self.codes[arg] - return build_column( - data=None, - dtype=self.dtype, - mask=codes.mask, - children=(codes,), + return build_categorical_column( + categories=self.categories, + codes=as_column(codes.base_data, dtype=codes.dtype), + mask=codes.base_mask, + ordered=self.ordered, + size=codes.size, + offset=codes.offset, ) start, stop, stride = arg.indices(len(self)) @@ -547,7 +553,7 @@ def __setitem__(self, key, value): if is_categorical_dtype(value.dtype): out = build_categorical_column( categories=value.categories, - codes=out, + codes=as_column(out.base_data, dtype=out.dtype), mask=out.base_mask, size=out.size, offset=out.offset, @@ -565,7 +571,7 @@ def __setitem__(self, key, value): if is_categorical_dtype(self.dtype): out = build_categorical_column( categories=self.categories, - codes=out, + codes=as_column(out.base_data, dtype=out.dtype), mask=out.base_mask, size=out.size, offset=out.offset, @@ -893,7 +899,11 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None): ): codes = column_empty_like(column.codes, masked=masked, newsize=newsize) return build_column( - data=None, dtype=dtype, mask=codes.mask, children=(codes,) + data=None, + dtype=dtype, + mask=codes.base_mask, + children=(as_column(codes.base_data, dtype=codes.dtype),), + size=codes.size, ) return column_empty(row_count, dtype, masked) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ed7fc278271..51684b96d37 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2470,7 +2470,12 @@ def _set_categories(col, cats): else: codes = df[name]._column df[name] = column.build_categorical_column( - categories=cats, codes=codes, ordered=False + categories=cats, + codes=as_column(codes.base_data, dtype=codes.dtype), + mask=codes.base_mask, + size=codes.size, + ordered=False, + offset=codes.offset, ) if sort and len(df): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7a2cdc26fc2..696613083e8 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -142,7 +142,9 @@ def reassign_categories(categories, cols, col_idxs): if idx in categories: cols[name] = build_categorical_column( categories=categories[idx], - codes=cols[name], + codes=as_column( + cols[name].base_data, dtype=cols[name].dtype + ), mask=cols[name].base_mask, offset=cols[name].offset, size=cols[name].size, @@ -518,7 +520,7 @@ def where(self, boolean_mask, other): if is_categorical_dtype(self[in_col_name].dtype): result = build_categorical_column( categories=self[in_col_name]._column.categories, - codes=result, + codes=as_column(result.base_data, dtype=result.dtype), mask=result.base_mask, size=result.size, offset=result.offset, @@ -548,7 +550,7 @@ def where(self, boolean_mask, other): if is_categorical_dtype(self.dtype): result = build_categorical_column( categories=self._column.categories, - codes=result, + codes=as_column(result.base_data, dtype=result.dtype), mask=result.base_mask, size=result.size, offset=result.offset, @@ -846,7 +848,7 @@ def _copy_categories(self, other, include_index=True): ): self._data[name] = build_categorical_column( categories=other_col.categories, - codes=col, + codes=as_column(col.base_data, dtype=col.dtype), mask=col.base_mask, ordered=other_col.ordered, size=col.size, @@ -1252,7 +1254,9 @@ def _merge( to_frame_data[name] = column.build_categorical_column( categories=dtype.categories, codes=cat_codes.get(name + "_codes", col), - mask=col.mask, + mask=col.base_mask, + size=col.size, + offset=col.offset, ordered=dtype.ordered, ) else: diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3188bd3a4da..635b33ba3ae 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -5,7 +5,7 @@ import cudf from cudf.core import DataFrame, Index, RangeIndex, Series -from cudf.core.column import build_categorical_column +from cudf.core.column import as_column, build_categorical_column from cudf.core.index import as_index from cudf.utils import cudautils from cudf.utils.dtypes import is_categorical_dtype, is_list_like @@ -246,7 +246,12 @@ def _tile(A, reps): mdata[var_name] = Series( build_categorical_column( - categories=value_vars, codes=temp._column, ordered=False + categories=value_vars, + codes=as_column(temp._column.base_data, dtype=temp._column.dtype), + mask=temp._column.base_mask, + size=temp._column.size, + offset=temp._column.offset, + ordered=False, ) ) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 54a35f58130..e16ae89f489 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -456,3 +456,13 @@ def test_categorical_remove_categories(pd_str_cat, inplace): cd_sr_1 = cd_sr.cat.remove_categories(["a", "d"], inplace=inplace) raises.match("removals must all be in old categories") + + +def test_categorical_dataframe_slice_copy(): + pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) + gdf = DataFrame.from_pandas(pdf) + + exp = pdf[1:].copy() + gdf = gdf[1:].copy() + + assert_eq(exp, gdf) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 21b46be984f..6000fa09684 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -7,7 +7,7 @@ from dask.dataframe.io.parquet.arrow import ArrowEngine import cudf -from cudf.core.column import build_categorical_column +from cudf.core.column import as_column, build_categorical_column from cudf.io import write_to_dataset @@ -86,7 +86,13 @@ def read_partition( ] sr = cudf.Series(index2).astype(type(index2)).repeat(len(df)) df[name] = build_categorical_column( - categories=categories, codes=sr._column, ordered=False + categories=categories, + codes=as_column( + sr._column.base_data, dtype=sr._column.dtype + ), + size=sr._column.size, + offset=sr._column.offset, + ordered=False, ) return df