Skip to content

Commit

Permalink
Merge pull request #4683 from rgsl888prabhu/4678_dataframe_slice_copy…
Browse files Browse the repository at this point in the history
…_issur

[REVIEW] Fix Slicing issue with categorical column in DataFrame
  • Loading branch information
raydouglass authored Mar 26, 2020
2 parents f90b91e + 4ee660b commit b8c4924
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@
- PR #4630 Remove dangling reference to RMM exec policy in drop duplicates tests.
- PR #4625 Fix hash-based repartition bug in dask_cudf
- PR #4662 Fix to handle `keep_index` in `partition_by_hash`
- PR #4683 Fix Slicing issue with categorical column in DataFrame
- PR #4676 Fix bug in `_shuffle_group` for repartition
- PR #4681 Fix `test_repr` tests that were generating a `RangeIndex` for column names

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_libxx/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ cdef class Column:

cdef mutable_column_view mutable_view(self) except *:
if is_categorical_dtype(self.dtype):
col = self.codes
col = self.base_children[0]
else:
col = self
data_dtype = col.dtype
Expand Down Expand Up @@ -347,7 +347,7 @@ cdef class Column:

cdef column_view _view(self, libcudf_types.size_type null_count) except *:
if is_categorical_dtype(self.dtype):
col = self.codes
col = self.base_children[0]
else:
col = self
data_dtype = col.dtype
Expand Down Expand Up @@ -423,7 +423,7 @@ cdef class Column:

column_owner = isinstance(owner, Column)
if column_owner and is_categorical_dtype(owner.dtype):
owner = owner.codes
owner = owner.base_children[0]

size = cv.size()
offset = cv.offset()
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_libxx/transpose.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@ def transpose(Table source):
if cats is not None:
result = Table(index=result._index, data=[
(name, cudf.core.column.column.build_categorical_column(
codes=col,
mask=col.mask,
codes=cudf.core.column.column.as_column(
col.base_data, dtype=col.dtype),
mask=col.base_mask,
size=col.size,
categories=cats
))
for name, col in result._data.items()
Expand Down
79 changes: 54 additions & 25 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,13 @@ def set_categories(self, new_categories, **kwargs):
"new_categories must have the same "
"number of items as old categories"
)

out_col = column.build_categorical_column(
new_categories,
self._column.children[0],
self._column.mask,
self._column.size,
categories=new_categories,
codes=self._column.base_children[0],
mask=self._column.base_mask,
size=self._column.size,
offset=self._column.offset,
ordered=ordered,
)
else:
Expand Down Expand Up @@ -182,10 +184,13 @@ def _set_categories(self, new_categories, **kwargs):
ordered = kwargs.get("ordered", self.ordered)
new_codes = df["new_codes"]._column

# codes can't have masks, so take mask out before moving in
return column.build_categorical_column(
categories=new_cats,
codes=new_codes,
mask=new_codes.mask,
codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype),
mask=new_codes.base_mask,
size=new_codes.size,
offset=new_codes.offset,
ordered=ordered,
)

Expand Down Expand Up @@ -228,6 +233,9 @@ def __init__(self, dtype, mask=None, size=None, offset=0, children=()):
respectively
"""
if size is None:
for child in children:
assert child.offset == 0
assert child.base_mask is None
size = children[0].size
size = size - offset
if isinstance(dtype, pd.api.types.CategoricalDtype):
Expand Down Expand Up @@ -294,7 +302,10 @@ def deserialize(cls, header, frames):
header["mask"], [frames[n_dtype_frames + n_data_frames]]
)
return column.build_column(
data=None, dtype=dtype, mask=mask, children=(data,)
data=None,
dtype=dtype,
mask=mask,
children=(column.as_column(data.base_data, dtype=data.dtype),),
)

def set_base_data(self, value):
Expand All @@ -318,12 +329,13 @@ def set_base_children(self, value):
def children(self):
if self._children is None:
codes_column = self.base_children[0]

buf = Buffer(codes_column.base_data)
buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
buf.size = self.size * codes_column.dtype.itemsize

codes_column = column.build_column(
data=codes_column.base_data,
dtype=codes_column.dtype,
mask=codes_column.base_mask,
size=self.size,
offset=self.offset + codes_column.offset,
data=buf, dtype=codes_column.dtype, size=self.size,
)
self._children = (codes_column,)
return self._children
Expand Down Expand Up @@ -392,7 +404,7 @@ def normalize_binop_value(self, other):
col = column.build_categorical_column(
categories=self.dtype.categories,
codes=column.as_column(ary),
mask=self.mask,
mask=self.base_mask,
ordered=self.dtype.ordered,
)
return col
Expand All @@ -401,8 +413,9 @@ def sort_by_values(self, ascending=True, na_position="last"):
codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
col = column.build_categorical_column(
categories=self.dtype.categories,
codes=codes,
mask=self.mask,
codes=column.as_column(codes.base_data, dtype=codes.dtype),
mask=codes.base_mask,
size=codes.size,
ordered=self.dtype.ordered,
)
return col, inds
Expand Down Expand Up @@ -439,8 +452,10 @@ def unique(self):
codes = self.as_numerical.unique()
return column.build_categorical_column(
categories=self.categories,
codes=codes,
mask=codes.mask,
codes=column.as_column(codes.base_data, dtype=codes.dtype),
mask=codes.base_mask,
offset=codes.offset,
size=codes.size,
ordered=self.ordered,
)

Expand Down Expand Up @@ -479,8 +494,10 @@ def find_and_replace(self, to_replace, replacement, all_nan):

return column.build_categorical_column(
categories=self.dtype.categories,
codes=output,
mask=self.mask,
codes=column.as_column(output.base_data, dtype=output.dtype),
mask=output.base_mask,
offset=output.offset,
size=output.size,
ordered=self.dtype.ordered,
)

Expand Down Expand Up @@ -517,7 +534,9 @@ def fillna(self, fill_value):

result = column.build_categorical_column(
categories=self.dtype.categories,
codes=result,
codes=column.as_column(result.base_data, dtype=result.dtype),
offset=result.offset,
size=result.size,
mask=None,
ordered=self.dtype.ordered,
)
Expand Down Expand Up @@ -582,18 +601,27 @@ def _get_decategorized_column(self):
def copy(self, deep=True):
if deep:
copied_col = libcudfxx.copying.copy_column(self)

return column.build_categorical_column(
categories=self.dtype.categories,
codes=copied_col,
mask=copied_col.mask,
codes=column.as_column(
copied_col.base_data, dtype=copied_col.dtype
),
offset=copied_col.offset,
size=copied_col.size,
mask=copied_col.base_mask,
ordered=self.dtype.ordered,
)
else:
return column.build_categorical_column(
categories=self.dtype.categories,
codes=self.codes,
mask=self.mask,
codes=column.as_column(
self.codes.base_data, dtype=self.codes.dtype
),
mask=self.base_mask,
ordered=self.dtype.ordered,
offset=self.offset,
size=self.size,
)

def __sizeof__(self):
Expand Down Expand Up @@ -635,7 +663,8 @@ def pandas_categorical_as_column(categorical, codes=None):

return column.build_categorical_column(
categories=categorical.categories,
codes=codes,
codes=column.as_column(codes.base_data, dtype=codes.dtype),
size=codes.size,
mask=mask,
ordered=categorical.ordered,
)
28 changes: 19 additions & 9 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,11 @@ def _concat(cls, objs, dtype=None):

if is_categorical:
col = build_categorical_column(
categories=cats, codes=col, mask=col.mask
categories=cats,
codes=as_column(col.base_data, dtype=col.dtype),
mask=col.base_mask,
size=col.size,
offset=col.offset,
)

return col
Expand Down Expand Up @@ -434,11 +438,13 @@ def __getitem__(self, arg):

if is_categorical_dtype(self):
codes = self.codes[arg]
return build_column(
data=None,
dtype=self.dtype,
mask=codes.mask,
children=(codes,),
return build_categorical_column(
categories=self.categories,
codes=as_column(codes.base_data, dtype=codes.dtype),
mask=codes.base_mask,
ordered=self.ordered,
size=codes.size,
offset=codes.offset,
)

start, stop, stride = arg.indices(len(self))
Expand Down Expand Up @@ -547,7 +553,7 @@ def __setitem__(self, key, value):
if is_categorical_dtype(value.dtype):
out = build_categorical_column(
categories=value.categories,
codes=out,
codes=as_column(out.base_data, dtype=out.dtype),
mask=out.base_mask,
size=out.size,
offset=out.offset,
Expand All @@ -565,7 +571,7 @@ def __setitem__(self, key, value):
if is_categorical_dtype(self.dtype):
out = build_categorical_column(
categories=self.categories,
codes=out,
codes=as_column(out.base_data, dtype=out.dtype),
mask=out.base_mask,
size=out.size,
offset=out.offset,
Expand Down Expand Up @@ -893,7 +899,11 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None):
):
codes = column_empty_like(column.codes, masked=masked, newsize=newsize)
return build_column(
data=None, dtype=dtype, mask=codes.mask, children=(codes,)
data=None,
dtype=dtype,
mask=codes.base_mask,
children=(as_column(codes.base_data, dtype=codes.dtype),),
size=codes.size,
)

return column_empty(row_count, dtype, masked)
Expand Down
7 changes: 6 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2470,7 +2470,12 @@ def _set_categories(col, cats):
else:
codes = df[name]._column
df[name] = column.build_categorical_column(
categories=cats, codes=codes, ordered=False
categories=cats,
codes=as_column(codes.base_data, dtype=codes.dtype),
mask=codes.base_mask,
size=codes.size,
ordered=False,
offset=codes.offset,
)

if sort and len(df):
Expand Down
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ def reassign_categories(categories, cols, col_idxs):
if idx in categories:
cols[name] = build_categorical_column(
categories=categories[idx],
codes=cols[name],
codes=as_column(
cols[name].base_data, dtype=cols[name].dtype
),
mask=cols[name].base_mask,
offset=cols[name].offset,
size=cols[name].size,
Expand Down Expand Up @@ -518,7 +520,7 @@ def where(self, boolean_mask, other):
if is_categorical_dtype(self[in_col_name].dtype):
result = build_categorical_column(
categories=self[in_col_name]._column.categories,
codes=result,
codes=as_column(result.base_data, dtype=result.dtype),
mask=result.base_mask,
size=result.size,
offset=result.offset,
Expand Down Expand Up @@ -548,7 +550,7 @@ def where(self, boolean_mask, other):
if is_categorical_dtype(self.dtype):
result = build_categorical_column(
categories=self._column.categories,
codes=result,
codes=as_column(result.base_data, dtype=result.dtype),
mask=result.base_mask,
size=result.size,
offset=result.offset,
Expand Down Expand Up @@ -905,7 +907,7 @@ def _copy_categories(self, other, include_index=True):
):
self._data[name] = build_categorical_column(
categories=other_col.categories,
codes=col,
codes=as_column(col.base_data, dtype=col.dtype),
mask=col.base_mask,
ordered=other_col.ordered,
size=col.size,
Expand Down Expand Up @@ -1311,7 +1313,9 @@ def _merge(
to_frame_data[name] = column.build_categorical_column(
categories=dtype.categories,
codes=cat_codes.get(name + "_codes", col),
mask=col.mask,
mask=col.base_mask,
size=col.size,
offset=col.offset,
ordered=dtype.ordered,
)
else:
Expand Down
9 changes: 7 additions & 2 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import cudf
from cudf.core import DataFrame, Index, RangeIndex, Series
from cudf.core.column import build_categorical_column
from cudf.core.column import as_column, build_categorical_column
from cudf.core.index import as_index
from cudf.utils import cudautils
from cudf.utils.dtypes import is_categorical_dtype, is_list_like
Expand Down Expand Up @@ -246,7 +246,12 @@ def _tile(A, reps):

mdata[var_name] = Series(
build_categorical_column(
categories=value_vars, codes=temp._column, ordered=False
categories=value_vars,
codes=as_column(temp._column.base_data, dtype=temp._column.dtype),
mask=temp._column.base_mask,
size=temp._column.size,
offset=temp._column.offset,
ordered=False,
)
)

Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,3 +456,13 @@ def test_categorical_remove_categories(pd_str_cat, inplace):
cd_sr_1 = cd_sr.cat.remove_categories(["a", "d"], inplace=inplace)

raises.match("removals must all be in old categories")


def test_categorical_dataframe_slice_copy():
pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")})
gdf = DataFrame.from_pandas(pdf)

exp = pdf[1:].copy()
gdf = gdf[1:].copy()

assert_eq(exp, gdf)
Loading

0 comments on commit b8c4924

Please sign in to comment.