Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve order if necessary when deduping categoricals internally #11597

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,8 +945,8 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
return self.codes.data_array_view

def unique(self) -> CategoricalColumn:
codes = self.as_numerical.unique()
def unique(self, preserve_order=False) -> CategoricalColumn:
codes = self.as_numerical.unique(preserve_order=preserve_order)
return column.build_categorical_column(
categories=self.categories,
codes=column.build_column(codes.base_data, dtype=codes.dtype),
Expand Down Expand Up @@ -1316,7 +1316,9 @@ def _concat(
head = next((obj for obj in objs if obj.valid_count), objs[0])

# Combine and de-dupe the categories
cats = column.concat_columns([o.categories for o in objs]).unique()
cats = column.concat_columns([o.categories for o in objs]).unique(
preserve_order=True
)
objs = [o._set_categories(cats, is_unique=True) for o in objs]
codes = [o.codes for o in objs]

Expand Down Expand Up @@ -1456,10 +1458,8 @@ def _set_categories(
# Ensure new_categories is unique first
if not (is_unique or new_cats.is_unique):
# drop_duplicates() instead of unique() to preserve order
new_cats = (
cudf.Series(new_cats)
.drop_duplicates(ignore_index=True)
._column
new_cats = cudf.Series(new_cats)._column.unique(
preserve_order=True
)

cur_codes = self.codes
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,7 @@ def searchsorted(
values, side, ascending=ascending, na_position=na_position
)

def unique(self) -> ColumnBase:
def unique(self, preserve_order=False) -> ColumnBase:
"""
Get unique values in the data
"""
Expand All @@ -1037,6 +1037,15 @@ def unique(self) -> ColumnBase:
# Few things to note before we can do this optimization is
# the following issue resolved:
# https://github.com/rapidsai/cudf/issues/5286
if preserve_order:
ind = as_column(cupy.arange(0, len(self)))

# dedup based on the column of data only
ind, col = drop_duplicates([ind, self], keys=[1])

# sort col based on ind
map = ind.argsort()
return col.take(map)

return drop_duplicates([self], keep="first")[0]

Expand Down
8 changes: 3 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7244,11 +7244,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
):
# Combine and de-dupe the categories
categories[idx] = (
cudf.Series(concat_columns([col.categories for col in cols]))
.drop_duplicates(ignore_index=True)
._column
)
categories[idx] = cudf.Series(
concat_columns([col.categories for col in cols])
)._column.unique(preserve_order=True)
# Set the column dtype to the codes' dtype. The categories
# will be re-assigned at the end
dtypes[idx] = min_scalar_type(len(categories[idx]))
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1761,3 +1761,19 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
def test_concat_struct_column(s1, s2, expected):
s = gd.concat([s1, s2])
assert_eq(s, expected, check_index_type=True)


def test_concat_categorical_ordering():
# https://github.com/rapidsai/cudf/issues/11486
sr = pd.Series(
["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category"
)
sr = sr.cat.set_categories(["d", "a", "b", "c", "e"])

df = pd.DataFrame({"a": sr})
gdf = gd.from_pandas(df)

expect = pd.concat([df, df, df])
got = gd.concat([gdf, gdf, gdf])

assert_eq(expect, got)
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1928,3 +1928,12 @@ def test_default_integer_bitwidth_construction(default_integer_bitwidth, data):
def test_default_float_bitwidth_construction(default_float_bitwidth, data):
s = cudf.Series(data)
assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}")


def test_series_ordered_dedup():
# part of https://github.com/rapidsai/cudf/issues/11486
sr = cudf.Series(np.random.randint(0, 100, 1000))
# pandas unique() preserves order
expect = pd.Series(sr.to_pandas().unique())
got = cudf.Series(sr._column.unique(preserve_order=True))
assert_eq(expect.values, got.values)