Skip to content

Commit

Permalink
Move _label_encoding from Series to Column (#12040)
Browse files Browse the repository at this point in the history
Maybe closes #9474.

The `Series._label_encoding` function is used to integer encode columns. It makes much more sense for this to be a method of `Column` rather than `Series` (the index is irrelevant). Thus, I've moved the implementation to `Column`,

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #12040
  • Loading branch information
shwina authored Nov 30, 2022
1 parent e321bf1 commit cb8d9e1
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 57 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):

cats = values._column.dropna().unique().astype(values.dtype)

name = values.name # label_encoding mutates self.name
labels = values._label_encoding(cats=cats, na_sentinel=na_sentinel).values
values.name = name
labels = values._column._label_encoding(
cats=cats, na_sentinel=na_sentinel
).values

return labels, cats.values if return_cupy_array else Index(cats)

Expand Down
87 changes: 78 additions & 9 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@
_maybe_convert_to_default_type,
cudf_dtype_from_pa_type,
get_time_unit,
is_mixed_with_object_dtype,
min_scalar_type,
min_unsigned_type,
np_to_pa_dtype,
pandas_dtypes_alias_to_cudf_alias,
Expand Down Expand Up @@ -897,8 +899,6 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
else:
ordered = False

sr = cudf.Series(self)

# Re-label self w.r.t. the provided categories
if (
isinstance(dtype, cudf.CategoricalDtype)
Expand All @@ -907,7 +907,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
isinstance(dtype, pd.CategoricalDtype)
and dtype.categories is not None
):
labels = sr._label_encoding(cats=dtype.categories)
labels = self._label_encoding(cats=as_column(dtype.categories))
if "ordered" in kwargs:
warnings.warn(
"Ignoring the `ordered` parameter passed in `**kwargs`, "
Expand All @@ -916,28 +916,28 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:

return build_categorical_column(
categories=as_column(dtype.categories),
codes=labels._column,
codes=labels,
mask=self.mask,
ordered=dtype.ordered,
)

cats = sr.unique().astype(sr.dtype)
cats = self.unique().astype(self.dtype)
label_dtype = min_unsigned_type(len(cats))
labels = sr._label_encoding(
labels = self._label_encoding(
cats=cats, dtype=label_dtype, na_sentinel=1
)

# columns include null index in factorization; remove:
if self.has_nulls():
cats = cats._column.dropna(drop_nan=False)
cats = cats.dropna(drop_nan=False)
min_type = min_unsigned_type(len(cats), 8)
labels = labels - 1
if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
labels = labels.astype(min_type)

return build_categorical_column(
categories=cats,
codes=labels._column,
codes=labels,
mask=self.mask,
ordered=ordered,
)
Expand Down Expand Up @@ -998,7 +998,7 @@ def apply_boolean_mask(self, mask) -> ColumnBase:

def argsort(
self, ascending: bool = True, na_position: str = "last"
) -> ColumnBase:
) -> "cudf.core.column.NumericalColumn":

return self.as_frame()._get_sorted_inds(
ascending=ascending, na_position=na_position
Expand Down Expand Up @@ -1215,6 +1215,75 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
"""
return self

def _label_encoding(
self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1
):
"""
Convert each value in `self` into an integer code, with `cats`
providing the mapping between codes and values.
Examples
--------
>>> from cudf.core.column import as_column
>>> col = as_column(['foo', 'bar', 'foo', 'baz'])
>>> cats = as_column(['foo', 'bar', 'baz'])
>>> col._label_encoding(cats)
<cudf.core.column.numerical.NumericalColumn object at 0x7f99bf3155c0>
[
0,
1,
0,
2
]
dtype: int8
>>> cats = as_column(['foo', 'bar'])
>>> col._label_encoding(cats)
<cudf.core.column.numerical.NumericalColumn object at 0x7f99bfde0e40>
[
0,
1,
0,
-1
]
dtype: int8
"""
from cudf._lib.join import join as cpp_join

def _return_sentinel_column():
return cudf.core.column.full(
size=len(self), fill_value=na_sentinel, dtype=dtype
)

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)

if is_mixed_with_object_dtype(self, cats):
return _return_sentinel_column()

try:
# Where there is a type-cast failure, we have
# to catch the exception and return encoded labels
# with na_sentinel values as there would be no corresponding
# encoded values of cats in self.
cats = cats.astype(self.dtype)
except ValueError:
return _return_sentinel_column()

codes = arange(len(cats), dtype=dtype)
left_gather_map, right_gather_map = cpp_join(
[self], [cats], how="left"
)
codes = codes.take(
right_gather_map, nullify=True, check_bounds=False
).fillna(na_sentinel)

# reorder `codes` so that its values correspond to the
# values of `self`:
order = arange(len(self))
order = order.take(left_gather_map, check_bounds=False).argsort()
codes = codes.take(order)
return codes


def column_empty_like(
column: ColumnBase,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None:
self._ordered = ordered

@property
def categories(self) -> "cudf.core.index.BaseIndex":
def categories(self) -> "cudf.core.index.GenericIndex":
"""
An ``Index`` containing the unique categories allowed.
Expand Down
44 changes: 0 additions & 44 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@
can_convert_to_column,
find_common_type,
is_mixed_with_object_dtype,
min_scalar_type,
to_cudf_compatible_scalar,
)
from cudf.utils.utils import _cudf_nvtx_annotate
Expand Down Expand Up @@ -2280,49 +2279,6 @@ def update(self, other):

self.mask(mask, other, inplace=True)

@_cudf_nvtx_annotate
def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
# Private implementation of deprecated public label_encoding method
def _return_sentinel_series():
return Series(
cudf.core.column.full(
size=len(self), fill_value=na_sentinel, dtype=dtype
),
index=self.index,
name=None,
)

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)

cats = column.as_column(cats)
if is_mixed_with_object_dtype(self, cats):
return _return_sentinel_series()

try:
# Where there is a type-cast failure, we have
# to catch the exception and return encoded labels
# with na_sentinel values as there would be no corresponding
# encoded values of cats in self.
cats = cats.astype(self.dtype)
except ValueError:
return _return_sentinel_series()

order = column.arange(len(self))
codes = column.arange(len(cats), dtype=dtype)

value = cudf.DataFrame({"value": cats, "code": codes})
codes = cudf.DataFrame(
{"value": self._data.columns[0].copy(deep=False), "order": order}
)

codes = codes.merge(value, on="value", how="left")
codes = codes.sort_values("order")["code"].fillna(na_sentinel)

codes.name = None
codes.index = self._index
return codes

# UDF related
@_cudf_nvtx_annotate
def apply(self, func, convert_dtype=True, args=(), **kwargs):
Expand Down

0 comments on commit cb8d9e1

Please sign in to comment.