Move _label_encoding from Series to Column (#12040)

Maybe closes #9474. The `Series._label_encoding` function is used to integer encode columns. It makes much more sense for this to be a method of `Column` rather than `Series` (the index is irrelevant). Thus, I've moved the implementation to `Column`, Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #12040
rapidsai · Nov 30, 2022 · cb8d9e1 · cb8d9e1
1 parent e321bf1
commit cb8d9e1
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 57 deletions.
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
@@ -58,9 +58,9 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
 
     cats = values._column.dropna().unique().astype(values.dtype)
 
-    name = values.name  # label_encoding mutates self.name
-    labels = values._label_encoding(cats=cats, na_sentinel=na_sentinel).values
-    values.name = name
+    labels = values._column._label_encoding(
+        cats=cats, na_sentinel=na_sentinel
+    ).values
 
     return labels, cats.values if return_cupy_array else Index(cats)
 

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -77,6 +77,8 @@
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
     get_time_unit,
+    is_mixed_with_object_dtype,
+    min_scalar_type,
     min_unsigned_type,
     np_to_pa_dtype,
     pandas_dtypes_alias_to_cudf_alias,
@@ -897,8 +899,6 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         else:
             ordered = False
 
-        sr = cudf.Series(self)
-
         # Re-label self w.r.t. the provided categories
         if (
             isinstance(dtype, cudf.CategoricalDtype)
@@ -907,7 +907,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
             isinstance(dtype, pd.CategoricalDtype)
             and dtype.categories is not None
         ):
-            labels = sr._label_encoding(cats=dtype.categories)
+            labels = self._label_encoding(cats=as_column(dtype.categories))
             if "ordered" in kwargs:
                 warnings.warn(
                     "Ignoring the `ordered` parameter passed in `**kwargs`, "
@@ -916,28 +916,28 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
 
             return build_categorical_column(
                 categories=as_column(dtype.categories),
-                codes=labels._column,
+                codes=labels,
                 mask=self.mask,
                 ordered=dtype.ordered,
             )
 
-        cats = sr.unique().astype(sr.dtype)
+        cats = self.unique().astype(self.dtype)
         label_dtype = min_unsigned_type(len(cats))
-        labels = sr._label_encoding(
+        labels = self._label_encoding(
             cats=cats, dtype=label_dtype, na_sentinel=1
         )
 
         # columns include null index in factorization; remove:
         if self.has_nulls():
-            cats = cats._column.dropna(drop_nan=False)
+            cats = cats.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
             labels = labels - 1
             if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
 
         return build_categorical_column(
             categories=cats,
-            codes=labels._column,
+            codes=labels,
             mask=self.mask,
             ordered=ordered,
         )
@@ -998,7 +998,7 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
 
     def argsort(
         self, ascending: bool = True, na_position: str = "last"
-    ) -> ColumnBase:
+    ) -> "cudf.core.column.NumericalColumn":
 
         return self.as_frame()._get_sorted_inds(
             ascending=ascending, na_position=na_position
@@ -1215,6 +1215,75 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         """
         return self
 
+    def _label_encoding(
+        self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1
+    ):
+        """
+        Convert each value in `self` into an integer code, with `cats`
+        providing the mapping between codes and values.
+
+        Examples
+        --------
+        >>> from cudf.core.column import as_column
+        >>> col = as_column(['foo', 'bar', 'foo', 'baz'])
+        >>> cats = as_column(['foo', 'bar', 'baz'])
+        >>> col._label_encoding(cats)
+        <cudf.core.column.numerical.NumericalColumn object at 0x7f99bf3155c0>
+        [
+          0,
+          1,
+          0,
+          2
+        ]
+        dtype: int8
+        >>> cats = as_column(['foo', 'bar'])
+        >>> col._label_encoding(cats)
+        <cudf.core.column.numerical.NumericalColumn object at 0x7f99bfde0e40>
+        [
+          0,
+          1,
+          0,
+          -1
+        ]
+        dtype: int8
+        """
+        from cudf._lib.join import join as cpp_join
+
+        def _return_sentinel_column():
+            return cudf.core.column.full(
+                size=len(self), fill_value=na_sentinel, dtype=dtype
+            )
+
+        if dtype is None:
+            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+
+        if is_mixed_with_object_dtype(self, cats):
+            return _return_sentinel_column()
+
+        try:
+            # Where there is a type-cast failure, we have
+            # to catch the exception and return encoded labels
+            # with na_sentinel values as there would be no corresponding
+            # encoded values of cats in self.
+            cats = cats.astype(self.dtype)
+        except ValueError:
+            return _return_sentinel_column()
+
+        codes = arange(len(cats), dtype=dtype)
+        left_gather_map, right_gather_map = cpp_join(
+            [self], [cats], how="left"
+        )
+        codes = codes.take(
+            right_gather_map, nullify=True, check_bounds=False
+        ).fillna(na_sentinel)
+
+        # reorder `codes` so that its values correspond to the
+        # values of `self`:
+        order = arange(len(self))
+        order = order.take(left_gather_map, check_bounds=False).argsort()
+        codes = codes.take(order)
+        return codes
+
 
 def column_empty_like(
     column: ColumnBase,

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -160,7 +160,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None:
         self._ordered = ordered
 
     @property
-    def categories(self) -> "cudf.core.index.BaseIndex":
+    def categories(self) -> "cudf.core.index.GenericIndex":
         """
         An ``Index`` containing the unique categories allowed.
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -75,7 +75,6 @@
     can_convert_to_column,
     find_common_type,
     is_mixed_with_object_dtype,
-    min_scalar_type,
     to_cudf_compatible_scalar,
 )
 from cudf.utils.utils import _cudf_nvtx_annotate
@@ -2280,49 +2279,6 @@ def update(self, other):
 
         self.mask(mask, other, inplace=True)
 
-    @_cudf_nvtx_annotate
-    def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
-        # Private implementation of deprecated public label_encoding method
-        def _return_sentinel_series():
-            return Series(
-                cudf.core.column.full(
-                    size=len(self), fill_value=na_sentinel, dtype=dtype
-                ),
-                index=self.index,
-                name=None,
-            )
-
-        if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
-
-        cats = column.as_column(cats)
-        if is_mixed_with_object_dtype(self, cats):
-            return _return_sentinel_series()
-
-        try:
-            # Where there is a type-cast failure, we have
-            # to catch the exception and return encoded labels
-            # with na_sentinel values as there would be no corresponding
-            # encoded values of cats in self.
-            cats = cats.astype(self.dtype)
-        except ValueError:
-            return _return_sentinel_series()
-
-        order = column.arange(len(self))
-        codes = column.arange(len(cats), dtype=dtype)
-
-        value = cudf.DataFrame({"value": cats, "code": codes})
-        codes = cudf.DataFrame(
-            {"value": self._data.columns[0].copy(deep=False), "order": order}
-        )
-
-        codes = codes.merge(value, on="value", how="left")
-        codes = codes.sort_values("order")["code"].fillna(na_sentinel)
-
-        codes.name = None
-        codes.index = self._index
-        return codes
-
     # UDF related
     @_cudf_nvtx_annotate
     def apply(self, func, convert_dtype=True, args=(), **kwargs):