rapidsai · rapids-bot · Mar 7, 2023 · Feb 21, 2023 · Feb 21, 2023 · Feb 21, 2023
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
@@ -10,12 +10,14 @@ Data manipulations
    :toctree: api/
 
    cudf.concat
+   cudf.crosstab
    cudf.cut
+   cudf.factorize
    cudf.get_dummies
    cudf.melt
+   cudf.merge
    cudf.pivot
    cudf.pivot_table
-   cudf.crosstab
    cudf.unstack
 
 Top-level conversions

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
@@ -102,7 +102,6 @@ Function application, GroupBy & window
    :toctree: api/
 
    Series.apply
-   Series.applymap
    Series.map
    Series.groupby
    Series.rolling

@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 import warnings
 
 import cupy as cp
@@ -7,47 +7,109 @@
 from cudf.core.column import as_column
 from cudf.core.index import Index, RangeIndex
 from cudf.core.indexed_frame import IndexedFrame
+from cudf.core.scalar import Scalar
 from cudf.core.series import Series
 
 
-def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
+def factorize(
+    values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None
+):
     """Encode the input values as integer labels
 
     Parameters
     ----------
     values: Series, Index, or CuPy array
         The data to be factorized.
+    sort : bool, default True
+        Sort uniques and shuffle codes to maintain the relationship.
     na_sentinel : number, default -1
         Value to indicate missing category.
 
+        .. deprecated:: 23.04
+
+           The na_sentinel argument is deprecated and will be removed in
+           a future version of cudf. Specify use_na_sentinel as
+           either True or False.
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NA values.
+        If False, NA values will be encoded as non-negative
+        integers and will not drop the NA from the uniques
+        of the values.
+
     Returns
     -------
     (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
         - *labels* contains the encoded values
         - *cats* contains the categories in order that the N-th
             item corresponds to the (N-1) code.
 
+    See Also
+    --------
+    cudf.Series.factorize : Encode the input values of Series.
+
     Examples
     --------
     >>> import cudf
+    >>> import numpy as np
     >>> data = cudf.Series(['a', 'c', 'c'])
     >>> codes, uniques = cudf.factorize(data)
     >>> codes
     array([0, 1, 1], dtype=int8)
     >>> uniques
     StringIndex(['a' 'c'], dtype='object')
 
-    See Also
-    --------
-    cudf.Series.factorize : Encode the input values of Series.
+    When ``use_na_sentinel=True`` (the default), missing values are indicated
+    in the `codes` with the sentinel value ``-1`` and missing values are not
+    included in `uniques`.
+
+    >>> codes, uniques = cudf.factorize(['b', None, 'a', 'c', 'b'])
+    >>> codes
+    array([ 1, -1,  0,  2,  1], dtype=int8)
+    >>> uniques
+    StringIndex(['a' 'b' 'c'], dtype='object')
 
+    If NA is in the values, and we want to include NA in the uniques of the
+    values, it can be achieved by setting ``use_na_sentinel=False``.
+
+    >>> values = np.array([1, 2, 1, np.nan])
+    >>> codes, uniques = cudf.factorize(values)
+    >>> codes
+    array([ 0,  1,  0, -1], dtype=int8)
+    >>> uniques
+    Float64Index([1.0, 2.0], dtype='float64')
+    >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False)
+    >>> codes
+    array([1, 2, 1, 0], dtype=int8)
+    >>> uniques
+    Float64Index([<NA>, 1.0, 2.0], dtype='float64')
     """
-    if sort:
-        raise NotImplementedError(
-            "Sorting not yet supported during factorization."
+    # TODO: Drop `na_sentinel` in the next release immediately after
+    # pandas 2.0 upgrade.
+    if na_sentinel is not None:
+        warnings.warn(
+            "Specifying the specific value to use for `na_sentinel` is "
+            "deprecated and will be removed in a future version of cudf. "
+            "Specify `use_na_sentinel=True` to use the sentinel value -1, "
+            "and `use_na_sentinel=False` to encode NA values.",
+            FutureWarning,
+        )
+
+    if use_na_sentinel is not None and na_sentinel is not None:
+        raise ValueError(
+            "Cannot specify both `na_sentinel` and `use_na_sentile`; "
+            f"got `na_sentinel={na_sentinel}` and "
+            f"`use_na_sentinel={use_na_sentinel}`"
+        )
+    elif use_na_sentinel is None and na_sentinel is None:
+        use_na_sentinel = True
+        na_sentinel = -1
+    elif use_na_sentinel is None:
+        use_na_sentinel = True
+    else:
+        # use_sentinel is either True or False, na_sentinel is None
+        na_sentinel = (
+            -1 if use_na_sentinel else Scalar(None, dtype=values.dtype)
         )
-    if na_sentinel is None:
-        raise NotImplementedError("na_sentinel can not be None.")
 
     if size_hint:
         warnings.warn("size_hint is not applicable for cudf.factorize")
@@ -56,10 +118,18 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
 
     values = Series(values)
 
-    cats = values._column.dropna().unique().astype(values.dtype)
+    if use_na_sentinel:
+        cats = values._column.dropna()
+    else:
+        cats = values._column
+
+    cats = cats.unique().astype(values.dtype)
+
+    if sort:
+        cats, _ = cats.sort_by_values()
 
     labels = values._column._label_encoding(
-        cats=cats, na_sentinel=na_sentinel
+        cats=cats, na_sentinel=Scalar(na_sentinel)
     ).values
 
     return labels, cats.values if return_cupy_array else Index(cats)

@@ -1014,7 +1014,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         cats = self.unique().astype(self.dtype)
         label_dtype = min_unsigned_type(len(cats))
         labels = self._label_encoding(
-            cats=cats, dtype=label_dtype, na_sentinel=1
+            cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
         )
 
         # columns include null index in factorization; remove:
@@ -1304,7 +1304,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         return self
 
     def _label_encoding(
-        self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1
+        self,
+        cats: ColumnBase,
+        dtype: Dtype = None,
+        na_sentinel: ScalarLike = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1337,13 +1340,22 @@ def _label_encoding(
         """
         from cudf._lib.join import join as cpp_join
 
+        if na_sentinel is None:
+            na_sentinel = cudf.Scalar(-1)
+
         def _return_sentinel_column():
             return cudf.core.column.full(
                 size=len(self), fill_value=na_sentinel, dtype=dtype
             )
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_scalar_type(
+                max(
+                    len(cats),
+                    -1 if na_sentinel.value is cudf.NA else na_sentinel,
+                ),
+                8,
+            )
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
@@ -1363,7 +1375,7 @@ def _return_sentinel_column():
         )
         codes = codes.take(
             right_gather_map, nullify=True, check_bounds=False
-        ).fillna(na_sentinel)
+        ).fillna(na_sentinel.value)
 
         # reorder `codes` so that its values correspond to the
         # values of `self`:

@@ -714,7 +714,13 @@ def _compute_levels_and_codes(self):
 
         codes = {}
         for name, col in self._data.items():
-            code, cats = cudf.Series._from_data({None: col}).factorize()
+            with warnings.catch_warnings():
+                # TODO: Remove this filter when
+                # `na_sentinel` is removed from `factorize`.
+                # This is a filter to not let the warnings from
+                # `factorize` show up in other parts of public APIs.
+                warnings.simplefilter("ignore")
+                code, cats = cudf.Series._from_data({None: col}).factorize()
             codes[name] = code.astype(np.int64)
             levels.append(cudf.Series(cats, name=None))
 

@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 """Base class for Frame types that only have a single column."""
 
 from __future__ import annotations
@@ -270,14 +270,27 @@ def __cuda_array_interface__(self):
         return self._column.__cuda_array_interface__
 
     @_cudf_nvtx_annotate
-    def factorize(self, na_sentinel=-1):
+    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
         """Encode the input values as integer labels.
 
         Parameters
         ----------
-        na_sentinel : number
+        sort : bool, default True
+            Sort uniques and shuffle codes to maintain the relationship.
+        na_sentinel : number, default -1
             Value to indicate missing category.
 
+            .. deprecated:: 23.04
+
+               The na_sentinel argument is deprecated and will be removed in
+               a future version of cudf. Specify use_na_sentinel as
+               either True or False.
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NA values.
+            If False, NA values will be encoded as non-negative
+            integers and will not drop the NA from the uniques
+            of the values.
+
         Returns
         -------
         (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
@@ -295,7 +308,12 @@ def factorize(self, na_sentinel=-1):
         >>> uniques
         StringIndex(['a' 'c'], dtype='object')
         """
-        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
+        return cudf.core.algorithms.factorize(
+            self,
+            sort=sort,
+            na_sentinel=na_sentinel,
+            use_na_sentinel=use_na_sentinel,
+        )
 
     @_cudf_nvtx_annotate
     def _make_operands_for_binop(

@@ -486,12 +486,57 @@ def test_series_factorize(data, na_sentinel):
 
     with pytest.warns(FutureWarning):
         expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
-    actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
+    with pytest.warns(FutureWarning):
+        actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
 
     assert_eq(expected_labels, actual_labels.get())
     assert_eq(expected_cats.values, actual_cats.to_pandas().values)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 2, 3, 2, 1],
+        [1, 2, None, 3, 1, 1],
+        [],
+        ["a", "b", "c", None, "z", "a"],
+    ],
+)
+@pytest.mark.parametrize("use_na_sentinel", [True, False])
+def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas(nullable=True)
+
+    expected_labels, expected_cats = psr.factorize(
+        use_na_sentinel=use_na_sentinel, sort=True
+    )
+    actual_labels, actual_cats = gsr.factorize(
+        use_na_sentinel=use_na_sentinel, sort=True
+    )
+    assert_eq(expected_labels, actual_labels.get())
+    assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 2, 3, 2, 1],
+        [1, 2, None, 3, 1, 1],
+        [],
+        ["a", "b", "c", None, "z", "a"],
+    ],
+)
+@pytest.mark.parametrize("sort", [True, False])
+def test_series_factorize_sort(data, sort):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas(nullable=True)
+
+    expected_labels, expected_cats = psr.factorize(sort=sort)
+    actual_labels, actual_cats = gsr.factorize(sort=sort)
+    assert_eq(expected_labels, actual_labels.get())
+    assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))
+
+
 @pytest.mark.parametrize(
     "data",
     [