From 0eaa4f2425c32eb457473b9ea0dfab8b40f31ee8 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 21 Feb 2023 12:10:10 -0800
Subject: [PATCH 1/8] Deprecate na_sentinel

---
 .../source/api_docs/general_functions.rst     |  4 +-
 docs/cudf/source/api_docs/series.rst          |  1 -
 python/cudf/cudf/core/algorithms.py           | 94 ++++++++++++++++---
 python/cudf/cudf/core/column/column.py        | 13 ++-
 python/cudf/cudf/core/multiindex.py           | 12 ++-
 python/cudf/cudf/core/single_column_frame.py  | 26 ++++-
 python/cudf/cudf/tests/test_series.py         | 29 +++++-
 7 files changed, 155 insertions(+), 24 deletions(-)

diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 40e1b766dc9..112df2fdf9f 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -10,12 +10,14 @@ Data manipulations
    :toctree: api/
 
    cudf.concat
+   cudf.crosstab
    cudf.cut
+   cudf.factorize
    cudf.get_dummies
    cudf.melt
+   cudf.merge
    cudf.pivot
    cudf.pivot_table
-   cudf.crosstab
    cudf.unstack
 
 Top-level conversions
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 386da4055d8..9cd0770431c 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -102,7 +102,6 @@ Function application, GroupBy & window
    :toctree: api/
 
    Series.apply
-   Series.applymap
    Series.map
    Series.groupby
    Series.rolling
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 73fc1130073..6d25a376385 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 import warnings
 
 import cupy as cp
@@ -7,19 +7,35 @@
 from cudf.core.column import as_column
 from cudf.core.index import Index, RangeIndex
 from cudf.core.indexed_frame import IndexedFrame
+from cudf.core.scalar import Scalar
 from cudf.core.series import Series
 
 
-def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
+def factorize(
+    values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None
+):
     """Encode the input values as integer labels
 
     Parameters
     ----------
     values: Series, Index, or CuPy array
         The data to be factorized.
+    sort : bool, default True
+        Sort uniques and shuffle codes to maintain the relationship.
     na_sentinel : number, default -1
         Value to indicate missing category.
 
+        .. deprecated:: 23.04
+
+           The na_sentinel argument is deprecated and will be removed in
+           a future version of cudf. Specify use_na_sentinel as
+           either True or False.
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NA values.
+        If False, NA values will be encoded as non-negative
+        integers and will not drop the NA from the uniques
+        of the values.
+
     Returns
     -------
     (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
@@ -27,9 +43,14 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
         - *cats* contains the categories in order that the N-th
             item corresponds to the (N-1) code.
 
+    See Also
+    --------
+    cudf.Series.factorize : Encode the input values of Series.
+
     Examples
     --------
     >>> import cudf
+    >>> import numpy as np
     >>> data = cudf.Series(['a', 'c', 'c'])
     >>> codes, uniques = cudf.factorize(data)
     >>> codes
@@ -37,17 +58,60 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     >>> uniques
     StringIndex(['a' 'c'], dtype='object')
 
-    See Also
-    --------
-    cudf.Series.factorize : Encode the input values of Series.
+    When ``use_na_sentinel=True`` (the default), missing values are indicated
+    in the `codes` with the sentinel value ``-1`` and missing values are not
+    included in `uniques`.
+
+    >>> codes, uniques = cudf.factorize(['b', None, 'a', 'c', 'b'])
+    >>> codes
+    array([ 1, -1,  0,  2,  1], dtype=int8)
+    >>> uniques
+    StringIndex(['a' 'b' 'c'], dtype='object')
 
+    If NA is in the values, and we want to include NA in the uniques of the
+    values, it can be achieved by setting ``use_na_sentinel=False``.
+
+    >>> values = np.array([1, 2, 1, np.nan])
+    >>> codes, uniques = cudf.factorize(values)
+    >>> codes
+    array([ 0,  1,  0, -1], dtype=int8)
+    >>> uniques
+    Float64Index([1.0, 2.0], dtype='float64')
+    >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False)
+    >>> codes
+    array([1, 2, 1, 0], dtype=int8)
+    >>> uniques
+    Float64Index([<NA>, 1.0, 2.0], dtype='float64')
     """
-    if sort:
-        raise NotImplementedError(
-            "Sorting not yet supported during factorization."
+    # TODO: Drop `na_sentinel` in the next release immediately after
+    # pandas 2.0 upgrade.
+    if na_sentinel is not None:
+        warnings.warn(
+            "Specifying the specific value to use for `na_sentinel` is "
+            "deprecated and will be removed in a future version of cudf. "
+            "Specify `use_na_sentinel=True` to use the sentinel value -1, "
+            "and `use_na_sentinel=False` to encode NA values.",
+            FutureWarning,
+        )
+
+    if use_na_sentinel is not None and na_sentinel is not None:
+        raise ValueError(
+            "Cannot specify both `na_sentinel` and `use_na_sentile`; "
+            f"got `na_sentinel={na_sentinel}` and "
+            f"`use_na_sentinel={use_na_sentinel}`"
+        )
+    elif use_na_sentinel is None and na_sentinel is None:
+        use_na_sentinel = True
+        na_sentinel = -1
+
+    if use_na_sentinel is None:
+        use_na_sentinel = True
+    elif na_sentinel is None:
+        na_sentinel = -1
+    else:
+        na_sentinel = (
+            -1 if use_na_sentinel else Scalar(None, dtype=values.dtype)
         )
-    if na_sentinel is None:
-        raise NotImplementedError("na_sentinel can not be None.")
 
     if size_hint:
         warnings.warn("size_hint is not applicable for cudf.factorize")
@@ -56,7 +120,15 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
 
     values = Series(values)
 
-    cats = values._column.dropna().unique().astype(values.dtype)
+    if use_na_sentinel:
+        cats = values._column.dropna()
+    else:
+        cats = values._column
+
+    cats = cats.unique().astype(values.dtype)
+
+    if sort:
+        cats, _ = cats.sort_by_values()
 
     labels = values._column._label_encoding(
         cats=cats, na_sentinel=na_sentinel
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index fb1bcf6d673..66a286bd84d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -22,13 +22,13 @@
 
 import cupy
 import numpy as np
-import pandas as pd
 import pyarrow as pa
 from numba import cuda
 
 import rmm
 
 import cudf
+import pandas as pd
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
 from cudf._lib.null_mask import (
@@ -1343,7 +1343,16 @@ def _return_sentinel_column():
             )
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_scalar_type(
+                max(
+                    len(cats),
+                    -1
+                    if isinstance(na_sentinel, cudf.Scalar)
+                    and na_sentinel.value is cudf.NA
+                    else na_sentinel,
+                ),
+                8,
+            )
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 783c3996400..f5aa5214f74 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -13,10 +13,9 @@
 
 import cupy as cp
 import numpy as np
-import pandas as pd
-from pandas._config import get_option
 
 import cudf
+import pandas as pd
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
@@ -31,6 +30,7 @@
 )
 from cudf.utils.docutils import doc_apply
 from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from pandas._config import get_option
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
@@ -714,7 +714,13 @@ def _compute_levels_and_codes(self):
 
         codes = {}
         for name, col in self._data.items():
-            code, cats = cudf.Series._from_data({None: col}).factorize()
+            with warnings.catch_warnings():
+                # TODO: Remove this filter when
+                # `na_sentinel` is removed from `factorize`.
+                # This is a filter to not let the warnings from
+                # `factorize` show up in other parts of public APIs.
+                warnings.simplefilter("ignore")
+                code, cats = cudf.Series._from_data({None: col}).factorize()
             codes[name] = code.astype(np.int64)
             levels.append(cudf.Series(cats, name=None))
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index afd06ea3629..c4128621148 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 """Base class for Frame types that only have a single column."""
 
 from __future__ import annotations
@@ -270,14 +270,27 @@ def __cuda_array_interface__(self):
         return self._column.__cuda_array_interface__
 
     @_cudf_nvtx_annotate
-    def factorize(self, na_sentinel=-1):
+    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
         """Encode the input values as integer labels.
 
         Parameters
         ----------
-        na_sentinel : number
+        sort : bool, default True
+            Sort uniques and shuffle codes to maintain the relationship.
+        na_sentinel : number, default -1
             Value to indicate missing category.
 
+            .. deprecated:: 23.04
+
+               The na_sentinel argument is deprecated and will be removed in
+               a future version of cudf. Specify use_na_sentinel as
+               either True or False.
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NA values.
+            If False, NA values will be encoded as non-negative
+            integers and will not drop the NA from the uniques
+            of the values.
+
         Returns
         -------
         (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
@@ -295,7 +308,12 @@ def factorize(self, na_sentinel=-1):
         >>> uniques
         StringIndex(['a' 'c'], dtype='object')
         """
-        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
+        return cudf.core.algorithms.factorize(
+            self,
+            sort=sort,
+            na_sentinel=na_sentinel,
+            use_na_sentinel=use_na_sentinel,
+        )
 
     @_cudf_nvtx_annotate
     def _make_operands_for_binop(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index b3c7c9ac9bb..bc54d1e51f7 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -8,11 +8,11 @@
 
 import cupy as cp
 import numpy as np
-import pandas as pd
 import pyarrow as pa
 import pytest
 
 import cudf
+import pandas as pd
 from cudf.core._compat import PANDAS_GE_120, PANDAS_LT_140
 from cudf.testing._utils import (
     NUMERIC_TYPES,
@@ -486,12 +486,37 @@ def test_series_factorize(data, na_sentinel):
 
     with pytest.warns(FutureWarning):
         expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
-    actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
+    with pytest.warns(FutureWarning):
+        actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
 
     assert_eq(expected_labels, actual_labels.get())
     assert_eq(expected_cats.values, actual_cats.to_pandas().values)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 2, 3, 2, 1],
+        [1, 2, None, 3, 1, 1],
+        [],
+        ["a", "b", "c", None, "z", "a"],
+    ],
+)
+@pytest.mark.parametrize("use_na_sentinel", [True, False])
+def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas(nullable=True)
+
+    expected_labels, expected_cats = psr.factorize(
+        use_na_sentinel=use_na_sentinel, sort=True
+    )
+    actual_labels, actual_cats = gsr.factorize(
+        use_na_sentinel=use_na_sentinel, sort=True
+    )
+    assert_eq(expected_labels, actual_labels.get())
+    assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 12a910273c4d6cf7ac442d7171964ce476bc041b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 21 Feb 2023 12:50:06 -0800
Subject: [PATCH 2/8] undo isort

---
 python/cudf/cudf/core/column/column.py | 2 +-
 python/cudf/cudf/core/multiindex.py    | 4 ++--
 python/cudf/cudf/tests/test_series.py  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 66a286bd84d..414d9cd5f35 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -22,13 +22,13 @@
 
 import cupy
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 from numba import cuda
 
 import rmm
 
 import cudf
-import pandas as pd
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
 from cudf._lib.null_mask import (
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index f5aa5214f74..17b3e611625 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -13,9 +13,10 @@
 
 import cupy as cp
 import numpy as np
+import pandas as pd
+from pandas._config import get_option
 
 import cudf
-import pandas as pd
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
@@ -30,7 +31,6 @@
 )
 from cudf.utils.docutils import doc_apply
 from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
-from pandas._config import get_option
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index bc54d1e51f7..b2a0369dfd3 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -8,11 +8,11 @@
 
 import cupy as cp
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 import pytest
 
 import cudf
-import pandas as pd
 from cudf.core._compat import PANDAS_GE_120, PANDAS_LT_140
 from cudf.testing._utils import (
     NUMERIC_TYPES,

From 7296bf7775654b2a05720886f854dbce55465fc2 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 22 Feb 2023 14:33:49 -0800
Subject: [PATCH 3/8] simplify if/else

---
 python/cudf/cudf/core/algorithms.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 6d25a376385..c04aaa00196 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -103,12 +103,10 @@ def factorize(
     elif use_na_sentinel is None and na_sentinel is None:
         use_na_sentinel = True
         na_sentinel = -1
-
-    if use_na_sentinel is None:
+    elif use_na_sentinel is None:
         use_na_sentinel = True
-    elif na_sentinel is None:
-        na_sentinel = -1
     else:
+        # use_sentinel is either True or False, na_sentinel is None
         na_sentinel = (
             -1 if use_na_sentinel else Scalar(None, dtype=values.dtype)
         )

From 71f98c441bd39c84a9420c75de9159f508e5415f Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 22 Feb 2023 15:39:42 -0800
Subject: [PATCH 4/8] accept only scalars in _label_encoding for na_sentinel

---
 python/cudf/cudf/core/algorithms.py    |  2 +-
 python/cudf/cudf/core/column/column.py | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index c04aaa00196..4e165e5f396 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -129,7 +129,7 @@ def factorize(
         cats, _ = cats.sort_by_values()
 
     labels = values._column._label_encoding(
-        cats=cats, na_sentinel=na_sentinel
+        cats=cats, na_sentinel=Scalar(na_sentinel)
     ).values
 
     return labels, cats.values if return_cupy_array else Index(cats)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 414d9cd5f35..8e6b8fe9f70 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1014,7 +1014,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         cats = self.unique().astype(self.dtype)
         label_dtype = min_unsigned_type(len(cats))
         labels = self._label_encoding(
-            cats=cats, dtype=label_dtype, na_sentinel=1
+            cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
         )
 
         # columns include null index in factorization; remove:
@@ -1304,7 +1304,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         return self
 
     def _label_encoding(
-        self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1
+        self,
+        cats: ColumnBase,
+        dtype: Dtype = None,
+        na_sentinel: ScalarLike = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1337,6 +1340,9 @@ def _label_encoding(
         """
         from cudf._lib.join import join as cpp_join
 
+        if na_sentinel is None:
+            na_sentinel = cudf.Scalar(-1)
+
         def _return_sentinel_column():
             return cudf.core.column.full(
                 size=len(self), fill_value=na_sentinel, dtype=dtype
@@ -1346,10 +1352,7 @@ def _return_sentinel_column():
             dtype = min_scalar_type(
                 max(
                     len(cats),
-                    -1
-                    if isinstance(na_sentinel, cudf.Scalar)
-                    and na_sentinel.value is cudf.NA
-                    else na_sentinel,
+                    -1 if na_sentinel.value is cudf.NA else na_sentinel,
                 ),
                 8,
             )
@@ -1372,7 +1375,7 @@ def _return_sentinel_column():
         )
         codes = codes.take(
             right_gather_map, nullify=True, check_bounds=False
-        ).fillna(na_sentinel)
+        ).fillna(na_sentinel.value)
 
         # reorder `codes` so that its values correspond to the
         # values of `self`:

From be44b5894b849e3f3510c0d49699c5831838146f Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 22 Feb 2023 15:42:14 -0800
Subject: [PATCH 5/8] add dedicated sort tests

---
 python/cudf/cudf/tests/test_series.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index b2a0369dfd3..4d8848b701c 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -517,6 +517,26 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
     assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 2, 3, 2, 1],
+        [1, 2, None, 3, 1, 1],
+        [],
+        ["a", "b", "c", None, "z", "a"],
+    ],
+)
+@pytest.mark.parametrize("sort", [True, False])
+def test_series_factorize_sort(data, sort):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas(nullable=True)
+
+    expected_labels, expected_cats = psr.factorize(sort=sort)
+    actual_labels, actual_cats = gsr.factorize(sort=sort)
+    assert_eq(expected_labels, actual_labels.get())
+    assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))
+
+
 @pytest.mark.parametrize(
     "data",
     [

From e47a85c5e0135736f1e34347a0e1d3c184659404 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 23 Feb 2023 16:07:28 -0600
Subject: [PATCH 6/8] Update python/cudf/cudf/core/algorithms.py

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 python/cudf/cudf/core/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 4e165e5f396..a8a07b2c3bb 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -96,7 +96,7 @@ def factorize(
 
     if use_na_sentinel is not None and na_sentinel is not None:
         raise ValueError(
-            "Cannot specify both `na_sentinel` and `use_na_sentile`; "
+            "Cannot specify both `na_sentinel` and `use_na_sentinel`; "
             f"got `na_sentinel={na_sentinel}` and "
             f"`use_na_sentinel={use_na_sentinel}`"
         )

From 4ce1479beec084203b1d6cf102021370febba65c Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 27 Feb 2023 11:07:54 -0800
Subject: [PATCH 7/8] update warnings

---
 python/cudf/cudf/core/algorithms.py | 55 ++++++++++++++++-------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index a8a07b2c3bb..7012496434a 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -85,40 +85,47 @@ def factorize(
     """
     # TODO: Drop `na_sentinel` in the next release immediately after
     # pandas 2.0 upgrade.
-    if na_sentinel is not None:
-        warnings.warn(
-            "Specifying the specific value to use for `na_sentinel` is "
-            "deprecated and will be removed in a future version of cudf. "
-            "Specify `use_na_sentinel=True` to use the sentinel value -1, "
-            "and `use_na_sentinel=False` to encode NA values.",
-            FutureWarning,
-        )
-
-    if use_na_sentinel is not None and na_sentinel is not None:
+    if na_sentinel is not None and use_na_sentinel is not None:
         raise ValueError(
-            "Cannot specify both `na_sentinel` and `use_na_sentinel`; "
+            "Cannot specify both `na_sentinel` and `use_na_sentile`; "
             f"got `na_sentinel={na_sentinel}` and "
             f"`use_na_sentinel={use_na_sentinel}`"
         )
-    elif use_na_sentinel is None and na_sentinel is None:
-        use_na_sentinel = True
-        na_sentinel = -1
-    elif use_na_sentinel is None:
-        use_na_sentinel = True
-    else:
-        # use_sentinel is either True or False, na_sentinel is None
+
+    return_cupy_array = isinstance(values, cp.ndarray)
+
+    values = Series(values)
+
+    if na_sentinel is None:
         na_sentinel = (
-            -1 if use_na_sentinel else Scalar(None, dtype=values.dtype)
+            -1
+            if use_na_sentinel is None or use_na_sentinel
+            else Scalar(None, dtype=values.dtype)
         )
+    else:
+        if na_sentinel is None:
+            msg = (
+                "Specifying `na_sentinel=None` is deprecated, specify "
+                "`use_na_sentinel=False` instead."
+            )
+        elif na_sentinel == -1:
+            msg = (
+                "Specifying `na_sentinel=-1` is deprecated, specify "
+                "`use_na_sentinel=True` instead."
+            )
+        else:
+            msg = (
+                "Specifying the specific value to use for `na_sentinel` is "
+                "deprecated and will be removed in a future version of cudf. "
+                "Specify `use_na_sentinel=True` to use the sentinel value -1, "
+                "and `use_na_sentinel=False` to encode NA values.",
+            )
+        warnings.warn(msg, FutureWarning)
 
     if size_hint:
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
-    return_cupy_array = isinstance(values, cp.ndarray)
-
-    values = Series(values)
-
-    if use_na_sentinel:
+    if use_na_sentinel is None or use_na_sentinel:
         cats = values._column.dropna()
     else:
         cats = values._column

From 54b9be7b92f0de20801e7725724853bb7d120fdb Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 6 Mar 2023 15:40:25 -0800
Subject: [PATCH 8/8] simplify

---
 python/cudf/cudf/core/column/column.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index bb6071512a3..40921b71db5 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1340,7 +1340,7 @@ def _label_encoding(
         """
         from cudf._lib.join import join as cpp_join
 
-        if na_sentinel is None:
+        if na_sentinel is None or na_sentinel.value is cudf.NA:
             na_sentinel = cudf.Scalar(-1)
 
         def _return_sentinel_column():
@@ -1349,13 +1349,7 @@ def _return_sentinel_column():
             )
 
         if dtype is None:
-            dtype = min_scalar_type(
-                max(
-                    len(cats),
-                    -1 if na_sentinel.value is cudf.NA else na_sentinel,
-                ),
-                8,
-            )
+            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()