diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 7012496434a..50ec4b774ee 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -11,9 +11,7 @@ from cudf.core.series import Series -def factorize( - values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None -): +def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): """Encode the input values as integer labels Parameters @@ -22,14 +20,6 @@ def factorize( The data to be factorized. sort : bool, default True Sort uniques and shuffle codes to maintain the relationship. - na_sentinel : number, default -1 - Value to indicate missing category. - - .. deprecated:: 23.04 - - The na_sentinel argument is deprecated and will be removed in - a future version of cudf. Specify use_na_sentinel as - either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NA values. If False, NA values will be encoded as non-negative @@ -83,51 +73,19 @@ def factorize( >>> uniques Float64Index([, 1.0, 2.0], dtype='float64') """ - # TODO: Drop `na_sentinel` in the next release immediately after - # pandas 2.0 upgrade. - if na_sentinel is not None and use_na_sentinel is not None: - raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentile`; " - f"got `na_sentinel={na_sentinel}` and " - f"`use_na_sentinel={use_na_sentinel}`" - ) return_cupy_array = isinstance(values, cp.ndarray) values = Series(values) - if na_sentinel is None: - na_sentinel = ( - -1 - if use_na_sentinel is None or use_na_sentinel - else Scalar(None, dtype=values.dtype) - ) - else: - if na_sentinel is None: - msg = ( - "Specifying `na_sentinel=None` is deprecated, specify " - "`use_na_sentinel=False` instead." - ) - elif na_sentinel == -1: - msg = ( - "Specifying `na_sentinel=-1` is deprecated, specify " - "`use_na_sentinel=True` instead." - ) - else: - msg = ( - "Specifying the specific value to use for `na_sentinel` is " - "deprecated and will be removed in a future version of cudf. " - "Specify `use_na_sentinel=True` to use the sentinel value -1, " - "and `use_na_sentinel=False` to encode NA values.", - ) - warnings.warn(msg, FutureWarning) - if size_hint: warnings.warn("size_hint is not applicable for cudf.factorize") - if use_na_sentinel is None or use_na_sentinel: + if use_na_sentinel: + na_sentinel = Scalar(-1) cats = values._column.dropna() else: + na_sentinel = Scalar(None, dtype=values.dtype) cats = values._column cats = cats.unique().astype(values.dtype) @@ -136,7 +94,7 @@ def factorize( cats, _ = cats.sort_by_values() labels = values._column._label_encoding( - cats=cats, na_sentinel=Scalar(na_sentinel) + cats=cats, na_sentinel=na_sentinel ).values return labels, cats.values if return_cupy_array else Index(cats) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index d9d1aecb9d6..4cd3f0b3837 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -671,13 +671,7 @@ def _compute_levels_and_codes(self): codes = {} for name, col in self._data.items(): - with warnings.catch_warnings(): - # TODO: Remove this filter when - # `na_sentinel` is removed from `factorize`. - # This is a filter to not let the warnings from - # `factorize` show up in other parts of public APIs. - warnings.simplefilter("ignore") - code, cats = cudf.Series._from_data({None: col}).factorize() + code, cats = cudf.Series._from_data({None: col}).factorize() codes[name] = code.astype(np.int64) levels.append(cudf.Series(cats, name=None)) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 9e380e63ae0..1ffb48fe19e 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -249,21 +249,13 @@ def __cuda_array_interface__(self): return self._column.__cuda_array_interface__ @_cudf_nvtx_annotate - def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): + def factorize(self, sort=False, use_na_sentinel=True): """Encode the input values as integer labels. Parameters ---------- sort : bool, default True Sort uniques and shuffle codes to maintain the relationship. - na_sentinel : number, default -1 - Value to indicate missing category. - - .. deprecated:: 23.04 - - The na_sentinel argument is deprecated and will be removed in - a future version of cudf. Specify use_na_sentinel as - either True or False. use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NA values. If False, NA values will be encoded as non-negative @@ -290,7 +282,6 @@ def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): return cudf.core.algorithms.factorize( self, sort=sort, - na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel, ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 682fccda8dc..c4acea6cc93 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -470,29 +470,6 @@ def test_series_describe_other_types(ps): assert_eq(expected.astype("str"), actual) -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0]) -def test_series_factorize(data, na_sentinel): - gsr = cudf.Series(data) - psr = gsr.to_pandas() - - with pytest.warns(FutureWarning): - expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) - with pytest.warns(FutureWarning): - actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) - - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats.values, actual_cats.to_pandas().values) - - @pytest.mark.parametrize( "data", [