diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst index 40e1b766dc9..112df2fdf9f 100644 --- a/docs/cudf/source/api_docs/general_functions.rst +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -10,12 +10,14 @@ Data manipulations :toctree: api/ cudf.concat + cudf.crosstab cudf.cut + cudf.factorize cudf.get_dummies cudf.melt + cudf.merge cudf.pivot cudf.pivot_table - cudf.crosstab cudf.unstack Top-level conversions diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 386da4055d8..9cd0770431c 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -102,7 +102,6 @@ Function application, GroupBy & window :toctree: api/ Series.apply - Series.applymap Series.map Series.groupby Series.rolling diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 73fc1130073..7012496434a 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import warnings import cupy as cp @@ -7,19 +7,35 @@ from cudf.core.column import as_column from cudf.core.index import Index, RangeIndex from cudf.core.indexed_frame import IndexedFrame +from cudf.core.scalar import Scalar from cudf.core.series import Series -def factorize(values, sort=False, na_sentinel=-1, size_hint=None): +def factorize( + values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None +): """Encode the input values as integer labels Parameters ---------- values: Series, Index, or CuPy array The data to be factorized. + sort : bool, default True + Sort uniques and shuffle codes to maintain the relationship. na_sentinel : number, default -1 Value to indicate missing category. + .. deprecated:: 23.04 + + The na_sentinel argument is deprecated and will be removed in + a future version of cudf. Specify use_na_sentinel as + either True or False. + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NA values. + If False, NA values will be encoded as non-negative + integers and will not drop the NA from the uniques + of the values. + Returns ------- (labels, cats) : (cupy.ndarray, cupy.ndarray or Index) @@ -27,9 +43,14 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): - *cats* contains the categories in order that the N-th item corresponds to the (N-1) code. + See Also + -------- + cudf.Series.factorize : Encode the input values of Series. + Examples -------- >>> import cudf + >>> import numpy as np >>> data = cudf.Series(['a', 'c', 'c']) >>> codes, uniques = cudf.factorize(data) >>> codes @@ -37,29 +58,85 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): >>> uniques StringIndex(['a' 'c'], dtype='object') - See Also - -------- - cudf.Series.factorize : Encode the input values of Series. + When ``use_na_sentinel=True`` (the default), missing values are indicated + in the `codes` with the sentinel value ``-1`` and missing values are not + included in `uniques`. + >>> codes, uniques = cudf.factorize(['b', None, 'a', 'c', 'b']) + >>> codes + array([ 1, -1, 0, 2, 1], dtype=int8) + >>> uniques + StringIndex(['a' 'b' 'c'], dtype='object') + + If NA is in the values, and we want to include NA in the uniques of the + values, it can be achieved by setting ``use_na_sentinel=False``. + + >>> values = np.array([1, 2, 1, np.nan]) + >>> codes, uniques = cudf.factorize(values) + >>> codes + array([ 0, 1, 0, -1], dtype=int8) + >>> uniques + Float64Index([1.0, 2.0], dtype='float64') + >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False) + >>> codes + array([1, 2, 1, 0], dtype=int8) + >>> uniques + Float64Index([, 1.0, 2.0], dtype='float64') """ - if sort: - raise NotImplementedError( - "Sorting not yet supported during factorization." + # TODO: Drop `na_sentinel` in the next release immediately after + # pandas 2.0 upgrade. + if na_sentinel is not None and use_na_sentinel is not None: + raise ValueError( + "Cannot specify both `na_sentinel` and `use_na_sentile`; " + f"got `na_sentinel={na_sentinel}` and " + f"`use_na_sentinel={use_na_sentinel}`" ) + + return_cupy_array = isinstance(values, cp.ndarray) + + values = Series(values) + if na_sentinel is None: - raise NotImplementedError("na_sentinel can not be None.") + na_sentinel = ( + -1 + if use_na_sentinel is None or use_na_sentinel + else Scalar(None, dtype=values.dtype) + ) + else: + if na_sentinel is None: + msg = ( + "Specifying `na_sentinel=None` is deprecated, specify " + "`use_na_sentinel=False` instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying `na_sentinel=-1` is deprecated, specify " + "`use_na_sentinel=True` instead." + ) + else: + msg = ( + "Specifying the specific value to use for `na_sentinel` is " + "deprecated and will be removed in a future version of cudf. " + "Specify `use_na_sentinel=True` to use the sentinel value -1, " + "and `use_na_sentinel=False` to encode NA values.", + ) + warnings.warn(msg, FutureWarning) if size_hint: warnings.warn("size_hint is not applicable for cudf.factorize") - return_cupy_array = isinstance(values, cp.ndarray) + if use_na_sentinel is None or use_na_sentinel: + cats = values._column.dropna() + else: + cats = values._column - values = Series(values) + cats = cats.unique().astype(values.dtype) - cats = values._column.dropna().unique().astype(values.dtype) + if sort: + cats, _ = cats.sort_by_values() labels = values._column._label_encoding( - cats=cats, na_sentinel=na_sentinel + cats=cats, na_sentinel=Scalar(na_sentinel) ).values return labels, cats.values if return_cupy_array else Index(cats) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b5f36aa3594..40921b71db5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1014,7 +1014,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: cats = self.unique().astype(self.dtype) label_dtype = min_unsigned_type(len(cats)) labels = self._label_encoding( - cats=cats, dtype=label_dtype, na_sentinel=1 + cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1) ) # columns include null index in factorization; remove: @@ -1304,7 +1304,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: return self def _label_encoding( - self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1 + self, + cats: ColumnBase, + dtype: Dtype = None, + na_sentinel: ScalarLike = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1337,6 +1340,9 @@ def _label_encoding( """ from cudf._lib.join import join as cpp_join + if na_sentinel is None or na_sentinel.value is cudf.NA: + na_sentinel = cudf.Scalar(-1) + def _return_sentinel_column(): return cudf.core.column.full( size=len(self), fill_value=na_sentinel, dtype=dtype @@ -1363,7 +1369,7 @@ def _return_sentinel_column(): ) codes = codes.take( right_gather_map, nullify=True, check_bounds=False - ).fillna(na_sentinel) + ).fillna(na_sentinel.value) # reorder `codes` so that its values correspond to the # values of `self`: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 573a3f7f1d7..1f26371f797 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -739,7 +739,13 @@ def _compute_levels_and_codes(self): codes = {} for name, col in self._data.items(): - code, cats = cudf.Series._from_data({None: col}).factorize() + with warnings.catch_warnings(): + # TODO: Remove this filter when + # `na_sentinel` is removed from `factorize`. + # This is a filter to not let the warnings from + # `factorize` show up in other parts of public APIs. + warnings.simplefilter("ignore") + code, cats = cudf.Series._from_data({None: col}).factorize() codes[name] = code.astype(np.int64) levels.append(cudf.Series(cats, name=None)) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index afd06ea3629..c4128621148 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -270,14 +270,27 @@ def __cuda_array_interface__(self): return self._column.__cuda_array_interface__ @_cudf_nvtx_annotate - def factorize(self, na_sentinel=-1): + def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): """Encode the input values as integer labels. Parameters ---------- - na_sentinel : number + sort : bool, default True + Sort uniques and shuffle codes to maintain the relationship. + na_sentinel : number, default -1 Value to indicate missing category. + .. deprecated:: 23.04 + + The na_sentinel argument is deprecated and will be removed in + a future version of cudf. Specify use_na_sentinel as + either True or False. + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NA values. + If False, NA values will be encoded as non-negative + integers and will not drop the NA from the uniques + of the values. + Returns ------- (labels, cats) : (cupy.ndarray, cupy.ndarray or Index) @@ -295,7 +308,12 @@ def factorize(self, na_sentinel=-1): >>> uniques StringIndex(['a' 'c'], dtype='object') """ - return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) + return cudf.core.algorithms.factorize( + self, + sort=sort, + na_sentinel=na_sentinel, + use_na_sentinel=use_na_sentinel, + ) @_cudf_nvtx_annotate def _make_operands_for_binop( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index eb05468d923..ce519a445ba 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -490,12 +490,57 @@ def test_series_factorize(data, na_sentinel): with pytest.warns(FutureWarning): expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) - actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) + with pytest.warns(FutureWarning): + actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) assert_eq(expected_labels, actual_labels.get()) assert_eq(expected_cats.values, actual_cats.to_pandas().values) +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 2, 1], + [1, 2, None, 3, 1, 1], + [], + ["a", "b", "c", None, "z", "a"], + ], +) +@pytest.mark.parametrize("use_na_sentinel", [True, False]) +def test_series_factorize_use_na_sentinel(data, use_na_sentinel): + gsr = cudf.Series(data) + psr = gsr.to_pandas(nullable=True) + + expected_labels, expected_cats = psr.factorize( + use_na_sentinel=use_na_sentinel, sort=True + ) + actual_labels, actual_cats = gsr.factorize( + use_na_sentinel=use_na_sentinel, sort=True + ) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 2, 1], + [1, 2, None, 3, 1, 1], + [], + ["a", "b", "c", None, "z", "a"], + ], +) +@pytest.mark.parametrize("sort", [True, False]) +def test_series_factorize_sort(data, sort): + gsr = cudf.Series(data) + psr = gsr.to_pandas(nullable=True) + + expected_labels, expected_cats = psr.factorize(sort=sort) + actual_labels, actual_cats = gsr.factorize(sort=sort) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) + + @pytest.mark.parametrize( "data", [