Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Deprecate na_sentinel in factorize #12817

Merged
merged 18 commits into from
Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/cudf/source/api_docs/general_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ Data manipulations
:toctree: api/

cudf.concat
cudf.crosstab
cudf.cut
cudf.factorize
cudf.get_dummies
cudf.melt
cudf.merge
cudf.pivot
cudf.pivot_table
cudf.crosstab
cudf.unstack

Top-level conversions
Expand Down
1 change: 0 additions & 1 deletion docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ Function application, GroupBy & window
:toctree: api/

Series.apply
Series.applymap
Series.map
Series.groupby
Series.rolling
Expand Down
94 changes: 82 additions & 12 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
import warnings

import cupy as cp
Expand All @@ -7,47 +7,109 @@
from cudf.core.column import as_column
from cudf.core.index import Index, RangeIndex
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.core.series import Series


def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
def factorize(
values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None
):
"""Encode the input values as integer labels

Parameters
----------
values: Series, Index, or CuPy array
The data to be factorized.
sort : bool, default True
Sort uniques and shuffle codes to maintain the relationship.
na_sentinel : number, default -1
Value to indicate missing category.

.. deprecated:: 23.04

The na_sentinel argument is deprecated and will be removed in
a future version of cudf. Specify use_na_sentinel as
either True or False.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NA values.
If False, NA values will be encoded as non-negative
integers and will not drop the NA from the uniques
of the values.

Returns
-------
(labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
- *labels* contains the encoded values
- *cats* contains the categories in order that the N-th
item corresponds to the (N-1) code.

See Also
--------
cudf.Series.factorize : Encode the input values of Series.

Examples
--------
>>> import cudf
>>> import numpy as np
>>> data = cudf.Series(['a', 'c', 'c'])
>>> codes, uniques = cudf.factorize(data)
>>> codes
array([0, 1, 1], dtype=int8)
>>> uniques
StringIndex(['a' 'c'], dtype='object')

See Also
--------
cudf.Series.factorize : Encode the input values of Series.
When ``use_na_sentinel=True`` (the default), missing values are indicated
in the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.

>>> codes, uniques = cudf.factorize(['b', None, 'a', 'c', 'b'])
>>> codes
array([ 1, -1, 0, 2, 1], dtype=int8)
>>> uniques
StringIndex(['a' 'b' 'c'], dtype='object')

If NA is in the values, and we want to include NA in the uniques of the
values, it can be achieved by setting ``use_na_sentinel=False``.

>>> values = np.array([1, 2, 1, np.nan])
>>> codes, uniques = cudf.factorize(values)
>>> codes
array([ 0, 1, 0, -1], dtype=int8)
>>> uniques
Float64Index([1.0, 2.0], dtype='float64')
>>> codes, uniques = cudf.factorize(values, use_na_sentinel=False)
>>> codes
array([1, 2, 1, 0], dtype=int8)
>>> uniques
Float64Index([<NA>, 1.0, 2.0], dtype='float64')
"""
if sort:
raise NotImplementedError(
"Sorting not yet supported during factorization."
# TODO: Drop `na_sentinel` in the next release immediately after
# pandas 2.0 upgrade.
if na_sentinel is not None:
warnings.warn(
"Specifying the specific value to use for `na_sentinel` is "
"deprecated and will be removed in a future version of cudf. "
"Specify `use_na_sentinel=True` to use the sentinel value -1, "
"and `use_na_sentinel=False` to encode NA values.",
FutureWarning,
)

if use_na_sentinel is not None and na_sentinel is not None:
raise ValueError(
"Cannot specify both `na_sentinel` and `use_na_sentile`; "
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
f"got `na_sentinel={na_sentinel}` and "
f"`use_na_sentinel={use_na_sentinel}`"
)
elif use_na_sentinel is None and na_sentinel is None:
use_na_sentinel = True
na_sentinel = -1
elif use_na_sentinel is None:
use_na_sentinel = True
else:
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
# use_sentinel is either True or False, na_sentinel is None
na_sentinel = (
-1 if use_na_sentinel else Scalar(None, dtype=values.dtype)
)
if na_sentinel is None:
raise NotImplementedError("na_sentinel can not be None.")

if size_hint:
warnings.warn("size_hint is not applicable for cudf.factorize")
Expand All @@ -56,10 +118,18 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):

values = Series(values)

cats = values._column.dropna().unique().astype(values.dtype)
if use_na_sentinel:
cats = values._column.dropna()
else:
cats = values._column

cats = cats.unique().astype(values.dtype)

if sort:
cats, _ = cats.sort_by_values()

labels = values._column._label_encoding(
cats=cats, na_sentinel=na_sentinel
cats=cats, na_sentinel=Scalar(na_sentinel)
).values

return labels, cats.values if return_cupy_array else Index(cats)
Expand Down
20 changes: 16 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
cats = self.unique().astype(self.dtype)
label_dtype = min_unsigned_type(len(cats))
labels = self._label_encoding(
cats=cats, dtype=label_dtype, na_sentinel=1
cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
)

# columns include null index in factorization; remove:
Expand Down Expand Up @@ -1304,7 +1304,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
return self

def _label_encoding(
self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1
self,
cats: ColumnBase,
dtype: Dtype = None,
na_sentinel: ScalarLike = None,
):
"""
Convert each value in `self` into an integer code, with `cats`
Expand Down Expand Up @@ -1337,13 +1340,22 @@ def _label_encoding(
"""
from cudf._lib.join import join as cpp_join

if na_sentinel is None:
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
na_sentinel = cudf.Scalar(-1)

def _return_sentinel_column():
return cudf.core.column.full(
size=len(self), fill_value=na_sentinel, dtype=dtype
)

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
dtype = min_scalar_type(
max(
len(cats),
-1 if na_sentinel.value is cudf.NA else na_sentinel,
),
8,
)

if is_mixed_with_object_dtype(self, cats):
return _return_sentinel_column()
Expand All @@ -1363,7 +1375,7 @@ def _return_sentinel_column():
)
codes = codes.take(
right_gather_map, nullify=True, check_bounds=False
).fillna(na_sentinel)
).fillna(na_sentinel.value)

# reorder `codes` so that its values correspond to the
# values of `self`:
Expand Down
8 changes: 7 additions & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,13 @@ def _compute_levels_and_codes(self):

codes = {}
for name, col in self._data.items():
code, cats = cudf.Series._from_data({None: col}).factorize()
with warnings.catch_warnings():
# TODO: Remove this filter when
# `na_sentinel` is removed from `factorize`.
# This is a filter to not let the warnings from
# `factorize` show up in other parts of public APIs.
warnings.simplefilter("ignore")
code, cats = cudf.Series._from_data({None: col}).factorize()
codes[name] = code.astype(np.int64)
levels.append(cudf.Series(cats, name=None))

Expand Down
26 changes: 22 additions & 4 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
"""Base class for Frame types that only have a single column."""

from __future__ import annotations
Expand Down Expand Up @@ -270,14 +270,27 @@ def __cuda_array_interface__(self):
return self._column.__cuda_array_interface__

@_cudf_nvtx_annotate
def factorize(self, na_sentinel=-1):
def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
"""Encode the input values as integer labels.

Parameters
----------
na_sentinel : number
sort : bool, default True
Sort uniques and shuffle codes to maintain the relationship.
na_sentinel : number, default -1
Value to indicate missing category.

.. deprecated:: 23.04

The na_sentinel argument is deprecated and will be removed in
a future version of cudf. Specify use_na_sentinel as
either True or False.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NA values.
If False, NA values will be encoded as non-negative
integers and will not drop the NA from the uniques
of the values.

Returns
-------
(labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
Expand All @@ -295,7 +308,12 @@ def factorize(self, na_sentinel=-1):
>>> uniques
StringIndex(['a' 'c'], dtype='object')
"""
return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
return cudf.core.algorithms.factorize(
self,
sort=sort,
na_sentinel=na_sentinel,
use_na_sentinel=use_na_sentinel,
)

@_cudf_nvtx_annotate
def _make_operands_for_binop(
Expand Down
47 changes: 46 additions & 1 deletion python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,12 +486,57 @@ def test_series_factorize(data, na_sentinel):

with pytest.warns(FutureWarning):
expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
with pytest.warns(FutureWarning):
actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)

assert_eq(expected_labels, actual_labels.get())
assert_eq(expected_cats.values, actual_cats.to_pandas().values)


@pytest.mark.parametrize(
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"data",
[
[1, 2, 3, 2, 1],
[1, 2, None, 3, 1, 1],
[],
["a", "b", "c", None, "z", "a"],
],
)
@pytest.mark.parametrize("use_na_sentinel", [True, False])
def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
gsr = cudf.Series(data)
psr = gsr.to_pandas(nullable=True)

expected_labels, expected_cats = psr.factorize(
use_na_sentinel=use_na_sentinel, sort=True
)
actual_labels, actual_cats = gsr.factorize(
use_na_sentinel=use_na_sentinel, sort=True
)
assert_eq(expected_labels, actual_labels.get())
assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))


@pytest.mark.parametrize(
"data",
[
[1, 2, 3, 2, 1],
[1, 2, None, 3, 1, 1],
[],
["a", "b", "c", None, "z", "a"],
],
)
@pytest.mark.parametrize("sort", [True, False])
def test_series_factorize_sort(data, sort):
gsr = cudf.Series(data)
psr = gsr.to_pandas(nullable=True)

expected_labels, expected_cats = psr.factorize(sort=sort)
actual_labels, actual_cats = gsr.factorize(sort=sort)
assert_eq(expected_labels, actual_labels.get())
assert_eq(expected_cats, actual_cats.to_pandas(nullable=True))


@pytest.mark.parametrize(
"data",
[
Expand Down