From 0eaa4f2425c32eb457473b9ea0dfab8b40f31ee8 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 21 Feb 2023 12:10:10 -0800 Subject: [PATCH 1/8] Deprecate na_sentinel --- .../source/api_docs/general_functions.rst | 4 +- docs/cudf/source/api_docs/series.rst | 1 - python/cudf/cudf/core/algorithms.py | 94 ++++++++++++++++--- python/cudf/cudf/core/column/column.py | 13 ++- python/cudf/cudf/core/multiindex.py | 12 ++- python/cudf/cudf/core/single_column_frame.py | 26 ++++- python/cudf/cudf/tests/test_series.py | 29 +++++- 7 files changed, 155 insertions(+), 24 deletions(-) diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst index 40e1b766dc9..112df2fdf9f 100644 --- a/docs/cudf/source/api_docs/general_functions.rst +++ b/docs/cudf/source/api_docs/general_functions.rst @@ -10,12 +10,14 @@ Data manipulations :toctree: api/ cudf.concat + cudf.crosstab cudf.cut + cudf.factorize cudf.get_dummies cudf.melt + cudf.merge cudf.pivot cudf.pivot_table - cudf.crosstab cudf.unstack Top-level conversions diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 386da4055d8..9cd0770431c 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -102,7 +102,6 @@ Function application, GroupBy & window :toctree: api/ Series.apply - Series.applymap Series.map Series.groupby Series.rolling diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 73fc1130073..6d25a376385 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import warnings import cupy as cp @@ -7,19 +7,35 @@ from cudf.core.column import as_column from cudf.core.index import Index, RangeIndex from cudf.core.indexed_frame import IndexedFrame +from cudf.core.scalar import Scalar from cudf.core.series import Series -def factorize(values, sort=False, na_sentinel=-1, size_hint=None): +def factorize( + values, sort=False, na_sentinel=None, use_na_sentinel=None, size_hint=None +): """Encode the input values as integer labels Parameters ---------- values: Series, Index, or CuPy array The data to be factorized. + sort : bool, default True + Sort uniques and shuffle codes to maintain the relationship. na_sentinel : number, default -1 Value to indicate missing category. + .. deprecated:: 23.04 + + The na_sentinel argument is deprecated and will be removed in + a future version of cudf. Specify use_na_sentinel as + either True or False. + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NA values. + If False, NA values will be encoded as non-negative + integers and will not drop the NA from the uniques + of the values. + Returns ------- (labels, cats) : (cupy.ndarray, cupy.ndarray or Index) @@ -27,9 +43,14 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): - *cats* contains the categories in order that the N-th item corresponds to the (N-1) code. + See Also + -------- + cudf.Series.factorize : Encode the input values of Series. + Examples -------- >>> import cudf + >>> import numpy as np >>> data = cudf.Series(['a', 'c', 'c']) >>> codes, uniques = cudf.factorize(data) >>> codes @@ -37,17 +58,60 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): >>> uniques StringIndex(['a' 'c'], dtype='object') - See Also - -------- - cudf.Series.factorize : Encode the input values of Series. + When ``use_na_sentinel=True`` (the default), missing values are indicated + in the `codes` with the sentinel value ``-1`` and missing values are not + included in `uniques`. + + >>> codes, uniques = cudf.factorize(['b', None, 'a', 'c', 'b']) + >>> codes + array([ 1, -1, 0, 2, 1], dtype=int8) + >>> uniques + StringIndex(['a' 'b' 'c'], dtype='object') + If NA is in the values, and we want to include NA in the uniques of the + values, it can be achieved by setting ``use_na_sentinel=False``. + + >>> values = np.array([1, 2, 1, np.nan]) + >>> codes, uniques = cudf.factorize(values) + >>> codes + array([ 0, 1, 0, -1], dtype=int8) + >>> uniques + Float64Index([1.0, 2.0], dtype='float64') + >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False) + >>> codes + array([1, 2, 1, 0], dtype=int8) + >>> uniques + Float64Index([, 1.0, 2.0], dtype='float64') """ - if sort: - raise NotImplementedError( - "Sorting not yet supported during factorization." + # TODO: Drop `na_sentinel` in the next release immediately after + # pandas 2.0 upgrade. + if na_sentinel is not None: + warnings.warn( + "Specifying the specific value to use for `na_sentinel` is " + "deprecated and will be removed in a future version of cudf. " + "Specify `use_na_sentinel=True` to use the sentinel value -1, " + "and `use_na_sentinel=False` to encode NA values.", + FutureWarning, + ) + + if use_na_sentinel is not None and na_sentinel is not None: + raise ValueError( + "Cannot specify both `na_sentinel` and `use_na_sentile`; " + f"got `na_sentinel={na_sentinel}` and " + f"`use_na_sentinel={use_na_sentinel}`" + ) + elif use_na_sentinel is None and na_sentinel is None: + use_na_sentinel = True + na_sentinel = -1 + + if use_na_sentinel is None: + use_na_sentinel = True + elif na_sentinel is None: + na_sentinel = -1 + else: + na_sentinel = ( + -1 if use_na_sentinel else Scalar(None, dtype=values.dtype) ) - if na_sentinel is None: - raise NotImplementedError("na_sentinel can not be None.") if size_hint: warnings.warn("size_hint is not applicable for cudf.factorize") @@ -56,7 +120,15 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): values = Series(values) - cats = values._column.dropna().unique().astype(values.dtype) + if use_na_sentinel: + cats = values._column.dropna() + else: + cats = values._column + + cats = cats.unique().astype(values.dtype) + + if sort: + cats, _ = cats.sort_by_values() labels = values._column._label_encoding( cats=cats, na_sentinel=na_sentinel diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fb1bcf6d673..66a286bd84d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -22,13 +22,13 @@ import cupy import numpy as np -import pandas as pd import pyarrow as pa from numba import cuda import rmm import cudf +import pandas as pd from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.null_mask import ( @@ -1343,7 +1343,16 @@ def _return_sentinel_column(): ) if dtype is None: - dtype = min_scalar_type(max(len(cats), na_sentinel), 8) + dtype = min_scalar_type( + max( + len(cats), + -1 + if isinstance(na_sentinel, cudf.Scalar) + and na_sentinel.value is cudf.NA + else na_sentinel, + ), + 8, + ) if is_mixed_with_object_dtype(self, cats): return _return_sentinel_column() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 783c3996400..f5aa5214f74 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -13,10 +13,9 @@ import cupy as cp import numpy as np -import pandas as pd -from pandas._config import get_option import cudf +import pandas as pd from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries from cudf.api.types import is_integer, is_list_like, is_object_dtype @@ -31,6 +30,7 @@ ) from cudf.utils.docutils import doc_apply from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate +from pandas._config import get_option def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: @@ -714,7 +714,13 @@ def _compute_levels_and_codes(self): codes = {} for name, col in self._data.items(): - code, cats = cudf.Series._from_data({None: col}).factorize() + with warnings.catch_warnings(): + # TODO: Remove this filter when + # `na_sentinel` is removed from `factorize`. + # This is a filter to not let the warnings from + # `factorize` show up in other parts of public APIs. + warnings.simplefilter("ignore") + code, cats = cudf.Series._from_data({None: col}).factorize() codes[name] = code.astype(np.int64) levels.append(cudf.Series(cats, name=None)) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index afd06ea3629..c4128621148 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -270,14 +270,27 @@ def __cuda_array_interface__(self): return self._column.__cuda_array_interface__ @_cudf_nvtx_annotate - def factorize(self, na_sentinel=-1): + def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): """Encode the input values as integer labels. Parameters ---------- - na_sentinel : number + sort : bool, default True + Sort uniques and shuffle codes to maintain the relationship. + na_sentinel : number, default -1 Value to indicate missing category. + .. deprecated:: 23.04 + + The na_sentinel argument is deprecated and will be removed in + a future version of cudf. Specify use_na_sentinel as + either True or False. + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NA values. + If False, NA values will be encoded as non-negative + integers and will not drop the NA from the uniques + of the values. + Returns ------- (labels, cats) : (cupy.ndarray, cupy.ndarray or Index) @@ -295,7 +308,12 @@ def factorize(self, na_sentinel=-1): >>> uniques StringIndex(['a' 'c'], dtype='object') """ - return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) + return cudf.core.algorithms.factorize( + self, + sort=sort, + na_sentinel=na_sentinel, + use_na_sentinel=use_na_sentinel, + ) @_cudf_nvtx_annotate def _make_operands_for_binop( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b3c7c9ac9bb..bc54d1e51f7 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -8,11 +8,11 @@ import cupy as cp import numpy as np -import pandas as pd import pyarrow as pa import pytest import cudf +import pandas as pd from cudf.core._compat import PANDAS_GE_120, PANDAS_LT_140 from cudf.testing._utils import ( NUMERIC_TYPES, @@ -486,12 +486,37 @@ def test_series_factorize(data, na_sentinel): with pytest.warns(FutureWarning): expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) - actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) + with pytest.warns(FutureWarning): + actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) assert_eq(expected_labels, actual_labels.get()) assert_eq(expected_cats.values, actual_cats.to_pandas().values) +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 2, 1], + [1, 2, None, 3, 1, 1], + [], + ["a", "b", "c", None, "z", "a"], + ], +) +@pytest.mark.parametrize("use_na_sentinel", [True, False]) +def test_series_factorize_use_na_sentinel(data, use_na_sentinel): + gsr = cudf.Series(data) + psr = gsr.to_pandas(nullable=True) + + expected_labels, expected_cats = psr.factorize( + use_na_sentinel=use_na_sentinel, sort=True + ) + actual_labels, actual_cats = gsr.factorize( + use_na_sentinel=use_na_sentinel, sort=True + ) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) + + @pytest.mark.parametrize( "data", [ From 12a910273c4d6cf7ac442d7171964ce476bc041b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 21 Feb 2023 12:50:06 -0800 Subject: [PATCH 2/8] undo isort --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/multiindex.py | 4 ++-- python/cudf/cudf/tests/test_series.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 66a286bd84d..414d9cd5f35 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -22,13 +22,13 @@ import cupy import numpy as np +import pandas as pd import pyarrow as pa from numba import cuda import rmm import cudf -import pandas as pd from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.null_mask import ( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f5aa5214f74..17b3e611625 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -13,9 +13,10 @@ import cupy as cp import numpy as np +import pandas as pd +from pandas._config import get_option import cudf -import pandas as pd from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries from cudf.api.types import is_integer, is_list_like, is_object_dtype @@ -30,7 +31,6 @@ ) from cudf.utils.docutils import doc_apply from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate -from pandas._config import get_option def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index bc54d1e51f7..b2a0369dfd3 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -8,11 +8,11 @@ import cupy as cp import numpy as np +import pandas as pd import pyarrow as pa import pytest import cudf -import pandas as pd from cudf.core._compat import PANDAS_GE_120, PANDAS_LT_140 from cudf.testing._utils import ( NUMERIC_TYPES, From 7296bf7775654b2a05720886f854dbce55465fc2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 22 Feb 2023 14:33:49 -0800 Subject: [PATCH 3/8] simplify if/else --- python/cudf/cudf/core/algorithms.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 6d25a376385..c04aaa00196 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -103,12 +103,10 @@ def factorize( elif use_na_sentinel is None and na_sentinel is None: use_na_sentinel = True na_sentinel = -1 - - if use_na_sentinel is None: + elif use_na_sentinel is None: use_na_sentinel = True - elif na_sentinel is None: - na_sentinel = -1 else: + # use_sentinel is either True or False, na_sentinel is None na_sentinel = ( -1 if use_na_sentinel else Scalar(None, dtype=values.dtype) ) From 71f98c441bd39c84a9420c75de9159f508e5415f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 22 Feb 2023 15:39:42 -0800 Subject: [PATCH 4/8] accept only scalars in _label_encoding for na_sentinel --- python/cudf/cudf/core/algorithms.py | 2 +- python/cudf/cudf/core/column/column.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index c04aaa00196..4e165e5f396 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -129,7 +129,7 @@ def factorize( cats, _ = cats.sort_by_values() labels = values._column._label_encoding( - cats=cats, na_sentinel=na_sentinel + cats=cats, na_sentinel=Scalar(na_sentinel) ).values return labels, cats.values if return_cupy_array else Index(cats) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 414d9cd5f35..8e6b8fe9f70 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1014,7 +1014,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: cats = self.unique().astype(self.dtype) label_dtype = min_unsigned_type(len(cats)) labels = self._label_encoding( - cats=cats, dtype=label_dtype, na_sentinel=1 + cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1) ) # columns include null index in factorization; remove: @@ -1304,7 +1304,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: return self def _label_encoding( - self, cats: ColumnBase, dtype: Dtype = None, na_sentinel=-1 + self, + cats: ColumnBase, + dtype: Dtype = None, + na_sentinel: ScalarLike = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1337,6 +1340,9 @@ def _label_encoding( """ from cudf._lib.join import join as cpp_join + if na_sentinel is None: + na_sentinel = cudf.Scalar(-1) + def _return_sentinel_column(): return cudf.core.column.full( size=len(self), fill_value=na_sentinel, dtype=dtype @@ -1346,10 +1352,7 @@ def _return_sentinel_column(): dtype = min_scalar_type( max( len(cats), - -1 - if isinstance(na_sentinel, cudf.Scalar) - and na_sentinel.value is cudf.NA - else na_sentinel, + -1 if na_sentinel.value is cudf.NA else na_sentinel, ), 8, ) @@ -1372,7 +1375,7 @@ def _return_sentinel_column(): ) codes = codes.take( right_gather_map, nullify=True, check_bounds=False - ).fillna(na_sentinel) + ).fillna(na_sentinel.value) # reorder `codes` so that its values correspond to the # values of `self`: From be44b5894b849e3f3510c0d49699c5831838146f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 22 Feb 2023 15:42:14 -0800 Subject: [PATCH 5/8] add dedicated sort tests --- python/cudf/cudf/tests/test_series.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b2a0369dfd3..4d8848b701c 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -517,6 +517,26 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel): assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 2, 1], + [1, 2, None, 3, 1, 1], + [], + ["a", "b", "c", None, "z", "a"], + ], +) +@pytest.mark.parametrize("sort", [True, False]) +def test_series_factorize_sort(data, sort): + gsr = cudf.Series(data) + psr = gsr.to_pandas(nullable=True) + + expected_labels, expected_cats = psr.factorize(sort=sort) + actual_labels, actual_cats = gsr.factorize(sort=sort) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) + + @pytest.mark.parametrize( "data", [ From e47a85c5e0135736f1e34347a0e1d3c184659404 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 23 Feb 2023 16:07:28 -0600 Subject: [PATCH 6/8] Update python/cudf/cudf/core/algorithms.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- python/cudf/cudf/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 4e165e5f396..a8a07b2c3bb 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -96,7 +96,7 @@ def factorize( if use_na_sentinel is not None and na_sentinel is not None: raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentile`; " + "Cannot specify both `na_sentinel` and `use_na_sentinel`; " f"got `na_sentinel={na_sentinel}` and " f"`use_na_sentinel={use_na_sentinel}`" ) From 4ce1479beec084203b1d6cf102021370febba65c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 27 Feb 2023 11:07:54 -0800 Subject: [PATCH 7/8] update warnings --- python/cudf/cudf/core/algorithms.py | 55 ++++++++++++++++------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index a8a07b2c3bb..7012496434a 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -85,40 +85,47 @@ def factorize( """ # TODO: Drop `na_sentinel` in the next release immediately after # pandas 2.0 upgrade. - if na_sentinel is not None: - warnings.warn( - "Specifying the specific value to use for `na_sentinel` is " - "deprecated and will be removed in a future version of cudf. " - "Specify `use_na_sentinel=True` to use the sentinel value -1, " - "and `use_na_sentinel=False` to encode NA values.", - FutureWarning, - ) - - if use_na_sentinel is not None and na_sentinel is not None: + if na_sentinel is not None and use_na_sentinel is not None: raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentinel`; " + "Cannot specify both `na_sentinel` and `use_na_sentile`; " f"got `na_sentinel={na_sentinel}` and " f"`use_na_sentinel={use_na_sentinel}`" ) - elif use_na_sentinel is None and na_sentinel is None: - use_na_sentinel = True - na_sentinel = -1 - elif use_na_sentinel is None: - use_na_sentinel = True - else: - # use_sentinel is either True or False, na_sentinel is None + + return_cupy_array = isinstance(values, cp.ndarray) + + values = Series(values) + + if na_sentinel is None: na_sentinel = ( - -1 if use_na_sentinel else Scalar(None, dtype=values.dtype) + -1 + if use_na_sentinel is None or use_na_sentinel + else Scalar(None, dtype=values.dtype) ) + else: + if na_sentinel is None: + msg = ( + "Specifying `na_sentinel=None` is deprecated, specify " + "`use_na_sentinel=False` instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying `na_sentinel=-1` is deprecated, specify " + "`use_na_sentinel=True` instead." + ) + else: + msg = ( + "Specifying the specific value to use for `na_sentinel` is " + "deprecated and will be removed in a future version of cudf. " + "Specify `use_na_sentinel=True` to use the sentinel value -1, " + "and `use_na_sentinel=False` to encode NA values.", + ) + warnings.warn(msg, FutureWarning) if size_hint: warnings.warn("size_hint is not applicable for cudf.factorize") - return_cupy_array = isinstance(values, cp.ndarray) - - values = Series(values) - - if use_na_sentinel: + if use_na_sentinel is None or use_na_sentinel: cats = values._column.dropna() else: cats = values._column From 54b9be7b92f0de20801e7725724853bb7d120fdb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 6 Mar 2023 15:40:25 -0800 Subject: [PATCH 8/8] simplify --- python/cudf/cudf/core/column/column.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index bb6071512a3..40921b71db5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1340,7 +1340,7 @@ def _label_encoding( """ from cudf._lib.join import join as cpp_join - if na_sentinel is None: + if na_sentinel is None or na_sentinel.value is cudf.NA: na_sentinel = cudf.Scalar(-1) def _return_sentinel_column(): @@ -1349,13 +1349,7 @@ def _return_sentinel_column(): ) if dtype is None: - dtype = min_scalar_type( - max( - len(cats), - -1 if na_sentinel.value is cudf.NA else na_sentinel, - ), - 8, - ) + dtype = min_scalar_type(max(len(cats), na_sentinel), 8) if is_mixed_with_object_dtype(self, cats): return _return_sentinel_column()