From fc2a32a1576d97a48b0c1c983ef4b31285267e96 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 14 Dec 2021 16:59:24 -0600 Subject: [PATCH] Introduce `nan_as_null` parameter for `cudf.Index` (#9893) Fixes: #9822 This PR introduces `nan_as_null` parameter to `cudf.Index` constructor which is similar to the one present in `cudf.Series` constructor. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9893 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column/categorical.py | 4 +++- python/cudf/cudf/core/index.py | 25 ++++++++++++++++----- python/cudf/cudf/tests/test_index.py | 19 ++++++++++++++++ 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ac5e152d011..ed1cc74db71 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -829,7 +829,7 @@ def is_floating(self): >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0]) >>> idx.is_floating() True - >>> idx = cudf.Index([1, 2, 3, 4, np.nan]) + >>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False) >>> idx.is_floating() True >>> idx = cudf.Index([1, 2, 3, 4]) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a2c1f04b2f2..4be7a422de0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -809,7 +809,9 @@ def __setitem__(self, key, value): to_add_categories = 0 else: to_add_categories = len( - cudf.Index(value).difference(self.categories) + cudf.Index(value, nan_as_null=False).difference( + self.categories + ) ) if to_add_categories > 0: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 29e0d17bc39..362c96ebbeb 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2527,7 +2527,7 @@ def is_object(self): return True -def as_index(arbitrary, **kwargs) -> BaseIndex: +def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: """Create an Index from an arbitrary object Currently supported inputs are: @@ -2560,7 +2560,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: elif isinstance(arbitrary, ColumnBase): return _index_from_data({kwargs.get("name", None): arbitrary}) elif isinstance(arbitrary, cudf.Series): - return as_index(arbitrary._column, **kwargs) + return as_index(arbitrary._column, nan_as_null=nan_as_null, **kwargs) elif isinstance(arbitrary, (pd.RangeIndex, range)): return RangeIndex( start=arbitrary.start, @@ -2569,11 +2569,14 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: **kwargs, ) elif isinstance(arbitrary, pd.MultiIndex): - return cudf.MultiIndex.from_pandas(arbitrary) + return cudf.MultiIndex.from_pandas(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, cudf.DataFrame): return cudf.MultiIndex.from_frame(arbitrary) return as_index( - column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs + column.as_column( + arbitrary, dtype=kwargs.get("dtype", None), nan_as_null=nan_as_null + ), + **kwargs, ) @@ -2623,6 +2626,10 @@ class Index(BaseIndex, metaclass=IndexMeta): tupleize_cols : bool (default: True) When True, attempt to create a MultiIndex if possible. tupleize_cols == False is not yet supported. + nan_as_null : bool, Default True + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. Returns ------- @@ -2655,6 +2662,7 @@ def __new__( copy=False, name=None, tupleize_cols=True, + nan_as_null=True, **kwargs, ): assert ( @@ -2665,7 +2673,14 @@ def __new__( "tupleize_cols != True is not yet supported" ) - return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + return as_index( + data, + copy=copy, + dtype=dtype, + name=name, + nan_as_null=nan_as_null, + **kwargs, + ) @classmethod def from_arrow(cls, obj): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c7fca2075f5..6679725ae9a 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2509,3 +2509,22 @@ def test_index_datetime_round(resolution): cuidx_floor = cuidx.round(resolution) assert_eq(pidx_floor, cuidx_floor) + + +@pytest.mark.parametrize( + "data,nan_idx,NA_idx", + [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): + idx = cudf.Index(data, nan_as_null=nan_as_null) + + if nan_as_null: + if nan_idx is not None: + assert idx[nan_idx] is cudf.NA + else: + if nan_idx is not None: + assert np.isnan(idx[nan_idx]) + + if NA_idx is not None: + assert idx[NA_idx] is cudf.NA