Skip to content

Commit

Permalink
Introduce nan_as_null parameter for cudf.Index (#9893)
Browse files Browse the repository at this point in the history
Fixes: #9822 

This PR introduces `nan_as_null` parameter to `cudf.Index` constructor which is similar to the one present in `cudf.Series` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #9893
  • Loading branch information
galipremsagar authored Dec 14, 2021
1 parent 41f9956 commit fc2a32a
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 7 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,7 @@ def is_floating(self):
>>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0])
>>> idx.is_floating()
True
>>> idx = cudf.Index([1, 2, 3, 4, np.nan])
>>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False)
>>> idx.is_floating()
True
>>> idx = cudf.Index([1, 2, 3, 4])
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,9 @@ def __setitem__(self, key, value):
to_add_categories = 0
else:
to_add_categories = len(
cudf.Index(value).difference(self.categories)
cudf.Index(value, nan_as_null=False).difference(
self.categories
)
)

if to_add_categories > 0:
Expand Down
25 changes: 20 additions & 5 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2527,7 +2527,7 @@ def is_object(self):
return True


def as_index(arbitrary, **kwargs) -> BaseIndex:
def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
"""Create an Index from an arbitrary object
Currently supported inputs are:
Expand Down Expand Up @@ -2560,7 +2560,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
elif isinstance(arbitrary, ColumnBase):
return _index_from_data({kwargs.get("name", None): arbitrary})
elif isinstance(arbitrary, cudf.Series):
return as_index(arbitrary._column, **kwargs)
return as_index(arbitrary._column, nan_as_null=nan_as_null, **kwargs)
elif isinstance(arbitrary, (pd.RangeIndex, range)):
return RangeIndex(
start=arbitrary.start,
Expand All @@ -2569,11 +2569,14 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
**kwargs,
)
elif isinstance(arbitrary, pd.MultiIndex):
return cudf.MultiIndex.from_pandas(arbitrary)
return cudf.MultiIndex.from_pandas(arbitrary, nan_as_null=nan_as_null)
elif isinstance(arbitrary, cudf.DataFrame):
return cudf.MultiIndex.from_frame(arbitrary)
return as_index(
column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs
column.as_column(
arbitrary, dtype=kwargs.get("dtype", None), nan_as_null=nan_as_null
),
**kwargs,
)


Expand Down Expand Up @@ -2623,6 +2626,10 @@ class Index(BaseIndex, metaclass=IndexMeta):
tupleize_cols : bool (default: True)
When True, attempt to create a MultiIndex if possible.
tupleize_cols == False is not yet supported.
nan_as_null : bool, Default True
If ``None``/``True``, converts ``np.nan`` values to
``null`` values.
If ``False``, leaves ``np.nan`` values as is.
Returns
-------
Expand Down Expand Up @@ -2655,6 +2662,7 @@ def __new__(
copy=False,
name=None,
tupleize_cols=True,
nan_as_null=True,
**kwargs,
):
assert (
Expand All @@ -2665,7 +2673,14 @@ def __new__(
"tupleize_cols != True is not yet supported"
)

return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)
return as_index(
data,
copy=copy,
dtype=dtype,
name=name,
nan_as_null=nan_as_null,
**kwargs,
)

@classmethod
def from_arrow(cls, obj):
Expand Down
19 changes: 19 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2509,3 +2509,22 @@ def test_index_datetime_round(resolution):
cuidx_floor = cuidx.round(resolution)

assert_eq(pidx_floor, cuidx_floor)


@pytest.mark.parametrize(
"data,nan_idx,NA_idx",
[([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)],
)
@pytest.mark.parametrize("nan_as_null", [True, False])
def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
idx = cudf.Index(data, nan_as_null=nan_as_null)

if nan_as_null:
if nan_idx is not None:
assert idx[nan_idx] is cudf.NA
else:
if nan_idx is not None:
assert np.isnan(idx[nan_idx])

if NA_idx is not None:
assert idx[NA_idx] is cudf.NA

0 comments on commit fc2a32a

Please sign in to comment.