Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Introduce nan_as_null parameter for cudf.Index #9893

Merged
merged 8 commits into from
Dec 14, 2021
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,7 @@ def is_floating(self):
>>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0])
>>> idx.is_floating()
True
>>> idx = cudf.Index([1, 2, 3, 4, np.nan])
>>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False)
>>> idx.is_floating()
True
>>> idx = cudf.Index([1, 2, 3, 4])
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,9 @@ def __setitem__(self, key, value):
to_add_categories = 0
else:
to_add_categories = len(
cudf.Index(value).difference(self.categories)
cudf.Index(value, nan_as_null=False).difference(
self.categories
)
)

if to_add_categories > 0:
Expand Down
25 changes: 20 additions & 5 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2527,7 +2527,7 @@ def is_object(self):
return True


def as_index(arbitrary, **kwargs) -> BaseIndex:
def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
"""Create an Index from an arbitrary object

Currently supported inputs are:
Expand Down Expand Up @@ -2560,7 +2560,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
elif isinstance(arbitrary, ColumnBase):
return _index_from_data({kwargs.get("name", None): arbitrary})
elif isinstance(arbitrary, cudf.Series):
return as_index(arbitrary._column, **kwargs)
return as_index(arbitrary._column, nan_as_null=nan_as_null, **kwargs)
elif isinstance(arbitrary, (pd.RangeIndex, range)):
return RangeIndex(
start=arbitrary.start,
Expand All @@ -2569,11 +2569,14 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
**kwargs,
)
elif isinstance(arbitrary, pd.MultiIndex):
return cudf.MultiIndex.from_pandas(arbitrary)
return cudf.MultiIndex.from_pandas(arbitrary, nan_as_null=nan_as_null)
elif isinstance(arbitrary, cudf.DataFrame):
return cudf.MultiIndex.from_frame(arbitrary)
return as_index(
column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs
column.as_column(
arbitrary, dtype=kwargs.get("dtype", None), nan_as_null=nan_as_null
),
**kwargs,
)


Expand Down Expand Up @@ -2623,6 +2626,10 @@ class Index(BaseIndex, metaclass=IndexMeta):
tupleize_cols : bool (default: True)
When True, attempt to create a MultiIndex if possible.
tupleize_cols == False is not yet supported.
nan_as_null : bool, Default True
If ``None``/``True``, converts ``np.nan`` values to
``null`` values.
If ``False``, leaves ``np.nan`` values as is.

Returns
-------
Expand Down Expand Up @@ -2655,6 +2662,7 @@ def __new__(
copy=False,
name=None,
tupleize_cols=True,
nan_as_null=True,
**kwargs,
):
assert (
Expand All @@ -2665,7 +2673,14 @@ def __new__(
"tupleize_cols != True is not yet supported"
)

return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)
return as_index(
data,
copy=copy,
dtype=dtype,
name=name,
nan_as_null=nan_as_null,
**kwargs,
)

@classmethod
def from_arrow(cls, obj):
Expand Down
19 changes: 19 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2509,3 +2509,22 @@ def test_index_datetime_round(resolution):
cuidx_floor = cuidx.round(resolution)

assert_eq(pidx_floor, cuidx_floor)


@pytest.mark.parametrize(
"data,nan_idx,NA_idx",
[([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)],
)
@pytest.mark.parametrize("nan_as_null", [True, False])
def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
idx = cudf.Index(data, nan_as_null=nan_as_null)

if nan_as_null:
if nan_idx is not None:
assert idx[nan_idx] is cudf.NA
else:
if nan_idx is not None:
assert np.isnan(idx[nan_idx])

if NA_idx is not None:
assert idx[NA_idx] is cudf.NA