Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't materialize column during RangeIndex methods #15582

Merged
merged 5 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def where(self, cond, other=None, inplace=False):
"""
raise NotImplementedError

def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
raise NotImplementedError

def union(self, other, sort=None):
Expand Down Expand Up @@ -2061,7 +2061,8 @@ def dropna(self, how="any"):
one null value. "all" drops only rows containing
*all* null values.
"""

if not self.hasnans:
return self.copy()
# This is to be consistent with IndexedFrame.dropna to handle nans
# as nulls by default
data_columns = [
Expand Down
111 changes: 71 additions & 40 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import cupy
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
Expand Down Expand Up @@ -255,6 +256,15 @@ def searchsorted(
), "Invalid ascending flag"
return search_range(value, self.as_range, side=side)

def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
if sort and self.step < 0:
codes = cupy.arange(len(self) - 1, -1, -1)
uniques = self[::-1]
else:
codes = cupy.arange(len(self), dtype=np.intp)
uniques = self
return codes, uniques

@property # type: ignore
@_cudf_nvtx_annotate
def name(self):
Expand All @@ -267,31 +277,31 @@ def name(self, value):

@property # type: ignore
@_cudf_nvtx_annotate
def start(self):
def start(self) -> int:
"""
The value of the `start` parameter (0 if this was not supplied).
"""
return self._start

@property # type: ignore
@_cudf_nvtx_annotate
def stop(self):
def stop(self) -> int:
"""
The value of the stop parameter.
"""
return self._stop

@property # type: ignore
@_cudf_nvtx_annotate
def step(self):
def step(self) -> int:
"""
The value of the step parameter.
"""
return self._step

@property # type: ignore
@_cudf_nvtx_annotate
def _num_rows(self):
def _num_rows(self) -> int:
return len(self)

@cached_property # type: ignore
Expand All @@ -302,33 +312,33 @@ def _values(self):
else:
return column.column_empty(0, masked=False, dtype=self.dtype)

def _clean_nulls_from_index(self):
def _clean_nulls_from_index(self) -> Self:
return self

def _is_numeric(self):
def _is_numeric(self) -> bool:
return True

def _is_boolean(self):
def _is_boolean(self) -> bool:
return False

def _is_integer(self):
def _is_integer(self) -> bool:
return True

def _is_floating(self):
def _is_floating(self) -> bool:
return False

def _is_object(self):
def _is_object(self) -> bool:
return False

def _is_categorical(self):
def _is_categorical(self) -> bool:
return False

def _is_interval(self):
def _is_interval(self) -> bool:
return False

@property # type: ignore
@_cudf_nvtx_annotate
def hasnans(self):
def hasnans(self) -> bool:
return False

@property # type: ignore
Expand Down Expand Up @@ -378,12 +388,15 @@ def astype(self, dtype, copy: bool = True):
return self
return self._as_int_index().astype(dtype, copy=copy)

def fillna(self, value, downcast=None):
return self.copy()

@_cudf_nvtx_annotate
def drop_duplicates(self, keep="first"):
return self

@_cudf_nvtx_annotate
def duplicated(self, keep="first"):
def duplicated(self, keep="first") -> cupy.ndarray:
return cupy.zeros(len(self), dtype=bool)

@_cudf_nvtx_annotate
Expand All @@ -399,8 +412,13 @@ def __repr__(self):
+ ")"
)

@property
@_cudf_nvtx_annotate
def size(self) -> int:
return len(self)

@_cudf_nvtx_annotate
def __len__(self):
def __len__(self) -> int:
return len(range(self._start, self._stop, self._step))

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -492,7 +510,7 @@ def to_pandas(
)

@property
def is_unique(self):
def is_unique(self) -> bool:
return True

@cached_property
Expand All @@ -501,24 +519,24 @@ def as_range(self):

@cached_property # type: ignore
@_cudf_nvtx_annotate
def is_monotonic_increasing(self):
def is_monotonic_increasing(self) -> bool:
return self._step > 0 or len(self) <= 1

@cached_property # type: ignore
@_cudf_nvtx_annotate
def is_monotonic_decreasing(self):
def is_monotonic_decreasing(self) -> bool:
return self._step < 0 or len(self) <= 1

@_cudf_nvtx_annotate
def memory_usage(self, deep=False):
def memory_usage(self, deep: bool = False) -> int:
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
return 0

def unique(self):
def unique(self) -> Self:
# RangeIndex always has unique values
return self

Expand Down Expand Up @@ -841,36 +859,40 @@ def _columns(self):

@property # type: ignore
@_cudf_nvtx_annotate
def values_host(self):
return self.to_pandas().values
def values_host(self) -> np.ndarray:
return np.arange(start=self.start, stop=self.stop, step=self.step)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch 👌


@_cudf_nvtx_annotate
def argsort(
self,
ascending=True,
na_position="last",
):
) -> cupy.ndarray:
if na_position not in {"first", "last"}:
raise ValueError(f"invalid na_position: {na_position}")

indices = cupy.arange(0, len(self))
if (ascending and self._step < 0) or (
not ascending and self._step > 0
):
indices = indices[::-1]
return indices
return cupy.arange(len(self), -1, -1)
else:
return cupy.arange(len(self))

@_cudf_nvtx_annotate
def where(self, cond, other=None, inplace=False):
return self._as_int_index().where(cond, other, inplace)

@_cudf_nvtx_annotate
def to_numpy(self):
def to_numpy(self) -> np.ndarray:
return self.values_host

@_cudf_nvtx_annotate
def to_arrow(self):
return self._as_int_index().to_arrow()
def to_cupy(self) -> cupy.ndarray:
return self.values

@_cudf_nvtx_annotate
def to_arrow(self) -> pa.Array:
return pa.array(self._range)

def __array__(self, dtype=None):
raise TypeError(
Expand All @@ -881,17 +903,17 @@ def __array__(self, dtype=None):
)

@_cudf_nvtx_annotate
def nunique(self):
def nunique(self) -> int:
return len(self)

@_cudf_nvtx_annotate
def isna(self):
def isna(self) -> cupy.ndarray:
return cupy.zeros(len(self), dtype=bool)

isnull = isna

@_cudf_nvtx_annotate
def notna(self):
def notna(self) -> cupy.ndarray:
return cupy.ones(len(self), dtype=bool)

notnull = isna
Expand All @@ -915,12 +937,15 @@ def max(self):
return self._minmax("max")

@property
def values(self):
def values(self) -> cupy.ndarray:
return cupy.arange(self.start, self.stop, self.step)

def any(self):
def any(self) -> bool:
return any(self._range)

def all(self) -> bool:
return 0 not in self._range

def append(self, other):
result = self._as_int_index().append(other)
return self._try_reconstruct_range_index(result)
Expand All @@ -946,14 +971,20 @@ def isin(self, values):

return self._values.isin(values).values

def __neg__(self):
return -self._as_int_index()
def __pos__(self) -> Self:
return self.copy()

def __pos__(self):
return +self._as_int_index()
def __neg__(self) -> Self:
rng = range(-self.start, -self.stop, -self.step)
return type(self)(rng, name=self.name)

def __abs__(self):
return abs(self._as_int_index())
def __abs__(self) -> Self | Index:
if len(self) == 0 or self.min() >= 0:
return self.copy()
elif self.max() <= 0:
return -self
else:
return abs(self._as_int_index())

@_warn_no_dask_cudf
def __dask_tokenize__(self):
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3176,3 +3176,26 @@ def test_index_to_pandas_arrow_type(scalar):
result = idx.to_pandas(arrow_type=True)
expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
pd.testing.assert_index_equal(result, expected)


@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
def test_rangeindex_all(data):
result = cudf.RangeIndex(data).all()
expected = cudf.Index(list(data)).all()
assert result == expected


@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
def test_rangeindex_factorize(sort, data):
res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
assert_eq(res_codes, exp_codes)
assert_eq(res_uniques, exp_uniques)


def test_rangeindex_dropna():
ri = cudf.RangeIndex(range(2))
result = ri.dropna()
expected = ri.copy()
assert_eq(result, expected)
Loading