Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace RangeIndex._start/_stop/_step with _range #15576

Merged
merged 9 commits into from
Apr 24, 2024
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5535,8 +5535,8 @@ def to_arrow(self, preserve_index=None):
descr = {
"kind": "range",
"name": index.name,
"start": index._start,
"stop": index._stop,
"start": index.start,
"stop": index.stop,
"step": 1,
}
else:
Expand Down
123 changes: 49 additions & 74 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
is_integer,
is_list_like,
is_scalar,
is_signed_integer_dtype,
)
from cudf.core._base_index import BaseIndex
from cudf.core._compat import PANDAS_LT_300
Expand Down Expand Up @@ -156,6 +155,15 @@ def _index_from_columns(
return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)


def validate_range_arg(arg, arg_name: Literal["start", "stop", "step"]) -> int:
"""Validate start/stop/step argument in RangeIndex.__init__"""
if not is_integer(arg):
raise TypeError(
f"{arg_name} must be an integer, not {type(arg).__name__}"
)
return int(arg)


class RangeIndex(BaseIndex, BinaryOperand):
"""
Immutable Index implementing a monotonic integer range.
Expand Down Expand Up @@ -204,44 +212,24 @@ class RangeIndex(BaseIndex, BinaryOperand):
def __init__(
self, start, stop=None, step=1, dtype=None, copy=False, name=None
):
if step == 0:
raise ValueError("Step must not be zero.")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
if not cudf.api.types.is_hashable(name):
raise ValueError("Name must be a hashable value.")
if dtype is not None and not is_signed_integer_dtype(dtype):
self._name = name
if dtype is not None and cudf.dtype(dtype).kind != "i":
raise ValueError(f"{dtype=} must be a signed integer type")

if isinstance(start, range):
therange = start
start = therange.start
stop = therange.stop
step = therange.step
if stop is None:
start, stop = 0, start
if not is_integer(start):
raise TypeError(
f"start must be an integer, not {type(start).__name__}"
)
self._start = int(start)
if not is_integer(stop):
raise TypeError(
f"stop must be an integer, not {type(stop).__name__}"
)
self._stop = int(stop)
if step is not None:
if not is_integer(step):
raise TypeError(
f"step must be an integer, not {type(step).__name__}"
)
self._step = int(step)
self._range = start
else:
self._step = 1
self._index = None
self._name = name
self._range = range(self._start, self._stop, self._step)
# _end is the actual last element of RangeIndex,
# whereas _stop is an upper bound.
self._end = self._start + self._step * (len(self._range) - 1)
if stop is None:
start, stop = 0, start
start = validate_range_arg(start, "start")
stop = validate_range_arg(stop, "stop")
if step is not None:
step = validate_range_arg(step, "step")
else:
step = 1
self._range = range(start, stop, step)

def _copy_type_metadata(
self, other: RangeIndex, *, override_dtypes=None
Expand All @@ -258,9 +246,9 @@ def searchsorted(
na_position: Literal["first", "last"] = "last",
):
assert (len(self) <= 1) or (
ascending == (self._step > 0)
ascending == (self.step > 0)
), "Invalid ascending flag"
return search_range(value, self.as_range, side=side)
return search_range(value, self._range, side=side)

@property # type: ignore
@_cudf_nvtx_annotate
Expand All @@ -278,23 +266,23 @@ def start(self):
"""
The value of the `start` parameter (0 if this was not supplied).
"""
return self._start
return self._range.start

@property # type: ignore
@_cudf_nvtx_annotate
def stop(self):
"""
The value of the stop parameter.
"""
return self._stop
return self._range.stop

@property # type: ignore
@_cudf_nvtx_annotate
def step(self):
"""
The value of the step parameter.
"""
return self._step
return self._range.step

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -375,9 +363,7 @@ def copy(self, name=None, deep=False):

name = self.name if name is None else name

return RangeIndex(
start=self._start, stop=self._stop, step=self._step, name=name
)
return RangeIndex(self._range, name=name)

@_cudf_nvtx_annotate
def astype(self, dtype, copy: bool = True):
Expand All @@ -396,8 +382,8 @@ def duplicated(self, keep="first"):
@_cudf_nvtx_annotate
def __repr__(self):
return (
f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
f", step={self._step}"
f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
f", step={self.step}"
+ (
f", name={pd.io.formats.printing.default_pprint(self.name)}"
if self.name is not None
Expand All @@ -408,16 +394,16 @@ def __repr__(self):

@_cudf_nvtx_annotate
def __len__(self):
return len(range(self._start, self._stop, self._step))
return len(self._range)

@_cudf_nvtx_annotate
def __getitem__(self, index):
if isinstance(index, slice):
sl_start, sl_stop, sl_step = index.indices(len(self))

lo = self._start + sl_start * self._step
hi = self._start + sl_stop * self._step
st = self._step * sl_step
lo = self.start + sl_start * self.step
hi = self.start + sl_stop * self.step
st = self.step * sl_step
return RangeIndex(start=lo, stop=hi, step=st, name=self._name)

elif isinstance(index, Number):
Expand All @@ -426,18 +412,13 @@ def __getitem__(self, index):
index += len_self
if not (0 <= index < len_self):
raise IndexError("Index out of bounds")
return self._start + index * self._step
return self.start + index * self.step
return self._as_int_index()[index]

@_cudf_nvtx_annotate
def equals(self, other):
if isinstance(other, RangeIndex):
if (self._start, self._stop, self._step) == (
other._start,
other._stop,
other._step,
):
return True
return self._range == other._range
return self._as_int_index().equals(other)

@_cudf_nvtx_annotate
Expand All @@ -449,9 +430,9 @@ def serialize(self):
# We don't need to store the GPU buffer for RangeIndexes
# cuDF only needs to store start/stop and rehydrate
# during de-serialization
header["index_column"]["start"] = self._start
header["index_column"]["stop"] = self._stop
header["index_column"]["step"] = self._step
header["index_column"]["start"] = self.start
header["index_column"]["stop"] = self.stop
header["index_column"]["step"] = self.step
frames = []

header["name"] = pickle.dumps(self.name)
Expand Down Expand Up @@ -491,9 +472,9 @@ def to_pandas(
elif arrow_type:
raise NotImplementedError(f"{arrow_type=} is not implemented.")
return pd.RangeIndex(
start=self._start,
stop=self._stop,
step=self._step,
start=self.start,
stop=self.stop,
step=self.step,
dtype=self.dtype,
name=self.name,
)
Expand All @@ -502,19 +483,15 @@ def to_pandas(
def is_unique(self):
return True

@cached_property
def as_range(self):
return range(self._start, self._stop, self._step)

@cached_property # type: ignore
@_cudf_nvtx_annotate
def is_monotonic_increasing(self):
return self._step > 0 or len(self) <= 1
return self.step > 0 or len(self) <= 1

@cached_property # type: ignore
@_cudf_nvtx_annotate
def is_monotonic_decreasing(self):
return self._step < 0 or len(self) <= 1
return self.step < 0 or len(self) <= 1

@_cudf_nvtx_annotate
def memory_usage(self, deep=False):
Expand Down Expand Up @@ -597,12 +574,12 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
def get_loc(self, key):
if not is_scalar(key):
raise TypeError("Should be a scalar-like")
idx = (key - self._start) / self._step
idx_int_upper_bound = (self._stop - self._start) // self._step
idx = (key - self.start) / self.step
idx_int_upper_bound = (self.stop - self.start) // self.step
if idx > idx_int_upper_bound or idx < 0:
raise KeyError(key)

idx_int = (key - self._start) // self._step
idx_int = (key - self.start) // self.step
if idx_int != idx:
raise KeyError(key)
return idx_int
Expand All @@ -614,9 +591,9 @@ def _union(self, other, sort=None):
# following notation: *_o -> other, *_s -> self,
# and *_r -> result
start_s, step_s = self.start, self.step
end_s = self._end
end_s = self.start + self.step * (len(self) - 1)
start_o, step_o = other.start, other.step
end_o = other._end
end_o = other.start + other.step * (len(other) - 1)
if self.step < 0:
start_s, step_s, end_s = end_s, -step_s, start_s
if other.step < 0:
Expand Down Expand Up @@ -861,9 +838,7 @@ def argsort(
raise ValueError(f"invalid na_position: {na_position}")

indices = cupy.arange(0, len(self))
if (ascending and self._step < 0) or (
not ascending and self._step > 0
):
if (ascending and self.step < 0) or (not ascending and self.step > 0):
indices = indices[::-1]
return indices

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1606,7 +1606,7 @@ def test_rangeindex_name_not_hashable():
def test_index_rangeindex_search_range():
# step > 0
ridx = RangeIndex(-13, 17, 4)
ri = ridx.as_range
ri = ridx._range
for i in range(len(ridx)):
assert i == search_range(ridx[i], ri, side="left")
assert i + 1 == search_range(ridx[i], ri, side="right")
Expand Down
Loading