Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Index search to simplify code and increase correctness #13625

Merged
merged 15 commits into from
Jun 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 113 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1432,7 +1432,94 @@ def rename(self, name, inplace=False):
out.name = name
return out

def get_slice_bound(self, label, side, kind=None):
def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
"""
Return indices corresponding to value

Parameters
----------
value
Value to look for in index

Returns
-------
Column of indices
"""
raise NotImplementedError

def find_label_range(self, loc: slice) -> slice:
"""
Translate a label-based slice to an index-based slice

Parameters
----------
loc
slice to search for.

Notes
-----
As with all label-based searches, the slice is right-closed.

Returns
-------
New slice translated into integer indices of the index (right-open).
"""
start = loc.start
stop = loc.stop
step = 1 if loc.step is None else loc.step
if step < 0:
start_side, stop_side = "right", "left"
else:
start_side, stop_side = "left", "right"
istart = (
None
if start is None
else self.get_slice_bound(start, side=start_side)
)
istop = (
None
if stop is None
else self.get_slice_bound(stop, side=stop_side)
)
if step < 0:
# Fencepost
istart = None if istart is None else max(istart - 1, 0)
istop = None if (istop is None or istop == 0) else istop - 1
return slice(istart, istop, step)

def searchsorted(
self,
value,
side: str = "left",
ascending: bool = True,
na_position: str = "last",
):
"""Find index where elements should be inserted to maintain order

Parameters
----------
value :
Value to be hypothetically inserted into Self
side : str {'left', 'right'} optional, default 'left'
If 'left', the index of the first suitable location found is given
If 'right', return the last such index
ascending : bool optional, default True
Index is in ascending order (otherwise descending)
na_position : str {'last', 'first'} optional, default 'last'
Position of null values in sorted order

Returns
-------
Insertion point.

Notes
-----
As a precondition the index must be sorted in the same order
as requested by the `ascending` flag.
"""
raise NotImplementedError()

def get_slice_bound(self, label, side: str, kind=None) -> int:
"""
Calculate slice bound that corresponds to given label.
Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
Expand All @@ -1449,7 +1536,31 @@ def get_slice_bound(self, label, side, kind=None):
int
Index of label.
"""
raise NotImplementedError
if kind is not None:
warnings.warn(
"'kind' argument in get_slice_bound is deprecated and will be "
"removed in a future version.",
FutureWarning,
)
if side not in {"left", "right"}:
raise ValueError(f"Invalid side argument {side}")
if self.is_monotonic_increasing or self.is_monotonic_decreasing:
return self.searchsorted(
label, side=side, ascending=self.is_monotonic_increasing
)
else:
try:
left, right = self._values._find_first_and_last(label)
except ValueError:
raise KeyError(f"{label=} not in index")
if left != right:
raise KeyError(
f"Cannot get slice bound for non-unique label {label=}"
)
if side == "left":
return left
else:
return right + 1

def __array_function__(self, func, types, args, kwargs):
# check if the function is implemented for the current type
Expand Down
17 changes: 4 additions & 13 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,19 +1286,10 @@ def fillna(

return result

def find_first_value(
self, value: ScalarLike, closest: bool = False
) -> int:
"""
Returns offset of first value that matches
"""
return self.as_numerical.find_first_value(self._encode(value))

def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
"""
Returns offset of last value that matches
"""
return self.as_numerical.find_last_value(self._encode(value))
def indices_of(
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
return self.as_numerical.indices_of(self._encode(value))

@property
def is_monotonic_increasing(self) -> bool:
Expand Down
122 changes: 70 additions & 52 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
drop_nulls,
)
from cudf._lib.transform import bools_to_mask
from cudf._lib.types import size_type_dtype
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
Expand Down Expand Up @@ -734,29 +735,79 @@ def notnull(self) -> ColumnBase:

return result

def find_first_value(
self, value: ScalarLike, closest: bool = False
) -> int:
def indices_of(
self, value: ScalarLike | Self
) -> cudf.core.column.NumericalColumn:
"""
Returns offset of first value that matches
Find locations of value in the column

Parameters
----------
value
Scalar to look for (cast to dtype of column), or a length-1 column

Returns
-------
Column of indices that match value
"""
# FIXME: Inefficient, may be need a libcudf api
index = cudf.core.index.RangeIndex(0, stop=len(self))
indices = index.take(self == value)
if not len(indices):
raise ValueError("value not found")
return indices[0]

def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
if not isinstance(value, ColumnBase):
value = as_column([value], dtype=self.dtype)
else:
assert len(value) == 1
mask = libcudf.search.contains(value, self)
return apply_boolean_mask(
[arange(0, len(self), dtype=size_type_dtype)], mask
)[0]

def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
indices = self.indices_of(value)
if n := len(indices):
return (
indices.element_indexing(0),
indices.element_indexing(n - 1),
)
else:
raise ValueError(f"Value {value} not found in column")

def find_first_value(self, value: ScalarLike) -> int:
"""
Returns offset of last value that matches
Return index of first value that matches

Parameters
----------
value
Value to search for (cast to dtype of column)

Returns
-------
Index of value

Raises
------
ValueError if value is not found
"""
first, _ = self._find_first_and_last(value)
return first

def find_last_value(self, value: ScalarLike) -> int:
"""
# FIXME: Inefficient, may be need a libcudf api
index = cudf.core.index.RangeIndex(0, stop=len(self))
indices = index.take(self == value)
if not len(indices):
raise ValueError("value not found")
return indices[-1]
Return index of last value that matches

Parameters
----------
value
Value to search for (cast to dtype of column)

Returns
-------
Index of value

Raises
------
ValueError if value is not found
"""
_, last = self._find_first_and_last(value)
return last

def append(self, other: ColumnBase) -> ColumnBase:
return concat_columns([self, as_column(other)])
Expand Down Expand Up @@ -893,39 +944,6 @@ def is_monotonic_decreasing(self) -> bool:
ascending=[False], null_position=None
)

def get_slice_bound(self, label: ScalarLike, side: str, kind: str) -> int:
"""
Calculate slice bound that corresponds to given label.
Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
of given label.

Parameters
----------
label : Scalar
side : {'left', 'right'}
kind : {'ix', 'loc', 'getitem'}
"""
if kind not in {"ix", "loc", "getitem", None}:
raise ValueError(
f"Invalid value for ``kind`` parameter,"
f" must be either one of the following: "
f"{'ix', 'loc', 'getitem', None}, but found: {kind}"
)
if side not in {"left", "right"}:
raise ValueError(
"Invalid value for side kwarg,"
" must be either 'left' or 'right': %s" % (side,)
)

# TODO: Handle errors/missing keys correctly
# Not currently using `kind` argument.
if side == "left":
return self.find_first_value(label, closest=True)
elif side == "right":
return self.find_last_value(label, closest=True) + 1
else:
raise ValueError(f"Invalid value for side: {side}")

def sort_values(
self: ColumnBase,
ascending: bool = True,
Expand Down
26 changes: 6 additions & 20 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,27 +470,13 @@ def fillna(

return super().fillna(fill_value, method)

def find_first_value(
self, value: ScalarLike, closest: bool = False
) -> int:
"""
Returns offset of first value that matches
"""
value = pd.to_datetime(value)
value = column.as_column(
value, dtype=self.dtype
).as_numerical.element_indexing(0)
return self.as_numerical.find_first_value(value, closest=closest)

def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
"""
Returns offset of last value that matches
"""
value = pd.to_datetime(value)
def indices_of(
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
value = column.as_column(
value, dtype=self.dtype
).as_numerical.element_indexing(0)
return self.as_numerical.find_last_value(value, closest=closest)
pd.to_datetime(value), dtype=self.dtype
).as_numerical
return self.as_numerical.indices_of(value)

@property
def is_unique(self) -> bool:
Expand Down
Loading