Skip to content

Commit

Permalink
part1: Simplify BaseIndex to an abstract class (#10389)
Browse files Browse the repository at this point in the history
This PR is in response to @vyasr  comment, as partial fix for PR #9593 :
 
`BaseIndex `should be reduced as closely as possible to an abstract class. While there are a subset of APIs that truly make sense for all types of index objects, in almost all cases the optimal implementation for `RangeIndex `(and `MultiIndex`, for that matter) is very different from the implementation for `GenericIndex`. In addition, this change reduces cognitive load for developers by simplifying the inheritance hierarchy

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10389
  • Loading branch information
skirui-source authored Oct 6, 2022
1 parent 1ef722d commit e20eb94
Show file tree
Hide file tree
Showing 5 changed files with 332 additions and 161 deletions.
5 changes: 5 additions & 0 deletions python/cudf/benchmarks/API/bench_rangeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,8 @@ def bench_min(benchmark, rangeindex):
def bench_where(benchmark, rangeindex):
cond = rangeindex % 2 == 0
benchmark(rangeindex.where, cond, 0)


def bench_isin(benchmark, rangeindex):
values = [10, 100]
benchmark(rangeindex.isin, values)
240 changes: 108 additions & 132 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@
from cudf.core.column import ColumnBase, column
from cudf.core.column_accessor import ColumnAccessor
from cudf.utils import ioutils
from cudf.utils.dtypes import (
is_mixed_with_object_dtype,
numeric_normalize_types,
)
from cudf.utils.dtypes import is_mixed_with_object_dtype

_index_astype_docstring = """\
Create an Index with values cast to dtypes.
Expand Down Expand Up @@ -90,7 +87,7 @@ def size(self):

@property
def values(self):
return self._values.values
raise NotImplementedError

def get_loc(self, key, method=None, tolerance=None):
raise NotImplementedError
Expand Down Expand Up @@ -188,12 +185,7 @@ def _clean_nulls_from_index(self):
methods using this method to replace or handle representation
of the actual types correctly.
"""
if self._values.has_nulls():
return cudf.Index(
self._values.astype("str").fillna(cudf._NA_REP), name=self.name
)
else:
return self
raise NotImplementedError

@property
def is_monotonic(self):
Expand Down Expand Up @@ -549,13 +541,11 @@ def to_frame(self, index=True, name=None):
Set the index of the returned DataFrame as the original Index
name : str, default None
Name to be used for the column
Returns
-------
DataFrame
cudf DataFrame
"""

if name is not None:
col_name = name
elif self.name is None:
Expand All @@ -570,7 +560,40 @@ def any(self):
"""
Return whether any elements is True in Index.
"""
return self._values.any()
raise NotImplementedError

def isna(self):
"""
Detect missing values.
Return a boolean same-sized object indicating if the values are NA.
NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`cudf.NaN`, get
mapped to ``True`` values.
Everything else get mapped to ``False`` values.
Returns
-------
numpy.ndarray[bool]
A boolean array to indicate which entries are NA.
"""
raise NotImplementedError

def notna(self):
"""
Detect existing (non-missing) values.
Return a boolean same-sized object indicating if the values are not NA.
Non-missing values get mapped to ``True``.
NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
values.
Returns
-------
numpy.ndarray[bool]
A boolean array to indicate which entries are not NA.
"""
raise NotImplementedError

def to_pandas(self):
"""
Expand All @@ -589,7 +612,75 @@ def to_pandas(self):
>>> type(idx)
<class 'cudf.core.index.Int64Index'>
"""
return pd.Index(self._values.to_pandas(), name=self.name)
raise NotImplementedError

def isin(self, values):
"""Return a boolean array where the index values are in values.
Compute boolean array of whether each index value is found in
the passed set of values. The length of the returned boolean
array matches the length of the index.
Parameters
----------
values : set, list-like, Index
Sought values.
Returns
-------
is_contained : cupy array
CuPy array of boolean values.
Examples
--------
>>> idx = cudf.Index([1,2,3])
>>> idx
Int64Index([1, 2, 3], dtype='int64')
Check whether each index value in a list of values.
>>> idx.isin([1, 4])
array([ True, False, False])
"""
# To match pandas behavior, even though only list-like objects are
# supposed to be passed, only scalars throw errors. Other types (like
# dicts) just transparently return False (see the implementation of
# ColumnBase.isin).
raise NotImplementedError

def unique(self):
"""
Return unique values in the index.
Returns
-------
Index without duplicates
"""
raise NotImplementedError

def to_series(self, index=None, name=None):
"""
Create a Series with both index and values equal to the index keys.
Useful with map for returning an indexer based on an index.
Parameters
----------
index : Index, optional
Index of resulting Series. If None, defaults to original index.
name : str, optional
Name of resulting Series. If None, defaults to name of original
index.
Returns
-------
Series
The dtype will be based on the type of the Index values.
"""
return cudf.Series._from_data(
self._data,
index=self.copy(deep=False) if index is None else index,
name=self.name if name is None else name,
)

@ioutils.doc_to_dlpack()
def to_dlpack(self):
Expand All @@ -599,7 +690,7 @@ def to_dlpack(self):

def append(self, other):
"""
Append a collection of Index options together.
Append a collection of Index objects together.
Parameters
----------
Expand All @@ -626,45 +717,7 @@ def append(self, other):
>>> idx.append([other, other])
Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64')
"""

if is_list_like(other):
to_concat = [self]
to_concat.extend(other)
else:
this = self
if len(other) == 0:
# short-circuit and return a copy
to_concat = [self]

other = cudf.Index(other)

if len(self) == 0:
to_concat = [other]

if len(self) and len(other):
if is_mixed_with_object_dtype(this, other):
got_dtype = (
other.dtype
if this.dtype == cudf.dtype("object")
else this.dtype
)
raise TypeError(
f"cudf does not support appending an Index of "
f"dtype `{cudf.dtype('object')}` with an Index "
f"of dtype `{got_dtype}`, please type-cast "
f"either one of them to same dtypes."
)

if isinstance(self._values, cudf.core.column.NumericalColumn):
if self.dtype != other.dtype:
this, other = numeric_normalize_types(self, other)
to_concat = [this, other]

for obj in to_concat:
if not isinstance(obj, BaseIndex):
raise TypeError("all inputs must be Index")

return self._concat(to_concat)
raise NotImplementedError

def difference(self, other, sort=None):
"""
Expand Down Expand Up @@ -1119,18 +1172,6 @@ def sort_values(
else:
return index_sorted

def unique(self):
"""
Return unique values in the index.
Returns
-------
Index without duplicates
"""
return cudf.core.index._index_from_data(
{self.name: self._values.unique()}, name=self.name
)

def join(
self, other, how="left", level=None, return_indexers=False, sort=False
):
Expand Down Expand Up @@ -1263,30 +1304,6 @@ def rename(self, name, inplace=False):
out.name = name
return out

def to_series(self, index=None, name=None):
"""
Create a Series with both index and values equal to the index keys.
Useful with map for returning an indexer based on an index.
Parameters
----------
index : Index, optional
Index of resulting Series. If None, defaults to original index.
name : str, optional
Dame of resulting Series. If None, defaults to name of original
index.
Returns
-------
Series
The dtype will be based on the type of the Index values.
"""
return cudf.Series(
self._values,
index=self.copy(deep=False) if index is None else index,
name=self.name if name is None else name,
)

def get_slice_bound(self, label, side, kind=None):
"""
Calculate slice bound that corresponds to given label.
Expand Down Expand Up @@ -1339,47 +1356,6 @@ def __array_function__(self, func, types, args, kwargs):
else:
return NotImplemented

def isin(self, values):
"""Return a boolean array where the index values are in values.
Compute boolean array of whether each index value is found in
the passed set of values. The length of the returned boolean
array matches the length of the index.
Parameters
----------
values : set, list-like, Index
Sought values.
Returns
-------
is_contained : cupy array
CuPy array of boolean values.
Examples
--------
>>> idx = cudf.Index([1,2,3])
>>> idx
Int64Index([1, 2, 3], dtype='int64')
Check whether each index value in a list of values.
>>> idx.isin([1, 4])
array([ True, False, False])
"""

# To match pandas behavior, even though only list-like objects are
# supposed to be passed, only scalars throw errors. Other types (like
# dicts) just transparently return False (see the implementation of
# ColumnBase.isin).
if is_scalar(values):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a {type(values).__name__}"
)

return self._values.isin(values).values

@classmethod
def from_pandas(cls, index, nan_as_null=None):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex):
super().__init__(parent=parent)

@property
def categories(self) -> "cudf.core.index.BaseIndex":
def categories(self) -> "cudf.core.index.GenericIndex":
"""
The categories of this categorical.
"""
Expand Down
Loading

0 comments on commit e20eb94

Please sign in to comment.