From e20eb94aedf5c8cc5c3f5ce3405e0dab1ace6f63 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Thu, 6 Oct 2022 15:15:17 -0700 Subject: [PATCH] part1: Simplify BaseIndex to an abstract class (#10389) This PR is in response to @vyasr comment, as partial fix for PR https://github.com/rapidsai/cudf/issues/9593 : `BaseIndex `should be reduced as closely as possible to an abstract class. While there are a subset of APIs that truly make sense for all types of index objects, in almost all cases the optimal implementation for `RangeIndex `(and `MultiIndex`, for that matter) is very different from the implementation for `GenericIndex`. In addition, this change reduces cognitive load for developers by simplifying the inheritance hierarchy Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10389 --- .../cudf/benchmarks/API/bench_rangeindex.py | 5 + python/cudf/cudf/core/_base_index.py | 240 ++++++++---------- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/index.py | 110 +++++++- python/cudf/cudf/tests/test_index.py | 136 ++++++++-- 5 files changed, 332 insertions(+), 161 deletions(-) diff --git a/python/cudf/benchmarks/API/bench_rangeindex.py b/python/cudf/benchmarks/API/bench_rangeindex.py index 7b2baef9081..42de5a86b65 100644 --- a/python/cudf/benchmarks/API/bench_rangeindex.py +++ b/python/cudf/benchmarks/API/bench_rangeindex.py @@ -40,3 +40,8 @@ def bench_min(benchmark, rangeindex): def bench_where(benchmark, rangeindex): cond = rangeindex % 2 == 0 benchmark(rangeindex.where, cond, 0) + + +def bench_isin(benchmark, rangeindex): + values = [10, 100] + benchmark(rangeindex.isin, values) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 6898ae4941c..b73536558f1 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -27,10 +27,7 @@ from cudf.core.column import ColumnBase, column from cudf.core.column_accessor import ColumnAccessor from cudf.utils import ioutils -from cudf.utils.dtypes import ( - is_mixed_with_object_dtype, - numeric_normalize_types, -) +from cudf.utils.dtypes import is_mixed_with_object_dtype _index_astype_docstring = """\ Create an Index with values cast to dtypes. @@ -90,7 +87,7 @@ def size(self): @property def values(self): - return self._values.values + raise NotImplementedError def get_loc(self, key, method=None, tolerance=None): raise NotImplementedError @@ -188,12 +185,7 @@ def _clean_nulls_from_index(self): methods using this method to replace or handle representation of the actual types correctly. """ - if self._values.has_nulls(): - return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name - ) - else: - return self + raise NotImplementedError @property def is_monotonic(self): @@ -549,13 +541,11 @@ def to_frame(self, index=True, name=None): Set the index of the returned DataFrame as the original Index name : str, default None Name to be used for the column - Returns ------- DataFrame cudf DataFrame """ - if name is not None: col_name = name elif self.name is None: @@ -570,7 +560,40 @@ def any(self): """ Return whether any elements is True in Index. """ - return self._values.any() + raise NotImplementedError + + def isna(self): + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`cudf.NaN`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. + + Returns + ------- + numpy.ndarray[bool] + A boolean array to indicate which entries are NA. + + """ + raise NotImplementedError + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + Returns + ------- + numpy.ndarray[bool] + A boolean array to indicate which entries are not NA. + """ + raise NotImplementedError def to_pandas(self): """ @@ -589,7 +612,75 @@ def to_pandas(self): >>> type(idx) """ - return pd.Index(self._values.to_pandas(), name=self.name) + raise NotImplementedError + + def isin(self, values): + """Return a boolean array where the index values are in values. + + Compute boolean array of whether each index value is found in + the passed set of values. The length of the returned boolean + array matches the length of the index. + + Parameters + ---------- + values : set, list-like, Index + Sought values. + + Returns + ------- + is_contained : cupy array + CuPy array of boolean values. + + Examples + -------- + >>> idx = cudf.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + + Check whether each index value in a list of values. + + >>> idx.isin([1, 4]) + array([ True, False, False]) + """ + # To match pandas behavior, even though only list-like objects are + # supposed to be passed, only scalars throw errors. Other types (like + # dicts) just transparently return False (see the implementation of + # ColumnBase.isin). + raise NotImplementedError + + def unique(self): + """ + Return unique values in the index. + + Returns + ------- + Index without duplicates + """ + raise NotImplementedError + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys. + Useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + The dtype will be based on the type of the Index values. + """ + return cudf.Series._from_data( + self._data, + index=self.copy(deep=False) if index is None else index, + name=self.name if name is None else name, + ) @ioutils.doc_to_dlpack() def to_dlpack(self): @@ -599,7 +690,7 @@ def to_dlpack(self): def append(self, other): """ - Append a collection of Index options together. + Append a collection of Index objects together. Parameters ---------- @@ -626,45 +717,7 @@ def append(self, other): >>> idx.append([other, other]) Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') """ - - if is_list_like(other): - to_concat = [self] - to_concat.extend(other) - else: - this = self - if len(other) == 0: - # short-circuit and return a copy - to_concat = [self] - - other = cudf.Index(other) - - if len(self) == 0: - to_concat = [other] - - if len(self) and len(other): - if is_mixed_with_object_dtype(this, other): - got_dtype = ( - other.dtype - if this.dtype == cudf.dtype("object") - else this.dtype - ) - raise TypeError( - f"cudf does not support appending an Index of " - f"dtype `{cudf.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ) - - if isinstance(self._values, cudf.core.column.NumericalColumn): - if self.dtype != other.dtype: - this, other = numeric_normalize_types(self, other) - to_concat = [this, other] - - for obj in to_concat: - if not isinstance(obj, BaseIndex): - raise TypeError("all inputs must be Index") - - return self._concat(to_concat) + raise NotImplementedError def difference(self, other, sort=None): """ @@ -1119,18 +1172,6 @@ def sort_values( else: return index_sorted - def unique(self): - """ - Return unique values in the index. - - Returns - ------- - Index without duplicates - """ - return cudf.core.index._index_from_data( - {self.name: self._values.unique()}, name=self.name - ) - def join( self, other, how="left", level=None, return_indexers=False, sort=False ): @@ -1263,30 +1304,6 @@ def rename(self, name, inplace=False): out.name = name return out - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys. - Useful with map for returning an indexer based on an index. - - Parameters - ---------- - index : Index, optional - Index of resulting Series. If None, defaults to original index. - name : str, optional - Dame of resulting Series. If None, defaults to name of original - index. - - Returns - ------- - Series - The dtype will be based on the type of the Index values. - """ - return cudf.Series( - self._values, - index=self.copy(deep=False) if index is None else index, - name=self.name if name is None else name, - ) - def get_slice_bound(self, label, side, kind=None): """ Calculate slice bound that corresponds to given label. @@ -1339,47 +1356,6 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - def isin(self, values): - """Return a boolean array where the index values are in values. - - Compute boolean array of whether each index value is found in - the passed set of values. The length of the returned boolean - array matches the length of the index. - - Parameters - ---------- - values : set, list-like, Index - Sought values. - - Returns - ------- - is_contained : cupy array - CuPy array of boolean values. - - Examples - -------- - >>> idx = cudf.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') - - Check whether each index value in a list of values. - - >>> idx.isin([1, 4]) - array([ True, False, False]) - """ - - # To match pandas behavior, even though only list-like objects are - # supposed to be passed, only scalars throw errors. Other types (like - # dicts) just transparently return False (see the implementation of - # ColumnBase.isin). - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a {type(values).__name__}" - ) - - return self._values.isin(values).values - @classmethod def from_pandas(cls, index, nan_as_null=None): """ diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 601ad707ba6..af5d140a20a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -104,7 +104,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex): super().__init__(parent=parent) @property - def categories(self) -> "cudf.core.index.BaseIndex": + def categories(self) -> "cudf.core.index.GenericIndex": """ The categories of this categorical. """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3734893627f..3d77ed15027 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -33,6 +33,8 @@ is_categorical_dtype, is_dtype_equal, is_interval_dtype, + is_list_like, + is_scalar, is_string_dtype, ) from cudf.core._base_index import BaseIndex, _index_astype_docstring @@ -55,7 +57,12 @@ from cudf.core.mixins import BinaryOperand from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring, doc_apply -from cudf.utils.dtypes import _maybe_convert_to_default_type, find_common_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + find_common_type, + is_mixed_with_object_dtype, + numeric_normalize_types, +) from cudf.utils.utils import _cudf_nvtx_annotate, search_range T = TypeVar("T", bound="Frame") @@ -243,6 +250,9 @@ def _values(self): else: return column.column_empty(0, masked=False, dtype=self.dtype) + def _clean_nulls_from_index(self): + return self + def is_numeric(self): return True @@ -867,6 +877,25 @@ def min(self): def max(self): return self._minmax("max") + @property + def values(self): + return cupy.arange(self.start, self.stop, self.step) + + def any(self): + return any(self._range) + + def append(self, other): + return self._as_int_index().append(other) + + def isin(self, values): + if is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a {type(values).__name__}" + ) + + return self._values.isin(values).values + def __neg__(self): return -self._as_int_index() @@ -1409,6 +1438,81 @@ def where(self, cond, other=None, inplace=False): inplace=inplace, ) + @property + def values(self): + return self._column.values + + def __contains__(self, item): + return item in self._values + + def _clean_nulls_from_index(self): + if self._values.has_nulls(): + return cudf.Index( + self._values.astype("str").fillna(cudf._NA_REP), name=self.name + ) + + return self + + def any(self): + return self._values.any() + + def to_pandas(self): + return pd.Index(self._values.to_pandas(), name=self.name) + + def append(self, other): + if is_list_like(other): + to_concat = [self] + to_concat.extend(other) + else: + this = self + if len(other) == 0: + # short-circuit and return a copy + to_concat = [self] + + other = cudf.Index(other) + + if len(self) == 0: + to_concat = [other] + + if len(self) and len(other): + if is_mixed_with_object_dtype(this, other): + got_dtype = ( + other.dtype + if this.dtype == cudf.dtype("object") + else this.dtype + ) + raise TypeError( + f"cudf does not support appending an Index of " + f"dtype `{cudf.dtype('object')}` with an Index " + f"of dtype `{got_dtype}`, please type-cast " + f"either one of them to same dtypes." + ) + + if isinstance(self._values, cudf.core.column.NumericalColumn): + if self.dtype != other.dtype: + this, other = numeric_normalize_types(self, other) + to_concat = [this, other] + + for obj in to_concat: + if not isinstance(obj, BaseIndex): + raise TypeError("all inputs must be Index") + + return self._concat(to_concat) + + def unique(self): + return cudf.core.index._index_from_data( + {self.name: self._values.unique()}, name=self.name + ) + + def isin(self, values): + if is_scalar(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a {type(values).__name__}" + ) + + return self._values.isin(values).values + class NumericIndex(GenericIndex): """Immutable, ordered and sliceable sequence of labels. @@ -2796,10 +2900,6 @@ def str(self): return StringMethods(parent=self) def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - """ if self._values.has_nulls(): return self.fillna(cudf._NA_REP) else: diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index e8c568979a3..358d5e2170e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2537,32 +2537,20 @@ def rangeindex(request): return RangeIndex(request.param) -def test_rangeindex_nunique(rangeindex): - gidx = rangeindex - pidx = gidx.to_pandas() - - actual = gidx.nunique() - expected = pidx.nunique() - - assert_eq(expected, actual) - - -def test_rangeindex_min(rangeindex): - gidx = rangeindex - pidx = gidx.to_pandas() - - actual = gidx.min() - expected = pidx.min() - - assert_eq(expected, actual) - - -def test_rangeindex_max(rangeindex): +@pytest.mark.parametrize( + "func", + ["nunique", "min", "max", "any", "values"], +) +def test_rangeindex_methods(rangeindex, func): gidx = rangeindex pidx = gidx.to_pandas() - actual = gidx.max() - expected = pidx.max() + if func == "values": + expected = pidx.values + actual = gidx.values + else: + expected = getattr(pidx, func)() + actual = getattr(gidx, func)() assert_eq(expected, actual) @@ -2693,3 +2681,105 @@ def test_rangeindex_where_user_option(default_integer_bitwidth): dtype=f"int{default_integer_bitwidth}", ) assert_eq(expected, actual) + + +index_data = [ + range(np.random.randint(0, 100)), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + range(0, 1), + [1, 2, 3, 1, None, None], + [None, None, 3.2, 1, None, None], + [None, "a", "3.2", "z", None, None], + pd.Series(["a", "b", None], dtype="category"), + np.array([1, 2, 3, None], dtype="datetime64[s]"), +] + + +@pytest.fixture(params=index_data) +def index(request): + """Create a cudf Index of different dtypes""" + return cudf.Index(request.param) + + +@pytest.mark.parametrize( + "func", + [ + "to_series", + "isna", + "notna", + "append", + ], +) +def test_index_methods(index, func): + gidx = index + pidx = gidx.to_pandas() + + if func == "append": + expected = pidx.append(other=pidx) + actual = gidx.append(other=gidx) + else: + expected = getattr(pidx, func)() + actual = getattr(gidx, func)() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx, values", + [ + (range(100, 1000, 10), [200, 600, 800]), + ([None, "a", "3.2", "z", None, None], ["a", "z"]), + (pd.Series(["a", "b", None], dtype="category"), [10, None]), + ], +) +def test_index_isin_values(idx, values): + gidx = cudf.Index(idx) + pidx = gidx.to_pandas() + + actual = gidx.isin(values) + expected = pidx.isin(values) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx, scalar", + [ + (range(0, -10, -2), -4), + ([None, "a", "3.2", "z", None, None], "x"), + (pd.Series(["a", "b", None], dtype="category"), 10), + ], +) +def test_index_isin_scalar_values(idx, scalar): + gidx = cudf.Index(idx) + + with pytest.raises( + TypeError, + match=re.escape( + f"only list-like objects are allowed to be passed " + f"to isin(), you passed a {type(scalar).__name__}" + ), + ): + gidx.isin(scalar) + + +def test_index_any(): + gidx = cudf.Index([1, 2, 3]) + pidx = gidx.to_pandas() + + assert_eq(pidx.any(), gidx.any()) + + +def test_index_values(): + gidx = cudf.Index([1, 2, 3]) + pidx = gidx.to_pandas() + + assert_eq(pidx.values, gidx.values) + + +def test_index_null_values(): + gidx = cudf.Index([1.0, None, 3, 0, None]) + with pytest.raises(ValueError): + gidx.values