From e20eb94aedf5c8cc5c3f5ce3405e0dab1ace6f63 Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Thu, 6 Oct 2022 15:15:17 -0700
Subject: [PATCH] part1: Simplify BaseIndex to an abstract class (#10389)

This PR is in response to @vyasr  comment, as partial fix for PR https://github.com/rapidsai/cudf/issues/9593 :

`BaseIndex `should be reduced as closely as possible to an abstract class. While there are a subset of APIs that truly make sense for all types of index objects, in almost all cases the optimal implementation for `RangeIndex `(and `MultiIndex`, for that matter) is very different from the implementation for `GenericIndex`. In addition, this change reduces cognitive load for developers by simplifying the inheritance hierarchy

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10389
---
 .../cudf/benchmarks/API/bench_rangeindex.py   |   5 +
 python/cudf/cudf/core/_base_index.py          | 240 ++++++++----------
 python/cudf/cudf/core/column/categorical.py   |   2 +-
 python/cudf/cudf/core/index.py                | 110 +++++++-
 python/cudf/cudf/tests/test_index.py          | 136 ++++++++--
 5 files changed, 332 insertions(+), 161 deletions(-)

diff --git a/python/cudf/benchmarks/API/bench_rangeindex.py b/python/cudf/benchmarks/API/bench_rangeindex.py
index 7b2baef9081..42de5a86b65 100644
--- a/python/cudf/benchmarks/API/bench_rangeindex.py
+++ b/python/cudf/benchmarks/API/bench_rangeindex.py
@@ -40,3 +40,8 @@ def bench_min(benchmark, rangeindex):
 def bench_where(benchmark, rangeindex):
     cond = rangeindex % 2 == 0
     benchmark(rangeindex.where, cond, 0)
+
+
+def bench_isin(benchmark, rangeindex):
+    values = [10, 100]
+    benchmark(rangeindex.isin, values)
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 6898ae4941c..b73536558f1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -27,10 +27,7 @@
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils import ioutils
-from cudf.utils.dtypes import (
-    is_mixed_with_object_dtype,
-    numeric_normalize_types,
-)
+from cudf.utils.dtypes import is_mixed_with_object_dtype
 
 _index_astype_docstring = """\
 Create an Index with values cast to dtypes.
@@ -90,7 +87,7 @@ def size(self):
 
     @property
     def values(self):
-        return self._values.values
+        raise NotImplementedError
 
     def get_loc(self, key, method=None, tolerance=None):
         raise NotImplementedError
@@ -188,12 +185,7 @@ def _clean_nulls_from_index(self):
         methods using this method to replace or handle representation
         of the actual types correctly.
         """
-        if self._values.has_nulls():
-            return cudf.Index(
-                self._values.astype("str").fillna(cudf._NA_REP), name=self.name
-            )
-        else:
-            return self
+        raise NotImplementedError
 
     @property
     def is_monotonic(self):
@@ -549,13 +541,11 @@ def to_frame(self, index=True, name=None):
             Set the index of the returned DataFrame as the original Index
         name : str, default None
             Name to be used for the column
-
         Returns
         -------
         DataFrame
             cudf DataFrame
         """
-
         if name is not None:
             col_name = name
         elif self.name is None:
@@ -570,7 +560,40 @@ def any(self):
         """
         Return whether any elements is True in Index.
         """
-        return self._values.any()
+        raise NotImplementedError
+
+    def isna(self):
+        """
+        Detect missing values.
+
+        Return a boolean same-sized object indicating if the values are NA.
+        NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`cudf.NaN`, get
+        mapped to ``True`` values.
+        Everything else get mapped to ``False`` values.
+
+        Returns
+        -------
+        numpy.ndarray[bool]
+            A boolean array to indicate which entries are NA.
+
+        """
+        raise NotImplementedError
+
+    def notna(self):
+        """
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized object indicating if the values are not NA.
+        Non-missing values get mapped to ``True``.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
+        values.
+
+        Returns
+        -------
+        numpy.ndarray[bool]
+            A boolean array to indicate which entries are not NA.
+        """
+        raise NotImplementedError
 
     def to_pandas(self):
         """
@@ -589,7 +612,75 @@ def to_pandas(self):
         >>> type(idx)
         <class 'cudf.core.index.Int64Index'>
         """
-        return pd.Index(self._values.to_pandas(), name=self.name)
+        raise NotImplementedError
+
+    def isin(self, values):
+        """Return a boolean array where the index values are in values.
+
+        Compute boolean array of whether each index value is found in
+        the passed set of values. The length of the returned boolean
+        array matches the length of the index.
+
+        Parameters
+        ----------
+        values : set, list-like, Index
+            Sought values.
+
+        Returns
+        -------
+        is_contained : cupy array
+            CuPy array of boolean values.
+
+        Examples
+        --------
+        >>> idx = cudf.Index([1,2,3])
+        >>> idx
+        Int64Index([1, 2, 3], dtype='int64')
+
+        Check whether each index value in a list of values.
+
+        >>> idx.isin([1, 4])
+        array([ True, False, False])
+        """
+        # To match pandas behavior, even though only list-like objects are
+        # supposed to be passed, only scalars throw errors. Other types (like
+        # dicts) just transparently return False (see the implementation of
+        # ColumnBase.isin).
+        raise NotImplementedError
+
+    def unique(self):
+        """
+        Return unique values in the index.
+
+        Returns
+        -------
+        Index without duplicates
+        """
+        raise NotImplementedError
+
+    def to_series(self, index=None, name=None):
+        """
+        Create a Series with both index and values equal to the index keys.
+        Useful with map for returning an indexer based on an index.
+
+        Parameters
+        ----------
+        index : Index, optional
+            Index of resulting Series. If None, defaults to original index.
+        name : str, optional
+            Name of resulting Series. If None, defaults to name of original
+            index.
+
+        Returns
+        -------
+        Series
+            The dtype will be based on the type of the Index values.
+        """
+        return cudf.Series._from_data(
+            self._data,
+            index=self.copy(deep=False) if index is None else index,
+            name=self.name if name is None else name,
+        )
 
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
@@ -599,7 +690,7 @@ def to_dlpack(self):
 
     def append(self, other):
         """
-        Append a collection of Index options together.
+        Append a collection of Index objects together.
 
         Parameters
         ----------
@@ -626,45 +717,7 @@ def append(self, other):
         >>> idx.append([other, other])
         Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64')
         """
-
-        if is_list_like(other):
-            to_concat = [self]
-            to_concat.extend(other)
-        else:
-            this = self
-            if len(other) == 0:
-                # short-circuit and return a copy
-                to_concat = [self]
-
-            other = cudf.Index(other)
-
-            if len(self) == 0:
-                to_concat = [other]
-
-            if len(self) and len(other):
-                if is_mixed_with_object_dtype(this, other):
-                    got_dtype = (
-                        other.dtype
-                        if this.dtype == cudf.dtype("object")
-                        else this.dtype
-                    )
-                    raise TypeError(
-                        f"cudf does not support appending an Index of "
-                        f"dtype `{cudf.dtype('object')}` with an Index "
-                        f"of dtype `{got_dtype}`, please type-cast "
-                        f"either one of them to same dtypes."
-                    )
-
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
-                to_concat = [this, other]
-
-        for obj in to_concat:
-            if not isinstance(obj, BaseIndex):
-                raise TypeError("all inputs must be Index")
-
-        return self._concat(to_concat)
+        raise NotImplementedError
 
     def difference(self, other, sort=None):
         """
@@ -1119,18 +1172,6 @@ def sort_values(
         else:
             return index_sorted
 
-    def unique(self):
-        """
-        Return unique values in the index.
-
-        Returns
-        -------
-        Index without duplicates
-        """
-        return cudf.core.index._index_from_data(
-            {self.name: self._values.unique()}, name=self.name
-        )
-
     def join(
         self, other, how="left", level=None, return_indexers=False, sort=False
     ):
@@ -1263,30 +1304,6 @@ def rename(self, name, inplace=False):
             out.name = name
             return out
 
-    def to_series(self, index=None, name=None):
-        """
-        Create a Series with both index and values equal to the index keys.
-        Useful with map for returning an indexer based on an index.
-
-        Parameters
-        ----------
-        index : Index, optional
-            Index of resulting Series. If None, defaults to original index.
-        name : str, optional
-            Dame of resulting Series. If None, defaults to name of original
-            index.
-
-        Returns
-        -------
-        Series
-            The dtype will be based on the type of the Index values.
-        """
-        return cudf.Series(
-            self._values,
-            index=self.copy(deep=False) if index is None else index,
-            name=self.name if name is None else name,
-        )
-
     def get_slice_bound(self, label, side, kind=None):
         """
         Calculate slice bound that corresponds to given label.
@@ -1339,47 +1356,6 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
-    def isin(self, values):
-        """Return a boolean array where the index values are in values.
-
-        Compute boolean array of whether each index value is found in
-        the passed set of values. The length of the returned boolean
-        array matches the length of the index.
-
-        Parameters
-        ----------
-        values : set, list-like, Index
-            Sought values.
-
-        Returns
-        -------
-        is_contained : cupy array
-            CuPy array of boolean values.
-
-        Examples
-        --------
-        >>> idx = cudf.Index([1,2,3])
-        >>> idx
-        Int64Index([1, 2, 3], dtype='int64')
-
-        Check whether each index value in a list of values.
-
-        >>> idx.isin([1, 4])
-        array([ True, False, False])
-        """
-
-        # To match pandas behavior, even though only list-like objects are
-        # supposed to be passed, only scalars throw errors. Other types (like
-        # dicts) just transparently return False (see the implementation of
-        # ColumnBase.isin).
-        if is_scalar(values):
-            raise TypeError(
-                "only list-like objects are allowed to be passed "
-                f"to isin(), you passed a {type(values).__name__}"
-            )
-
-        return self._values.isin(values).values
-
     @classmethod
     def from_pandas(cls, index, nan_as_null=None):
         """
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 601ad707ba6..af5d140a20a 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -104,7 +104,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex):
         super().__init__(parent=parent)
 
     @property
-    def categories(self) -> "cudf.core.index.BaseIndex":
+    def categories(self) -> "cudf.core.index.GenericIndex":
         """
         The categories of this categorical.
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3734893627f..3d77ed15027 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -33,6 +33,8 @@
     is_categorical_dtype,
     is_dtype_equal,
     is_interval_dtype,
+    is_list_like,
+    is_scalar,
     is_string_dtype,
 )
 from cudf.core._base_index import BaseIndex, _index_astype_docstring
@@ -55,7 +57,12 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring, doc_apply
-from cudf.utils.dtypes import _maybe_convert_to_default_type, find_common_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    find_common_type,
+    is_mixed_with_object_dtype,
+    numeric_normalize_types,
+)
 from cudf.utils.utils import _cudf_nvtx_annotate, search_range
 
 T = TypeVar("T", bound="Frame")
@@ -243,6 +250,9 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
+    def _clean_nulls_from_index(self):
+        return self
+
     def is_numeric(self):
         return True
 
@@ -867,6 +877,25 @@ def min(self):
     def max(self):
         return self._minmax("max")
 
+    @property
+    def values(self):
+        return cupy.arange(self.start, self.stop, self.step)
+
+    def any(self):
+        return any(self._range)
+
+    def append(self, other):
+        return self._as_int_index().append(other)
+
+    def isin(self, values):
+        if is_scalar(values):
+            raise TypeError(
+                "only list-like objects are allowed to be passed "
+                f"to isin(), you passed a {type(values).__name__}"
+            )
+
+        return self._values.isin(values).values
+
     def __neg__(self):
         return -self._as_int_index()
 
@@ -1409,6 +1438,81 @@ def where(self, cond, other=None, inplace=False):
             inplace=inplace,
         )
 
+    @property
+    def values(self):
+        return self._column.values
+
+    def __contains__(self, item):
+        return item in self._values
+
+    def _clean_nulls_from_index(self):
+        if self._values.has_nulls():
+            return cudf.Index(
+                self._values.astype("str").fillna(cudf._NA_REP), name=self.name
+            )
+
+        return self
+
+    def any(self):
+        return self._values.any()
+
+    def to_pandas(self):
+        return pd.Index(self._values.to_pandas(), name=self.name)
+
+    def append(self, other):
+        if is_list_like(other):
+            to_concat = [self]
+            to_concat.extend(other)
+        else:
+            this = self
+            if len(other) == 0:
+                # short-circuit and return a copy
+                to_concat = [self]
+
+            other = cudf.Index(other)
+
+            if len(self) == 0:
+                to_concat = [other]
+
+            if len(self) and len(other):
+                if is_mixed_with_object_dtype(this, other):
+                    got_dtype = (
+                        other.dtype
+                        if this.dtype == cudf.dtype("object")
+                        else this.dtype
+                    )
+                    raise TypeError(
+                        f"cudf does not support appending an Index of "
+                        f"dtype `{cudf.dtype('object')}` with an Index "
+                        f"of dtype `{got_dtype}`, please type-cast "
+                        f"either one of them to same dtypes."
+                    )
+
+                if isinstance(self._values, cudf.core.column.NumericalColumn):
+                    if self.dtype != other.dtype:
+                        this, other = numeric_normalize_types(self, other)
+                to_concat = [this, other]
+
+        for obj in to_concat:
+            if not isinstance(obj, BaseIndex):
+                raise TypeError("all inputs must be Index")
+
+        return self._concat(to_concat)
+
+    def unique(self):
+        return cudf.core.index._index_from_data(
+            {self.name: self._values.unique()}, name=self.name
+        )
+
+    def isin(self, values):
+        if is_scalar(values):
+            raise TypeError(
+                "only list-like objects are allowed to be passed "
+                f"to isin(), you passed a {type(values).__name__}"
+            )
+
+        return self._values.isin(values).values
+
 
 class NumericIndex(GenericIndex):
     """Immutable, ordered and sliceable sequence of labels.
@@ -2796,10 +2900,6 @@ def str(self):
         return StringMethods(parent=self)
 
     def _clean_nulls_from_index(self):
-        """
-        Convert all na values(if any) in Index object
-        to `<NA>` as a preprocessing step to `__repr__` methods.
-        """
         if self._values.has_nulls():
             return self.fillna(cudf._NA_REP)
         else:
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index e8c568979a3..358d5e2170e 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2537,32 +2537,20 @@ def rangeindex(request):
     return RangeIndex(request.param)
 
 
-def test_rangeindex_nunique(rangeindex):
-    gidx = rangeindex
-    pidx = gidx.to_pandas()
-
-    actual = gidx.nunique()
-    expected = pidx.nunique()
-
-    assert_eq(expected, actual)
-
-
-def test_rangeindex_min(rangeindex):
-    gidx = rangeindex
-    pidx = gidx.to_pandas()
-
-    actual = gidx.min()
-    expected = pidx.min()
-
-    assert_eq(expected, actual)
-
-
-def test_rangeindex_max(rangeindex):
+@pytest.mark.parametrize(
+    "func",
+    ["nunique", "min", "max", "any", "values"],
+)
+def test_rangeindex_methods(rangeindex, func):
     gidx = rangeindex
     pidx = gidx.to_pandas()
 
-    actual = gidx.max()
-    expected = pidx.max()
+    if func == "values":
+        expected = pidx.values
+        actual = gidx.values
+    else:
+        expected = getattr(pidx, func)()
+        actual = getattr(gidx, func)()
 
     assert_eq(expected, actual)
 
@@ -2693,3 +2681,105 @@ def test_rangeindex_where_user_option(default_integer_bitwidth):
         dtype=f"int{default_integer_bitwidth}",
     )
     assert_eq(expected, actual)
+
+
+index_data = [
+    range(np.random.randint(0, 100)),
+    range(0, 10, -2),
+    range(0, -10, 2),
+    range(0, -10, -2),
+    range(0, 1),
+    [1, 2, 3, 1, None, None],
+    [None, None, 3.2, 1, None, None],
+    [None, "a", "3.2", "z", None, None],
+    pd.Series(["a", "b", None], dtype="category"),
+    np.array([1, 2, 3, None], dtype="datetime64[s]"),
+]
+
+
+@pytest.fixture(params=index_data)
+def index(request):
+    """Create a cudf Index of different dtypes"""
+    return cudf.Index(request.param)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "to_series",
+        "isna",
+        "notna",
+        "append",
+    ],
+)
+def test_index_methods(index, func):
+    gidx = index
+    pidx = gidx.to_pandas()
+
+    if func == "append":
+        expected = pidx.append(other=pidx)
+        actual = gidx.append(other=gidx)
+    else:
+        expected = getattr(pidx, func)()
+        actual = getattr(gidx, func)()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx, values",
+    [
+        (range(100, 1000, 10), [200, 600, 800]),
+        ([None, "a", "3.2", "z", None, None], ["a", "z"]),
+        (pd.Series(["a", "b", None], dtype="category"), [10, None]),
+    ],
+)
+def test_index_isin_values(idx, values):
+    gidx = cudf.Index(idx)
+    pidx = gidx.to_pandas()
+
+    actual = gidx.isin(values)
+    expected = pidx.isin(values)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx, scalar",
+    [
+        (range(0, -10, -2), -4),
+        ([None, "a", "3.2", "z", None, None], "x"),
+        (pd.Series(["a", "b", None], dtype="category"), 10),
+    ],
+)
+def test_index_isin_scalar_values(idx, scalar):
+    gidx = cudf.Index(idx)
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            f"only list-like objects are allowed to be passed "
+            f"to isin(), you passed a {type(scalar).__name__}"
+        ),
+    ):
+        gidx.isin(scalar)
+
+
+def test_index_any():
+    gidx = cudf.Index([1, 2, 3])
+    pidx = gidx.to_pandas()
+
+    assert_eq(pidx.any(), gidx.any())
+
+
+def test_index_values():
+    gidx = cudf.Index([1, 2, 3])
+    pidx = gidx.to_pandas()
+
+    assert_eq(pidx.values, gidx.values)
+
+
+def test_index_null_values():
+    gidx = cudf.Index([1.0, None, 3, 0, None])
+    with pytest.raises(ValueError):
+        gidx.values