Remove _getattr_ method in RangeIndex class (NVIDIA#10538)

This PR helps reduce implicit conversions by minimizing unnecessary `Int64Index `column materialization by the RangeIndex class( PR rapidsai/cudf#9593). Replaces rapidsai/cudf#10388. The following methods have been explicitly implemented for RangeIndex in this PR : `_column, _columns, where, isna, argsort, max, min, nunique, values_host, to_numpy, to_arrow, __array__ ` ### As demonstrated by the results posted in this[ comment](rapidsai/cudf#10538 (comment)), **on average:** - There's an evident performance gain with the new implementations of `values_host, to_numpy, nunique, min and max ` - `isna` and `argsort` demonstrate inconsistent measurements of performance perhaps due to memory allocation discrepancies - Whereas` to_arrow` and `where` still materialize an `Int64Index` Authors: - Sheilah Kirui (https://github.com/skirui-source) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: rapidsai/cudf#10538
tgravescs · Jul 15, 2022 · 4528d8e · 4528d8e
1 parent c1d4a5e
commit 4528d8e
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 13 deletions.
diff --git a/python/cudf/benchmarks/API/bench_rangeindex.py b/python/cudf/benchmarks/API/bench_rangeindex.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import pytest
+
+
+@pytest.mark.pandas_incompatible
+def bench_values_host(benchmark, rangeindex):
+    benchmark(lambda: rangeindex.values_host)
+
+
+def bench_to_numpy(benchmark, rangeindex):
+    benchmark(rangeindex.to_numpy)
+
+
+@pytest.mark.pandas_incompatible
+def bench_to_arrow(benchmark, rangeindex):
+    benchmark(rangeindex.to_arrow)
+
+
+def bench_argsort(benchmark, rangeindex):
+    benchmark(rangeindex.argsort)
+
+
+def bench_nunique(benchmark, rangeindex):
+    benchmark(rangeindex.nunique)
+
+
+def bench_isna(benchmark, rangeindex):
+    benchmark(rangeindex.isna)
+
+
+def bench_max(benchmark, rangeindex):
+    benchmark(rangeindex.max)
+
+
+def bench_min(benchmark, rangeindex):
+    benchmark(rangeindex.min)
+
+
+def bench_where(benchmark, rangeindex):
+    cond = rangeindex % 2 == 0
+    benchmark(rangeindex.where, cond, 0)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -67,7 +67,13 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.groupby.groupby import DataFrameGroupBy
-from cudf.core.index import BaseIndex, Index, RangeIndex, as_index
+from cudf.core.index import (
+    BaseIndex,
+    Index,
+    RangeIndex,
+    _index_from_data,
+    as_index,
+)
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -3168,7 +3174,7 @@ def rename(
                 except OverflowError:
                     index_data = self.index._data.copy(deep=True)
 
-                out = DataFrame(index=self.index._from_data(index_data))
+                out = DataFrame(index=_index_from_data(index_data))
         else:
             out = DataFrame(index=self.index)
 

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -554,17 +554,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             ufunc, method, *inputs, **kwargs
         )
 
-    @_cudf_nvtx_annotate
-    def __getattr__(self, key):
-        # For methods that are not defined for RangeIndex we attempt to operate
-        # on the corresponding integer index if possible.
-        try:
-            return getattr(self._as_int64(), key)
-        except AttributeError:
-            raise AttributeError(
-                f"'{type(self)}' object has no attribute {key}"
-            )
-
     @_cudf_nvtx_annotate
     def get_loc(self, key, method=None, tolerance=None):
         # Given an actual integer,
@@ -782,6 +771,83 @@ def join(
         # join. We need to implement that for the supported special cases.
         return self._as_int64().join(other, how, level, return_indexers, sort)
 
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def _column(self):
+        return self._as_int64()._column
+
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def _columns(self):
+        return self._as_int64()._columns
+
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def values_host(self):
+        return self.to_pandas().values
+
+    @_cudf_nvtx_annotate
+    def argsort(
+        self,
+        ascending=True,
+        na_position="last",
+    ):
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
+
+        indices = cupy.arange(0, len(self))
+        if (ascending and self._step < 0) or (
+            not ascending and self._step > 0
+        ):
+            indices = indices[::-1]
+        return indices
+
+    @_cudf_nvtx_annotate
+    def where(self, cond, other=None, inplace=False):
+        return self._as_int64().where(cond, other, inplace)
+
+    @_cudf_nvtx_annotate
+    def to_numpy(self):
+        return self.values_host
+
+    @_cudf_nvtx_annotate
+    def to_arrow(self):
+        return self._as_int64().to_arrow()
+
+    def __array__(self, dtype=None):
+        raise TypeError(
+            "Implicit conversion to a host NumPy array via __array__ is not "
+            "allowed, To explicitly construct a GPU matrix, consider using "
+            ".to_cupy()\nTo explicitly construct a host matrix, consider "
+            "using .to_numpy()."
+        )
+
+    @_cudf_nvtx_annotate
+    def nunique(self):
+        return len(self)
+
+    @_cudf_nvtx_annotate
+    def isna(self):
+        return cupy.zeros(len(self), dtype=bool)
+
+    @_cudf_nvtx_annotate
+    def _minmax(self, meth: str):
+        no_steps = len(self) - 1
+        if no_steps == -1:
+            return np.nan
+        elif (meth == "min" and self.step > 0) or (
+            meth == "max" and self.step < 0
+        ):
+            return self.start
+
+        return self.start + self.step * no_steps
+
+    def min(self):
+        return self._minmax("min")
+
+    def max(self):
+        return self._minmax("max")
+
 
 # Patch in all binops and unary ops, which bypass __getattr__ on the instance
 # and prevent the above overload from working.

diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
@@ -2518,3 +2518,50 @@ def test_isin_multiindex(data, values, level, err):
                 "squences  when `level=None`."
             ),
         )
+
+
+range_data = [
+    range(np.random.randint(0, 100)),
+    range(9, 12, 2),
+    range(20, 30),
+    range(100, 1000, 10),
+    range(0, 10, -2),
+    range(0, -10, 2),
+    range(0, -10, -2),
+]
+
+
+@pytest.fixture(params=range_data)
+def rangeindex(request):
+    """Create a cudf RangeIndex of different `nrows`"""
+    return RangeIndex(request.param)
+
+
+def test_rangeindex_nunique(rangeindex):
+    gidx = rangeindex
+    pidx = gidx.to_pandas()
+
+    actual = gidx.nunique()
+    expected = pidx.nunique()
+
+    assert_eq(expected, actual)
+
+
+def test_rangeindex_min(rangeindex):
+    gidx = rangeindex
+    pidx = gidx.to_pandas()
+
+    actual = gidx.min()
+    expected = pidx.min()
+
+    assert_eq(expected, actual)
+
+
+def test_rangeindex_max(rangeindex):
+    gidx = rangeindex
+    pidx = gidx.to_pandas()
+
+    actual = gidx.max()
+    expected = pidx.max()
+
+    assert_eq(expected, actual)