Skip to content

Commit

Permalink
Remove _getattr_ method in RangeIndex class (NVIDIA#10538)
Browse files Browse the repository at this point in the history
This PR helps reduce implicit conversions by minimizing unnecessary `Int64Index `column materialization by the RangeIndex class( PR rapidsai/cudf#9593). Replaces rapidsai/cudf#10388. 

The following methods have been explicitly implemented for RangeIndex in this PR : `_column, _columns, where, isna, argsort, max, min, nunique,  values_host, to_numpy, to_arrow, __array__
`

### 


As demonstrated by the results posted in this[ comment](rapidsai/cudf#10538 (comment)), **on average:** 

- There's an evident performance gain with the new implementations of `values_host, to_numpy, nunique, min and max
`
-  `isna` and `argsort` demonstrate inconsistent measurements of performance perhaps due to memory allocation discrepancies

-  Whereas` to_arrow` and `where` still materialize an `Int64Index`

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai/cudf#10538
  • Loading branch information
skirui-source authored Jul 15, 2022
1 parent c1d4a5e commit 4528d8e
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 13 deletions.
42 changes: 42 additions & 0 deletions python/cudf/benchmarks/API/bench_rangeindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) 2022, NVIDIA CORPORATION.

import pytest


@pytest.mark.pandas_incompatible
def bench_values_host(benchmark, rangeindex):
benchmark(lambda: rangeindex.values_host)


def bench_to_numpy(benchmark, rangeindex):
benchmark(rangeindex.to_numpy)


@pytest.mark.pandas_incompatible
def bench_to_arrow(benchmark, rangeindex):
benchmark(rangeindex.to_arrow)


def bench_argsort(benchmark, rangeindex):
benchmark(rangeindex.argsort)


def bench_nunique(benchmark, rangeindex):
benchmark(rangeindex.nunique)


def bench_isna(benchmark, rangeindex):
benchmark(rangeindex.isna)


def bench_max(benchmark, rangeindex):
benchmark(rangeindex.max)


def bench_min(benchmark, rangeindex):
benchmark(rangeindex.min)


def bench_where(benchmark, rangeindex):
cond = rangeindex % 2 == 0
benchmark(rangeindex.where, cond, 0)
10 changes: 8 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,13 @@
)
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.groupby.groupby import DataFrameGroupBy
from cudf.core.index import BaseIndex, Index, RangeIndex, as_index
from cudf.core.index import (
BaseIndex,
Index,
RangeIndex,
_index_from_data,
as_index,
)
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -3168,7 +3174,7 @@ def rename(
except OverflowError:
index_data = self.index._data.copy(deep=True)

out = DataFrame(index=self.index._from_data(index_data))
out = DataFrame(index=_index_from_data(index_data))
else:
out = DataFrame(index=self.index)

Expand Down
88 changes: 77 additions & 11 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,17 +554,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
ufunc, method, *inputs, **kwargs
)

@_cudf_nvtx_annotate
def __getattr__(self, key):
# For methods that are not defined for RangeIndex we attempt to operate
# on the corresponding integer index if possible.
try:
return getattr(self._as_int64(), key)
except AttributeError:
raise AttributeError(
f"'{type(self)}' object has no attribute {key}"
)

@_cudf_nvtx_annotate
def get_loc(self, key, method=None, tolerance=None):
# Given an actual integer,
Expand Down Expand Up @@ -782,6 +771,83 @@ def join(
# join. We need to implement that for the supported special cases.
return self._as_int64().join(other, how, level, return_indexers, sort)

@property # type: ignore
@_cudf_nvtx_annotate
def _column(self):
return self._as_int64()._column

@property # type: ignore
@_cudf_nvtx_annotate
def _columns(self):
return self._as_int64()._columns

@property # type: ignore
@_cudf_nvtx_annotate
def values_host(self):
return self.to_pandas().values

@_cudf_nvtx_annotate
def argsort(
self,
ascending=True,
na_position="last",
):
if na_position not in {"first", "last"}:
raise ValueError(f"invalid na_position: {na_position}")

indices = cupy.arange(0, len(self))
if (ascending and self._step < 0) or (
not ascending and self._step > 0
):
indices = indices[::-1]
return indices

@_cudf_nvtx_annotate
def where(self, cond, other=None, inplace=False):
return self._as_int64().where(cond, other, inplace)

@_cudf_nvtx_annotate
def to_numpy(self):
return self.values_host

@_cudf_nvtx_annotate
def to_arrow(self):
return self._as_int64().to_arrow()

def __array__(self, dtype=None):
raise TypeError(
"Implicit conversion to a host NumPy array via __array__ is not "
"allowed, To explicitly construct a GPU matrix, consider using "
".to_cupy()\nTo explicitly construct a host matrix, consider "
"using .to_numpy()."
)

@_cudf_nvtx_annotate
def nunique(self):
return len(self)

@_cudf_nvtx_annotate
def isna(self):
return cupy.zeros(len(self), dtype=bool)

@_cudf_nvtx_annotate
def _minmax(self, meth: str):
no_steps = len(self) - 1
if no_steps == -1:
return np.nan
elif (meth == "min" and self.step > 0) or (
meth == "max" and self.step < 0
):
return self.start

return self.start + self.step * no_steps

def min(self):
return self._minmax("min")

def max(self):
return self._minmax("max")


# Patch in all binops and unary ops, which bypass __getattr__ on the instance
# and prevent the above overload from working.
Expand Down
47 changes: 47 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2518,3 +2518,50 @@ def test_isin_multiindex(data, values, level, err):
"squences when `level=None`."
),
)


range_data = [
range(np.random.randint(0, 100)),
range(9, 12, 2),
range(20, 30),
range(100, 1000, 10),
range(0, 10, -2),
range(0, -10, 2),
range(0, -10, -2),
]


@pytest.fixture(params=range_data)
def rangeindex(request):
"""Create a cudf RangeIndex of different `nrows`"""
return RangeIndex(request.param)


def test_rangeindex_nunique(rangeindex):
gidx = rangeindex
pidx = gidx.to_pandas()

actual = gidx.nunique()
expected = pidx.nunique()

assert_eq(expected, actual)


def test_rangeindex_min(rangeindex):
gidx = rangeindex
pidx = gidx.to_pandas()

actual = gidx.min()
expected = pidx.min()

assert_eq(expected, actual)


def test_rangeindex_max(rangeindex):
gidx = rangeindex
pidx = gidx.to_pandas()

actual = gidx.max()
expected = pidx.max()

assert_eq(expected, actual)

0 comments on commit 4528d8e

Please sign in to comment.