From 007b7785f5791678061814eb4f188d6b045989ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:34:18 -0700 Subject: [PATCH 1/2] Don't materialize column during RangeIndex methods --- python/cudf/cudf/core/_base_index.py | 5 +- python/cudf/cudf/core/index.py | 111 +++++++++++++++++---------- python/cudf/cudf/tests/test_index.py | 23 ++++++ 3 files changed, 97 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index de44f392eef..9d14390d3b8 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -517,7 +517,7 @@ def where(self, cond, other=None, inplace=False): """ raise NotImplementedError - def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None): + def factorize(self, sort: bool = False, use_na_sentinel: bool = True): raise NotImplementedError def union(self, other, sort=None): @@ -2061,7 +2061,8 @@ def dropna(self, how="any"): one null value. "all" drops only rows containing *all* null values. """ - + if not self.hasnans: + return self.copy() # This is to be consistent with IndexedFrame.dropna to handle nans # as nulls by default data_columns = [ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6f08b1d83b3..b9dff38539c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -21,6 +21,7 @@ import cupy import numpy as np import pandas as pd +import pyarrow as pa from typing_extensions import Self import cudf @@ -255,6 +256,15 @@ def searchsorted( ), "Invalid ascending flag" return search_range(value, self.as_range, side=side) + def factorize(self, sort: bool = False, use_na_sentinel: bool = True): + if sort and self.step < 0: + codes = cupy.arange(len(self) - 1, -1, -1) + uniques = self[::-1] + else: + codes = cupy.arange(len(self), dtype=np.intp) + uniques = self + return codes, uniques + @property # type: ignore @_cudf_nvtx_annotate def name(self): @@ -267,7 +277,7 @@ def name(self, value): @property # type: ignore @_cudf_nvtx_annotate - def start(self): + def start(self) -> int: """ The value of the `start` parameter (0 if this was not supplied). """ @@ -275,7 +285,7 @@ def start(self): @property # type: ignore @_cudf_nvtx_annotate - def stop(self): + def stop(self) -> int: """ The value of the stop parameter. """ @@ -283,7 +293,7 @@ def stop(self): @property # type: ignore @_cudf_nvtx_annotate - def step(self): + def step(self) -> int: """ The value of the step parameter. """ @@ -291,7 +301,7 @@ def step(self): @property # type: ignore @_cudf_nvtx_annotate - def _num_rows(self): + def _num_rows(self) -> int: return len(self) @cached_property # type: ignore @@ -302,33 +312,33 @@ def _values(self): else: return column.column_empty(0, masked=False, dtype=self.dtype) - def _clean_nulls_from_index(self): + def _clean_nulls_from_index(self) -> Self: return self - def _is_numeric(self): + def _is_numeric(self) -> bool: return True - def _is_boolean(self): + def _is_boolean(self) -> bool: return False - def _is_integer(self): + def _is_integer(self) -> bool: return True - def _is_floating(self): + def _is_floating(self) -> bool: return False - def _is_object(self): + def _is_object(self) -> bool: return False - def _is_categorical(self): + def _is_categorical(self) -> bool: return False - def _is_interval(self): + def _is_interval(self) -> bool: return False @property # type: ignore @_cudf_nvtx_annotate - def hasnans(self): + def hasnans(self) -> bool: return False @property # type: ignore @@ -378,12 +388,15 @@ def astype(self, dtype, copy: bool = True): return self return self._as_int_index().astype(dtype, copy=copy) + def fillna(self, value, downcast=None): + return self.copy() + @_cudf_nvtx_annotate def drop_duplicates(self, keep="first"): return self @_cudf_nvtx_annotate - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> cupy.ndarray: return cupy.zeros(len(self), dtype=bool) @_cudf_nvtx_annotate @@ -399,8 +412,13 @@ def __repr__(self): + ")" ) + @property + @_cudf_nvtx_annotate + def size(self) -> int: + return len(self) + @_cudf_nvtx_annotate - def __len__(self): + def __len__(self) -> int: return len(range(self._start, self._stop, self._step)) @_cudf_nvtx_annotate @@ -492,7 +510,7 @@ def to_pandas( ) @property - def is_unique(self): + def is_unique(self) -> bool: return True @cached_property @@ -501,16 +519,16 @@ def as_range(self): @cached_property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: return self._step > 0 or len(self) <= 1 @cached_property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: return self._step < 0 or len(self) <= 1 @_cudf_nvtx_annotate - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: if deep: warnings.warn( "The deep parameter is ignored and is only included " @@ -518,7 +536,7 @@ def memory_usage(self, deep=False): ) return 0 - def unique(self): + def unique(self) -> Self: # RangeIndex always has unique values return self @@ -841,36 +859,40 @@ def _columns(self): @property # type: ignore @_cudf_nvtx_annotate - def values_host(self): - return self.to_pandas().values + def values_host(self) -> np.ndarray: + return np.arange(start=self.start, stop=self.stop, step=self.step) @_cudf_nvtx_annotate def argsort( self, ascending=True, na_position="last", - ): + ) -> cupy.ndarray: if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") - indices = cupy.arange(0, len(self)) if (ascending and self._step < 0) or ( not ascending and self._step > 0 ): - indices = indices[::-1] - return indices + return cupy.arange(len(self), -1, -1) + else: + return cupy.arange(len(self)) @_cudf_nvtx_annotate def where(self, cond, other=None, inplace=False): return self._as_int_index().where(cond, other, inplace) @_cudf_nvtx_annotate - def to_numpy(self): + def to_numpy(self) -> np.ndarray: return self.values_host @_cudf_nvtx_annotate - def to_arrow(self): - return self._as_int_index().to_arrow() + def to_cupy(self) -> cupy.ndarray: + return self.values + + @_cudf_nvtx_annotate + def to_arrow(self) -> pa.Array: + return pa.array(self._range) def __array__(self, dtype=None): raise TypeError( @@ -881,17 +903,17 @@ def __array__(self, dtype=None): ) @_cudf_nvtx_annotate - def nunique(self): + def nunique(self) -> int: return len(self) @_cudf_nvtx_annotate - def isna(self): + def isna(self) -> cupy.ndarray: return cupy.zeros(len(self), dtype=bool) isnull = isna @_cudf_nvtx_annotate - def notna(self): + def notna(self) -> cupy.ndarray: return cupy.ones(len(self), dtype=bool) notnull = isna @@ -915,12 +937,15 @@ def max(self): return self._minmax("max") @property - def values(self): + def values(self) -> cupy.ndarray: return cupy.arange(self.start, self.stop, self.step) - def any(self): + def any(self) -> bool: return any(self._range) + def all(self) -> bool: + return 0 not in self._range + def append(self, other): result = self._as_int_index().append(other) return self._try_reconstruct_range_index(result) @@ -946,14 +971,20 @@ def isin(self, values): return self._values.isin(values).values - def __neg__(self): - return -self._as_int_index() + def __pos__(self) -> Self: + return self.copy() - def __pos__(self): - return +self._as_int_index() + def __neg__(self) -> Self: + rng = range(-self.start, -self.stop, -self.step) + return type(self)(rng, name=self.name) - def __abs__(self): - return abs(self._as_int_index()) + def __abs__(self) -> Self | Index: + if len(self) == 0 or self.min() >= 0: + return self.copy() + elif self.max() <= 0: + return -self + else: + return abs(self._as_int_index()) @_warn_no_dask_cudf def __dask_tokenize__(self): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index ebbca57bd40..17bc14743db 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3176,3 +3176,26 @@ def test_index_to_pandas_arrow_type(scalar): result = idx.to_pandas(arrow_type=True) expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) pd.testing.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)]) +def test_rangeindex_all(data): + result = cudf.RangeIndex(data).all() + expected = cudf.Index(list(data)).all() + assert result == expected + + +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)]) +def test_rangeindex_factorize(sort, data): + res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort) + exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort) + assert_eq(res_codes, exp_codes) + assert_eq(res_uniques, exp_uniques) + + +def test_rangeindex_dropna(): + ri = cudf.RangeIndex(range(2)) + result = ri.dropna() + expected = ri.copy() + assert_eq(result, expected) From 9608d2a97be776ef14174a84ead03692fe91d3c5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 23 Apr 2024 13:25:47 -0700 Subject: [PATCH 2/2] Add pa.array type, fix off by 1 --- python/cudf/cudf/core/_base_index.py | 9 +++++++-- python/cudf/cudf/core/index.py | 5 ++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 9d14390d3b8..b5630ff9a54 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2061,8 +2061,13 @@ def dropna(self, how="any"): one null value. "all" drops only rows containing *all* null values. """ - if not self.hasnans: - return self.copy() + if how not in {"any", "all"}: + raise ValueError(f"{how=} must be 'any' or 'all'") + try: + if not self.hasnans: + return self.copy() + except NotImplementedError: + pass # This is to be consistent with IndexedFrame.dropna to handle nans # as nulls by default data_columns = [ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b9dff38539c..987aacb1413 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -870,11 +870,10 @@ def argsort( ) -> cupy.ndarray: if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") - if (ascending and self._step < 0) or ( not ascending and self._step > 0 ): - return cupy.arange(len(self), -1, -1) + return cupy.arange(len(self) - 1, -1, -1) else: return cupy.arange(len(self)) @@ -892,7 +891,7 @@ def to_cupy(self) -> cupy.ndarray: @_cudf_nvtx_annotate def to_arrow(self) -> pa.Array: - return pa.array(self._range) + return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype)) def __array__(self, dtype=None): raise TypeError(