From 85151e0c76b482a1d952888486f5643b311b5ede Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Oct 2021 16:32:04 -0700 Subject: [PATCH 01/20] Unify argsort implementations. --- python/cudf/cudf/core/_base_index.py | 51 -------------------- python/cudf/cudf/core/dataframe.py | 51 -------------------- python/cudf/cudf/core/frame.py | 72 +++++++++++++++++++++++----- python/cudf/cudf/core/index.py | 12 +++++ python/cudf/cudf/core/multiindex.py | 13 ++++- python/cudf/cudf/core/series.py | 64 +++++++------------------ 6 files changed, 101 insertions(+), 162 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 590bff3b19d..a014f3c0265 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -508,57 +508,6 @@ def take(self, indices): """ return self[indices] - def argsort(self, ascending=True, **kwargs): - """ - Return the integer indices that would sort the index. - - Parameters - ---------- - ascending : bool, default True - If True, returns the indices for ascending order. - If False, returns the indices for descending order. - - Returns - ------- - array : A cupy array containing Integer indices that - would sort the index if used as an indexer. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([10, 100, 1, 1000]) - >>> index - Int64Index([10, 100, 1, 1000], dtype='int64') - >>> index.argsort() - array([2, 0, 1, 3], dtype=int32) - - The order of argsort can be reversed using - ``ascending`` parameter, by setting it to ``False``. - >>> index.argsort(ascending=False) - array([3, 1, 0, 2], dtype=int32) - - ``argsort`` on a MultiIndex: - - >>> index = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> index - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> index.argsort() - array([4, 0, 1, 2, 3], dtype=int32) - >>> index.argsort(ascending=False) - array([3, 2, 1, 0, 4], dtype=int32) - """ - indices = self._values.argsort(ascending=ascending, **kwargs) - return cupy.asarray(indices) - def to_frame(self, index=True, name=None): """Create a DataFrame with a column containing this Index diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5239cf9d648..e14fc397d90 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3248,57 +3248,6 @@ def label_encoding( return outdf - @annotate("ARGSORT", color="yellow", domain="cudf_python") - def argsort(self, ascending=True, na_position="last"): - """ - Sort by the values. - - Parameters - ---------- - ascending : bool or list of bool, default True - If True, sort values in ascending order, otherwise descending. - na_position : {‘first’ or ‘last’}, default ‘last’ - Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs - at the end. - - Returns - ------- - out_column_inds : cuDF Column of indices sorted based on input - - Notes - ----- - Difference from pandas: - - - Support axis='index' only. - - Not supporting: inplace, kind - - Ascending can be a list of bools to control per column - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]}) - >>> df - a b - 0 10 -10 - 1 0 10 - 2 2 1 - >>> inds = df.argsort() - >>> inds - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> df.take(inds) - a b - 1 0 10 - 2 2 1 - 0 10 -10 - """ - inds_col = self._get_sorted_inds( - ascending=ascending, na_position=na_position - ) - return cudf.Series(inds_col) - def sort_values( self, by, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0b895460410..972e96978ca 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2899,29 +2899,77 @@ def searchsorted( else: return result - def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): - """ - Sort by the values. + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + """Return the integer indices that would sort the Series values. Parameters ---------- - by: list, optional - Labels specifying columns to sort by. By default, - sort by all columns of `self` + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable + algorithms. Only quicksort is supported in cuDF. + order : None + Has no effect but is accepted for compatibility with numpy. ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at the end. + Returns ------- - out_column_inds : cuDF Column of indices sorted based on input + Series of indices sorted based on input. - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - * Ascending can be a list of bools to control per column - """ + Examples + -------- + >>> import cudf + >>> s = cudf.Series([3, 1, 2]) + >>> s + 0 3 + 1 1 + 2 2 + dtype: int64 + >>> s.argsort() + 0 1 + 1 2 + 2 0 + dtype: int32 + >>> s[s.argsort()] + 1 1 + 2 2 + 0 3 + dtype: int64 + """ # noqa: E501 + if kind != "quicksort": + if kind not in {"mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + warnings.warn( + f"GPU-accelerated {kind} is currently not supported, " + "defaulting to quicksort." + ) + return cudf.Series._from_data( + { + None: self._get_sorted_inds( + ascending=ascending, na_position=na_position + ) + } + ) + + def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): + # Get an int64 column consisting of the indices required to sort self + # according to the columns specified in by. to_sort = ( self diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c003454fb59..aa215c418b5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -748,6 +748,18 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + return ( + super().argsort(axis, kind, order, ascending, na_position).values + ) + @classmethod def deserialize(cls, header, frames): if "index_column" in header: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 27edd41ed92..f891d26ba8c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1364,8 +1364,17 @@ def is_monotonic_decreasing(self): ascending=[False] * len(self.levels), null_position=None ) - def argsort(self, ascending=True, **kwargs): - return self._get_sorted_inds(ascending=ascending, **kwargs).values + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + return ( + super().argsort(axis, kind, order, ascending, na_position).values + ) def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aada0534f42..89d86f8a01b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2269,37 +2269,6 @@ def astype(self, dtype, copy=False, errors="raise"): pass return self - def argsort(self, ascending=True, na_position="last"): - """Returns a Series of int64 index that will sort the series. - - Uses Thrust sort. - - Returns - ------- - result: Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([3, 1, 2]) - >>> s - 0 3 - 1 1 - 2 2 - dtype: int64 - >>> s.argsort() - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> s[s.argsort()] - 1 1 - 2 2 - 0 3 - dtype: int64 - """ - return self._sort(ascending=ascending, na_position=na_position)[1] - def sort_index(self, axis=0, *args, **kwargs): if axis not in (0, "index"): raise ValueError("Only axis=0 is valid for Series.") @@ -2359,7 +2328,13 @@ def sort_values( if len(self) == 0: return self - vals, inds = self._sort(ascending=ascending, na_position=na_position) + + col_keys, col_inds = self._column.sort_by_values( + ascending=ascending, na_position=na_position + ) + vals = self._from_data({self.name: col_keys}, self._index) + inds = self._from_data({self.name: col_inds}, self._index) + if not ignore_index: index = self.index.take(inds) else: @@ -2519,20 +2494,17 @@ def nsmallest(self, n=5, keep="first"): """ return self._n_largest_or_smallest(n=n, keep=keep, largest=False) - def _sort(self, ascending=True, na_position="last"): - """ - Sort by values - - Returns - ------- - 2-tuple of key and index - """ - col_keys, col_inds = self._column.sort_by_values( - ascending=ascending, na_position=na_position - ) - sr_keys = self._from_data({self.name: col_keys}, self._index) - sr_inds = self._from_data({self.name: col_inds}, self._index) - return sr_keys, sr_inds + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + obj = super().argsort(axis, kind, order, ascending, na_position) + obj.name = self.name + return obj def replace(self, to_replace=None, value=None, *args, **kwargs): if is_dict_like(to_replace) and value is not None: From 785c7ed72078f4f282a32763909312934f272b15 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 12:54:45 -0700 Subject: [PATCH 02/20] Standardize implementation of take. --- python/cudf/cudf/core/dataframe.py | 39 ++----------------- python/cudf/cudf/core/frame.py | 39 +++++++++++++++++++ python/cudf/cudf/core/groupby/groupby.py | 24 ------------ python/cudf/cudf/core/index.py | 6 +-- python/cudf/cudf/core/series.py | 48 +----------------------- 5 files changed, 47 insertions(+), 109 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e14fc397d90..768f6214526 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2575,40 +2575,7 @@ class max_speed return result def take(self, positions, keep_index=True): - """ - Return a new DataFrame containing the rows specified by *positions* - - Parameters - ---------- - positions : array-like - Integer or boolean array-like specifying the rows of the output. - If integer, each element represents the integer index of a row. - If boolean, *positions* must be of the same length as *self*, - and represents a boolean mask. - - Returns - ------- - out : DataFrame - New DataFrame - - Examples - -------- - >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], - ... 'b': cudf.Series(['a', 'b', 'c'])}) - >>> a.take([0, 2, 2]) - a b - 0 1.0 a - 2 3.0 c - 2 3.0 c - >>> a.take([True, False, True]) - a b - 0 1.0 a - 2 3.0 c - """ - positions = as_column(positions) - if is_bool_dtype(positions): - return self._apply_boolean_mask(positions) - out = self._gather(positions, keep_index=keep_index) + out = super().take(positions, keep_index) out.columns = self.columns return out @@ -3314,7 +3281,9 @@ def sort_values( # argsort the `by` column return self.take( - self[by].argsort(ascending=ascending, na_position=na_position), + self[by]._get_sorted_inds( + ascending=ascending, na_position=na_position + ), keep_index=not ignore_index, ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 972e96978ca..d18ca38792e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -30,6 +30,7 @@ from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, + is_bool_dtype, is_decimal_dtype, is_dict_like, is_integer_dtype, @@ -529,6 +530,7 @@ def _gather(self, gather_map, keep_index=True, nullify=False): ) result._copy_type_metadata(self, include_index=keep_index) + result._data.names = self._data.names if keep_index and self._index is not None: result._index.names = self._index.names return result @@ -2983,6 +2985,43 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + def take(self, positions, keep_index=True): + """Return a new object containing the rows specified by *positions* + + Parameters + ---------- + positions : array-like + Integer or boolean array-like specifying the rows of the output. + If integer, each element represents the integer index of a row. + If boolean, *positions* must be of the same length as *self*, + and represents a boolean mask. + keep_index : bool, default True + Whether to retain the index in result or not. + + Returns + ------- + out : DataFrame + New DataFrame + + Examples + -------- + >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], + ... 'b': cudf.Series(['a', 'b', 'c'])}) + >>> a.take([0, 2, 2]) + a b + 0 1.0 a + 2 3.0 c + 2 3.0 c + >>> a.take([True, False, True]) + a b + 0 1.0 a + 2 3.0 c + """ + positions = as_column(positions) + if is_bool_dtype(positions): + return self._apply_boolean_mask(positions) + return self._gather(positions, keep_index=keep_index) + def sin(self): """ Get Trigonometric sine, element-wise. diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6ffba8da069..a62684b6dd7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1181,18 +1181,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): _PROTECTED_KEYS = frozenset(("obj",)) - def __init__( - self, obj, by=None, level=None, sort=False, as_index=True, dropna=True - ): - super().__init__( - obj=obj, - by=by, - level=level, - sort=sort, - as_index=as_index, - dropna=dropna, - ) - def __getitem__(self, key): return self.obj[key].groupby( self.grouping, dropna=self._dropna, sort=self._sort @@ -1265,18 +1253,6 @@ class SeriesGroupBy(GroupBy): Name: Max Speed, dtype: float64 """ - def __init__( - self, obj, by=None, level=None, sort=False, as_index=True, dropna=True - ): - super().__init__( - obj=obj, - by=by, - level=level, - sort=sort, - as_index=as_index, - dropna=dropna, - ) - def agg(self, func): result = super().agg(func) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index aa215c418b5..ea45f9086c0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -748,6 +748,9 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) + def take(self, indices): + return self[indices] + def argsort( self, axis=0, @@ -2383,9 +2386,6 @@ def to_pandas(self): self.to_numpy(na_value=None), name=self.name, dtype="object" ) - def take(self, indices): - return self._values[indices] - def __repr__(self): return ( f"{self.__class__.__name__}({self._values.to_array()}," diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 89d86f8a01b..f7f3c5239ad 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1169,52 +1169,6 @@ def __setitem__(self, key, value): else: self.loc[key] = value - def take(self, indices, keep_index=True): - """ - Return Series by taking values from the corresponding *indices*. - - Parameters - ---------- - indices : array-like or scalar - An array/scalar like integers indicating which positions to take. - keep_index : bool, default True - Whethere to retain the index in result Series or not. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 13, 14]) - >>> series - 0 10 - 1 11 - 2 12 - 3 13 - 4 14 - dtype: int64 - >>> series.take([0, 4]) - 0 10 - 4 14 - dtype: int64 - - If you want to drop the index, pass `keep_index=False` - - >>> series.take([0, 4], keep_index=False) - 0 10 - 1 14 - dtype: int64 - """ - if keep_index is True or is_scalar(indices): - return self.iloc[indices] - else: - col_inds = as_column(indices) - return self._from_data( - {self.name: self._column.take(col_inds, keep_index=False)} - ) - def __repr__(self): _, height = get_terminal_size() max_rows = ( @@ -2353,7 +2307,7 @@ def _n_largest_or_smallest(self, largest, n, keep): data = data[-n:-n] else: data = data.tail(n) - return data.reverse() + return data[::-1] else: raise ValueError('keep must be either "first", "last"') From c98bc0281f81346f3160e052ed4e46773a7297c7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 14:25:40 -0700 Subject: [PATCH 03/20] Deprecate parameters and remove unnecessary impls. --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 23 ++++++++++++++++++----- python/cudf/cudf/core/index.py | 3 --- python/cudf/cudf/core/series.py | 3 +-- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 768f6214526..cb47cfbe8c3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2574,7 +2574,7 @@ class max_speed if not inplace: return result - def take(self, positions, keep_index=True): + def take(self, positions, keep_index=None): out = super().take(positions, keep_index) out.columns = self.columns return out diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d18ca38792e..fe704ab014b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2985,16 +2985,13 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) - def take(self, positions, keep_index=True): + def take(self, positions, keep_index=None): """Return a new object containing the rows specified by *positions* Parameters ---------- positions : array-like - Integer or boolean array-like specifying the rows of the output. - If integer, each element represents the integer index of a row. - If boolean, *positions* must be of the same length as *self*, - and represents a boolean mask. + Array of ints indicating which positions to take. keep_index : bool, default True Whether to retain the index in result or not. @@ -3017,8 +3014,24 @@ def take(self, positions, keep_index=True): 0 1.0 a 2 3.0 c """ + # TODO: When we remove keep_index we should introduce the axis + # parameter. We could also introduce is_copy, but that's already + # deprecated in pandas so it's probably unnecessary. + if keep_index is not None: + warnings.warn( + "keep_index is deprecated and will be removed in the future.", + FutureWarning, + ) + else: + keep_index = True + positions = as_column(positions) if is_bool_dtype(positions): + warnings.warn( + "Calling take with a boolean array is deprecated and will be " + "removed in the future.", + FutureWarning, + ) return self._apply_boolean_mask(positions) return self._gather(positions, keep_index=keep_index) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ea45f9086c0..2331f6efdd1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -748,9 +748,6 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) - def take(self, indices): - return self[indices] - def argsort( self, axis=0, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f7f3c5239ad..21f4515b9ac 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -111,8 +111,7 @@ def __getitem__(self, arg): ): return data return self._frame._from_data( - {self._frame.name: data}, - index=cudf.Index(self._frame.index.take(arg)), + {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]), ) def __setitem__(self, key, value): From f8ad3b83a19b1d0f11d8b908344999e1cd9d7165 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 14:29:11 -0700 Subject: [PATCH 04/20] Remove MultiIndex take implementation. --- python/cudf/cudf/core/multiindex.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f891d26ba8c..ab174901974 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,7 +7,6 @@ import pickle import warnings from collections.abc import Sequence -from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy @@ -835,22 +834,9 @@ def size(self): return self._num_rows def take(self, indices): - if isinstance(indices, (Integral, Sequence)): - indices = np.array(indices) - elif isinstance(indices, cudf.Series) and indices.has_nulls: - raise ValueError("Column must have no nulls.") - elif isinstance(indices, slice): - start, stop, step = indices.indices(len(self)) - indices = column.arange(start, stop, step) - result = MultiIndex.from_frame( - self.to_frame(index=False).take(indices) - ) - if self._codes is not None: - result._codes = self._codes.take(indices) - if self._levels is not None: - result._levels = self._levels - result.names = self.names - return result + obj = super().take(indices) + obj.names = self.names + return obj def serialize(self): header, frames = super().serialize() From 13e9911dcb1b10074edcd8499f65f4039f8932c0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 14:59:36 -0700 Subject: [PATCH 05/20] Share sort_values between Series and DataFrame. --- python/cudf/cudf/core/dataframe.py | 72 ------------------------- python/cudf/cudf/core/frame.py | 3 +- python/cudf/cudf/core/indexed_frame.py | 74 ++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 47 ++++++---------- 4 files changed, 93 insertions(+), 103 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index cb47cfbe8c3..e13c3922d60 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3215,78 +3215,6 @@ def label_encoding( return outdf - def sort_values( - self, - by, - axis=0, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - ): - """ - Sort by the values row-wise. - - Parameters - ---------- - by : str or list of str - Name or list of names to sort by. - ascending : bool or list of bool, default True - Sort ascending vs. descending. Specify list for multiple sort - orders. If this is a list of bools, must match the length of the - by. - na_position : {‘first’, ‘last’}, default ‘last’ - 'first' puts nulls at the beginning, 'last' puts nulls at the end - ignore_index : bool, default False - If True, index will not be sorted. - - Returns - ------- - sorted_obj : cuDF DataFrame - - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['a'] = [0, 1, 2] - >>> df['b'] = [-3, 2, 0] - >>> df.sort_values('b') - a b - 0 0 -3 - 2 2 0 - 1 1 2 - """ - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind not in {"quicksort", "mergesort", "heapsort", "stable"}: - raise AttributeError( - f"{kind} is not a valid sorting algorithm for " - f"'DataFrame' object" - ) - elif kind != "quicksort": - msg = ( - f"GPU-accelerated {kind} is currently not supported, " - f"now defaulting to GPU-accelerated quicksort." - ) - warnings.warn(msg) - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - - # argsort the `by` column - return self.take( - self[by]._get_sorted_inds( - ascending=ascending, na_position=na_position - ), - keep_index=not ignore_index, - ) - def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fe704ab014b..5301cb1a5f1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3016,7 +3016,8 @@ def take(self, positions, keep_index=None): """ # TODO: When we remove keep_index we should introduce the axis # parameter. We could also introduce is_copy, but that's already - # deprecated in pandas so it's probably unnecessary. + # deprecated in pandas so it's probably unnecessary. We also need to + # introduce Index.take's allow_fill and fill_value parameters. if keep_index is not None: warnings.warn( "keep_index is deprecated and will be removed in the future.", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index aba24171f06..25d6357a8af 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import warnings from typing import Type, TypeVar import cupy as cp @@ -388,3 +389,76 @@ def sort_index( if ignore_index is True: out = out.reset_index(drop=True) return self._mimic_inplace(out, inplace=inplace) + + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """Sort by the values along either axis. + + Parameters + ---------- + by : str or list of str + Name or list of names to sort by. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of the + by. + na_position : {‘first’, ‘last’}, default ‘last’ + 'first' puts nulls at the beginning, 'last' puts nulls at the end + ignore_index : bool, default False + If True, index will not be sorted. + + Returns + ------- + Frame : Frame with sorted values. + + Notes + ----- + Difference from pandas: + * Support axis='index' only. + * Not supporting: inplace, kind + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['a'] = [0, 1, 2] + >>> df['b'] = [-3, 2, 0] + >>> df.sort_values('b') + a b + 0 0 -3 + 2 2 0 + 1 1 2 + """ + if inplace: + raise NotImplementedError("`inplace` not currently implemented.") + if kind != "quicksort": + if kind not in {"mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + warnings.warn( + f"GPU-accelerated {kind} is currently not supported, " + f"defaulting to quicksort." + ) + if axis != 0: + raise NotImplementedError("`axis` not currently implemented.") + + if len(self) == 0: + return self + + # argsort the `by` column + return self.take( + self._get_columns_by_label(by)._get_sorted_inds( + ascending=ascending, na_position=na_position + ), + keep_index=not ignore_index, + ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 21f4515b9ac..d6a64e3911b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2236,28 +2236,28 @@ def sort_values( na_position="last", ignore_index=False, ): - """ - Sort by the values. - - Sort a Series in ascending or descending order by some criterion. + """Sort by the values along either axis. Parameters ---------- - ascending : bool, default True - If True, sort values in ascending order, otherwise descending. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of the + by. na_position : {‘first’, ‘last’}, default ‘last’ - 'first' puts nulls at the beginning, 'last' puts nulls at the end. + 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. Returns ------- - sorted_obj : cuDF Series + Series : Series with sorted values. Notes ----- Difference from pandas: - * Not supporting: `inplace`, `kind` + * Support axis='index' only. + * Not supporting: inplace, kind Examples -------- @@ -2271,28 +2271,15 @@ def sort_values( 1 5 dtype: int64 """ - - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind != "quicksort": - raise NotImplementedError("`kind` not currently implemented.") - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - - if len(self) == 0: - return self - - col_keys, col_inds = self._column.sort_by_values( - ascending=ascending, na_position=na_position + return super().sort_values( + by=self.name, + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + ignore_index=ignore_index, ) - vals = self._from_data({self.name: col_keys}, self._index) - inds = self._from_data({self.name: col_inds}, self._index) - - if not ignore_index: - index = self.index.take(inds) - else: - index = self.index - return vals.set_index(index) def _n_largest_or_smallest(self, largest, n, keep): direction = largest From c1c7ef9ba5f36da758b0b9733ffcb578f5b4c2b9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 15:21:21 -0700 Subject: [PATCH 06/20] Standardize argsort to return cupy arrays except for Series. --- python/cudf/cudf/core/frame.py | 12 ++++-------- python/cudf/cudf/core/index.py | 12 ------------ python/cudf/cudf/core/multiindex.py | 12 ------------ python/cudf/cudf/core/series.py | 4 +++- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- 5 files changed, 9 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5301cb1a5f1..b2366dd394c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2929,7 +2929,7 @@ def argsort( Returns ------- - Series of indices sorted based on input. + cupy.ndarray: The indices sorted based on input. Examples -------- @@ -2961,13 +2961,9 @@ def argsort( f"GPU-accelerated {kind} is currently not supported, " "defaulting to quicksort." ) - return cudf.Series._from_data( - { - None: self._get_sorted_inds( - ascending=ascending, na_position=na_position - ) - } - ) + return self._get_sorted_inds( + ascending=ascending, na_position=na_position + ).values def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): # Get an int64 column consisting of the indices required to sort self diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2331f6efdd1..8ca604a45de 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -748,18 +748,6 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) - def argsort( - self, - axis=0, - kind="quicksort", - order=None, - ascending=True, - na_position="last", - ): - return ( - super().argsort(axis, kind, order, ascending, na_position).values - ) - @classmethod def deserialize(cls, header, frames): if "index_column" in header: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ab174901974..bc0c4ef00af 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1350,18 +1350,6 @@ def is_monotonic_decreasing(self): ascending=[False] * len(self.levels), null_position=None ) - def argsort( - self, - axis=0, - kind="quicksort", - order=None, - ascending=True, - na_position="last", - ): - return ( - super().argsort(axis, kind, order, ascending, na_position).values - ) - def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: raise NotImplementedError("key parameter is not yet implemented.") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d6a64e3911b..926fe97c59b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2442,7 +2442,9 @@ def argsort( ascending=True, na_position="last", ): - obj = super().argsort(axis, kind, order, ascending, na_position) + obj = type(self)._from_data( + {None: super().argsort(axis, kind, order, ascending, na_position)} + ) obj.name = self.name return obj diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c1eade0fcdc..ae331a1d5ce 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8732,12 +8732,12 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode): ( cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), True, - cudf.Series([1, 2, 0], dtype="int32"), + cupy.array([1, 2, 0], dtype="int32"), ), ( cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), False, - cudf.Series([0, 2, 1], dtype="int32"), + cupy.array([0, 2, 1], dtype="int32"), ), ], ) From 3d14373b27f7b376cfb09c221dab1516eb6e9386 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 15:23:54 -0700 Subject: [PATCH 07/20] Fix MultiIndex.__getitem__ for slices. --- python/cudf/cudf/core/multiindex.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bc0c4ef00af..d06d9705bdb 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,6 +7,7 @@ import pickle import warnings from collections.abc import Sequence +from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union import cupy @@ -873,11 +874,26 @@ def deserialize(cls, header, frames): return obj._set_names(column_names) def __getitem__(self, index): - if isinstance(index, int): - # we are indexing into a single row of the MultiIndex, - # return that row as a tuple: - return self.take(index).to_pandas()[0] - return self.take(index) + flatten = isinstance(index, int) + + if isinstance(index, (Integral, Sequence)): + index = np.array(index) + elif isinstance(index, slice): + start, stop, step = index.indices(len(self)) + index = column.arange(start, stop, step) + result = MultiIndex.from_frame(self.to_frame(index=False).take(index)) + + # we are indexing into a single row of the MultiIndex, + # return that row as a tuple: + if flatten: + return result.to_pandas()[0] + + if self._codes is not None: + result._codes = self._codes.take(index) + if self._levels is not None: + result._levels = self._levels + result.names = self.names + return result def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is From 6bbcaa4a7ee125c07714ab1575883ced7263e3a6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 15:48:52 -0700 Subject: [PATCH 08/20] Unify index sort_values implementations. --- python/cudf/cudf/core/_base_index.py | 18 +++++++++++++----- python/cudf/cudf/core/multiindex.py | 14 -------------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a014f3c0265..14eea97a487 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -5,7 +5,6 @@ import pickle from typing import Any, Set -import cupy import pandas as pd import cudf @@ -974,7 +973,13 @@ def _intersection(self, other, sort=None): return intersection_result.sort_values() return intersection_result - def sort_values(self, return_indexer=False, ascending=True, key=None): + def sort_values( + self, + return_indexer=False, + ascending=True, + na_position="last", + key=None, + ): """ Return a sorted copy of the index, and optionally return the indices that sorted the index itself. @@ -985,6 +990,9 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. key : None, optional This parameter is NON-FUNCTIONAL. @@ -1051,11 +1059,11 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): if key is not None: raise NotImplementedError("key parameter is not yet implemented.") - indices = self._values.argsort(ascending=ascending) - index_sorted = cudf.Index(self.take(indices), name=self.name) + indices = self.argsort(ascending=ascending, na_position=na_position) + index_sorted = self.take(indices) if return_indexer: - return index_sorted, cupy.asarray(indices) + return index_sorted, indices else: return index_sorted diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index d06d9705bdb..b77133a7431 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1366,20 +1366,6 @@ def is_monotonic_decreasing(self): ascending=[False] * len(self.levels), null_position=None ) - def sort_values(self, return_indexer=False, ascending=True, key=None): - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - - indices = cudf.Series._from_data( - {None: self._get_sorted_inds(ascending=ascending)} - ) - index_sorted = as_index(self.take(indices), name=self.names) - - if return_indexer: - return index_sorted, cupy.asarray(indices) - else: - return index_sorted - def fillna(self, value): """ Fill null values with the specified value. From 79cb909c0df0ea31c2aaf64f4fffb62d42ded9e2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 16:00:00 -0700 Subject: [PATCH 09/20] Remove BaseIndex.gpu_values. --- python/cudf/cudf/core/_base_index.py | 7 ------- python/cudf/cudf/core/dataframe.py | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 14eea97a487..d4c958594dc 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -564,13 +564,6 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) - @property - def gpu_values(self): - """ - View the data as a numba device array object - """ - return self._values.data_array_view - def append(self, other): """ Append a collection of Index options together. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e13c3922d60..ce6dbdea675 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3488,7 +3488,7 @@ def _n_largest_or_smallest(self, method, n, columns, keep): # Operate sorted_series = getattr(col, method)(n=n, keep=keep) df = DataFrame() - new_positions = sorted_series.index.gpu_values + new_positions = sorted_series.index for k in self._data.names: if k == column: df[k] = sorted_series From 8fe0038296afd242f93e29146cb2de66063733a1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Oct 2021 16:42:13 -0700 Subject: [PATCH 10/20] Unify implementations of nsmallest and nlargest. --- python/cudf/cudf/core/dataframe.py | 23 ++---------------- python/cudf/cudf/core/indexed_frame.py | 33 ++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 20 ++-------------- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ce6dbdea675..8517d7a2a62 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3407,7 +3407,7 @@ def nlargest(self, n, columns, keep="first"): Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return self._n_largest_or_smallest("nlargest", n, columns, keep) + return self._n_largest_or_smallest(True, n, columns, keep) def nsmallest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n smallest value of *columns* @@ -3475,26 +3475,7 @@ def nsmallest(self, n, columns, keep="first"): Tuvalu 11300 38 TV Nauru 337000 182 NR """ - return self._n_largest_or_smallest("nsmallest", n, columns, keep) - - def _n_largest_or_smallest(self, method, n, columns, keep): - # Get column to operate on - if not isinstance(columns, str): - [column] = columns - else: - column = columns - - col = self[column].reset_index(drop=True) - # Operate - sorted_series = getattr(col, method)(n=n, keep=keep) - df = DataFrame() - new_positions = sorted_series.index - for k in self._data.names: - if k == column: - df[k] = sorted_series - else: - df[k] = self[k].reset_index(drop=True).take(new_positions) - return df.set_index(self.index.take(new_positions)) + return self._n_largest_or_smallest(False, n, columns, keep) def transpose(self): """Transpose index and columns. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 25d6357a8af..7a509002ef8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -462,3 +462,36 @@ def sort_values( ), keep_index=not ignore_index, ) + + def _n_largest_or_smallest(self, largest, n, columns, keep): + # Get column to operate on + if isinstance(columns, str): + columns = [columns] + + if len(self) == 0: + return self + + if keep == "first": + if n < 0: + n = 0 + + # argsort the `by` column + return self.take( + self._get_columns_by_label(columns)._get_sorted_inds( + ascending=not largest + )[:n], + keep_index=True, + ) + elif keep == "last": + indices = self._get_columns_by_label(columns)._get_sorted_inds( + ascending=largest + ) + + if n <= 0: + # Empty slice. + indices = indices[0:0] + else: + indices = indices[: -n - 1 : -1] + return self.take(indices, keep_index=True) + else: + raise ValueError('keep must be either "first", "last"') diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 926fe97c59b..b70a5776649 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2281,22 +2281,6 @@ def sort_values( ignore_index=ignore_index, ) - def _n_largest_or_smallest(self, largest, n, keep): - direction = largest - if keep == "first": - if n < 0: - n = 0 - return self.sort_values(ascending=not direction).head(n) - elif keep == "last": - data = self.sort_values(ascending=direction) - if n <= 0: - data = data[-n:-n] - else: - data = data.tail(n) - return data[::-1] - else: - raise ValueError('keep must be either "first", "last"') - def nlargest(self, n=5, keep="first"): """Returns a new Series of the *n* largest element. @@ -2357,7 +2341,7 @@ def nlargest(self, n=5, keep="first"): Brunei 434000 dtype: int64 """ - return self._n_largest_or_smallest(n=n, keep=keep, largest=True) + return self._n_largest_or_smallest(True, n, [self.name], keep) def nsmallest(self, n=5, keep="first"): """ @@ -2432,7 +2416,7 @@ def nsmallest(self, n=5, keep="first"): Tuvalu 11300 dtype: int64 """ - return self._n_largest_or_smallest(n=n, keep=keep, largest=False) + return self._n_largest_or_smallest(False, n, [self.name], keep) def argsort( self, From 4b405a0b9ea48d8d339e5761ce9b103c298c739d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 17 Oct 2021 09:13:59 -0700 Subject: [PATCH 11/20] Make all argsort signatures consistent with pandas. --- python/cudf/cudf/core/frame.py | 7 ++++++ python/cudf/cudf/core/index.py | 38 +++++++++++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 10 ++++++++- 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b2366dd394c..b286a19bb13 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2903,6 +2903,7 @@ def searchsorted( def argsort( self, + by=None, axis=0, kind="quicksort", order=None, @@ -2913,6 +2914,8 @@ def argsort( Parameters ---------- + by : str or list of str, default None + Name or list of names to sort by. If None, sort by all columns. axis : {0 or "index"} Has no effect but is accepted for compatibility with numpy. kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' @@ -2951,6 +2954,10 @@ def argsort( 0 3 dtype: int64 """ # noqa: E501 + if by is None: + by = list(self._data.names) + elif isinstance(by, str): + by = [by] if kind != "quicksort": if kind not in {"mergesort", "heapsort", "stable"}: raise AttributeError( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8ca604a45de..de463269743 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1162,6 +1162,44 @@ def is_categorical(self): def is_interval(self): return False + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + """Return the integer indices that would sort the Series values. + + Parameters + ---------- + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable + algorithms. Only quicksort is supported in cuDF. + order : None + Has no effect but is accepted for compatibility with numpy. + ascending : bool or list of bool, default True + If True, sort values in ascending order, otherwise descending. + na_position : {‘first’ or ‘last’}, default ‘last’ + Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs + at the end. + + Returns + ------- + cupy.ndarray: The indices sorted based on input. + """ # noqa: E501 + return super().argsort( + axis=axis, + kind=kind, + order=order, + ascending=ascending, + na_position=na_position, + ) + class NumericIndex(GenericIndex): """Immutable, ordered and sliceable sequence of labels. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b70a5776649..0ec029c3397 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2427,7 +2427,15 @@ def argsort( na_position="last", ): obj = type(self)._from_data( - {None: super().argsort(axis, kind, order, ascending, na_position)} + { + None: super().argsort( + axis=axis, + kind=kind, + order=order, + ascending=ascending, + na_position=na_position, + ) + } ) obj.name = self.name return obj From 128a00c17e0b4e3c454682318b5bd6a37255b723 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 17 Oct 2021 09:36:22 -0700 Subject: [PATCH 12/20] Put back groupby overrides for this PR. --- python/cudf/cudf/core/groupby/groupby.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a62684b6dd7..6ffba8da069 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1181,6 +1181,18 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): _PROTECTED_KEYS = frozenset(("obj",)) + def __init__( + self, obj, by=None, level=None, sort=False, as_index=True, dropna=True + ): + super().__init__( + obj=obj, + by=by, + level=level, + sort=sort, + as_index=as_index, + dropna=dropna, + ) + def __getitem__(self, key): return self.obj[key].groupby( self.grouping, dropna=self._dropna, sort=self._sort @@ -1253,6 +1265,18 @@ class SeriesGroupBy(GroupBy): Name: Max Speed, dtype: float64 """ + def __init__( + self, obj, by=None, level=None, sort=False, as_index=True, dropna=True + ): + super().__init__( + obj=obj, + by=by, + level=level, + sort=sort, + as_index=as_index, + dropna=dropna, + ) + def agg(self, func): result = super().agg(func) From d89fe29079f2563e077293307a4129da2728290e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 17 Oct 2021 09:59:09 -0700 Subject: [PATCH 13/20] Don't construct list of columns unnecessarily. --- python/cudf/cudf/core/frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b286a19bb13..c27b4f4c255 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2954,9 +2954,7 @@ def argsort( 0 3 dtype: int64 """ # noqa: E501 - if by is None: - by = list(self._data.names) - elif isinstance(by, str): + if isinstance(by, str): by = [by] if kind != "quicksort": if kind not in {"mergesort", "heapsort", "stable"}: @@ -2969,7 +2967,7 @@ def argsort( "defaulting to quicksort." ) return self._get_sorted_inds( - ascending=ascending, na_position=na_position + by=by, ascending=ascending, na_position=na_position ).values def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): @@ -2979,7 +2977,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): to_sort = ( self if by is None - else self._get_columns_by_label(by, downcast=False) + else self._get_columns_by_label(list(by), downcast=False) ) # If given a scalar need to construct a sequence of length # of columns From e2d57ba38782db1ee11cf269eb0f9eb535a89af7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Oct 2021 11:58:10 -0700 Subject: [PATCH 14/20] Add DataFrame tests of multiple column sorts. --- python/cudf/cudf/tests/test_sorting.py | 33 +++++++------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 53676a47046..6533aa74fb7 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -154,33 +154,16 @@ def test_series_nsmallest(data, n): @pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)]) -def test_dataframe_nlargest(nelem, n): +@pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) +def test_dataframe_nlargest_nsmallest(nelem, n, op): np.random.seed(0) - df = DataFrame() - df["a"] = aa = np.random.random(nelem) - df["b"] = bb = np.random.random(nelem) - res = df.nlargest(n, "a") - - # Check - inds = np.argsort(aa) - assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) - assert_eq(res.index.values, inds[-n:][::-1]) - + aa = np.random.random(nelem) + bb = np.random.random(nelem) -@pytest.mark.parametrize("nelem,n", [(10, 5), (100, 10)]) -def test_dataframe_nsmallest(nelem, n): - np.random.seed(0) - df = DataFrame() - df["a"] = aa = np.random.random(nelem) - df["b"] = bb = np.random.random(nelem) - res = df.nsmallest(n, "a") - - # Check - inds = np.argsort(-aa) - assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) - assert_eq(res.index.values, inds[-n:][::-1]) + df = DataFrame({"a": aa, "b": bb}) + pdf = df.to_pandas() + assert_eq(getattr(df, op)(n, "a"), getattr(pdf, op)(n, "a")) + assert_eq(getattr(df, op)(n, ["b", "a"]), getattr(pdf, op)(n, ["b", "a"])) @pytest.mark.parametrize( From a71c408ec1084e9789002a25117a8e0d2e548033 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Oct 2021 16:05:03 -0700 Subject: [PATCH 15/20] Remove now superfluous take implementation. --- python/cudf/cudf/core/_base_index.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index d4c958594dc..35e5cbd0f1e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -498,15 +498,6 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def take(self, indices): - """Gather only the specific subset of indices - - Parameters - ---------- - indices: An array-like that maps to values contained in this Index. - """ - return self[indices] - def to_frame(self, index=True, name=None): """Create a DataFrame with a column containing this Index From 1c2549e4d70e29080ec3885ac7cc8c31f418e396 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Oct 2021 16:08:04 -0700 Subject: [PATCH 16/20] Rename positions to indices for pandas consistency. --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/frame.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8517d7a2a62..ed38c9da3c6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2574,8 +2574,8 @@ class max_speed if not inplace: return result - def take(self, positions, keep_index=None): - out = super().take(positions, keep_index) + def take(self, indices, keep_index=None): + out = super().take(indices, keep_index) out.columns = self.columns return out diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c27b4f4c255..55bce9a8f34 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2986,12 +2986,12 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) - def take(self, positions, keep_index=None): + def take(self, indices, keep_index=None): """Return a new object containing the rows specified by *positions* Parameters ---------- - positions : array-like + indices : array-like Array of ints indicating which positions to take. keep_index : bool, default True Whether to retain the index in result or not. @@ -3027,15 +3027,15 @@ def take(self, positions, keep_index=None): else: keep_index = True - positions = as_column(positions) - if is_bool_dtype(positions): + indices = as_column(indices) + if is_bool_dtype(indices): warnings.warn( "Calling take with a boolean array is deprecated and will be " "removed in the future.", FutureWarning, ) - return self._apply_boolean_mask(positions) - return self._gather(positions, keep_index=keep_index) + return self._apply_boolean_mask(indices) + return self._gather(indices, keep_index=keep_index) def sin(self): """ From 3fac33b91194eb4b4c28facd519956ba61b83a24 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 25 Oct 2021 14:03:28 -0700 Subject: [PATCH 17/20] Allow deserialization to return RangeIndex. --- python/cudf/cudf/tests/test_pickling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 0f8b46cee35..28e63ec41f1 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf import DataFrame, GenericIndex, Series +from cudf import DataFrame, GenericIndex, RangeIndex, Series from cudf.core.buffer import Buffer from cudf.testing._utils import assert_eq @@ -28,7 +28,7 @@ def check_serialization(df): assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex)) assert_frame_picklable(sortvaldf) # out-of-band if pickle.HIGHEST_PROTOCOL >= 5: From ea0d075f9377cfd51a43ad5351d43f84c0cbd5c0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 25 Oct 2021 14:16:22 -0700 Subject: [PATCH 18/20] Parameterize the columns. --- python/cudf/cudf/tests/test_sorting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 6533aa74fb7..00cd31e7539 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -155,15 +155,15 @@ def test_series_nsmallest(data, n): @pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)]) @pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) -def test_dataframe_nlargest_nsmallest(nelem, n, op): +@pytest.mark.parametrize("columns", ["a", ["b", "a"]]) +def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): np.random.seed(0) aa = np.random.random(nelem) bb = np.random.random(nelem) df = DataFrame({"a": aa, "b": bb}) pdf = df.to_pandas() - assert_eq(getattr(df, op)(n, "a"), getattr(pdf, op)(n, "a")) - assert_eq(getattr(df, op)(n, ["b", "a"]), getattr(pdf, op)(n, ["b", "a"])) + assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) @pytest.mark.parametrize( From 3e412bcf5421633dfb449f9ac0b35beee296488d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 27 Oct 2021 15:09:04 -0700 Subject: [PATCH 19/20] Update python/cudf/cudf/core/series.py Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0ec029c3397..b9636580a91 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2426,7 +2426,7 @@ def argsort( ascending=True, na_position="last", ): - obj = type(self)._from_data( + obj = self.__class__._from_data( { None: super().argsort( axis=axis, From b06a43e86d65974f043105ba659664160f023c00 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 1 Nov 2021 17:15:08 -0700 Subject: [PATCH 20/20] Address PR reviews. --- python/cudf/cudf/core/_base_index.py | 14 ++++++++ python/cudf/cudf/core/frame.py | 46 +++++++++++++++++++++++--- python/cudf/cudf/core/indexed_frame.py | 5 +++ python/cudf/cudf/core/multiindex.py | 2 ++ 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 35e5cbd0f1e..eea8e3c418f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,6 +3,7 @@ from __future__ import annotations, division, print_function import pickle +import warnings from typing import Any, Set import pandas as pd @@ -555,6 +556,17 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) + @property + def gpu_values(self): + """ + View the data as a numba device array object + """ + warnings.warn( + "The gpu_values property is deprecated and will be removed.", + FutureWarning, + ) + return self._values.data_array_view + def append(self, other): """ Append a collection of Index options together. @@ -1042,6 +1054,8 @@ def sort_values( """ if key is not None: raise NotImplementedError("key parameter is not yet implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") indices = self.argsort(ascending=ascending, na_position=na_position) index_sorted = self.take(indices) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 55bce9a8f34..666144d6819 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2880,6 +2880,9 @@ def searchsorted( """ # Call libcudf++ search_sorted primitive + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + scalar_flag = None if is_scalar(values): scalar_flag = True @@ -2901,6 +2904,7 @@ def searchsorted( else: return result + @annotate("ARGSORT", color="yellow", domain="cudf_python") def argsort( self, by=None, @@ -2936,6 +2940,8 @@ def argsort( Examples -------- + **Series** + >>> import cudf >>> s = cudf.Series([3, 1, 2]) >>> s @@ -2953,9 +2959,21 @@ def argsort( 2 2 0 3 dtype: int64 + + **DataFrame** + >>> import cudf + >>> df = cudf.DataFrame({'foo': [3, 1, 2]}) + >>> df.argsort() + array([1, 2, 0], dtype=int32) + + **Index** + >>> import cudf + >>> idx = cudf.Index([3, 1, 2]) + >>> idx.argsort() + array([1, 2, 0], dtype=int32) """ # noqa: E501 - if isinstance(by, str): - by = [by] + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") if kind != "quicksort": if kind not in {"mergesort", "heapsort", "stable"}: raise AttributeError( @@ -2966,6 +2984,9 @@ def argsort( f"GPU-accelerated {kind} is currently not supported, " "defaulting to quicksort." ) + + if isinstance(by, str): + by = [by] return self._get_sorted_inds( by=by, ascending=ascending, na_position=na_position ).values @@ -2998,11 +3019,22 @@ def take(self, indices, keep_index=None): Returns ------- - out : DataFrame - New DataFrame + out : Series or DataFrame or Index + New object with desired subset of rows. Examples -------- + **Series** + >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e']) + >>> s.take([2, 0, 4, 3]) + 2 c + 0 a + 4 e + 3 d + dtype: object + + **DataFrame** + >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], ... 'b': cudf.Series(['a', 'b', 'c'])}) >>> a.take([0, 2, 2]) @@ -3014,6 +3046,12 @@ def take(self, indices, keep_index=None): a b 0 1.0 a 2 3.0 c + + **Index** + + >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) + >>> idx.take([2, 0, 4, 3]) + StringIndex(['c' 'a' 'e' 'd'], dtype='object') """ # TODO: When we remove keep_index we should introduce the axis # parameter. We could also introduce is_copy, but that's already diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7a509002ef8..552b590e0d8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -352,6 +352,9 @@ def sort_index( if key is not None: raise NotImplementedError("key is not yet supported.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if axis in (0, "index"): idx = self.index if isinstance(idx, MultiIndex): @@ -437,6 +440,8 @@ def sort_values( 2 2 0 1 1 2 """ + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") if inplace: raise NotImplementedError("`inplace` not currently implemented.") if kind != "quicksort": diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b77133a7431..7c132e3fb71 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -835,6 +835,8 @@ def size(self): return self._num_rows def take(self, indices): + if isinstance(indices, cudf.Series) and indices.has_nulls: + raise ValueError("Column must have no nulls.") obj = super().take(indices) obj.names = self.names return obj