diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 590bff3b19d..eea8e3c418f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -3,9 +3,9 @@ from __future__ import annotations, division, print_function import pickle +import warnings from typing import Any, Set -import cupy import pandas as pd import cudf @@ -499,66 +499,6 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def take(self, indices): - """Gather only the specific subset of indices - - Parameters - ---------- - indices: An array-like that maps to values contained in this Index. - """ - return self[indices] - - def argsort(self, ascending=True, **kwargs): - """ - Return the integer indices that would sort the index. - - Parameters - ---------- - ascending : bool, default True - If True, returns the indices for ascending order. - If False, returns the indices for descending order. - - Returns - ------- - array : A cupy array containing Integer indices that - would sort the index if used as an indexer. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([10, 100, 1, 1000]) - >>> index - Int64Index([10, 100, 1, 1000], dtype='int64') - >>> index.argsort() - array([2, 0, 1, 3], dtype=int32) - - The order of argsort can be reversed using - ``ascending`` parameter, by setting it to ``False``. - >>> index.argsort(ascending=False) - array([3, 1, 0, 2], dtype=int32) - - ``argsort`` on a MultiIndex: - - >>> index = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> index - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> index.argsort() - array([4, 0, 1, 2, 3], dtype=int32) - >>> index.argsort(ascending=False) - array([3, 2, 1, 0, 4], dtype=int32) - """ - indices = self._values.argsort(ascending=ascending, **kwargs) - return cupy.asarray(indices) - def to_frame(self, index=True, name=None): """Create a DataFrame with a column containing this Index @@ -621,6 +561,10 @@ def gpu_values(self): """ View the data as a numba device array object """ + warnings.warn( + "The gpu_values property is deprecated and will be removed.", + FutureWarning, + ) return self._values.data_array_view def append(self, other): @@ -1025,7 +969,13 @@ def _intersection(self, other, sort=None): return intersection_result.sort_values() return intersection_result - def sort_values(self, return_indexer=False, ascending=True, key=None): + def sort_values( + self, + return_indexer=False, + ascending=True, + na_position="last", + key=None, + ): """ Return a sorted copy of the index, and optionally return the indices that sorted the index itself. @@ -1036,6 +986,9 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. key : None, optional This parameter is NON-FUNCTIONAL. @@ -1101,12 +1054,14 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): """ if key is not None: raise NotImplementedError("key parameter is not yet implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") - indices = self._values.argsort(ascending=ascending) - index_sorted = cudf.Index(self.take(indices), name=self.name) + indices = self.argsort(ascending=ascending, na_position=na_position) + index_sorted = self.take(indices) if return_indexer: - return index_sorted, cupy.asarray(indices) + return index_sorted, indices else: return index_sorted diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index db81a86e670..45c8550c165 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2553,43 +2553,11 @@ class max_speed if not inplace: return result - def take(self, positions, axis=0, keep_index=True): - """ - Return a new DataFrame containing the rows specified by *positions* - - Parameters - ---------- - positions : array-like - Integer or boolean array-like specifying the rows of the output. - If integer, each element represents the integer index of a row. - If boolean, *positions* must be of the same length as *self*, - and represents a boolean mask. - - Returns - ------- - out : DataFrame - New DataFrame - - Examples - -------- - >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], - ... 'b': cudf.Series(['a', 'b', 'c'])}) - >>> a.take([0, 2, 2]) - a b - 0 1.0 a - 2 3.0 c - 2 3.0 c - >>> a.take([True, False, True]) - a b - 0 1.0 a - 2 3.0 c - """ + def take(self, indices, axis=0, keep_index=None): + axis = self._get_axis_from_axis_arg(axis) if axis != 0: raise NotImplementedError("Only axis=0 is supported.") - positions = as_column(positions) - if is_bool_dtype(positions): - return self._apply_boolean_mask(positions) - out = self._gather(positions, keep_index=keep_index) + out = super().take(indices, keep_index) out.columns = self.columns return out @@ -3242,127 +3210,6 @@ def _label_encoding( outdf.insert(len(outdf._data), newname, newcol) return outdf - @annotate("ARGSORT", color="yellow", domain="cudf_python") - def argsort(self, ascending=True, na_position="last"): - """ - Sort by the values. - - Parameters - ---------- - ascending : bool or list of bool, default True - If True, sort values in ascending order, otherwise descending. - na_position : {‘first’ or ‘last’}, default ‘last’ - Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs - at the end. - - Returns - ------- - out_column_inds : cuDF Column of indices sorted based on input - - Notes - ----- - Difference from pandas: - - - Support axis='index' only. - - Not supporting: inplace, kind - - Ascending can be a list of bools to control per column - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]}) - >>> df - a b - 0 10 -10 - 1 0 10 - 2 2 1 - >>> inds = df.argsort() - >>> inds - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> df.take(inds) - a b - 1 0 10 - 2 2 1 - 0 10 -10 - """ - inds_col = self._get_sorted_inds( - ascending=ascending, na_position=na_position - ) - return cudf.Series(inds_col) - - def sort_values( - self, - by, - axis=0, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - ): - """ - Sort by the values row-wise. - - Parameters - ---------- - by : str or list of str - Name or list of names to sort by. - ascending : bool or list of bool, default True - Sort ascending vs. descending. Specify list for multiple sort - orders. If this is a list of bools, must match the length of the - by. - na_position : {‘first’, ‘last’}, default ‘last’ - 'first' puts nulls at the beginning, 'last' puts nulls at the end - ignore_index : bool, default False - If True, index will not be sorted. - - Returns - ------- - sorted_obj : cuDF DataFrame - - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['a'] = [0, 1, 2] - >>> df['b'] = [-3, 2, 0] - >>> df.sort_values('b') - a b - 0 0 -3 - 2 2 0 - 1 1 2 - """ - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind not in {"quicksort", "mergesort", "heapsort", "stable"}: - raise AttributeError( - f"{kind} is not a valid sorting algorithm for " - f"'DataFrame' object" - ) - elif kind != "quicksort": - msg = ( - f"GPU-accelerated {kind} is currently not supported, " - f"now defaulting to GPU-accelerated quicksort." - ) - warnings.warn(msg) - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - - # argsort the `by` column - return self.take( - self[by].argsort(ascending=ascending, na_position=na_position), - keep_index=not ignore_index, - ) - def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3555,7 +3402,7 @@ def nlargest(self, n, columns, keep="first"): Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return self._n_largest_or_smallest("nlargest", n, columns, keep) + return self._n_largest_or_smallest(True, n, columns, keep) def nsmallest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n smallest value of *columns* @@ -3623,26 +3470,7 @@ def nsmallest(self, n, columns, keep="first"): Tuvalu 11300 38 TV Nauru 337000 182 NR """ - return self._n_largest_or_smallest("nsmallest", n, columns, keep) - - def _n_largest_or_smallest(self, method, n, columns, keep): - # Get column to operate on - if not isinstance(columns, str): - [column] = columns - else: - column = columns - - col = self[column].reset_index(drop=True) - # Operate - sorted_series = getattr(col, method)(n=n, keep=keep) - df = DataFrame() - new_positions = sorted_series.index.gpu_values - for k in self._data.names: - if k == column: - df[k] = sorted_series - else: - df[k] = self[k].reset_index(drop=True).take(new_positions) - return df.set_index(self.index.take(new_positions)) + return self._n_largest_or_smallest(False, n, columns, keep) def transpose(self): """Transpose index and columns. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0b895460410..666144d6819 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -30,6 +30,7 @@ from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, + is_bool_dtype, is_decimal_dtype, is_dict_like, is_integer_dtype, @@ -529,6 +530,7 @@ def _gather(self, gather_map, keep_index=True, nullify=False): ) result._copy_type_metadata(self, include_index=keep_index) + result._data.names = self._data.names if keep_index and self._index is not None: result._index.names = self._index.names return result @@ -2878,6 +2880,9 @@ def searchsorted( """ # Call libcudf++ search_sorted primitive + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + scalar_flag = None if is_scalar(values): scalar_flag = True @@ -2899,34 +2904,101 @@ def searchsorted( else: return result - def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): - """ - Sort by the values. + @annotate("ARGSORT", color="yellow", domain="cudf_python") + def argsort( + self, + by=None, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + """Return the integer indices that would sort the Series values. Parameters ---------- - by: list, optional - Labels specifying columns to sort by. By default, - sort by all columns of `self` + by : str or list of str, default None + Name or list of names to sort by. If None, sort by all columns. + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable + algorithms. Only quicksort is supported in cuDF. + order : None + Has no effect but is accepted for compatibility with numpy. ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at the end. + Returns ------- - out_column_inds : cuDF Column of indices sorted based on input + cupy.ndarray: The indices sorted based on input. - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - * Ascending can be a list of bools to control per column - """ + Examples + -------- + **Series** + + >>> import cudf + >>> s = cudf.Series([3, 1, 2]) + >>> s + 0 3 + 1 1 + 2 2 + dtype: int64 + >>> s.argsort() + 0 1 + 1 2 + 2 0 + dtype: int32 + >>> s[s.argsort()] + 1 1 + 2 2 + 0 3 + dtype: int64 + + **DataFrame** + >>> import cudf + >>> df = cudf.DataFrame({'foo': [3, 1, 2]}) + >>> df.argsort() + array([1, 2, 0], dtype=int32) + + **Index** + >>> import cudf + >>> idx = cudf.Index([3, 1, 2]) + >>> idx.argsort() + array([1, 2, 0], dtype=int32) + """ # noqa: E501 + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if kind != "quicksort": + if kind not in {"mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + warnings.warn( + f"GPU-accelerated {kind} is currently not supported, " + "defaulting to quicksort." + ) + + if isinstance(by, str): + by = [by] + return self._get_sorted_inds( + by=by, ascending=ascending, na_position=na_position + ).values + + def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): + # Get an int64 column consisting of the indices required to sort self + # according to the columns specified in by. to_sort = ( self if by is None - else self._get_columns_by_label(by, downcast=False) + else self._get_columns_by_label(list(by), downcast=False) ) # If given a scalar need to construct a sequence of length # of columns @@ -2935,6 +3007,74 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + def take(self, indices, keep_index=None): + """Return a new object containing the rows specified by *positions* + + Parameters + ---------- + indices : array-like + Array of ints indicating which positions to take. + keep_index : bool, default True + Whether to retain the index in result or not. + + Returns + ------- + out : Series or DataFrame or Index + New object with desired subset of rows. + + Examples + -------- + **Series** + >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e']) + >>> s.take([2, 0, 4, 3]) + 2 c + 0 a + 4 e + 3 d + dtype: object + + **DataFrame** + + >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], + ... 'b': cudf.Series(['a', 'b', 'c'])}) + >>> a.take([0, 2, 2]) + a b + 0 1.0 a + 2 3.0 c + 2 3.0 c + >>> a.take([True, False, True]) + a b + 0 1.0 a + 2 3.0 c + + **Index** + + >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) + >>> idx.take([2, 0, 4, 3]) + StringIndex(['c' 'a' 'e' 'd'], dtype='object') + """ + # TODO: When we remove keep_index we should introduce the axis + # parameter. We could also introduce is_copy, but that's already + # deprecated in pandas so it's probably unnecessary. We also need to + # introduce Index.take's allow_fill and fill_value parameters. + if keep_index is not None: + warnings.warn( + "keep_index is deprecated and will be removed in the future.", + FutureWarning, + ) + else: + keep_index = True + + indices = as_column(indices) + if is_bool_dtype(indices): + warnings.warn( + "Calling take with a boolean array is deprecated and will be " + "removed in the future.", + FutureWarning, + ) + return self._apply_boolean_mask(indices) + return self._gather(indices, keep_index=keep_index) + def sin(self): """ Get Trigonometric sine, element-wise. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c003454fb59..de463269743 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1162,6 +1162,44 @@ def is_categorical(self): def is_interval(self): return False + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + """Return the integer indices that would sort the Series values. + + Parameters + ---------- + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See :func:`numpy.sort` for more + information. 'mergesort' and 'stable' are the only stable + algorithms. Only quicksort is supported in cuDF. + order : None + Has no effect but is accepted for compatibility with numpy. + ascending : bool or list of bool, default True + If True, sort values in ascending order, otherwise descending. + na_position : {‘first’ or ‘last’}, default ‘last’ + Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs + at the end. + + Returns + ------- + cupy.ndarray: The indices sorted based on input. + """ # noqa: E501 + return super().argsort( + axis=axis, + kind=kind, + order=order, + ascending=ascending, + na_position=na_position, + ) + class NumericIndex(GenericIndex): """Immutable, ordered and sliceable sequence of labels. @@ -2371,9 +2409,6 @@ def to_pandas(self): self.to_numpy(na_value=None), name=self.name, dtype="object" ) - def take(self, indices): - return self._values[indices] - def __repr__(self): return ( f"{self.__class__.__name__}({self._values.to_array()}," diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e7066e336ab..68088cb275e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,6 +3,7 @@ from __future__ import annotations +import warnings from typing import Type, TypeVar import cupy as cp @@ -377,6 +378,9 @@ def sort_index( if key is not None: raise NotImplementedError("key is not yet supported.") + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if axis in (0, "index"): idx = self.index if isinstance(idx, MultiIndex): @@ -414,3 +418,111 @@ def sort_index( if ignore_index is True: out = out.reset_index(drop=True) return self._mimic_inplace(out, inplace=inplace) + + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """Sort by the values along either axis. + + Parameters + ---------- + by : str or list of str + Name or list of names to sort by. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of the + by. + na_position : {‘first’, ‘last’}, default ‘last’ + 'first' puts nulls at the beginning, 'last' puts nulls at the end + ignore_index : bool, default False + If True, index will not be sorted. + + Returns + ------- + Frame : Frame with sorted values. + + Notes + ----- + Difference from pandas: + * Support axis='index' only. + * Not supporting: inplace, kind + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['a'] = [0, 1, 2] + >>> df['b'] = [-3, 2, 0] + >>> df.sort_values('b') + a b + 0 0 -3 + 2 2 0 + 1 1 2 + """ + if na_position not in {"first", "last"}: + raise ValueError(f"invalid na_position: {na_position}") + if inplace: + raise NotImplementedError("`inplace` not currently implemented.") + if kind != "quicksort": + if kind not in {"mergesort", "heapsort", "stable"}: + raise AttributeError( + f"{kind} is not a valid sorting algorithm for " + f"'DataFrame' object" + ) + warnings.warn( + f"GPU-accelerated {kind} is currently not supported, " + f"defaulting to quicksort." + ) + if axis != 0: + raise NotImplementedError("`axis` not currently implemented.") + + if len(self) == 0: + return self + + # argsort the `by` column + return self.take( + self._get_columns_by_label(by)._get_sorted_inds( + ascending=ascending, na_position=na_position + ), + keep_index=not ignore_index, + ) + + def _n_largest_or_smallest(self, largest, n, columns, keep): + # Get column to operate on + if isinstance(columns, str): + columns = [columns] + + if len(self) == 0: + return self + + if keep == "first": + if n < 0: + n = 0 + + # argsort the `by` column + return self.take( + self._get_columns_by_label(columns)._get_sorted_inds( + ascending=not largest + )[:n], + keep_index=True, + ) + elif keep == "last": + indices = self._get_columns_by_label(columns)._get_sorted_inds( + ascending=largest + ) + + if n <= 0: + # Empty slice. + indices = indices[0:0] + else: + indices = indices[: -n - 1 : -1] + return self.take(indices, keep_index=True) + else: + raise ValueError('keep must be either "first", "last"') diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 27edd41ed92..7c132e3fb71 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -835,22 +835,11 @@ def size(self): return self._num_rows def take(self, indices): - if isinstance(indices, (Integral, Sequence)): - indices = np.array(indices) - elif isinstance(indices, cudf.Series) and indices.has_nulls: + if isinstance(indices, cudf.Series) and indices.has_nulls: raise ValueError("Column must have no nulls.") - elif isinstance(indices, slice): - start, stop, step = indices.indices(len(self)) - indices = column.arange(start, stop, step) - result = MultiIndex.from_frame( - self.to_frame(index=False).take(indices) - ) - if self._codes is not None: - result._codes = self._codes.take(indices) - if self._levels is not None: - result._levels = self._levels - result.names = self.names - return result + obj = super().take(indices) + obj.names = self.names + return obj def serialize(self): header, frames = super().serialize() @@ -887,11 +876,26 @@ def deserialize(cls, header, frames): return obj._set_names(column_names) def __getitem__(self, index): - if isinstance(index, int): - # we are indexing into a single row of the MultiIndex, - # return that row as a tuple: - return self.take(index).to_pandas()[0] - return self.take(index) + flatten = isinstance(index, int) + + if isinstance(index, (Integral, Sequence)): + index = np.array(index) + elif isinstance(index, slice): + start, stop, step = index.indices(len(self)) + index = column.arange(start, stop, step) + result = MultiIndex.from_frame(self.to_frame(index=False).take(index)) + + # we are indexing into a single row of the MultiIndex, + # return that row as a tuple: + if flatten: + return result.to_pandas()[0] + + if self._codes is not None: + result._codes = self._codes.take(index) + if self._levels is not None: + result._levels = self._levels + result.names = self.names + return result def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is @@ -1364,23 +1368,6 @@ def is_monotonic_decreasing(self): ascending=[False] * len(self.levels), null_position=None ) - def argsort(self, ascending=True, **kwargs): - return self._get_sorted_inds(ascending=ascending, **kwargs).values - - def sort_values(self, return_indexer=False, ascending=True, key=None): - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - - indices = cudf.Series._from_data( - {None: self._get_sorted_inds(ascending=ascending)} - ) - index_sorted = as_index(self.take(indices), name=self.names) - - if return_indexer: - return index_sorted, cupy.asarray(indices) - else: - return index_sorted - def fillna(self, value): """ Fill null values with the specified value. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f6775dac820..739f4c2e79b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -111,8 +111,7 @@ def __getitem__(self, arg): ): return data return self._frame._from_data( - {self._frame.name: data}, - index=cudf.Index(self._frame.index.take(arg)), + {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]), ) def __setitem__(self, key, value): @@ -1171,51 +1170,9 @@ def __setitem__(self, key, value): self.loc[key] = value def take(self, indices, axis=0, keep_index=True): - """ - Return Series by taking values from the corresponding *indices*. - - Parameters - ---------- - indices : array-like or scalar - An array/scalar like integers indicating which positions to take. - keep_index : bool, default True - Whethere to retain the index in result Series or not. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 13, 14]) - >>> series - 0 10 - 1 11 - 2 12 - 3 13 - 4 14 - dtype: int64 - >>> series.take([0, 4]) - 0 10 - 4 14 - dtype: int64 - - If you want to drop the index, pass `keep_index=False` - - >>> series.take([0, 4], keep_index=False) - 0 10 - 1 14 - dtype: int64 - """ - axis = self._get_axis_from_axis_arg(axis) - if keep_index is True or is_scalar(indices): - return self.iloc[indices] - else: - col_inds = as_column(indices) - return self._from_data( - {self.name: self._column.take(col_inds, keep_index=False)} - ) + # Validate but don't use the axis. + _ = self._get_axis_from_axis_arg(axis) + return super().take(indices, keep_index) def __repr__(self): _, height = get_terminal_size() @@ -2262,37 +2219,6 @@ def astype(self, dtype, copy=False, errors="raise"): pass return self - def argsort(self, ascending=True, na_position="last"): - """Returns a Series of int64 index that will sort the series. - - Uses Thrust sort. - - Returns - ------- - result: Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([3, 1, 2]) - >>> s - 0 3 - 1 1 - 2 2 - dtype: int64 - >>> s.argsort() - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> s[s.argsort()] - 1 1 - 2 2 - 0 3 - dtype: int64 - """ - return self._sort(ascending=ascending, na_position=na_position)[1] - def sort_index(self, axis=0, *args, **kwargs): if axis not in (0, "index"): raise ValueError("Only axis=0 is valid for Series.") @@ -2307,28 +2233,28 @@ def sort_values( na_position="last", ignore_index=False, ): - """ - Sort by the values. - - Sort a Series in ascending or descending order by some criterion. + """Sort by the values along either axis. Parameters ---------- - ascending : bool, default True - If True, sort values in ascending order, otherwise descending. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of the + by. na_position : {‘first’, ‘last’}, default ‘last’ - 'first' puts nulls at the beginning, 'last' puts nulls at the end. + 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. Returns ------- - sorted_obj : cuDF Series + Series : Series with sorted values. Notes ----- Difference from pandas: - * Not supporting: `inplace`, `kind` + * Support axis='index' only. + * Not supporting: inplace, kind Examples -------- @@ -2342,38 +2268,15 @@ def sort_values( 1 5 dtype: int64 """ - - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind != "quicksort": - raise NotImplementedError("`kind` not currently implemented.") - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - - if len(self) == 0: - return self - vals, inds = self._sort(ascending=ascending, na_position=na_position) - if not ignore_index: - index = self.index.take(inds) - else: - index = self.index - return vals.set_index(index) - - def _n_largest_or_smallest(self, largest, n, keep): - direction = largest - if keep == "first": - if n < 0: - n = 0 - return self.sort_values(ascending=not direction).head(n) - elif keep == "last": - data = self.sort_values(ascending=direction) - if n <= 0: - data = data[-n:-n] - else: - data = data.tail(n) - return data.reverse() - else: - raise ValueError('keep must be either "first", "last"') + return super().sort_values( + by=self.name, + axis=axis, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + ignore_index=ignore_index, + ) def nlargest(self, n=5, keep="first"): """Returns a new Series of the *n* largest element. @@ -2435,7 +2338,7 @@ def nlargest(self, n=5, keep="first"): Brunei 434000 dtype: int64 """ - return self._n_largest_or_smallest(n=n, keep=keep, largest=True) + return self._n_largest_or_smallest(True, n, [self.name], keep) def nsmallest(self, n=5, keep="first"): """ @@ -2510,22 +2413,29 @@ def nsmallest(self, n=5, keep="first"): Tuvalu 11300 dtype: int64 """ - return self._n_largest_or_smallest(n=n, keep=keep, largest=False) + return self._n_largest_or_smallest(False, n, [self.name], keep) - def _sort(self, ascending=True, na_position="last"): - """ - Sort by values - - Returns - ------- - 2-tuple of key and index - """ - col_keys, col_inds = self._column.sort_by_values( - ascending=ascending, na_position=na_position + def argsort( + self, + axis=0, + kind="quicksort", + order=None, + ascending=True, + na_position="last", + ): + obj = self.__class__._from_data( + { + None: super().argsort( + axis=axis, + kind=kind, + order=order, + ascending=ascending, + na_position=na_position, + ) + } ) - sr_keys = self._from_data({self.name: col_keys}, self._index) - sr_inds = self._from_data({self.name: col_inds}, self._index) - return sr_keys, sr_inds + obj.name = self.name + return obj def replace(self, to_replace=None, value=None, *args, **kwargs): if is_dict_like(to_replace) and value is not None: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c1eade0fcdc..ae331a1d5ce 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8732,12 +8732,12 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode): ( cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), True, - cudf.Series([1, 2, 0], dtype="int32"), + cupy.array([1, 2, 0], dtype="int32"), ), ( cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), False, - cudf.Series([0, 2, 1], dtype="int32"), + cupy.array([0, 2, 1], dtype="int32"), ), ], ) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 0f8b46cee35..28e63ec41f1 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf import DataFrame, GenericIndex, Series +from cudf import DataFrame, GenericIndex, RangeIndex, Series from cudf.core.buffer import Buffer from cudf.testing._utils import assert_eq @@ -28,7 +28,7 @@ def check_serialization(df): assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, GenericIndex) + assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex)) assert_frame_picklable(sortvaldf) # out-of-band if pickle.HIGHEST_PROTOCOL >= 5: diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 53676a47046..00cd31e7539 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -154,33 +154,16 @@ def test_series_nsmallest(data, n): @pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)]) -def test_dataframe_nlargest(nelem, n): +@pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) +@pytest.mark.parametrize("columns", ["a", ["b", "a"]]) +def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): np.random.seed(0) - df = DataFrame() - df["a"] = aa = np.random.random(nelem) - df["b"] = bb = np.random.random(nelem) - res = df.nlargest(n, "a") - - # Check - inds = np.argsort(aa) - assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) - assert_eq(res.index.values, inds[-n:][::-1]) - + aa = np.random.random(nelem) + bb = np.random.random(nelem) -@pytest.mark.parametrize("nelem,n", [(10, 5), (100, 10)]) -def test_dataframe_nsmallest(nelem, n): - np.random.seed(0) - df = DataFrame() - df["a"] = aa = np.random.random(nelem) - df["b"] = bb = np.random.random(nelem) - res = df.nsmallest(n, "a") - - # Check - inds = np.argsort(-aa) - assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1]) - assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1]) - assert_eq(res.index.values, inds[-n:][::-1]) + df = DataFrame({"a": aa, "b": bb}) + pdf = df.to_pandas() + assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) @pytest.mark.parametrize(