Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor sorting APIs #9464

Merged
merged 21 commits into from
Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 20 additions & 65 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from __future__ import annotations, division, print_function

import pickle
import warnings
from typing import Any, Set

import cupy
import pandas as pd

import cudf
Expand Down Expand Up @@ -499,66 +499,6 @@ def fillna(self, value, downcast=None):

return super().fillna(value=value)

def take(self, indices):
"""Gather only the specific subset of indices

Parameters
----------
indices: An array-like that maps to values contained in this Index.
"""
return self[indices]

def argsort(self, ascending=True, **kwargs):
"""
Return the integer indices that would sort the index.

Parameters
----------
ascending : bool, default True
If True, returns the indices for ascending order.
If False, returns the indices for descending order.

Returns
-------
array : A cupy array containing Integer indices that
would sort the index if used as an indexer.

Examples
--------
>>> import cudf
>>> index = cudf.Index([10, 100, 1, 1000])
>>> index
Int64Index([10, 100, 1, 1000], dtype='int64')
>>> index.argsort()
array([2, 0, 1, 3], dtype=int32)

The order of argsort can be reversed using
``ascending`` parameter, by setting it to ``False``.
>>> index.argsort(ascending=False)
array([3, 1, 0, 2], dtype=int32)

``argsort`` on a MultiIndex:

>>> index = cudf.MultiIndex(
... levels=[[1, 3, 4, -10], [1, 11, 5]],
... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
... names=["x", "y"],
... )
>>> index
MultiIndex([( 1, 1),
( 1, 5),
( 3, 11),
( 4, 11),
(-10, 1)],
names=['x', 'y'])
>>> index.argsort()
array([4, 0, 1, 2, 3], dtype=int32)
>>> index.argsort(ascending=False)
array([3, 2, 1, 0, 4], dtype=int32)
"""
indices = self._values.argsort(ascending=ascending, **kwargs)
return cupy.asarray(indices)

def to_frame(self, index=True, name=None):
"""Create a DataFrame with a column containing this Index

Expand Down Expand Up @@ -621,6 +561,10 @@ def gpu_values(self):
"""
View the data as a numba device array object
"""
warnings.warn(
"The gpu_values property is deprecated and will be removed.",
FutureWarning,
)
return self._values.data_array_view

def append(self, other):
Expand Down Expand Up @@ -1025,7 +969,13 @@ def _intersection(self, other, sort=None):
return intersection_result.sort_values()
return intersection_result

def sort_values(self, return_indexer=False, ascending=True, key=None):
def sort_values(
self,
return_indexer=False,
ascending=True,
na_position="last",
key=None,
):
"""
Return a sorted copy of the index, and optionally return the indices
that sorted the index itself.
Expand All @@ -1036,6 +986,9 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
Should the indices that would sort the index be returned.
ascending : bool, default True
Should the index values be sorted in an ascending order.
na_position : {'first' or 'last'}, default 'last'
vyasr marked this conversation as resolved.
Show resolved Hide resolved
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
key : None, optional
This parameter is NON-FUNCTIONAL.

Expand Down Expand Up @@ -1101,12 +1054,14 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
"""
if key is not None:
raise NotImplementedError("key parameter is not yet implemented.")
if na_position not in {"first", "last"}:
raise ValueError(f"invalid na_position: {na_position}")

indices = self._values.argsort(ascending=ascending)
index_sorted = cudf.Index(self.take(indices), name=self.name)
indices = self.argsort(ascending=ascending, na_position=na_position)
index_sorted = self.take(indices)

if return_indexer:
return index_sorted, cupy.asarray(indices)
return index_sorted, indices
else:
return index_sorted

Expand Down
182 changes: 5 additions & 177 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2553,43 +2553,11 @@ class max_speed
if not inplace:
return result

def take(self, positions, axis=0, keep_index=True):
"""
Return a new DataFrame containing the rows specified by *positions*

Parameters
----------
positions : array-like
Integer or boolean array-like specifying the rows of the output.
If integer, each element represents the integer index of a row.
If boolean, *positions* must be of the same length as *self*,
and represents a boolean mask.

Returns
-------
out : DataFrame
New DataFrame

Examples
--------
>>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
... 'b': cudf.Series(['a', 'b', 'c'])})
>>> a.take([0, 2, 2])
a b
0 1.0 a
2 3.0 c
2 3.0 c
>>> a.take([True, False, True])
a b
0 1.0 a
2 3.0 c
"""
def take(self, indices, axis=0, keep_index=None):
axis = self._get_axis_from_axis_arg(axis)
if axis != 0:
raise NotImplementedError("Only axis=0 is supported.")
positions = as_column(positions)
if is_bool_dtype(positions):
return self._apply_boolean_mask(positions)
out = self._gather(positions, keep_index=keep_index)
out = super().take(indices, keep_index)
out.columns = self.columns
return out

Expand Down Expand Up @@ -3242,127 +3210,6 @@ def _label_encoding(
outdf.insert(len(outdf._data), newname, newcol)
return outdf

@annotate("ARGSORT", color="yellow", domain="cudf_python")
vyasr marked this conversation as resolved.
Show resolved Hide resolved
def argsort(self, ascending=True, na_position="last"):
"""
Sort by the values.

Parameters
----------
ascending : bool or list of bool, default True
If True, sort values in ascending order, otherwise descending.
na_position : {‘first’ or ‘last’}, default ‘last’
Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
at the end.

Returns
-------
out_column_inds : cuDF Column of indices sorted based on input

Notes
-----
Difference from pandas:

- Support axis='index' only.
- Not supporting: inplace, kind
- Ascending can be a list of bools to control per column

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]})
>>> df
a b
0 10 -10
1 0 10
2 2 1
>>> inds = df.argsort()
>>> inds
0 1
1 2
2 0
dtype: int32
>>> df.take(inds)
a b
1 0 10
2 2 1
0 10 -10
"""
inds_col = self._get_sorted_inds(
ascending=ascending, na_position=na_position
)
return cudf.Series(inds_col)

def sort_values(
self,
by,
axis=0,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
"""
Sort by the values row-wise.

Parameters
----------
by : str or list of str
Name or list of names to sort by.
ascending : bool or list of bool, default True
Sort ascending vs. descending. Specify list for multiple sort
orders. If this is a list of bools, must match the length of the
by.
na_position : {‘first’, ‘last’}, default ‘last’
'first' puts nulls at the beginning, 'last' puts nulls at the end
ignore_index : bool, default False
If True, index will not be sorted.

Returns
-------
sorted_obj : cuDF DataFrame

Notes
-----
Difference from pandas:
* Support axis='index' only.
* Not supporting: inplace, kind

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame()
>>> df['a'] = [0, 1, 2]
>>> df['b'] = [-3, 2, 0]
>>> df.sort_values('b')
a b
0 0 -3
2 2 0
1 1 2
"""
if inplace:
raise NotImplementedError("`inplace` not currently implemented.")
if kind not in {"quicksort", "mergesort", "heapsort", "stable"}:
raise AttributeError(
f"{kind} is not a valid sorting algorithm for "
f"'DataFrame' object"
)
elif kind != "quicksort":
msg = (
f"GPU-accelerated {kind} is currently not supported, "
f"now defaulting to GPU-accelerated quicksort."
)
warnings.warn(msg)
if axis != 0:
raise NotImplementedError("`axis` not currently implemented.")

# argsort the `by` column
return self.take(
self[by].argsort(ascending=ascending, na_position=na_position),
keep_index=not ignore_index,
)

def agg(self, aggs, axis=None):
"""
Aggregate using one or more operations over the specified axis.
Expand Down Expand Up @@ -3555,7 +3402,7 @@ def nlargest(self, n, columns, keep="first"):
Italy 59000000 1937894 IT
Brunei 434000 12128 BN
"""
return self._n_largest_or_smallest("nlargest", n, columns, keep)
return self._n_largest_or_smallest(True, n, columns, keep)

def nsmallest(self, n, columns, keep="first"):
"""Get the rows of the DataFrame sorted by the n smallest value of *columns*
Expand Down Expand Up @@ -3623,26 +3470,7 @@ def nsmallest(self, n, columns, keep="first"):
Tuvalu 11300 38 TV
Nauru 337000 182 NR
"""
return self._n_largest_or_smallest("nsmallest", n, columns, keep)

def _n_largest_or_smallest(self, method, n, columns, keep):
# Get column to operate on
if not isinstance(columns, str):
[column] = columns
else:
column = columns

col = self[column].reset_index(drop=True)
# Operate
sorted_series = getattr(col, method)(n=n, keep=keep)
df = DataFrame()
new_positions = sorted_series.index.gpu_values
for k in self._data.names:
if k == column:
df[k] = sorted_series
else:
df[k] = self[k].reset_index(drop=True).take(new_positions)
return df.set_index(self.index.take(new_positions))
return self._n_largest_or_smallest(False, n, columns, keep)

def transpose(self):
"""Transpose index and columns.
Expand Down
Loading