Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lists.sort_values API #7657

Merged
merged 11 commits into from
Mar 24, 2021
9 changes: 8 additions & 1 deletion docs/cudf/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ Series
:inherited-members:
:exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list

Lists
-----
.. currentmodule:: cudf.core.column.lists

.. autoclass:: ListMethods
:members:

Strings
-------
.. currentmodule:: cudf.core.column.string
Expand Down Expand Up @@ -253,4 +260,4 @@ GpuArrowReader
.. currentmodule:: cudf.comm.gpuarrow
.. autoclass:: GpuArrowReader
:members:
:exclude-members: count, index
:exclude-members: count, index
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/sorting.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.types cimport order, null_order
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view


cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] sort_lists(
const lists_column_view source_column,
order column_order,
null_order null_precedence
) except +
30 changes: 27 additions & 3 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,28 @@ from cudf._lib.cpp.lists.count_elements cimport (
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.sorting cimport (
sort_lists as cpp_sort_lists
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.types cimport size_type, order, null_order

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table

from cudf._lib.types cimport (
underlying_type_t_null_order, underlying_type_t_order
)
from cudf._lib.types import Order, NullOrder
from cudf.core.dtypes import ListDtype


def count_elements(Column col):
if not isinstance(col.dtype, ListDtype):
raise TypeError("col is not a list column.")

# shared_ptr required because lists_column_view has no default
# ctor
Expand Down Expand Up @@ -58,3 +63,22 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
column_names=tbl._column_names,
index_names=None if ignore_index else tbl._index_names
)


def sort_lists(Column col, object order_enum, object null_order_enum):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to more tightly type the enums here? I thought we had a pattern fo handling enum values like this elsewhere but I could be mistaken.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately no. I think typically we just pass a Python object to Cython, where we then plumb it through to the appropriate C++ type. See for example https://github.com/rapidsai/cudf/blob/branch-0.19/python/cudf/cudf/_lib/sort.pyx#L26.

I'd suggest we do something similar here, where the Cython API shouldn't expect a Python enum, but rather something like a string or a bool. That encapsulates things a little better, making the use of an enum an implementation detail.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies, @isVoid I got this backwards and you had it right from the beginning

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs to be addressed before merging

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Sorry misunderstood on this one!

cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef order c_sort_order = <order><underlying_type_t_order>order_enum.value
cdef null_order c_null_prec = (
<null_order><underlying_type_t_null_order>null_order_enum.value
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
)

return Column.from_unique_ptr(move(c_result))
62 changes: 61 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

import cudf
from cudf._lib.copying import segmented_gather
from cudf._lib.lists import count_elements
from cudf._lib.lists import count_elements, sort_lists
from cudf._lib.types import NullOrder, Order
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -285,3 +286,62 @@ def take(self, lists_indices):
raise
else:
return res

def sort_values(
self,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
"""
Sort each list by the values.

Sort the lists in ascending or descending order by some criterion.

Parameters
----------
ascending : bool, default True
If True, sort values in ascending order, otherwise descending.
na_position : {'first', 'last'}, default 'last'
'first' puts nulls at the beginning, 'last' puts nulls at the end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, ..., n - 1.

Returns
-------
ListColumn with each list sorted

Notes
-----
Difference from pandas:
* Not supporting: `inplace`, `kind`
isVoid marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
isVoid marked this conversation as resolved.
Show resolved Hide resolved
>>> s.list.sort_values(ascending=True, na_position="last")
0 [2.0, 4.0, 9.0, nan]
1 [2.0, 8.0, 8.0]
2 [1.0, 2.0]
dtype: list
"""
if inplace:
raise NotImplementedError("`inplace` not currently implemented.")
if kind != "quicksort":
raise NotImplementedError("`kind` not currently implemented.")
if na_position not in {"first", "last"}:
raise ValueError(f"Unknown `na_position` value {na_position}")
if is_list_dtype(self._column.children[1].dtype):
raise NotImplementedError("Nested lists sort is not supported.")

sort_order = Order.ASCENDING if ascending else Order.DESCENDING
null_order = (
NullOrder.BEFORE if na_position == "first" else NullOrder.AFTER
)

return self._return_or_inplace(
sort_lists(self._column, sort_order, null_order),
retain_index=not ignore_index,
)
1 change: 1 addition & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3571,6 +3571,7 @@ def sort_values(
4 3
3 4
1 5
dtype: int64
"""

if inplace:
Expand Down
52 changes: 52 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
import functools

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -159,3 +160,54 @@ def test_take_invalid(invalid, exception):
gs = cudf.Series([[0, 1], [2, 3]])
with exception:
gs.list.take(invalid)


def key_func_builder(x, na_position):
if x is None:
if na_position == "first":
return -1e8
else:
return 1e8
else:
return x


@pytest.mark.parametrize(
"data",
[
[[4, 2, None, 9], [8, 8, 2], [2, 1]],
[[4, 2, None, 9], [8, 8, 2], None],
[[4, 2, None, 9], [], None],
],
)
@pytest.mark.parametrize(
"index",
[
None,
pd.Index(["a", "b", "c"]),
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
),
],
)
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_position", ["first", "last"])
@pytest.mark.parametrize("ignore_index", [True, False])
def test_sort_values(data, index, ascending, na_position, ignore_index):
key_func = functools.partial(key_func_builder, na_position=na_position)

ps = pd.Series(data, index=index)
gs = cudf.from_pandas(ps)

expected = ps.apply(
lambda x: sorted(x, key=key_func, reverse=not ascending)
if x is not None
else None
)
if ignore_index:
expected.reset_index(drop=True, inplace=True)
got = gs.list.sort_values(
ascending=ascending, na_position=na_position, ignore_index=ignore_index
)

assert_eq(expected, got)