Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lists.sort_values API #7657

Merged
merged 11 commits into from
Mar 24, 2021
9 changes: 8 additions & 1 deletion docs/cudf/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ Series
:inherited-members:
:exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list

Lists
-----
.. currentmodule:: cudf.core.column.lists

.. autoclass:: ListMethods
:members:

Strings
-------
.. currentmodule:: cudf.core.column.string
Expand Down Expand Up @@ -253,4 +260,4 @@ GpuArrowReader
.. currentmodule:: cudf.comm.gpuarrow
.. autoclass:: GpuArrowReader
:members:
:exclude-members: count, index
:exclude-members: count, index
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/sorting.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.types cimport order, null_order
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view


cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] sort_lists(
const lists_column_view source_column,
order column_order,
null_order null_precedence
) except +
32 changes: 29 additions & 3 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,29 @@ from cudf._lib.cpp.lists.count_elements cimport (
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.sorting cimport (
sort_lists as cpp_sort_lists
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.types cimport size_type, order, null_order

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table

from cudf._lib.types cimport (
underlying_type_t_null_order, underlying_type_t_order
)
from cudf.core.dtypes import ListDtype

from cudf._lib.cpp.lists.extract cimport extract_list_element


def count_elements(Column col):
if not isinstance(col.dtype, ListDtype):
raise TypeError("col is not a list column.")

# shared_ptr required because lists_column_view has no default
# ctor
Expand Down Expand Up @@ -61,6 +66,27 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
)


def sort_lists(Column col, bool ascending, str na_position):
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef order c_sort_order = (
order.ASCENDING if ascending else order.DESCENDING
)
cdef null_order c_null_prec = (
null_order.BEFORE if na_position == "first" else null_order.AFTER
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
)

return Column.from_unique_ptr(move(c_result))


def extract_element(Column col, size_type index):
# shared_ptr required because lists_column_view has no default
# ctor
Expand Down
56 changes: 55 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import cudf
from cudf._lib.copying import segmented_gather
from cudf._lib.lists import count_elements, extract_element
from cudf._lib.lists import count_elements, extract_element, sort_lists
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -317,3 +317,57 @@ def take(self, lists_indices):
raise
else:
return res

def sort_values(
self,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
"""
Sort each list by the values.

Sort the lists in ascending or descending order by some criterion.

Parameters
----------
ascending : bool, default True
If True, sort values in ascending order, otherwise descending.
na_position : {'first', 'last'}, default 'last'
'first' puts nulls at the beginning, 'last' puts nulls at the end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, ..., n - 1.

Returns
-------
ListColumn with each list sorted

Notes
-----
Difference from pandas:
* Not supporting: `inplace`, `kind`
isVoid marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
isVoid marked this conversation as resolved.
Show resolved Hide resolved
>>> s.list.sort_values(ascending=True, na_position="last")
0 [2.0, 4.0, 9.0, nan]
1 [2.0, 8.0, 8.0]
2 [1.0, 2.0]
dtype: list
"""
if inplace:
raise NotImplementedError("`inplace` not currently implemented.")
if kind != "quicksort":
raise NotImplementedError("`kind` not currently implemented.")
if na_position not in {"first", "last"}:
raise ValueError(f"Unknown `na_position` value {na_position}")
if is_list_dtype(self._column.children[1].dtype):
raise NotImplementedError("Nested lists sort is not supported.")

return self._return_or_inplace(
sort_lists(self._column, ascending, na_position),
retain_index=not ignore_index,
)
1 change: 1 addition & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3571,6 +3571,7 @@ def sort_values(
4 3
3 4
1 5
dtype: int64
"""

if inplace:
Expand Down
52 changes: 52 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
import functools

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -161,6 +162,57 @@ def test_take_invalid(invalid, exception):
gs.list.take(invalid)


def key_func_builder(x, na_position):
if x is None:
if na_position == "first":
return -1e8
else:
return 1e8
else:
return x


@pytest.mark.parametrize(
"data",
[
[[4, 2, None, 9], [8, 8, 2], [2, 1]],
[[4, 2, None, 9], [8, 8, 2], None],
[[4, 2, None, 9], [], None],
],
)
@pytest.mark.parametrize(
"index",
[
None,
pd.Index(["a", "b", "c"]),
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
),
],
)
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_position", ["first", "last"])
@pytest.mark.parametrize("ignore_index", [True, False])
def test_sort_values(data, index, ascending, na_position, ignore_index):
key_func = functools.partial(key_func_builder, na_position=na_position)

ps = pd.Series(data, index=index)
gs = cudf.from_pandas(ps)

expected = ps.apply(
lambda x: sorted(x, key=key_func, reverse=not ascending)
if x is not None
else None
)
if ignore_index:
expected.reset_index(drop=True, inplace=True)
got = gs.list.sort_values(
ascending=ascending, na_position=na_position, ignore_index=ignore_index
)

assert_eq(expected, got)


@pytest.mark.parametrize(
"data, index, expect",
[
Expand Down