Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lists.sort_values API #7657

Merged
merged 11 commits into from
Mar 24, 2021
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/sorting.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
isVoid marked this conversation as resolved.
Show resolved Hide resolved

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.types cimport order, null_order
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view


cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] sort_lists(
const lists_column_view source_column,
order column_order,
null_order null_precedence
)
isVoid marked this conversation as resolved.
Show resolved Hide resolved
29 changes: 28 additions & 1 deletion python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@ from cudf._lib.cpp.lists.count_elements cimport (
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.sorting cimport (
sort_lists as cpp_sort_lists
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.types cimport size_type, order, null_order

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table
Expand Down Expand Up @@ -58,3 +61,27 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
column_names=tbl._column_names,
index_names=None if ignore_index else tbl._index_names
)


def sort_lists(Column col, bool ascending, object na_position):
isVoid marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(col.dtype, ListDtype):
raise TypeError("col is not a list column.")
isVoid marked this conversation as resolved.
Show resolved Hide resolved

cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef order c_sort_order = (
order.ASCENDING if ascending else order.DESCENDING
)
cdef null_order c_null_prec = (
null_order.BEFORE if na_position == "first" else null_order.AFTER
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
)

return Column.from_unique_ptr(move(c_result))
55 changes: 54 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import cudf
from cudf._lib.copying import segmented_gather
from cudf._lib.lists import count_elements
from cudf._lib.lists import count_elements, sort_lists
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -285,3 +285,56 @@ def take(self, lists_indices):
raise
else:
return res

def sort_values(
self,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
"""
Sort each list by the values.

Sort the lists in ascending or descending order by some criterion.

Parameters
----------
ascending : bool, default True
If True, sort values in ascending order, otherwise descending.
na_position : {'first', 'last'}, default 'last'
'first' puts nulls at the beginning, 'last' puts nulls at the end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, ..., n - 1.

Returns
-------
ListColumn with each list sorted

Notes
-----
Difference from pandas:
* Not supporting: `inplace`, `kind`
isVoid marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> import cudf
>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
isVoid marked this conversation as resolved.
Show resolved Hide resolved
>>> s.sort_values(ascending=True, na_position="last")
[2, 4, 9, None],
[2, 8, 8],
[1, 2]
type: list
"""
if inplace:
raise NotImplementedError("`inplace` not currently implemented.")
if kind != "quicksort":
raise NotImplementedError("`kind` not currently implemented.")
if na_position not in {"first", "last"}:
raise ValueError(f"Unknown `na_position` value {na_position}")

return self._return_or_inplace(
sort_lists(self._column, ascending, na_position),
retain_index=not ignore_index,
)
52 changes: 52 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
import functools

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -159,3 +160,54 @@ def test_take_invalid(invalid, exception):
gs = cudf.Series([[0, 1], [2, 3]])
with exception:
gs.list.take(invalid)


def key_func_builder(x, na_position):
if x is None:
if na_position == "first":
return -1e8
else:
return 1e8
else:
return x


@pytest.mark.parametrize(
"data",
[
[[4, 2, None, 9], [8, 8, 2], [2, 1]],
[[4, 2, None, 9], [8, 8, 2], None],
[[4, 2, None, 9], [], None],
],
)
@pytest.mark.parametrize(
"index",
[
None,
pd.Index(["a", "b", "c"]),
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
),
],
)
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_position", ["first", "last"])
@pytest.mark.parametrize("ignore_index", [True, False])
def test_sort_values(data, index, ascending, na_position, ignore_index):
key_func = functools.partial(key_func_builder, na_position=na_position)

ps = pd.Series(data, index=index)
gs = cudf.from_pandas(ps)

expected = ps.apply(
lambda x: sorted(x, key=key_func, reverse=not ascending)
if x is not None
else None
)
if ignore_index:
expected.reset_index(drop=True, inplace=True)
got = gs.list.sort_values(
ascending=ascending, na_position=na_position, ignore_index=ignore_index
)

assert_eq(expected, got)