Skip to content

Commit

Permalink
Add lists.sort_values API (rapidsai#7657)
Browse files Browse the repository at this point in the history
Closes rapidsai#7467 

Introduces list method `list.sort_values`. Sorts each list of a LIST column based on given criterion. This method signature is aligned with `Series.sort_values`. Example:

```python
>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
>>> s.list.sort_values(ascending=False, na_position="last")
0    [nan, 9.0, 4.0, 2.0]
1         [8.0, 8.0, 2.0]
2              [2.0, 1.0]
dtype: list
```

This PR also includes exposing `ListMethods` to docs and a small docstring fix to `cudf.Series`.

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Keith Kraus (@kkraus14)

URL: rapidsai#7657
  • Loading branch information
isVoid authored Mar 24, 2021
1 parent 2aa9f5b commit 1e9f8f8
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 5 deletions.
9 changes: 8 additions & 1 deletion docs/cudf/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ Series
:inherited-members:
:exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list

Lists
-----
.. currentmodule:: cudf.core.column.lists

.. autoclass:: ListMethods
:members:

Strings
-------
.. currentmodule:: cudf.core.column.string
Expand Down Expand Up @@ -253,4 +260,4 @@ GpuArrowReader
.. currentmodule:: cudf.comm.gpuarrow
.. autoclass:: GpuArrowReader
:members:
:exclude-members: count, index
:exclude-members: count, index
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/sorting.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.types cimport order, null_order
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view


cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] sort_lists(
const lists_column_view source_column,
order column_order,
null_order null_precedence
) except +
32 changes: 29 additions & 3 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,29 @@ from cudf._lib.cpp.lists.count_elements cimport (
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.sorting cimport (
sort_lists as cpp_sort_lists
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.types cimport size_type, order, null_order

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table

from cudf._lib.types cimport (
underlying_type_t_null_order, underlying_type_t_order
)
from cudf.core.dtypes import ListDtype

from cudf._lib.cpp.lists.extract cimport extract_list_element


def count_elements(Column col):
if not isinstance(col.dtype, ListDtype):
raise TypeError("col is not a list column.")

# shared_ptr required because lists_column_view has no default
# ctor
Expand Down Expand Up @@ -61,6 +66,27 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
)


def sort_lists(Column col, bool ascending, str na_position):
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef order c_sort_order = (
order.ASCENDING if ascending else order.DESCENDING
)
cdef null_order c_null_prec = (
null_order.BEFORE if na_position == "first" else null_order.AFTER
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
)

return Column.from_unique_ptr(move(c_result))


def extract_element(Column col, size_type index):
# shared_ptr required because lists_column_view has no default
# ctor
Expand Down
56 changes: 55 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import cudf
from cudf._lib.copying import segmented_gather
from cudf._lib.lists import count_elements, extract_element
from cudf._lib.lists import count_elements, extract_element, sort_lists
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -317,3 +317,57 @@ def take(self, lists_indices):
raise
else:
return res

def sort_values(
self,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
"""
Sort each list by the values.
Sort the lists in ascending or descending order by some criterion.
Parameters
----------
ascending : bool, default True
If True, sort values in ascending order, otherwise descending.
na_position : {'first', 'last'}, default 'last'
'first' puts nulls at the beginning, 'last' puts nulls at the end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, ..., n - 1.
Returns
-------
ListColumn with each list sorted
Notes
-----
Difference from pandas:
* Not supporting: `inplace`, `kind`
Examples
--------
>>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
>>> s.list.sort_values(ascending=True, na_position="last")
0 [2.0, 4.0, 9.0, nan]
1 [2.0, 8.0, 8.0]
2 [1.0, 2.0]
dtype: list
"""
if inplace:
raise NotImplementedError("`inplace` not currently implemented.")
if kind != "quicksort":
raise NotImplementedError("`kind` not currently implemented.")
if na_position not in {"first", "last"}:
raise ValueError(f"Unknown `na_position` value {na_position}")
if is_list_dtype(self._column.children[1].dtype):
raise NotImplementedError("Nested lists sort is not supported.")

return self._return_or_inplace(
sort_lists(self._column, ascending, na_position),
retain_index=not ignore_index,
)
1 change: 1 addition & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3571,6 +3571,7 @@ def sort_values(
4 3
3 4
1 5
dtype: int64
"""

if inplace:
Expand Down
52 changes: 52 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
import functools

import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -161,6 +162,57 @@ def test_take_invalid(invalid, exception):
gs.list.take(invalid)


def key_func_builder(x, na_position):
if x is None:
if na_position == "first":
return -1e8
else:
return 1e8
else:
return x


@pytest.mark.parametrize(
"data",
[
[[4, 2, None, 9], [8, 8, 2], [2, 1]],
[[4, 2, None, 9], [8, 8, 2], None],
[[4, 2, None, 9], [], None],
],
)
@pytest.mark.parametrize(
"index",
[
None,
pd.Index(["a", "b", "c"]),
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
),
],
)
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_position", ["first", "last"])
@pytest.mark.parametrize("ignore_index", [True, False])
def test_sort_values(data, index, ascending, na_position, ignore_index):
key_func = functools.partial(key_func_builder, na_position=na_position)

ps = pd.Series(data, index=index)
gs = cudf.from_pandas(ps)

expected = ps.apply(
lambda x: sorted(x, key=key_func, reverse=not ascending)
if x is not None
else None
)
if ignore_index:
expected.reset_index(drop=True, inplace=True)
got = gs.list.sort_values(
ascending=ascending, na_position=na_position, ignore_index=ignore_index
)

assert_eq(expected, got)


@pytest.mark.parametrize(
"data, index, expect",
[
Expand Down

0 comments on commit 1e9f8f8

Please sign in to comment.