diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst index df7e92c125d..b4ca0321073 100644 --- a/docs/cudf/source/api.rst +++ b/docs/cudf/source/api.rst @@ -20,6 +20,13 @@ Series :inherited-members: :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list +Lists +----- +.. currentmodule:: cudf.core.column.lists + +.. autoclass:: ListMethods + :members: + Strings ------- .. currentmodule:: cudf.core.column.string @@ -253,4 +260,4 @@ GpuArrowReader .. currentmodule:: cudf.comm.gpuarrow .. autoclass:: GpuArrowReader :members: - :exclude-members: count, index \ No newline at end of file + :exclude-members: count, index diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd new file mode 100644 index 00000000000..55e8e09427c --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.types cimport order, null_order +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view + + +cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] sort_lists( + const lists_column_view source_column, + order column_order, + null_order null_precedence + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index fc9dd681242..2971aad8313 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,24 +10,29 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.sorting cimport ( + sort_lists as cpp_sort_lists +) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.types cimport size_type, order, null_order + from cudf._lib.column cimport Column from cudf._lib.table cimport Table +from cudf._lib.types cimport ( + underlying_type_t_null_order, underlying_type_t_order +) from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.extract cimport extract_list_element def count_elements(Column col): - if not isinstance(col.dtype, ListDtype): - raise TypeError("col is not a list column.") # shared_ptr required because lists_column_view has no default # ctor @@ -61,6 +66,27 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) +def sort_lists(Column col, bool ascending, str na_position): + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef order c_sort_order = ( + order.ASCENDING if ascending else order.DESCENDING + ) + cdef null_order c_null_prec = ( + null_order.BEFORE if na_position == "first" else null_order.AFTER + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec) + ) + + return Column.from_unique_ptr(move(c_result)) + + def extract_element(Column col, size_type index): # shared_ptr required because lists_column_view has no default # ctor diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0241e4f0637..2204fbdea1f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -7,7 +7,7 @@ import cudf from cudf._lib.copying import segmented_gather -from cudf._lib.lists import count_elements, extract_element +from cudf._lib.lists import count_elements, extract_element, sort_lists from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin @@ -317,3 +317,57 @@ def take(self, lists_indices): raise else: return res + + def sort_values( + self, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """ + Sort each list by the values. + + Sort the lists in ascending or descending order by some criterion. + + Parameters + ---------- + ascending : bool, default True + If True, sort values in ascending order, otherwise descending. + na_position : {'first', 'last'}, default 'last' + 'first' puts nulls at the beginning, 'last' puts nulls at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, ..., n - 1. + + Returns + ------- + ListColumn with each list sorted + + Notes + ----- + Difference from pandas: + * Not supporting: `inplace`, `kind` + + Examples + -------- + >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) + >>> s.list.sort_values(ascending=True, na_position="last") + 0 [2.0, 4.0, 9.0, nan] + 1 [2.0, 8.0, 8.0] + 2 [1.0, 2.0] + dtype: list + """ + if inplace: + raise NotImplementedError("`inplace` not currently implemented.") + if kind != "quicksort": + raise NotImplementedError("`kind` not currently implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"Unknown `na_position` value {na_position}") + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists sort is not supported.") + + return self._return_or_inplace( + sort_lists(self._column, ascending, na_position), + retain_index=not ignore_index, + ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7ed2157277c..9d4643da637 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3571,6 +3571,7 @@ def sort_values( 4 3 3 4 1 5 + dtype: int64 """ if inplace: diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5a5a82ba33a..2ab1382b34e 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import functools import pandas as pd import pyarrow as pa @@ -161,6 +162,57 @@ def test_take_invalid(invalid, exception): gs.list.take(invalid) +def key_func_builder(x, na_position): + if x is None: + if na_position == "first": + return -1e8 + else: + return 1e8 + else: + return x + + +@pytest.mark.parametrize( + "data", + [ + [[4, 2, None, 9], [8, 8, 2], [2, 1]], + [[4, 2, None, 9], [8, 8, 2], None], + [[4, 2, None, 9], [], None], + ], +) +@pytest.mark.parametrize( + "index", + [ + None, + pd.Index(["a", "b", "c"]), + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + ], +) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_sort_values(data, index, ascending, na_position, ignore_index): + key_func = functools.partial(key_func_builder, na_position=na_position) + + ps = pd.Series(data, index=index) + gs = cudf.from_pandas(ps) + + expected = ps.apply( + lambda x: sorted(x, key=key_func, reverse=not ascending) + if x is not None + else None + ) + if ignore_index: + expected.reset_index(drop=True, inplace=True) + got = gs.list.sort_values( + ascending=ascending, na_position=na_position, ignore_index=ignore_index + ) + + assert_eq(expected, got) + + @pytest.mark.parametrize( "data, index, expect", [