From b5cdd554310434878e6caffb97a3990663869f03 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 2 Jul 2024 12:36:49 -0700 Subject: [PATCH] Initial commit --- python/cudf/cudf/_lib/lists.pyx | 29 +++------- .../_lib/pylibcudf/libcudf/lists/sorting.pxd | 5 ++ python/cudf/cudf/_lib/pylibcudf/lists.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/lists.pyx | 55 ++++++++++++++++++- .../cudf/cudf/pylibcudf_tests/test_lists.py | 41 ++++++++++++++ 5 files changed, 109 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0ad09dba717..4f91433ccef 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -16,17 +16,12 @@ from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( lists_column_view, ) -from cudf._lib.pylibcudf.libcudf.lists.sorting cimport ( - sort_lists as cpp_sort_lists, -) from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( distinct as cpp_distinct, ) from cudf._lib.pylibcudf.libcudf.types cimport ( nan_equality, null_equality, - null_order, - order, size_type, ) from cudf._lib.utils cimport columns_from_pylibcudf_table @@ -94,24 +89,14 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): @acquire_spill_lock() def sort_lists(Column col, bool ascending, str na_position): - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) - ) - cdef order c_sort_order = ( - order.ASCENDING if ascending else order.DESCENDING - ) - cdef null_order c_null_prec = ( - null_order.BEFORE if na_position == "first" else null_order.AFTER - ) - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec) + return Column.from_pylibcudf( + pylibcudf.lists.sort_lists( + col.to_pylibcudf(mode="read"), + ascending, + na_position, + False, ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd index 145ab41302f..56bd937752c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd @@ -15,3 +15,8 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: order column_order, null_order null_precedence ) except + + cdef unique_ptr[column] stable_sort_lists( + const lists_column_view source_column, + order column_order, + null_order null_precedence + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2ccf0139e90..22024829948 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -23,3 +23,5 @@ cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) cpdef Column index_of(Column, ColumnOrScalar, bool) + +cpdef Column sort_lists(Column, bool, str, bool) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index a94d940accd..1bfffc3a83e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -15,8 +15,12 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_null_policy, concatenate_rows as cpp_concatenate_rows, ) +from cudf._lib.pylibcudf.libcudf.lists.sorting cimport ( + sort_lists as cpp_sort_lists, + stable_sort_lists as cpp_stable_sort_lists, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type from cudf._lib.pylibcudf.lists cimport ColumnOrScalar from .column cimport Column, ListColumnView @@ -206,3 +210,52 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o find_option, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column sort_lists(Column input, bool ascending, str na_position, bool stable): + """Sort the elements within a list in each row of a list column. + + For details, see :cpp:func:`sort_lists`. + + Parameters + ---------- + input : Column + The input column. + ascending : bool + If true, the sort order is ascending. Otherwise, the sort order is descending. + na_position : str + If na_position equals "first", then the null values in the output + column are placed first. Otherwise, they are be placed after. + stable: bool + If true :cpp:func:`stable_sort_lists` is used, Otherwise, + :cpp:func:`sort_lists` is used. + + Returns + ------- + Column + A new Column with elements in each list sorted. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + cdef order c_sort_order = ( + order.ASCENDING if ascending else order.DESCENDING + ) + cdef null_order c_null_prec = ( + null_order.BEFORE if na_position == "first" else null_order.AFTER + ) + + with nogil: + if stable: + c_result = move(cpp_stable_sort_lists( + list_view.view(), + c_sort_order, + c_null_prec, + )) + else: + c_result = move(cpp_sort_lists( + list_view.view(), + c_sort_order, + c_null_prec, + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index c781126e388..05084a30fb9 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -22,6 +22,11 @@ def column(): return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32()) +@pytest.fixture +def lists_column(): + return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 0]] + + def test_concatenate_rows(test_data): arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"]) plc_tbl = plc.interop.from_arrow(arrow_tbl) @@ -134,3 +139,39 @@ def test_index_of_list_column(test_data, column): expect = pa.array(column[1], type=pa.int32()) assert_column_eq(expect, res) + + +@pytest.mark.parametrize( + "ascending,na_position,expected", + [ + ( + True, + "first", + [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10]], + ), + ( + True, + "second", + [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10]], + ), + ( + False, + "first", + [[4, 3, 2, 1], [4, 2, 1, None], [10, 0, -10]], + ), + ( + False, + "second", + [[4, 3, 2, 1], [None, 4, 2, 1], [10, 0, -10]], + ), + ], +) +def test_sort_lists(lists_column, ascending, na_position, expected): + plc_column = plc.interop.from_arrow(pa.array(lists_column)) + res = plc.lists.sort_lists(plc_column, ascending, na_position, False) + res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True) + + expect = pa.array(expected) + + assert_column_eq(expect, res) + assert_column_eq(expect, res_stable)