From 6030947e4368182ebcfb42ae068a647584621152 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 11:05:53 -0700 Subject: [PATCH 1/9] Initial --- python/cudf/cudf/_lib/cpp/lists/sorting.pxd | 15 ++++++ python/cudf/cudf/_lib/lists.pyx | 29 ++++++++++- python/cudf/cudf/core/column/lists.py | 54 ++++++++++++++++++++- python/cudf/cudf/tests/test_list.py | 52 ++++++++++++++++++++ 4 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/lists/sorting.pxd diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd new file mode 100644 index 00000000000..cf8aee219bb --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.types cimport order, null_order +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view + + +cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] sort_lists( + const lists_column_view source_column, + order column_order, + null_order null_precedence + ) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0f0ee35556a..178fe97ba83 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,13 +10,16 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.sorting cimport ( + sort_lists as cpp_sort_lists +) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.types cimport size_type, order, null_order from cudf._lib.column cimport Column from cudf._lib.table cimport Table @@ -58,3 +61,27 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): column_names=tbl._column_names, index_names=None if ignore_index else tbl._index_names ) + + +def sort_lists(Column col, bool ascending, object na_position): + if not isinstance(col.dtype, ListDtype): + raise TypeError("col is not a list column.") + + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef order c_sort_order = ( + order.ASCENDING if ascending else order.DESCENDING + ) + cdef null_order c_null_prec = ( + null_order.BEFORE if na_position == "first" else null_order.AFTER + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1d3f73822a9..052b17b7488 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -7,7 +7,7 @@ import cudf from cudf._lib.copying import segmented_gather -from cudf._lib.lists import count_elements +from cudf._lib.lists import count_elements, sort_lists from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin @@ -285,3 +285,55 @@ def take(self, lists_indices): raise else: return res + + def sort_values( + self, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """ + Sort each list by the values. + + Sort the lists in ascending or descending order by some criterion. + + Parameters + ---------- + ascending : bool, default True + If True, sort values in ascending order, otherwise descending. + na_position : {'first', 'last'}, default 'last' + 'first' puts nulls at the beginning, 'last' puts nulls at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, ..., n - 1. + + Returns + ------- + ListColumn with each list sorted + + Notes + ----- + Difference from pandas: + * Not supporting: `inplace`, `kind` + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) + >>> s.sort_values(ascending=True, na_position="last") + [2, 4, 9, None], + [2, 8, 8], + [1, 2] + type: list + """ + if inplace: + raise NotImplementedError("`inplace` not currently implemented.") + if kind != "quicksort": + raise NotImplementedError("`kind` not currently implemented.") + + return self._return_or_inplace( + sort_lists(self._column, ascending, na_position), + inplace=inplace, + retain_index=not ignore_index, + ) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 33812cfa7a7..685f10c1d25 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import functools import pandas as pd import pyarrow as pa @@ -159,3 +160,54 @@ def test_take_invalid(invalid, exception): gs = cudf.Series([[0, 1], [2, 3]]) with exception: gs.list.take(invalid) + + +def key_func_builder(x, na_position): + if x is None: + if na_position == "first": + return -1e8 + else: + return 1e8 + else: + return x + + +@pytest.mark.parametrize( + "data", + [ + [[4, 2, None, 9], [8, 8, 2], [2, 1]], + [[4, 2, None, 9], [8, 8, 2], None], + [[4, 2, None, 9], [], None], + ], +) +@pytest.mark.parametrize( + "index", + [ + None, + pd.Index(["a", "b", "c"]), + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + ], +) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_sort_values(data, index, ascending, na_position, ignore_index): + key_func = functools.partial(key_func_builder, na_position=na_position) + + ps = pd.Series(data, index=index) + gs = cudf.from_pandas(ps) + + expected = ps.apply( + lambda x: sorted(x, key=key_func, reverse=not ascending) + if x is not None + else None + ) + if ignore_index: + expected.reset_index(drop=True, inplace=True) + got = gs.list.sort_values( + ascending=ascending, na_position=na_position, ignore_index=ignore_index + ) + + assert_eq(expected, got) From 6fe171a934495f1952acbf1c9cd54664c3b36b30 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 12:39:15 -0700 Subject: [PATCH 2/9] remove indirection, na_pos check --- python/cudf/cudf/core/column/lists.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 052b17b7488..3c145055700 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -331,9 +331,10 @@ def sort_values( raise NotImplementedError("`inplace` not currently implemented.") if kind != "quicksort": raise NotImplementedError("`kind` not currently implemented.") + if na_position not in {"first", "last"}: + raise ValueError(f"Unknown `na_position` value {na_position}") return self._return_or_inplace( sort_lists(self._column, ascending, na_position), - inplace=inplace, retain_index=not ignore_index, ) From b8b884de1dacce0b65fa3598299dfecc104805fc Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 13:33:23 -0700 Subject: [PATCH 3/9] License year Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/_lib/cpp/lists/sorting.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd index cf8aee219bb..5371c898d90 100644 --- a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr From 56e06c8e77c06448d14ac253bf28bd73f9388acf Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 15:03:28 -0700 Subject: [PATCH 4/9] Use internal enum classes for order and null_order --- python/cudf/cudf/_lib/lists.pyx | 17 +++++++---------- python/cudf/cudf/core/column/lists.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 178fe97ba83..fdce1d7740f 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -24,12 +24,14 @@ from cudf._lib.cpp.types cimport size_type, order, null_order from cudf._lib.column cimport Column from cudf._lib.table cimport Table +from cudf._lib.types cimport ( + underlying_type_t_null_order, underlying_type_t_order +) +from cudf._lib.types import Order, NullOrder from cudf.core.dtypes import ListDtype def count_elements(Column col): - if not isinstance(col.dtype, ListDtype): - raise TypeError("col is not a list column.") # shared_ptr required because lists_column_view has no default # ctor @@ -63,18 +65,13 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) -def sort_lists(Column col, bool ascending, object na_position): - if not isinstance(col.dtype, ListDtype): - raise TypeError("col is not a list column.") - +def sort_lists(Column col, object order_enum, object null_order_enum): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) ) - cdef order c_sort_order = ( - order.ASCENDING if ascending else order.DESCENDING - ) + cdef order c_sort_order = order_enum.value cdef null_order c_null_prec = ( - null_order.BEFORE if na_position == "first" else null_order.AFTER + null_order_enum.value ) cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3c145055700..daad5e8058f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -8,6 +8,7 @@ import cudf from cudf._lib.copying import segmented_gather from cudf._lib.lists import count_elements, sort_lists +from cudf._lib.types import NullOrder, Order from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin @@ -321,7 +322,7 @@ def sort_values( -------- >>> import cudf >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) - >>> s.sort_values(ascending=True, na_position="last") + >>> s.list.sort_values(ascending=True, na_position="last") [2, 4, 9, None], [2, 8, 8], [1, 2] @@ -334,7 +335,12 @@ def sort_values( if na_position not in {"first", "last"}: raise ValueError(f"Unknown `na_position` value {na_position}") + sort_order = Order.ASCENDING if ascending else Order.DESCENDING + null_order = ( + NullOrder.BEFORE if na_position == "first" else NullOrder.AFTER + ) + return self._return_or_inplace( - sort_lists(self._column, ascending, na_position), + sort_lists(self._column, sort_order, null_order), retain_index=not ignore_index, ) From 8c1b763d5004cf7966a20f5e1af992a18fa59d91 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 15:10:15 -0700 Subject: [PATCH 5/9] capture cpp exception --- python/cudf/cudf/_lib/cpp/lists/sorting.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd index cf8aee219bb..14bfd5240d5 100644 --- a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd @@ -12,4 +12,4 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: const lists_column_view source_column, order column_order, null_order null_precedence - ) + ) except + From 574984aa4fa1fb5a0f865a916bb026de3f73fed0 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 15:33:57 -0700 Subject: [PATCH 6/9] Several docstring fixes, exposing lists API docs --- docs/cudf/source/api.rst | 9 ++++++++- python/cudf/cudf/core/column/lists.py | 9 ++++----- python/cudf/cudf/core/series.py | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst index df7e92c125d..b4ca0321073 100644 --- a/docs/cudf/source/api.rst +++ b/docs/cudf/source/api.rst @@ -20,6 +20,13 @@ Series :inherited-members: :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list +Lists +----- +.. currentmodule:: cudf.core.column.lists + +.. autoclass:: ListMethods + :members: + Strings ------- .. currentmodule:: cudf.core.column.string @@ -253,4 +260,4 @@ GpuArrowReader .. currentmodule:: cudf.comm.gpuarrow .. autoclass:: GpuArrowReader :members: - :exclude-members: count, index \ No newline at end of file + :exclude-members: count, index diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index daad5e8058f..1be9168a483 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -320,13 +320,12 @@ def sort_values( Examples -------- - >>> import cudf >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) >>> s.list.sort_values(ascending=True, na_position="last") - [2, 4, 9, None], - [2, 8, 8], - [1, 2] - type: list + 0 [2.0, 4.0, 9.0, nan] + 1 [2.0, 8.0, 8.0] + 2 [1.0, 2.0] + dtype: list """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7ed2157277c..9d4643da637 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3571,6 +3571,7 @@ def sort_values( 4 3 3 4 1 5 + dtype: int64 """ if inplace: From 7b36e630b0e18d4e02dd6669d154ca4e48e7b9a0 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Mar 2021 16:20:45 -0700 Subject: [PATCH 7/9] Add nested list error msg --- python/cudf/cudf/core/column/lists.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1be9168a483..dae3d34e625 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -333,6 +333,8 @@ def sort_values( raise NotImplementedError("`kind` not currently implemented.") if na_position not in {"first", "last"}: raise ValueError(f"Unknown `na_position` value {na_position}") + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists sort is not supported.") sort_order = Order.ASCENDING if ascending else Order.DESCENDING null_order = ( From e5ba902c4cf753009c59ef278be5bd534b4ca6ac Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 23 Mar 2021 09:58:13 -0700 Subject: [PATCH 8/9] style --- python/cudf/cudf/core/column/lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 93c61f0b2d2..1b8ddc717e5 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -7,7 +7,7 @@ import cudf from cudf._lib.copying import segmented_gather -from cudf._lib.lists import count_elements, sort_lists, extract_element +from cudf._lib.lists import count_elements, extract_element, sort_lists from cudf._lib.types import NullOrder, Order from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column From 8acf8f0fefb496e55781fbb305e8e491058f2fb9 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 23 Mar 2021 15:53:56 -0700 Subject: [PATCH 9/9] Passing bool/str arg to cython --- python/cudf/cudf/_lib/lists.pyx | 9 +++++---- python/cudf/cudf/core/column/lists.py | 8 +------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7920980493d..2971aad8313 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -27,7 +27,6 @@ from cudf._lib.table cimport Table from cudf._lib.types cimport ( underlying_type_t_null_order, underlying_type_t_order ) -from cudf._lib.types import Order, NullOrder from cudf.core.dtypes import ListDtype from cudf._lib.cpp.lists.extract cimport extract_list_element @@ -67,13 +66,15 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) -def sort_lists(Column col, object order_enum, object null_order_enum): +def sort_lists(Column col, bool ascending, str na_position): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) ) - cdef order c_sort_order = order_enum.value + cdef order c_sort_order = ( + order.ASCENDING if ascending else order.DESCENDING + ) cdef null_order c_null_prec = ( - null_order_enum.value + null_order.BEFORE if na_position == "first" else null_order.AFTER ) cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1b8ddc717e5..2204fbdea1f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -8,7 +8,6 @@ import cudf from cudf._lib.copying import segmented_gather from cudf._lib.lists import count_elements, extract_element, sort_lists -from cudf._lib.types import NullOrder, Order from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin @@ -368,12 +367,7 @@ def sort_values( if is_list_dtype(self._column.children[1].dtype): raise NotImplementedError("Nested lists sort is not supported.") - sort_order = Order.ASCENDING if ascending else Order.DESCENDING - null_order = ( - NullOrder.BEFORE if na_position == "first" else NullOrder.AFTER - ) - return self._return_or_inplace( - sort_lists(self._column, sort_order, null_order), + sort_lists(self._column, ascending, na_position), retain_index=not ignore_index, )