diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 656d92c1a4b..5d406f5c85f 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,11 +9,6 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( - concatenate_list_elements as cpp_concatenate_list_elements, - concatenate_null_policy, - concatenate_rows as cpp_concatenate_rows, -) from cudf._lib.pylibcudf.libcudf.lists.contains cimport ( contains, index_of as cpp_index_of, @@ -32,7 +27,6 @@ from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( distinct as cpp_distinct, ) from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport ( nan_equality, null_equality, @@ -41,10 +35,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( size_type, ) from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib import pylibcudf @@ -223,31 +214,20 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): - cdef unique_ptr[column] c_result - - cdef table_view c_table_view = table_view_from_columns(source_columns) - - with nogil: - c_result = move(cpp_concatenate_rows( - c_table_view, - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + pylibcudf.lists.concatenate_rows( + pylibcudf.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]) + ) + ) @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): - cdef concatenate_null_policy policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW + return Column.from_pylibcudf( + pylibcudf.lists.concatenate_list_elements( + input_column.to_pylibcudf(mode="read"), + dropna, + ) ) - cdef column_view c_input = input_column.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_concatenate_list_elements( - c_input, - policy - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index b780d299977..2d2a5b2a9ea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -1,8 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool + from cudf._lib.pylibcudf.libcudf.types cimport size_type +from .column cimport Column from .table cimport Table cpdef Table explode_outer(Table, size_type explode_column_idx) + +cpdef Column concatenate_rows(Table) + +cpdef Column concatenate_list_elements(Column, bool dropna) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 654f39742b6..069c9da31c2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -1,12 +1,20 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode +from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( + concatenate_list_elements as cpp_concatenate_list_elements, + concatenate_null_policy, + concatenate_rows as cpp_concatenate_rows, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type +from .column cimport Column from .table cimport Table @@ -33,3 +41,56 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx)) return Table.from_libcudf(move(c_result)) + + +cpdef Column concatenate_rows(Table input): + """Concatenate multiple lists columns into a single lists column row-wise. + + Parameters + ---------- + input : Table + The input table + + Returns + ------- + Table + A new Column of concatenated rows + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_rows(input.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column concatenate_list_elements(Column input, bool dropna): + """Concatenate multiple lists on the same row into a single list. + + Parameters + ---------- + input : Column + The input column + + Returns + ------- + Column + A new Column of concatenated list elements + dropna : bool + If true, null list elements will be ignored + from concatenation. Otherwise any input null values will result in + the corresponding output row being set to null. + """ + cdef concatenate_null_policy null_policy = ( + concatenate_null_policy.IGNORE if dropna + else concatenate_null_policy.NULLIFY_OUTPUT_ROW + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_list_elements( + input.view(), + null_policy, + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py new file mode 100644 index 00000000000..b21af8ea11c --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +from cudf._lib import pylibcudf as plc + + +def test_concatenate_rows(): + test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]] + + arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"]) + plc_tbl = plc.interop.from_arrow(arrow_tbl) + + res = plc.lists.concatenate_rows(plc_tbl) + + expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)]) + + assert_column_eq(expect, res) + + +@pytest.mark.parametrize( + "test_data, dropna, expected", + [ + ( + [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], + False, + [[1, 2, 3, 4, 5], None], + ), + ( + [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], + True, + [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], + ), + ], +) +def test_concatenate_list_elements(test_data, dropna, expected): + arr = pa.array(test_data) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.concatenate_list_elements(plc_column, dropna) + + expect = pa.array(expected) + + assert_column_eq(expect, res)