Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate lists/filtering to pylibcudf #16184

Merged
merged 8 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 7 additions & 34 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,7 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
sort_lists as cpp_sort_lists,
)
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
null_equality,
null_order,
order,
size_type,
)
from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
from cudf._lib.utils cimport columns_from_pylibcudf_table

from cudf._lib import pylibcudf
Expand Down Expand Up @@ -65,31 +56,13 @@ def explode_outer(list source_columns, int explode_column_idx):

@acquire_spill_lock()
def distinct(Column col, bool nulls_equal, bool nans_all_equal):
"""
nulls_equal == True indicates that libcudf should treat any two nulls as
equal, and as unequal otherwise.
nans_all_equal == True indicates that libcudf should treat any two
elements from {-nan, +nan} as equal, and as unequal otherwise.
"""
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_distinct(list_view.get()[0],
c_nulls_equal,
c_nans_equal)
return Column.from_pylibcudf(
pylibcudf.lists.distinct(
col.to_pylibcudf(mode="read"),
nulls_equal,
nans_all_equal,
)
return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality

cdef extern from "cudf/lists/stream_compaction.hpp" \
namespace "cudf::lists" nogil:
cdef unique_ptr[column] apply_boolean_mask(
const lists_column_view& lists_column,
const lists_column_view& boolean_mask,
) except +
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
cdef unique_ptr[column] distinct(
const lists_column_view lists_column,
const lists_column_view& lists_column,
null_equality nulls_equal,
nan_equality nans_equal
) except +
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,7 @@ cpdef Column contains(Column, ColumnOrScalar)
cpdef Column contains_nulls(Column)

cpdef Column index_of(Column, ColumnOrScalar, bool)

cpdef Column apply_boolean_mask(Column, Column)

cpdef Column distinct(Column, bool, bool)
77 changes: 76 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,16 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
concatenate_null_policy,
concatenate_rows as cpp_concatenate_rows,
)
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
apply_boolean_mask as cpp_apply_boolean_mask,
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.table.table cimport table
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
null_equality,
size_type,
)
from cudf._lib.pylibcudf.lists cimport ColumnOrScalar

from .column cimport Column, ListColumnView
Expand Down Expand Up @@ -206,3 +214,70 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
find_option,
))
return Column.from_libcudf(move(c_result))


cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
"""Filters elements in each row of the input lists column using a boolean mask

For details, see :cpp:func:`apply_boolean_mask`.

Parameters
----------
input : Column
The input column.
boolean_mask : Column
The boolean mask.

Returns
-------
Column
A Column of filtered elements based upon the boolean mask.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view1 = input.list_view()
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
cdef ListColumnView list_view2 = boolean_mask.list_view()
with nogil:
c_result = move(cpp_apply_boolean_mask(
list_view1.view(),
list_view2.view(),
))
return Column.from_libcudf(move(c_result))


cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
"""Create a new list column without duplicate elements in each list.

For details, see :cpp:func:`distinct`.

Parameters
----------
input : Column
The input column.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.

Returns
-------
Column
A new list column without duplicate elements in each list.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_distinct(
list_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))
30 changes: 30 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ def column():
return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())


@pytest.fixture
def bool_column():
return pa.array([[False, True], [True], [True], [True, True]])


def test_concatenate_rows(test_data):
arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
plc_tbl = plc.interop.from_arrow(arrow_tbl)
Expand Down Expand Up @@ -134,3 +139,28 @@ def test_index_of_list_column(test_data, column):
expect = pa.array(column[1], type=pa.int32())

assert_column_eq(expect, res)


def test_apply_boolean_mask(test_data, bool_column):
list_column = test_data[0][0]
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
arr = pa.array(list_column)
plc_column = plc.interop.from_arrow(arr)
plc_bool_column = plc.interop.from_arrow(bool_column)

res = plc.lists.apply_boolean_mask(plc_column, plc_bool_column)

expect = pa.array([[1], [2], [5], [6, 7]])

assert_column_eq(expect, res)


def test_distinct():
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
list_column = [[0, 1, 2, 3, 2], [3, 1, 2], None, [4, None, None, 5]]
arr = pa.array(list_column)
plc_column = plc.interop.from_arrow(arr)

res = plc.lists.distinct(plc_column, True, True)

expect = pa.array([[0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]])

assert_column_eq(expect, res)
Loading