Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate lists/filtering to pylibcudf #16184

Merged
merged 8 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 7 additions & 39 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,9 @@
from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
lists_column_view,
)
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
null_equality,
null_order,
size_type,
)
from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
from cudf._lib.utils cimport columns_from_pylibcudf_table

from cudf._lib import pylibcudf
Expand Down Expand Up @@ -47,31 +33,13 @@ def explode_outer(list source_columns, int explode_column_idx):

@acquire_spill_lock()
def distinct(Column col, bool nulls_equal, bool nans_all_equal):
"""
nulls_equal == True indicates that libcudf should treat any two nulls as
equal, and as unequal otherwise.
nans_all_equal == True indicates that libcudf should treat any two
elements from {-nan, +nan} as equal, and as unequal otherwise.
"""
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_distinct(list_view.get()[0],
c_nulls_equal,
c_nans_equal)
return Column.from_pylibcudf(
pylibcudf.lists.distinct(
col.to_pylibcudf(mode="read"),
nulls_equal,
nans_all_equal,
)
return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@ from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality

cdef extern from "cudf/lists/stream_compaction.hpp" \
namespace "cudf::lists" nogil:
cdef unique_ptr[column] apply_boolean_mask(
const lists_column_view& lists_column,
const lists_column_view& boolean_mask,
) except +
Matt711 marked this conversation as resolved.
Show resolved Hide resolved

cdef unique_ptr[column] distinct(
const lists_column_view lists_column,
const lists_column_view& lists_column,
null_equality nulls_equal,
nan_equality nans_equal
) except +
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column apply_boolean_mask(Column, Column)

cpdef Column distinct(Column, bool, bool)
71 changes: 71 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
sort_lists as cpp_sort_lists,
stable_sort_lists as cpp_stable_sort_lists,
)
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
apply_boolean_mask as cpp_apply_boolean_mask,
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.table.table cimport table
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
Expand Down Expand Up @@ -614,3 +618,70 @@ cpdef Column union_distinct(
c_nans_equal,
))
return Column.from_libcudf(move(c_result))


cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
"""Filters elements in each row of the input lists column using a boolean mask

For details, see :cpp:func:`apply_boolean_mask`.

Parameters
----------
input : Column
The input column.
boolean_mask : Column
The boolean mask.

Returns
-------
Column
A Column of filtered elements based upon the boolean mask.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()
cdef ListColumnView mask_view = boolean_mask.list_view()
with nogil:
c_result = move(cpp_apply_boolean_mask(
list_view.view(),
mask_view.view(),
))
return Column.from_libcudf(move(c_result))


cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
"""Create a new list column without duplicate elements in each list.

For details, see :cpp:func:`distinct`.

Parameters
----------
input : Column
The input column.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.

Returns
-------
Column
A new list column without duplicate elements in each list.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_distinct(
list_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))
94 changes: 70 additions & 24 deletions python/cudf/cudf/pylibcudf_tests/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,26 @@ def test_data():
return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]


@pytest.fixture
def list_column():
return [[0, 1], [2], [5], [6, 7]]


@pytest.fixture
def scalar():
return pa.scalar(1)


@pytest.fixture
def column():
def search_key_column():
return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())


@pytest.fixture
def bool_column():
return pa.array([[False, True], [True], [True], [True, True]])


@pytest.fixture
def set_lists_column():
lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]]
Expand Down Expand Up @@ -72,8 +82,7 @@ def test_concatenate_list_elements(test_data, dropna, expected):
assert_column_eq(expect, res)


def test_contains_scalar(test_data, scalar):
list_column = test_data[0][0]
def test_contains_scalar(list_column, scalar):
arr = pa.array(list_column)

plc_column = plc.interop.from_arrow(arr)
Expand All @@ -85,17 +94,17 @@ def test_contains_scalar(test_data, scalar):
assert_column_eq(expect, res)


def test_contains_list_column(test_data):
list_column1 = test_data[0][0]
list_column2 = [1, 3, 5, 1]
def test_contains_list_column(list_column, search_key_column):
list_column1 = list_column
list_column2, _ = search_key_column
arr1 = pa.array(list_column1)
arr2 = pa.array(list_column2)

plc_column1 = plc.interop.from_arrow(arr1)
plc_column2 = plc.interop.from_arrow(arr2)
res = plc.lists.contains(plc_column1, plc_column2)

expect = pa.array([True, False, True, False])
expect = pa.array([False, True, True, True])

assert_column_eq(expect, res)

Expand Down Expand Up @@ -123,8 +132,7 @@ def test_contains_nulls(list_column, expected):
assert_column_eq(expect, res)


def test_index_of_scalar(test_data, scalar):
list_column = test_data[0][0]
def test_index_of_scalar(list_column, scalar):
arr = pa.array(list_column)

plc_column = plc.interop.from_arrow(arr)
Expand All @@ -136,21 +144,19 @@ def test_index_of_scalar(test_data, scalar):
assert_column_eq(expect, res)


def test_index_of_list_column(test_data, column):
list_column = test_data[0][0]
def test_index_of_list_column(list_column, search_key_column):
arr1 = pa.array(list_column)
arr2, expect = column
arr2, expect = search_key_column
plc_column1 = plc.interop.from_arrow(arr1)
plc_column2 = plc.interop.from_arrow(arr2)
res = plc.lists.index_of(plc_column1, plc_column2, True)

expect = pa.array(column[1], type=pa.int32())
expect = pa.array(search_key_column[1], type=pa.int32())

assert_column_eq(expect, res)


def test_reverse(test_data):
list_column = test_data[0][0]
def test_reverse(list_column):
arr = pa.array(list_column)
plc_column = plc.interop.from_arrow(arr)

Expand All @@ -162,8 +168,7 @@ def test_reverse(test_data):


def test_segmented_gather(test_data):
list_column1 = test_data[0][0]
list_column2 = test_data[0][1]
list_column1, list_column2 = test_data[0]

plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
Expand All @@ -175,19 +180,17 @@ def test_segmented_gather(test_data):
assert_column_eq(expect, res)


def test_extract_list_element_scalar(test_data):
arr = pa.array(test_data[0][0])
plc_column = plc.interop.from_arrow(arr)
def test_extract_list_element_scalar(list_column):
plc_column = plc.interop.from_arrow(pa.array(list_column))

res = plc.lists.extract_list_element(plc_column, 0)
expect = pa.compute.list_element(test_data[0][0], 0)
expect = pa.compute.list_element(list_column, 0)

assert_column_eq(expect, res)


def test_extract_list_element_column(test_data):
arr = pa.array(test_data[0][0])
plc_column = plc.interop.from_arrow(arr)
def test_extract_list_element_column(list_column):
plc_column = plc.interop.from_arrow(pa.array(list_column))
indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))

res = plc.lists.extract_list_element(plc_column, indices)
Expand Down Expand Up @@ -343,3 +346,46 @@ def test_set_operations(
else:
expect = pa.array(expected)
assert_column_eq(expect, res)


@pytest.mark.parametrize(
"nans_equal,nulls_equal,expected",
[
(True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]),
(
False,
True,
[[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]],
),
(
True,
False,
[[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
),
(
False,
False,
[
[np.nan, np.nan, 0, 1, 2, 3],
[3, 1, 2],
None,
[4, None, None, 5],
],
),
],
)
def test_distinct(list_column, nans_equal, nulls_equal, expected):
list_column = [
[np.nan, np.nan, 0, 1, 2, 3, 2],
[3, 1, 2],
None,
[4, None, None, 5],
]
arr = pa.array(list_column)
plc_column = plc.interop.from_arrow(arr)

res = plc.lists.distinct(plc_column, nans_equal, nulls_equal)

expect = pa.array(expected)

assert_column_eq(expect, res)
Loading