From 5989bdd27831dc4e3bced9573c1c4cf977432c4a Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 2 Jul 2024 14:22:39 -0700 Subject: [PATCH 1/4] Initial commit --- python/cudf/cudf/_lib/lists.pyx | 41 ++-------- .../libcudf/lists/stream_compaction.pxd | 6 +- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 4 + python/cudf/cudf/_lib/pylibcudf/lists.pyx | 77 ++++++++++++++++++- .../cudf/cudf/pylibcudf_tests/test_lists.py | 30 ++++++++ 5 files changed, 122 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0ad09dba717..9e1f2e619de 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -19,16 +19,7 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( from cudf._lib.pylibcudf.libcudf.lists.sorting cimport ( sort_lists as cpp_sort_lists, ) -from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( - distinct as cpp_distinct, -) -from cudf._lib.pylibcudf.libcudf.types cimport ( - nan_equality, - null_equality, - null_order, - order, - size_type, -) +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib import pylibcudf @@ -65,31 +56,13 @@ def explode_outer(list source_columns, int explode_column_idx): @acquire_spill_lock() def distinct(Column col, bool nulls_equal, bool nans_all_equal): - """ - nulls_equal == True indicates that libcudf should treat any two nulls as - equal, and as unequal otherwise. - nans_all_equal == True indicates that libcudf should treat any two - elements from {-nan, +nan} as equal, and as unequal otherwise. - """ - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) - ) - cdef null_equality c_nulls_equal = ( - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL - ) - cdef nan_equality c_nans_equal = ( - nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL - ) - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_distinct(list_view.get()[0], - c_nulls_equal, - c_nans_equal) + return Column.from_pylibcudf( + pylibcudf.lists.distinct( + col.to_pylibcudf(mode="read"), + nulls_equal, + nans_all_equal, ) - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd index 22b91df7192..e9437026c7d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd @@ -11,8 +11,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality cdef extern from "cudf/lists/stream_compaction.hpp" \ namespace "cudf::lists" nogil: + cdef unique_ptr[column] apply_boolean_mask( + const lists_column_view& lists_column, + const lists_column_view& boolean_mask, + ) except + cdef unique_ptr[column] distinct( - const lists_column_view lists_column, + const lists_column_view& lists_column, null_equality nulls_equal, nan_equality nans_equal ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2ccf0139e90..eb5ac484e99 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -23,3 +23,7 @@ cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) cpdef Column index_of(Column, ColumnOrScalar, bool) + +cpdef Column apply_boolean_mask(Column, Column) + +cpdef Column distinct(Column, bool, bool) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index a94d940accd..7add4ec0c82 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -15,8 +15,16 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_null_policy, concatenate_rows as cpp_concatenate_rows, ) +from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( + apply_boolean_mask as cpp_apply_boolean_mask, + distinct as cpp_distinct, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.libcudf.types cimport ( + nan_equality, + null_equality, + size_type, +) from cudf._lib.pylibcudf.lists cimport ColumnOrScalar from .column cimport Column, ListColumnView @@ -206,3 +214,70 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o find_option, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column apply_boolean_mask(Column input, Column boolean_mask): + """Filters elements in each row of the input lists column using a boolean mask + + For details, see :cpp:func:`apply_boolean_mask`. + + Parameters + ---------- + input : Column + The input column. + boolean_mask : Column + The boolean mask. + + Returns + ------- + Column + A Column of filtered elements based upon the boolean mask. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view1 = input.list_view() + cdef ListColumnView list_view2 = boolean_mask.list_view() + with nogil: + c_result = move(cpp_apply_boolean_mask( + list_view1.view(), + list_view2.view(), + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): + """Create a new list column without duplicate elements in each list. + + For details, see :cpp:func:`distinct`. + + Parameters + ---------- + input : Column + The input column. + nulls_equal : bool + If true, null elements are considered equal. Otherwise, unequal. + nans_equal : bool + If true, libcudf will treat nan elements from {-nan, +nan} + as equal. Otherwise, unequal. Otherwise, unequal. + + Returns + ------- + Column + A new list column without duplicate elements in each list. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL + ) + + with nogil: + c_result = move(cpp_distinct( + list_view.view(), + c_nulls_equal, + c_nans_equal, + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index c781126e388..246e9eac670 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -22,6 +22,11 @@ def column(): return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32()) +@pytest.fixture +def bool_column(): + return pa.array([[False, True], [True], [True], [True, True]]) + + def test_concatenate_rows(test_data): arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"]) plc_tbl = plc.interop.from_arrow(arrow_tbl) @@ -134,3 +139,28 @@ def test_index_of_list_column(test_data, column): expect = pa.array(column[1], type=pa.int32()) assert_column_eq(expect, res) + + +def test_apply_boolean_mask(test_data, bool_column): + list_column = test_data[0][0] + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + plc_bool_column = plc.interop.from_arrow(bool_column) + + res = plc.lists.apply_boolean_mask(plc_column, plc_bool_column) + + expect = pa.array([[1], [2], [5], [6, 7]]) + + assert_column_eq(expect, res) + + +def test_distinct(): + list_column = [[0, 1, 2, 3, 2], [3, 1, 2], None, [4, None, None, 5]] + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.distinct(plc_column, True, True) + + expect = pa.array([[0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]) + + assert_column_eq(expect, res) From df7f143dbbacce0bbf1df5b4b4b6e6ed80718c7a Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 15 Jul 2024 08:17:04 -0700 Subject: [PATCH 2/4] Change names to list_view and mask_view --- python/cudf/cudf/_lib/pylibcudf/lists.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 8030ce9ad2f..ca28a546752 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -320,12 +320,12 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): A Column of filtered elements based upon the boolean mask. """ cdef unique_ptr[column] c_result - cdef ListColumnView list_view1 = input.list_view() - cdef ListColumnView list_view2 = boolean_mask.list_view() + cdef ListColumnView list_view = input.list_view() + cdef ListColumnView mask_view = boolean_mask.list_view() with nogil: c_result = move(cpp_apply_boolean_mask( - list_view1.view(), - list_view2.view(), + list_view.view(), + mask_view.view(), )) return Column.from_libcudf(move(c_result)) From e87e2278fafbe73235861061e5aa13962d2eda67 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 23 Jul 2024 10:49:19 -0700 Subject: [PATCH 3/4] Address review --- .../cudf/cudf/pylibcudf_tests/test_lists.py | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 4ebb10382d7..27b7b6390d2 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -23,7 +23,7 @@ def scalar(): @pytest.fixture -def column(): +def search_key_column(): return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32()) @@ -86,9 +86,9 @@ def test_contains_scalar(list_column, scalar): assert_column_eq(expect, res) -def test_contains_list_column(list_column, column): +def test_contains_list_column(list_column, search_key_column): list_column1 = list_column - list_column2, _ = column + list_column2, _ = search_key_column arr1 = pa.array(list_column1) arr2 = pa.array(list_column2) @@ -136,14 +136,14 @@ def test_index_of_scalar(list_column, scalar): assert_column_eq(expect, res) -def test_index_of_list_column(list_column, column): +def test_index_of_list_column(list_column, search_key_column): arr1 = pa.array(list_column) - arr2, expect = column + arr2, expect = search_key_column plc_column1 = plc.interop.from_arrow(arr1) plc_column2 = plc.interop.from_arrow(arr2) res = plc.lists.index_of(plc_column1, plc_column2, True) - expect = pa.array(column[1], type=pa.int32()) + expect = pa.array(search_key_column[1], type=pa.int32()) assert_column_eq(expect, res) @@ -213,14 +213,25 @@ def test_apply_boolean_mask(list_column, bool_column): assert_column_eq(expect, res) -def test_distinct(): +@pytest.mark.parametrize( + "nans_equal,nulls_equal,expected", + [ + (True, True, [[0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), + ( + False, + True, + [[0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], + ), + ], +) +def test_distinct(nans_equal, nulls_equal, expected): list_column = [[0, 1, 2, 3, 2], [3, 1, 2], None, [4, None, None, 5]] arr = pa.array(list_column) plc_column = plc.interop.from_arrow(arr) - res = plc.lists.distinct(plc_column, True, True) + res = plc.lists.distinct(plc_column, nans_equal, nulls_equal) - expect = pa.array([[0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]) + expect = pa.array(expected) assert_column_eq(expect, res) From 70b6b2f23787fb963bf99db80e3cbf2d2cb66508 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 23 Jul 2024 12:36:28 -0700 Subject: [PATCH 4/4] parameterize nans_equal --- .../cudf/cudf/pylibcudf_tests/test_lists.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 27b7b6390d2..a48d60776c3 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np import pyarrow as pa import pytest from utils import assert_column_eq @@ -216,16 +217,36 @@ def test_apply_boolean_mask(list_column, bool_column): @pytest.mark.parametrize( "nans_equal,nulls_equal,expected", [ - (True, True, [[0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), + (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]), ( False, True, - [[0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], + [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]], + ), + ( + True, + False, + [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]], + ), + ( + False, + False, + [ + [np.nan, np.nan, 0, 1, 2, 3], + [3, 1, 2], + None, + [4, None, None, 5], + ], ), ], ) -def test_distinct(nans_equal, nulls_equal, expected): - list_column = [[0, 1, 2, 3, 2], [3, 1, 2], None, [4, None, None, 5]] +def test_distinct(list_column, nans_equal, nulls_equal, expected): + list_column = [ + [np.nan, np.nan, 0, 1, 2, 3, 2], + [3, 1, 2], + None, + [4, None, None, 5], + ] arr = pa.array(list_column) plc_column = plc.interop.from_arrow(arr)