From 047cc85ad3dffa3732f21c0d1ec2086e9390f501 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 14 Feb 2024 20:18:06 -0800 Subject: [PATCH 1/3] Implement search in pylibcudf --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/search.rst | 6 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/search.pxd | 26 ++++ python/cudf/cudf/_lib/pylibcudf/search.pyx | 117 ++++++++++++++++++ 7 files changed, 155 insertions(+) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/search.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/search.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 73f63ae1343..2e5b3916c65 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. reduce rolling scalar + search stream_compaction sorting replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst new file mode 100644 index 00000000000..aa57bcd9d92 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst @@ -0,0 +1,6 @@ +====== +search +====== + +.. automodule:: cudf._lib.pylibcudf.search + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 68e6765cc49..fd749a5edc1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -28,6 +28,7 @@ set(cython_sources replace.pyx rolling.pyx scalar.pyx + search.pyx stream_compaction.pyx sorting.pyx table.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5ef10fb2ffc..96aa42cc257 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -14,6 +14,7 @@ from . cimport ( reduce, replace, rolling, + search, sorting, stream_compaction, types, @@ -45,6 +46,7 @@ __all__ = [ "reduce", "replace", "rolling", + "search", "stream_compaction", "sorting", "types", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 4689c49fdb1..19cc782dd92 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -13,6 +13,7 @@ reduce, replace, rolling, + search, sorting, stream_compaction, types, @@ -43,6 +44,7 @@ "reduce", "replace", "rolling", + "search", "stream_compaction", "sorting", "types", diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/cudf/cudf/_lib/pylibcudf/search.pxd new file mode 100644 index 00000000000..2f5063825f7 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/search.pxd @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# from libcpp cimport bool +# +# from cudf._lib.cpp.aggregation cimport rank_method +# from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type + +from .column cimport Column +from .table cimport Table + + +cpdef Column lower_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +) + +cpdef Column upper_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +) + +cpdef Column contains(Column haystack, Column needles) diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx new file mode 100644 index 00000000000..080200f596b --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx @@ -0,0 +1,117 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.cpp cimport search as cpp_search +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.types cimport null_order, order + +from .column cimport Column +from .table cimport Table + + +cpdef Column lower_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +): + """Find smallest indices in haystack where needles may be inserted to retain order. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to find insertion points. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Column + The insertion points + """ + cdef unique_ptr[column] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_search.lower_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column upper_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +): + """Find largest indices in haystack where needles may be inserted to retain order. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to find insertion points. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Column + The insertion points + """ + cdef unique_ptr[column] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_search.upper_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column contains(Column haystack, Column needles): + """Check whether needles are present in haystack. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to search. + + Returns + ------- + Column + Boolean indicator for each needle. + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_search.contains( + haystack.view(), + needles.view(), + ) + ) + return Column.from_libcudf(move(c_result)) From 2912baf312bb57f6e3e26e4e75c93d406a12d1be Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 14 Feb 2024 20:37:49 -0800 Subject: [PATCH 2/3] Use pylibcudf for search --- python/cudf/cudf/_lib/pylibcudf/search.pyx | 1 - python/cudf/cudf/_lib/search.pyx | 91 +++++++--------------- 2 files changed, 29 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx index 080200f596b..a186167af13 100644 --- a/python/cudf/cudf/_lib/pylibcudf/search.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx @@ -6,7 +6,6 @@ from libcpp.vector cimport vector from cudf._lib.cpp cimport search as cpp_search from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.types cimport null_order, order from .column cimport Column diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx index fef3a08c6d7..1ee73949fd3 100644 --- a/python/cudf/cudf/_lib/search.pyx +++ b/python/cudf/cudf/_lib/search.pyx @@ -1,18 +1,10 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -cimport cudf._lib.cpp.search as cpp_search -cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport table_view_from_columns + +from cudf._lib import pylibcudf @acquire_spill_lock() @@ -31,50 +23,31 @@ def search_sorted( If 'left', the index of the first suitable location is given. If 'right', return the last such index """ - cdef unique_ptr[column] c_result - cdef vector[libcudf_types.order] c_column_order - cdef vector[libcudf_types.null_order] c_null_precedence - cdef libcudf_types.order c_order - cdef libcudf_types.null_order c_null_order - cdef table_view c_table_data = table_view_from_columns(source) - cdef table_view c_values_data = table_view_from_columns(values) - # Note: We are ignoring index columns here - c_order = (libcudf_types.order.ASCENDING - if ascending - else libcudf_types.order.DESCENDING) - c_null_order = ( - libcudf_types.null_order.AFTER - if na_position=="last" - else libcudf_types.null_order.BEFORE + column_order = [ + pylibcudf.types.Order.ASCENDING + if ascending + else pylibcudf.types.Order.DESCENDING + ] * len(source) + null_precedence = [ + pylibcudf.types.NullOrder.AFTER + if na_position == "last" + else pylibcudf.types.NullOrder.BEFORE + ] * len(source) + + func = getattr( + pylibcudf.search, + "lower_bound" if side == "left" else "upper_bound", ) - c_column_order = vector[libcudf_types.order](len(source), c_order) - c_null_precedence = vector[libcudf_types.null_order]( - len(source), c_null_order + return Column.from_pylibcudf( + func( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]), + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), + column_order, + null_precedence, + ) ) - if side == 'left': - with nogil: - c_result = move( - cpp_search.lower_bound( - c_table_data, - c_values_data, - c_column_order, - c_null_precedence, - ) - ) - elif side == 'right': - with nogil: - c_result = move( - cpp_search.upper_bound( - c_table_data, - c_values_data, - c_column_order, - c_null_precedence, - ) - ) - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def contains(Column haystack, Column needles): @@ -87,15 +60,9 @@ def contains(Column haystack, Column needles): needles : A column of values to search for """ - cdef unique_ptr[column] c_result - cdef column_view c_haystack = haystack.view() - cdef column_view c_needles = needles.view() - - with nogil: - c_result = move( - cpp_search.contains( - c_haystack, - c_needles, - ) + return Column.from_pylibcudf( + pylibcudf.search.contains( + haystack.to_pylibcudf(mode="read"), + needles.to_pylibcudf(mode="read"), ) - return Column.from_unique_ptr(move(c_result)) + ) From 080a9c5916dbb50c5aae52f5f869832988f3766e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 28 Feb 2024 01:49:51 +0000 Subject: [PATCH 3/3] Some cleanup --- python/cudf/cudf/_lib/pylibcudf/search.pxd | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/cudf/cudf/_lib/pylibcudf/search.pxd index 2f5063825f7..0faf18b108f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/search.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/search.pxd @@ -1,10 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -# from libcpp cimport bool -# -# from cudf._lib.cpp.aggregation cimport rank_method -# from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type - from .column cimport Column from .table cimport Table