Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement search using pylibcudf #15166

Merged
merged 4 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
reduce
rolling
scalar
search
stream_compaction
sorting
replace
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
======
search
======

.. automodule:: cudf._lib.pylibcudf.search
:members:
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ set(cython_sources
replace.pyx
rolling.pyx
scalar.pyx
search.pyx
stream_compaction.pyx
sorting.pyx
table.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ from . cimport (
reduce,
replace,
rolling,
search,
sorting,
stream_compaction,
types,
Expand Down Expand Up @@ -45,6 +46,7 @@ __all__ = [
"reduce",
"replace",
"rolling",
"search",
"stream_compaction",
"sorting",
"types",
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
reduce,
replace,
rolling,
search,
sorting,
stream_compaction,
types,
Expand Down Expand Up @@ -43,6 +44,7 @@
"reduce",
"replace",
"rolling",
"search",
"stream_compaction",
"sorting",
"types",
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/search.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from .column cimport Column
from .table cimport Table


cpdef Column lower_bound(
Table haystack,
Table needles,
list column_order,
list null_precedence,
)

cpdef Column upper_bound(
Table haystack,
Table needles,
list column_order,
list null_precedence,
)

cpdef Column contains(Column haystack, Column needles)
116 changes: 116 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/search.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.cpp cimport search as cpp_search
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.types cimport null_order, order

from .column cimport Column
from .table cimport Table


cpdef Column lower_bound(
Table haystack,
Table needles,
list column_order,
list null_precedence,
):
"""Find smallest indices in haystack where needles may be inserted to retain order.

Parameters
----------
haystack : Table
The search space.
needles : Table
The values for which to find insertion points.
column_order : List[ColumnOrder]
Whether each column should be sorted in ascending or descending order.
null_precedence : List[NullOrder]
Whether nulls should come before or after non-nulls.

Returns
-------
Column
The insertion points
"""
cdef unique_ptr[column] c_result
cdef vector[order] c_orders = column_order
cdef vector[null_order] c_null_precedence = null_precedence
with nogil:
c_result = move(
cpp_search.lower_bound(
haystack.view(),
needles.view(),
c_orders,
c_null_precedence,
)
)
return Column.from_libcudf(move(c_result))


cpdef Column upper_bound(
Table haystack,
Table needles,
list column_order,
list null_precedence,
):
"""Find largest indices in haystack where needles may be inserted to retain order.

Parameters
----------
haystack : Table
The search space.
needles : Table
The values for which to find insertion points.
column_order : List[ColumnOrder]
Whether each column should be sorted in ascending or descending order.
null_precedence : List[NullOrder]
Whether nulls should come before or after non-nulls.

Returns
-------
Column
The insertion points
"""
cdef unique_ptr[column] c_result
cdef vector[order] c_orders = column_order
cdef vector[null_order] c_null_precedence = null_precedence
with nogil:
c_result = move(
cpp_search.upper_bound(
haystack.view(),
needles.view(),
c_orders,
c_null_precedence,
)
)
return Column.from_libcudf(move(c_result))


cpdef Column contains(Column haystack, Column needles):
"""Check whether needles are present in haystack.

Parameters
----------
haystack : Table
The search space.
needles : Table
The values for which to search.

Returns
-------
Column
Boolean indicator for each needle.
"""
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_search.contains(
haystack.view(),
needles.view(),
)
)
return Column.from_libcudf(move(c_result))
91 changes: 29 additions & 62 deletions python/cudf/cudf/_lib/search.pyx
Original file line number Diff line number Diff line change
@@ -1,18 +1,10 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector

cimport cudf._lib.cpp.search as cpp_search
cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport table_view_from_columns

from cudf._lib import pylibcudf


@acquire_spill_lock()
Expand All @@ -31,50 +23,31 @@ def search_sorted(
If 'left', the index of the first suitable location is given.
If 'right', return the last such index
"""
cdef unique_ptr[column] c_result
cdef vector[libcudf_types.order] c_column_order
cdef vector[libcudf_types.null_order] c_null_precedence
cdef libcudf_types.order c_order
cdef libcudf_types.null_order c_null_order
cdef table_view c_table_data = table_view_from_columns(source)
cdef table_view c_values_data = table_view_from_columns(values)

# Note: We are ignoring index columns here
c_order = (libcudf_types.order.ASCENDING
if ascending
else libcudf_types.order.DESCENDING)
c_null_order = (
libcudf_types.null_order.AFTER
if na_position=="last"
else libcudf_types.null_order.BEFORE
column_order = [
pylibcudf.types.Order.ASCENDING
if ascending
else pylibcudf.types.Order.DESCENDING
] * len(source)
null_precedence = [
pylibcudf.types.NullOrder.AFTER
if na_position == "last"
else pylibcudf.types.NullOrder.BEFORE
] * len(source)

func = getattr(
pylibcudf.search,
"lower_bound" if side == "left" else "upper_bound",
)
c_column_order = vector[libcudf_types.order](len(source), c_order)
c_null_precedence = vector[libcudf_types.null_order](
len(source), c_null_order
return Column.from_pylibcudf(
func(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]),
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
column_order,
null_precedence,
)
)

if side == 'left':
with nogil:
c_result = move(
cpp_search.lower_bound(
c_table_data,
c_values_data,
c_column_order,
c_null_precedence,
)
)
elif side == 'right':
with nogil:
c_result = move(
cpp_search.upper_bound(
c_table_data,
c_values_data,
c_column_order,
c_null_precedence,
)
)
return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def contains(Column haystack, Column needles):
Expand All @@ -87,15 +60,9 @@ def contains(Column haystack, Column needles):
needles :
A column of values to search for
"""
cdef unique_ptr[column] c_result
cdef column_view c_haystack = haystack.view()
cdef column_view c_needles = needles.view()

with nogil:
c_result = move(
cpp_search.contains(
c_haystack,
c_needles,
)
return Column.from_pylibcudf(
pylibcudf.search.contains(
haystack.to_pylibcudf(mode="read"),
needles.to_pylibcudf(mode="read"),
)
return Column.from_unique_ptr(move(c_result))
)
Loading