Move cudf._lib.search to cudf.core._internals (#17411)

Contributes to #17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: #17411
rapidsai · Nov 23, 2024 · d1d4420 · d1d4420
1 parent 8b7127f
commit d1d4420
Show file tree

Hide file tree

Showing 10 changed files with 87 additions and 88 deletions.
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -36,7 +36,6 @@ set(cython_sources
     rolling.pyx
     round.pyx
     scalar.pyx
-    search.pyx
     sort.pyx
     stream_compaction.pyx
     string_casting.pyx

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -22,7 +22,6 @@
     reshape,
     rolling,
     round,
-    search,
     sort,
     stream_compaction,
     string_casting,

diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def search_sorted(
+    source: list[ColumnBase],
+    values: list[ColumnBase],
+    side: Literal["left", "right"],
+    ascending: bool = True,
+    na_position: Literal["first", "last"] = "last",
+) -> ColumnBase:
+    """Find indices where elements should be inserted to maintain order
+
+    Parameters
+    ----------
+    source : list of columns
+        List of columns to search in
+    values : List of columns
+        List of value columns to search for
+    side : str {'left', 'right'} optional
+        If 'left', the index of the first suitable location is given.
+        If 'right', return the last such index
+    """
+    # Note: We are ignoring index columns here
+    column_order = [
+        plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+    ] * len(source)
+    null_precedence = [
+        plc.types.NullOrder.AFTER
+        if na_position == "last"
+        else plc.types.NullOrder.BEFORE
+    ] * len(source)
+
+    func = getattr(
+        plc.search,
+        "lower_bound" if side == "left" else "upper_bound",
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in source]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            column_order,
+            null_precedence,
+        )
+    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -757,7 +757,7 @@ def indices_of(
             raise ValueError("value must be a scalar")
         else:
             value = as_column(value, dtype=self.dtype, length=1)
-        mask = libcudf.search.contains(value, self)
+        mask = value.contains(self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
@@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         # self.isin(other) asks "which values of self are in other"
         # contains(haystack, needles) asks "which needles are in haystack"
         # hence this argument ordering.
-        result = libcudf.search.contains(rhs, self)
+        result = rhs.contains(self)
         if self.null_count > 0:
             # If one of the needles is null, then the result contains
             # nulls, these nulls should be replaced by whether or not the
@@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool:
             [self], [False], None
         )
 
+    def contains(self, other: ColumnBase) -> ColumnBase:
+        """
+        Check whether column contains multiple values.
+
+        Parameters
+        ----------
+        other : Column
+            A column of values to search for
+        """
+        with acquire_spill_lock():
+            return Column.from_pylibcudf(
+                plc.search.contains(
+                    self.to_pylibcudf(mode="read"),
+                    other.to_pylibcudf(mode="read"),
+                )
+            )
+
     def sort_values(
         self: Self,
         ascending: bool = True,
@@ -1190,7 +1207,7 @@ def searchsorted(
             raise ValueError(
                 "Column searchsorted expects values to be column of same dtype"
             )
-        return libcudf.search.search_sorted(
+        return cudf.core._internals.search.search_sorted(  # type: ignore[return-value]
             [self],
             [value],
             side=side,

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -18,9 +18,9 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.search import search_sorted
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import unary
+from cudf.core._internals.search import search_sorted
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
     get_compatible_timezone,

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool:
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
-        return libcudf.search.contains(
-            self, column.as_column([search_item], dtype=self.dtype)
+        return self.contains(
+            column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn:

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -5857,14 +5857,8 @@ def sum(
             return result_col
 
     def __contains__(self, item: ScalarLike) -> bool:
-        if is_scalar(item):
-            return True in libcudf.search.contains(
-                self, column.as_column([item], dtype=self.dtype)
-            )
-        else:
-            return True in libcudf.search.contains(
-                self, column.as_column(item, dtype=self.dtype)
-            )
+        other = [item] if is_scalar(item) else item
+        return self.contains(column.as_column(other, dtype=self.dtype)).any()
 
     def as_numerical_column(
         self, dtype: Dtype

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -8,8 +8,6 @@
 from collections import abc
 from typing import TYPE_CHECKING, Any, Literal
 
-# TODO: The `numpy` import is needed for typing purposes during doc builds
-# only, need to figure out why the `np` alias is insufficient then remove.
 import cupy
 import numpy
 import numpy as np
@@ -19,9 +17,13 @@
 import pylibcudf as plc
 
 import cudf
+
+# TODO: The `numpy` import is needed for typing purposes during doc builds
+# only, need to figure out why the `np` alias is insufficient then remove.
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals.search import search_sorted
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -1302,7 +1304,7 @@ def searchsorted(
             for val, common_dtype in zip(values, common_dtype_list)
         ]
 
-        outcol = libcudf.search.search_sorted(
+        outcol = search_sorted(
             sources,
             values,
             side,

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -19,7 +19,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.filling import sequence
-from cudf._lib.search import search_sorted
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -32,6 +31,7 @@
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals.search import search_sorted
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,