Merge remote-tracking branch 'upstream/branch-25.02' into cleanup-mur…

…murhash-32
rapidsai · Nov 24, 2024 · 14c8326 · 14c8326
2 parents 0cdaeaa + 44b2e79
commit 14c8326
Show file tree

Hide file tree

Showing 11 changed files with 87 additions and 263 deletions.
diff --git a/cpp/include/cudf/detail/utilities/int_fastdiv.h b/cpp/include/cudf/detail/utilities/int_fastdiv.h
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -36,7 +36,6 @@ set(cython_sources
     rolling.pyx
     round.pyx
     scalar.pyx
-    search.pyx
     sort.pyx
     stream_compaction.pyx
     string_casting.pyx

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -22,7 +22,6 @@
     reshape,
     rolling,
     round,
-    search,
     sort,
     stream_compaction,
     string_casting,

diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def search_sorted(
+    source: list[ColumnBase],
+    values: list[ColumnBase],
+    side: Literal["left", "right"],
+    ascending: bool = True,
+    na_position: Literal["first", "last"] = "last",
+) -> ColumnBase:
+    """Find indices where elements should be inserted to maintain order
+
+    Parameters
+    ----------
+    source : list of columns
+        List of columns to search in
+    values : List of columns
+        List of value columns to search for
+    side : str {'left', 'right'} optional
+        If 'left', the index of the first suitable location is given.
+        If 'right', return the last such index
+    """
+    # Note: We are ignoring index columns here
+    column_order = [
+        plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+    ] * len(source)
+    null_precedence = [
+        plc.types.NullOrder.AFTER
+        if na_position == "last"
+        else plc.types.NullOrder.BEFORE
+    ] * len(source)
+
+    func = getattr(
+        plc.search,
+        "lower_bound" if side == "left" else "upper_bound",
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in source]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            column_order,
+            null_precedence,
+        )
+    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -757,7 +757,7 @@ def indices_of(
             raise ValueError("value must be a scalar")
         else:
             value = as_column(value, dtype=self.dtype, length=1)
-        mask = libcudf.search.contains(value, self)
+        mask = value.contains(self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
@@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         # self.isin(other) asks "which values of self are in other"
         # contains(haystack, needles) asks "which needles are in haystack"
         # hence this argument ordering.
-        result = libcudf.search.contains(rhs, self)
+        result = rhs.contains(self)
         if self.null_count > 0:
             # If one of the needles is null, then the result contains
             # nulls, these nulls should be replaced by whether or not the
@@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool:
             [self], [False], None
         )
 
+    def contains(self, other: ColumnBase) -> ColumnBase:
+        """
+        Check whether column contains multiple values.
+
+        Parameters
+        ----------
+        other : Column
+            A column of values to search for
+        """
+        with acquire_spill_lock():
+            return Column.from_pylibcudf(
+                plc.search.contains(
+                    self.to_pylibcudf(mode="read"),
+                    other.to_pylibcudf(mode="read"),
+                )
+            )
+
     def sort_values(
         self: Self,
         ascending: bool = True,
@@ -1190,7 +1207,7 @@ def searchsorted(
             raise ValueError(
                 "Column searchsorted expects values to be column of same dtype"
             )
-        return libcudf.search.search_sorted(
+        return cudf.core._internals.search.search_sorted(  # type: ignore[return-value]
             [self],
             [value],
             side=side,

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -18,9 +18,9 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.search import search_sorted
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import unary
+from cudf.core._internals.search import search_sorted
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
     get_compatible_timezone,

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool:
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
-        return libcudf.search.contains(
-            self, column.as_column([search_item], dtype=self.dtype)
+        return self.contains(
+            column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn: