From 7a0e19f66d894058a806cb1a70fc5a1e9fbca861 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Sun, 6 Oct 2024 13:49:36 -0700
Subject: [PATCH 1/9] Migrate nvtext generate_ngrams APIs to pylibcudf

---
 .../pylibcudf/nvtext/generate_ngrams.rst      |   6 +
 .../api_docs/pylibcudf/nvtext/index.rst       |   1 +
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |  76 +++---------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |   2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   3 +-
 .../pylibcudf/nvtext/generate_ngrams.pxd      |  12 ++
 .../pylibcudf/nvtext/generate_ngrams.pyx      | 111 ++++++++++++++++++
 .../tests/test_nvtext_generate_ngrams.py      |  55 +++++++++
 9 files changed, 209 insertions(+), 60 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
new file mode 100644
index 00000000000..d68199271bd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
@@ -0,0 +1,6 @@
+===============
+generate_ngrams
+===============
+
+.. automodule:: pylibcudf.nvtext.generate_ngrams
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index b5cd5ee42c3..2e03b589c8b 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -5,3 +5,4 @@ nvtext
     :maxdepth: 1
 
     edit_distance
+    generate_ngrams
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 6591b527eec..b4dabf4b33e 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -2,75 +2,37 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
-    generate_character_ngrams as cpp_generate_character_ngrams,
-    generate_ngrams as cpp_generate_ngrams,
-    hash_character_ngrams as cpp_hash_character_ngrams,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar as plc_Scalar
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def generate_ngrams(Column strings, int ngrams, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_ngrams(
-                c_strings,
-                c_ngrams,
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        <size_type> ngrams,
+        <plc_Scalar> py_separator.device_value.c_value
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def generate_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        <size_type> ngrams
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def hash_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_hash_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_chash_character_ngramsharacter_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        <size_type> ngrams
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index ebe1fda1f12..eb5617a1da6 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 82f7c425b1d..7f5fa2b9925 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance
+from . cimport edit_distance, generate_ngrams
 
 __all__ = [
     "edit_distance",
+    "generate_ngrams",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 986652a241f..a66ce984745 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance
+from . import edit_distance, generate_ngrams
 
 __all__ = [
     "edit_distance",
+    "generate_ngrams",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
new file mode 100644
index 00000000000..f15eb1f25e9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
new file mode 100644
index 00000000000..8c7a8edc01d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
+    generate_character_ngrams as cpp_generate_character_ngrams,
+    generate_ngrams as cpp_generate_ngrams,
+    hash_character_ngrams as cpp_hash_character_ngrams,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
+    """
+    Returns a single column of strings by generating ngrams from a strings column.
+
+    For details, see :cpp:func:`generate_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef column_view c_strings = input.view()
+    cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_ngrams(
+                c_strings,
+                ngrams,
+                c_separator[0]
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of ngrams of characters within each string.
+
+    For details, see :cpp:func:`generate_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of strings
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_character_ngrams(
+                c_strings,
+                ngrams,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of hash values of the characters in each string
+
+    For details, see :cpp:func:`hash_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of hash values
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_hash_character_ngrams(
+                c_strings,
+                ngrams,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
new file mode 100644
index 00000000000..08daba64d38
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["ab", "cde", "fgh"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+@pytest.mark.parametrize("sep", ["_", "**", ","])
+def test_generate_ngrams(input_col, ngram, sep):
+    result = plc.nvtext.generate_ngrams.generate_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+        plc.interop.from_arrow(pa.scalar(sep)),
+    )
+    expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"])
+    if ngram == 3:
+        expected = pa.array([f"ab{sep}cde{sep}fgh"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_generate_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]])
+    if ngram == 3:
+        expected = pa.array([[], ["cde"], ["fgh"]])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_hash_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    pa_result = plc.interop.to_arrow(result)
+    if ngram == 2:
+        assert len(pa_result[0]) == 1
+        assert len(pa_result[1]) == 2
+        assert len(pa_result[2]) == 2
+    else:
+        assert len(pa_result[0]) == 0
+        assert len(pa_result[1]) == 1
+        assert len(pa_result[2]) == 1

From 82aaabbcaf4f8d96947bbb40eec12e66e04babe0 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 7 Oct 2024 05:17:46 -0700
Subject: [PATCH 2/9] Migrate nvtext jaccard API to pylibcudf

---
 cpp/tests/text/jaccard_tests.cpp              | 16 ++++---
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../api_docs/pylibcudf/nvtext/jaccard.rst     |  6 +++
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      | 31 ++++--------
 python/cudf/cudf/_lib/string_casting.pyx      |  1 -
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  3 +-
 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd |  7 +++
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx | 47 +++++++++++++++++++
 10 files changed, 83 insertions(+), 34 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx

diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp
index 91ebb644f83..57e3683eaad 100644
--- a/cpp/tests/text/jaccard_tests.cpp
+++ b/cpp/tests/text/jaccard_tests.cpp
@@ -26,24 +26,26 @@ struct JaccardTest : public cudf::test::BaseFixture {};
 
 TEST_F(JaccardTest, Basic)
 {
-  auto input1 =
-    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
-  auto input2 =
-    cudf::test::strings_column_wrapper({"the slowest brown cat", "crawled under the jumping fox"});
+  // input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
+  // input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
+  auto input1 = cudf::test::strings_column_wrapper(
+    {"the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"});
+  auto input2 = cudf::test::strings_column_wrapper(
+    {"the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"});
 
   auto view1 = cudf::strings_column_view(input1);
   auto view2 = cudf::strings_column_view(input2);
 
   auto results = nvtext::jaccard_index(view1, view2, 5);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<float>({0.103448279f, 0.0697674453f});
+  auto expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f, 1.0f, 1.0f});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
   expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f});
   results  = nvtext::jaccard_index(view1, view1, 5);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results = nvtext::jaccard_index(view2, view2, 10);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  // results = nvtext::jaccard_index(view2, view2, 10);
+  // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(JaccardTest, WithNulls)
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 2e03b589c8b..6300f77d686 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -6,3 +6,4 @@ nvtext
 
     edit_distance
     generate_ngrams
+    jaccard
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
new file mode 100644
index 00000000000..ea59657c25e
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
@@ -0,0 +1,6 @@
+=======
+jaccard
+=======
+
+.. automodule:: pylibcudf.nvtext.jaccard
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 0ebf7c281e3..798601ce364 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -2,33 +2,18 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.jaccard cimport (
-    jaccard_index as cpp_jaccard_index,
-)
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 @acquire_spill_lock()
 def jaccard_index(Column input1, Column input2, int width):
-    cdef column_view c_input1 = input1.view()
-    cdef column_view c_input2 = input2.view()
-    cdef size_type c_width = width
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_jaccard_index(
-                c_input1,
-                c_input2,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.jaccard.jaccard_index(
+        input1.to_pylibcudf(mode="read"),
+        input2.to_pylibcudf(mode="read"),
+        <size_type> width,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 76c862a8657..d9595f4ab0a 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -6,7 +6,6 @@ from cudf._lib.scalar import as_device_scalar
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index eb5617a1da6..9913e1fbadb 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 7f5fa2b9925..5f1762b1e3d 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams
+from . cimport edit_distance, generate_ngrams, jaccard
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
+    "jaccard",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index a66ce984745..1c0ddb1e5a4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams
+from . import edit_distance, generate_ngrams, jaccard
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
+    "jaccard",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
new file mode 100644
index 00000000000..a4d4a15335b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width)
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
new file mode 100644
index 00000000000..9334d7ce751
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -0,0 +1,47 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width):
+    """
+    Returns the Jaccard similarity between individual rows in two strings columns.
+
+    For details, see :cpp:func:`jaccard_index`
+
+    Parameters
+    ----------
+    input1 : Column
+        Input strings column
+    input2 : Column
+        Input strings column
+    width : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Index calculation values
+    """
+    cdef column_view c_input1 = input1.view()
+    cdef column_view c_input2 = input2.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_jaccard_index(
+                c_input1,
+                c_input2,
+                width
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))

From 1db10e6221b8cdfc7cc885531ad9b3703a53ff8a Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 7 Oct 2024 09:09:50 -0700
Subject: [PATCH 3/9] comment out test

---
 cpp/tests/text/jaccard_tests.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp
index 57e3683eaad..413c9ad0b3b 100644
--- a/cpp/tests/text/jaccard_tests.cpp
+++ b/cpp/tests/text/jaccard_tests.cpp
@@ -41,9 +41,9 @@ TEST_F(JaccardTest, Basic)
   auto expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f, 1.0f, 1.0f});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f});
-  results  = nvtext::jaccard_index(view1, view1, 5);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  // expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f});
+  // results  = nvtext::jaccard_index(view1, view1, 5);
+  // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   // results = nvtext::jaccard_index(view2, view2, 10);
   // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }

From af71a5bde672efa5cd579d473d27383fc0a79c12 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 7 Oct 2024 17:31:56 -0700
Subject: [PATCH 4/9] add a test

---
 cpp/tests/text/jaccard_tests.cpp              | 22 +++++------
 .../pylibcudf/tests/test_nvtext_jaccard.py    | 37 +++++++++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py

diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp
index 413c9ad0b3b..91ebb644f83 100644
--- a/cpp/tests/text/jaccard_tests.cpp
+++ b/cpp/tests/text/jaccard_tests.cpp
@@ -26,26 +26,24 @@ struct JaccardTest : public cudf::test::BaseFixture {};
 
 TEST_F(JaccardTest, Basic)
 {
-  // input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
-  // input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
-  auto input1 = cudf::test::strings_column_wrapper(
-    {"the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"});
-  auto input2 = cudf::test::strings_column_wrapper(
-    {"the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"});
+  auto input1 =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto input2 =
+    cudf::test::strings_column_wrapper({"the slowest brown cat", "crawled under the jumping fox"});
 
   auto view1 = cudf::strings_column_view(input1);
   auto view2 = cudf::strings_column_view(input2);
 
   auto results = nvtext::jaccard_index(view1, view2, 5);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f, 1.0f, 1.0f});
+  auto expected = cudf::test::fixed_width_column_wrapper<float>({0.103448279f, 0.0697674453f});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  // expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f});
-  // results  = nvtext::jaccard_index(view1, view1, 5);
-  // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  // results = nvtext::jaccard_index(view2, view2, 10);
-  // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f});
+  results  = nvtext::jaccard_index(view1, view1, 5);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  results = nvtext::jaccard_index(view2, view2, 10);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(JaccardTest, WithNulls)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
new file mode 100644
index 00000000000..8ef140f3090
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_data():
+    input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
+    input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
+    return pa.array(input1), pa.array(input2)
+
+
+@pytest.mark.parametrize("width", [2, 3])
+def test_generate_ngrams(input_data, width):
+    def get_tokens(s, width):
+        return [s[i : i + width] for i in range(len(s) - width + 1)]
+
+    def jaccard_index(s1, s2, width):
+        x = set(get_tokens(s1, width))
+        y = set(get_tokens(s2, width))
+        return len(x & y) / len(x | y)
+
+    input1, input2 = input_data
+    result = plc.nvtext.jaccard.jaccard_index(
+        plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width
+    )
+    expected = pa.array(
+        [
+            jaccard_index(s1.as_py(), s2.as_py(), width)
+            for s1, s2 in zip(input1, input2)
+        ],
+        type=pa.float32(),
+    )
+    assert_column_eq(result, expected)

From 860835de397e2931d8562fa7f938ed343720644e Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 7 Oct 2024 18:59:26 -0700
Subject: [PATCH 5/9] clean up

---
 python/cudf/cudf/_lib/nvtext/jaccard.pyx | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 798601ce364..c964d0206b7 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -2,8 +2,6 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
 from pylibcudf import nvtext
@@ -14,6 +12,6 @@ def jaccard_index(Column input1, Column input2, int width):
     result = nvtext.jaccard.jaccard_index(
         input1.to_pylibcudf(mode="read"),
         input2.to_pylibcudf(mode="read"),
-        <size_type> width,
+        width,
     )
     return Column.from_pylibcudf(result)

From 5e5cd0330d2d32af80191d6e0313df4273bb1bba Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 8 Oct 2024 14:36:24 -0700
Subject: [PATCH 6/9] Migrate Min Hashing APIs to pylibcudf

---
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  99 ++++-----------
 .../pylibcudf/libcudf/concatenate.pxd         |   2 +-
 .../pylibcudf/pylibcudf/libcudf/groupby.pxd   |   1 -
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  21 ++--
 .../utilities/{host_span.pxd => span.pxd}     |   5 +-
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |   2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   3 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  14 +++
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 115 ++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_minhash.py    |  25 ++++
 11 files changed, 203 insertions(+), 87 deletions(-)
 rename python/pylibcudf/pylibcudf/libcudf/utilities/{host_span.pxd => span.pxd} (57%)
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py

diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 59cb8d51440..c2886709402 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -2,93 +2,46 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.minhash cimport (
-    minhash as cpp_minhash,
-    minhash64 as cpp_minhash64,
-    word_minhash as cpp_word_minhash,
-    word_minhash64 as cpp_word_minhash64,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 @acquire_spill_lock()
 def minhash(Column strings, Column seeds, int width):
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_width = width
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_minhash(
-                c_strings,
-                c_seeds,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.minhash(
+        strings.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def minhash64(Column strings, Column seeds, int width):
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_width = width
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_minhash64(
-                c_strings,
-                c_seeds,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.minhash64(
+        strings.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
-
-    cdef column_view c_input = input.view()
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_word_minhash(
-                c_input,
-                c_seeds
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.minhash(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        4,
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash64(Column input, Column seeds):
-
-    cdef column_view c_input = input.view()
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_word_minhash64(
-                c_input,
-                c_seeds
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.minhash64(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        4,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 92f5a185a54..81c889e3b3d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column, column_view
 from pylibcudf.libcudf.table.table cimport table, table_view
-from pylibcudf.libcudf.utilities.host_span cimport host_span
+from pylibcudf.libcudf.utilities.span cimport host_span
 
 from rmm._lib.device_buffer cimport device_buffer
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
index 848462131fe..17ea33a2066 100644
--- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
@@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport (
     size_type,
     sorted,
 )
-from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index f2dd22f43aa..b4d4733e962 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,31 +1,36 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.utilities.span cimport device_span
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] minhash(
         const column_view &strings,
-        const column_view &seeds,
+        const numeric_scalar[uint32_t] seed,
         const size_type width,
     ) except +
 
-    cdef unique_ptr[column] minhash64(
+    cdef unique_ptr[column] minhash(
         const column_view &strings,
         const column_view &seeds,
         const size_type width,
     ) except +
 
-    cdef unique_ptr[column] word_minhash(
-        const column_view &input,
-        const column_view &seeds
+    cdef unique_ptr[column] minhash64(
+        const column_view &strings,
+        const column_view &seeds,
+        const size_type width,
     ) except +
 
-    cdef unique_ptr[column] word_minhash64(
-        const column_view &input,
-        const column_view &seeds
+    cdef unique_ptr[column] minhash64(
+        const column_view &strings,
+        const numeric_scalar[uint64_t] seed,
+        const size_type width,
     ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
similarity index 57%
rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
index 7e591e96373..36876972a92 100644
--- a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 
@@ -7,3 +7,6 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil:
     cdef cppclass host_span[T]:
         host_span() except +
         host_span(vector[T]) except +
+    cdef cppclass device_span[T]:
+        device_span()
+        device_span(device_span other) except +
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 9913e1fbadb..7fd65beeeb0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 5f1762b1e3d..9eed1da1ab5 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams, jaccard
+from . cimport edit_distance, generate_ngrams, jaccard, minhash
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
+    "minhash"
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 1c0ddb1e5a4..a3a2363f7ef 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard
+from . import edit_distance, generate_ngrams, jaccard, minhash
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
+    "minhash",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
new file mode 100644
index 00000000000..be0912d2d47
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
+
+cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
new file mode 100644
index 00000000000..3434ef3d7da
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -0,0 +1,115 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.minhash cimport (
+    minhash as cpp_minhash,
+    minhash64 as cpp_minhash64,
+)
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
+    """
+    Returns the minhash values for each string per seed.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`cudf::nvtext::minhash`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seeds : Column or Scalar
+        Seed value(s) used for the hash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+        Default is 4 characters.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+    cdef numeric_scalar[uint32_t]* cpp_seed
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            c_result = move(
+                cpp_minhash(
+                    input.view(),
+                    seeds.view(),
+                    width
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        cpp_seed = <numeric_scalar[uint32_t]*>seeds.c_obj.get()
+        with nogil:
+            c_result = move(
+                cpp_minhash(
+                    input.view(),
+                    dereference(cpp_seed),
+                    width
+                )
+            )
+    else:
+        raise ValueError("seeds must be a Column or Scalar")
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
+    """
+    Returns the minhash values for each string per seed.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`cudf::nvtext::minhash64`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seeds : Column or Scalar
+        Seed value(s) used for the hash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+        Default is 4 characters.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+    cdef numeric_scalar[uint64_t]* cpp_seed
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            c_result = move(
+                cpp_minhash64(
+                    input.view(),
+                    seeds.view(),
+                    width
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        cpp_seed = <numeric_scalar[uint64_t]*>seeds.c_obj.get()
+        with nogil:
+            c_result = move(
+                cpp_minhash64(
+                    input.view(),
+                    dereference(cpp_seed),
+                    width
+                )
+            )
+    else:
+        raise ValueError("seeds must be a Column or Scalar")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
new file mode 100644
index 00000000000..f1bfe70ac05
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+
+
+@pytest.fixture(scope="module")
+def input_data():
+    input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"])
+    seeds = pa.array([2, 3, 4, 5], pa.uint32())
+    return input_arr, seeds
+
+
+@pytest.mark.parametrize("width", [5, 12])
+def test_minhash(input_data, width):
+    input_arr, seeds = input_data
+    result = plc.nvtext.minhash.minhash(
+        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", pa.uint32(), nullable=False)
+    )

From bfa583e60d008596b621777c30ab7946ede80a82 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 8 Oct 2024 17:43:31 -0700
Subject: [PATCH 7/9] clean up, add missing tests

---
 .../api_docs/pylibcudf/nvtext/index.rst       |   1 +
 .../api_docs/pylibcudf/nvtext/minhash.rst     |   6 +
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  14 +-
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  10 ++
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |   4 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 135 ++++++++++++------
 .../pylibcudf/tests/test_nvtext_minhash.py    |  45 ++++--
 7 files changed, 154 insertions(+), 61 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 6300f77d686..f6caabe324d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -7,3 +7,4 @@ nvtext
     edit_distance
     generate_ngrams
     jaccard
+    minhash
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
new file mode 100644
index 00000000000..b8ec02fca35
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
@@ -0,0 +1,6 @@
+=======
+minhash
+=======
+
+.. automodule:: pylibcudf.nvtext.minhash
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index c2886709402..5e39cafa47b 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -8,9 +8,9 @@ from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def minhash(Column strings, Column seeds, int width):
+def minhash(Column input, Column seeds, int width=4):
     result = nvtext.minhash.minhash(
-        strings.to_pylibcudf(mode="read"),
+        input.to_pylibcudf(mode="read"),
         seeds.to_pylibcudf(mode="read"),
         width,
     )
@@ -18,9 +18,9 @@ def minhash(Column strings, Column seeds, int width):
 
 
 @acquire_spill_lock()
-def minhash64(Column strings, Column seeds, int width):
+def minhash64(Column input, Column seeds, int width=4):
     result = nvtext.minhash.minhash64(
-        strings.to_pylibcudf(mode="read"),
+        input.to_pylibcudf(mode="read"),
         seeds.to_pylibcudf(mode="read"),
         width,
     )
@@ -29,19 +29,17 @@ def minhash64(Column strings, Column seeds, int width):
 
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
-    result = nvtext.minhash.minhash(
+    result = nvtext.minhash.word_minhash(
         input.to_pylibcudf(mode="read"),
         seeds.to_pylibcudf(mode="read"),
-        4,
     )
     return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash64(Column input, Column seeds):
-    result = nvtext.minhash.minhash64(
+    result = nvtext.minhash.word_minhash64(
         input.to_pylibcudf(mode="read"),
         seeds.to_pylibcudf(mode="read"),
-        4,
     )
     return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index b4d4733e962..c26d92551eb 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -34,3 +34,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const numeric_scalar[uint64_t] seed,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] word_minhash(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
+
+    cdef unique_ptr[column] word_minhash64(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index be0912d2d47..97e8c9dc83c 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -12,3 +12,7 @@ ctypedef fused ColumnOrScalar:
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
 
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
+
+cpdef Column word_minhash(Column input, Column seeds)
+
+cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 3434ef3d7da..5fabf6a3f89 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
 )
 from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
@@ -21,7 +23,7 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     Returns the minhash values for each string per seed.
     This function uses MurmurHash3_x86_32 for the hash algorithm.
 
-    For details, see :cpp:func:`cudf::nvtext::minhash`.
+    For details, see :cpp:func:`minhash`.
 
     Parameters
     ----------
@@ -39,29 +41,19 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
         List column of minhash values for each string per seed
     """
     cdef unique_ptr[column] c_result
-    cdef numeric_scalar[uint32_t]* cpp_seed
-
-    if ColumnOrScalar is Column:
-        with nogil:
-            c_result = move(
-                cpp_minhash(
-                    input.view(),
-                    seeds.view(),
-                    width
-                )
-            )
-    elif ColumnOrScalar is Scalar:
-        cpp_seed = <numeric_scalar[uint32_t]*>seeds.c_obj.get()
-        with nogil:
-            c_result = move(
-                cpp_minhash(
-                    input.view(),
-                    dereference(cpp_seed),
-                    width
-                )
+
+    if not isinstance(seeds, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(
+            cpp_minhash(
+                input.view(),
+                seeds.view() if ColumnOrScalar is Column else
+                dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
+                width
             )
-    else:
-        raise ValueError("seeds must be a Column or Scalar")
+        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -70,7 +62,7 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     Returns the minhash values for each string per seed.
     This function uses MurmurHash3_x64_128 for the hash algorithm.
 
-    For details, see :cpp:func:`cudf::nvtext::minhash64`.
+    For details, see :cpp:func:`minhash64`.
 
     Parameters
     ----------
@@ -88,28 +80,81 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
         List column of minhash values for each string per seed
     """
     cdef unique_ptr[column] c_result
-    cdef numeric_scalar[uint64_t]* cpp_seed
-
-    if ColumnOrScalar is Column:
-        with nogil:
-            c_result = move(
-                cpp_minhash64(
-                    input.view(),
-                    seeds.view(),
-                    width
-                )
+
+    if not isinstance(seeds, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(
+            cpp_minhash64(
+                input.view(),
+                seeds.view() if ColumnOrScalar is Column else
+                dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
+                width
             )
-    elif ColumnOrScalar is Scalar:
-        cpp_seed = <numeric_scalar[uint64_t]*>seeds.c_obj.get()
-        with nogil:
-            c_result = move(
-                cpp_minhash64(
-                    input.view(),
-                    dereference(cpp_seed),
-                    width
-                )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column word_minhash(Column input, Column seeds):
+    """
+    Returns the minhash values for each row of strings per seed.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`word_minhash`.
+
+    Parameters
+    ----------
+    input : Column
+        Lists column of strings to compute minhash
+    seeds : Column or Scalar
+        Seed values used for the hash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                input.view(),
+                seeds.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column word_minhash64(Column input, Column seeds):
+    """
+    Returns the minhash values for each row of strings per seed.
+    This function uses MurmurHash3_x64_128 for the hash algorithm though
+    only the first 64-bits of the hash are used in computing the output.
+
+    For details, see :cpp:func:`word_minhash64`.
+
+    Parameters
+    ----------
+    input : Column
+        Lists column of strings to compute minhash
+    seeds : Column or Scalar
+        Seed values used for the hash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                input.view(),
+                seeds.view()
             )
-    else:
-        raise ValueError("seeds must be a Column or Scalar")
+        )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index f1bfe70ac05..4e389a63f90 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -5,21 +5,50 @@
 import pytest
 
 
-@pytest.fixture(scope="module")
-def input_data():
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def minhash_input_data(request):
     input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"])
-    seeds = pa.array([2, 3, 4, 5], pa.uint32())
-    return input_arr, seeds
+    seeds = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, seeds, request.param
+
+
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def word_minhash_input_data(request):
+    input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]])
+    seeds = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, seeds, request.param
 
 
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash(input_data, width):
-    input_arr, seeds = input_data
-    result = plc.nvtext.minhash.minhash(
+def test_minhash(minhash_input_data, width):
+    input_arr, seeds, seed_type = minhash_input_data
+    minhash_func = (
+        plc.nvtext.minhash.minhash
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.minhash64
+    )
+    result = minhash_func(
         plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
     )
     pa_result = plc.interop.to_arrow(result)
     assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
     assert pa_result.type == pa.list_(
-        pa.field("element", pa.uint32(), nullable=False)
+        pa.field("element", seed_type, nullable=False)
+    )
+
+
+def test_word_minhash(word_minhash_input_data):
+    input_arr, seeds, seed_type = word_minhash_input_data
+    word_minhash_func = (
+        plc.nvtext.minhash.word_minhash
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.word_minhash64
+    )
+    result = word_minhash_func(
+        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds)
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", seed_type, nullable=False)
     )

From cd710f4c9a37face4a877dcde1cfb9aa9c30c12e Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Fri, 11 Oct 2024 05:25:29 -0700
Subject: [PATCH 8/9] remove device_span

---
 python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
index 36876972a92..7e591e96373 100644
--- a/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 
@@ -7,6 +7,3 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil:
     cdef cppclass host_span[T]:
         host_span() except +
         host_span(vector[T]) except +
-    cdef cppclass device_span[T]:
-        device_span()
-        device_span(device_span other) except +

From a419ab5ac5626ea043b109dd037117feb462b327 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Fri, 11 Oct 2024 09:12:34 -0700
Subject: [PATCH 9/9] remove import

---
 python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index c26d92551eb..41250037dcf 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -6,7 +6,6 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
-from pylibcudf.libcudf.utilities.span cimport device_span
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: