diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index 6300f77d686..f6caabe324d 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -7,3 +7,4 @@ nvtext edit_distance generate_ngrams jaccard + minhash diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst new file mode 100644 index 00000000000..b8ec02fca35 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst @@ -0,0 +1,6 @@ +======= +minhash +======= + +.. automodule:: pylibcudf.nvtext.minhash + :members: diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..5e39cafa47b 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -2,93 +2,44 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column - -@acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) +from pylibcudf import nvtext @acquire_spill_lock() -def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result +def minhash(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) - ) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def minhash64(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.word_minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.word_minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index a09b6c01392..def292148c5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view -from pylibcudf.libcudf.utilities.host_span cimport host_span +from pylibcudf.libcudf.utilities.span cimport host_span from rmm.librmm.device_buffer cimport device_buffer diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd index 848462131fe..17ea33a2066 100644 --- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport ( size_type, sorted, ) -from pylibcudf.libcudf.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..41250037dcf 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,13 +1,21 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: + cdef unique_ptr[column] minhash( + const column_view &strings, + const numeric_scalar[uint32_t] seed, + const size_type width, + ) except + + cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, @@ -20,6 +28,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64( + const column_view &strings, + const numeric_scalar[uint64_t] seed, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd similarity index 100% rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 9913e1fbadb..7fd65beeeb0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 5f1762b1e3d..9eed1da1ab5 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance, generate_ngrams, jaccard +from . cimport edit_distance, generate_ngrams, jaccard, minhash __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash" ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 1c0ddb1e5a4..a3a2363f7ef 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams, jaccard +from . import edit_distance, generate_ngrams, jaccard, minhash __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd new file mode 100644 index 00000000000..97e8c9dc83c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column word_minhash(Column input, Column seeds) + +cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx new file mode 100644 index 00000000000..5fabf6a3f89 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -0,0 +1,160 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, +) +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = move( + cpp_minhash( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = move( + cpp_minhash64( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`word_minhash`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + input.view(), + seeds.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash64(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm though + only the first 64-bits of the hash are used in computing the output. + + For details, see :cpp:func:`word_minhash64`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + input.view(), + seeds.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py new file mode 100644 index 00000000000..4e389a63f90 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_input_data(request): + input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def word_minhash_input_data(request): + input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.mark.parametrize("width", [5, 12]) +def test_minhash(minhash_input_data, width): + input_arr, seeds, seed_type = minhash_input_data + minhash_func = ( + plc.nvtext.minhash.minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64 + ) + result = minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) + + +def test_word_minhash(word_minhash_input_data): + input_arr, seeds, seed_type = word_minhash_input_data + word_minhash_func = ( + plc.nvtext.minhash.word_minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.word_minhash64 + ) + result = word_minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + )