From 7a0e19f66d894058a806cb1a70fc5a1e9fbca861 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Sun, 6 Oct 2024 13:49:36 -0700 Subject: [PATCH 1/9] Migrate nvtext generate_ngrams APIs to pylibcudf --- .../pylibcudf/nvtext/generate_ngrams.rst | 6 + .../api_docs/pylibcudf/nvtext/index.rst | 1 + .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 76 +++--------- .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 2 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 3 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 3 +- .../pylibcudf/nvtext/generate_ngrams.pxd | 12 ++ .../pylibcudf/nvtext/generate_ngrams.pyx | 111 ++++++++++++++++++ .../tests/test_nvtext_generate_ngrams.py | 55 +++++++++ 9 files changed, 209 insertions(+), 60 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst new file mode 100644 index 00000000000..d68199271bd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst @@ -0,0 +1,6 @@ +=============== +generate_ngrams +=============== + +.. automodule:: pylibcudf.nvtext.generate_ngrams + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index b5cd5ee42c3..2e03b589c8b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -5,3 +5,4 @@ nvtext :maxdepth: 1 edit_distance + generate_ngrams diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 6591b527eec..b4dabf4b33e 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,75 +2,37 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar as plc_Scalar from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_chash_character_ngramsharacter_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index ebe1fda1f12..eb5617a1da6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 82f7c425b1d..7f5fa2b9925 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance +from . cimport edit_distance, generate_ngrams __all__ = [ "edit_distance", + "generate_ngrams", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 986652a241f..a66ce984745 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance +from . import edit_distance, generate_ngrams __all__ = [ "edit_distance", + "generate_ngrams", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd new file mode 100644 index 00000000000..f15eb1f25e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) + +cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx new file mode 100644 index 00000000000..8c7a8edc01d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -0,0 +1,111 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( + generate_character_ngrams as cpp_generate_character_ngrams, + generate_ngrams as cpp_generate_ngrams, + hash_character_ngrams as cpp_hash_character_ngrams, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): + """ + Returns a single column of strings by generating ngrams from a strings column. + + For details, see :cpp:func:`generate_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef column_view c_strings = input.view() + cdef const string_scalar* c_separator = separator.c_obj.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of ngrams of characters within each string. + + For details, see :cpp:func:`generate_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of strings + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of hash values of the characters in each string + + For details, see :cpp:func:`hash_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of hash values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_hash_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py new file mode 100644 index 00000000000..08daba64d38 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["ab", "cde", "fgh"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngram", [2, 3]) +@pytest.mark.parametrize("sep", ["_", "**", ","]) +def test_generate_ngrams(input_col, ngram, sep): + result = plc.nvtext.generate_ngrams.generate_ngrams( + plc.interop.from_arrow(input_col), + ngram, + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) + if ngram == 3: + expected = pa.array([f"ab{sep}cde{sep}fgh"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_generate_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) + if ngram == 3: + expected = pa.array([[], ["cde"], ["fgh"]]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_hash_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + pa_result = plc.interop.to_arrow(result) + if ngram == 2: + assert len(pa_result[0]) == 1 + assert len(pa_result[1]) == 2 + assert len(pa_result[2]) == 2 + else: + assert len(pa_result[0]) == 0 + assert len(pa_result[1]) == 1 + assert len(pa_result[2]) == 1 From 82aaabbcaf4f8d96947bbb40eec12e66e04babe0 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 05:17:46 -0700 Subject: [PATCH 2/9] Migrate nvtext jaccard API to pylibcudf --- cpp/tests/text/jaccard_tests.cpp | 16 ++++--- .../api_docs/pylibcudf/nvtext/index.rst | 1 + .../api_docs/pylibcudf/nvtext/jaccard.rst | 6 +++ python/cudf/cudf/_lib/nvtext/jaccard.pyx | 31 ++++-------- python/cudf/cudf/_lib/string_casting.pyx | 1 - .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 2 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 3 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 3 +- python/pylibcudf/pylibcudf/nvtext/jaccard.pxd | 7 +++ python/pylibcudf/pylibcudf/nvtext/jaccard.pyx | 47 +++++++++++++++++++ 10 files changed, 83 insertions(+), 34 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp index 91ebb644f83..57e3683eaad 100644 --- a/cpp/tests/text/jaccard_tests.cpp +++ b/cpp/tests/text/jaccard_tests.cpp @@ -26,24 +26,26 @@ struct JaccardTest : public cudf::test::BaseFixture {}; TEST_F(JaccardTest, Basic) { - auto input1 = - cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); - auto input2 = - cudf::test::strings_column_wrapper({"the slowest brown cat", "crawled under the jumping fox"}); + // input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] + // input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] + auto input1 = cudf::test::strings_column_wrapper( + {"the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"}); + auto input2 = cudf::test::strings_column_wrapper( + {"the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"}); auto view1 = cudf::strings_column_view(input1); auto view2 = cudf::strings_column_view(input2); auto results = nvtext::jaccard_index(view1, view2, 5); - auto expected = cudf::test::fixed_width_column_wrapper({0.103448279f, 0.0697674453f}); + auto expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f, 1.0f, 1.0f}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f}); results = nvtext::jaccard_index(view1, view1, 5); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = nvtext::jaccard_index(view2, view2, 10); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + // results = nvtext::jaccard_index(view2, view2, 10); + // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(JaccardTest, WithNulls) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index 2e03b589c8b..6300f77d686 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -6,3 +6,4 @@ nvtext edit_distance generate_ngrams + jaccard diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst new file mode 100644 index 00000000000..ea59657c25e --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst @@ -0,0 +1,6 @@ +======= +jaccard +======= + +.. automodule:: pylibcudf.nvtext.jaccard + :members: diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx index 0ebf7c281e3..798601ce364 100644 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx @@ -2,33 +2,18 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.jaccard cimport ( - jaccard_index as cpp_jaccard_index, -) from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +from pylibcudf import nvtext + @acquire_spill_lock() def jaccard_index(Column input1, Column input2, int width): - cdef column_view c_input1 = input1.view() - cdef column_view c_input2 = input2.view() - cdef size_type c_width = width - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_jaccard_index( - c_input1, - c_input2, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.jaccard.jaccard_index( + input1.to_pylibcudf(mode="read"), + input2.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 76c862a8657..d9595f4ab0a 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -6,7 +6,6 @@ from cudf._lib.scalar import as_device_scalar from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index eb5617a1da6..9913e1fbadb 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 7f5fa2b9925..5f1762b1e3d 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance, generate_ngrams +from . cimport edit_distance, generate_ngrams, jaccard __all__ = [ "edit_distance", "generate_ngrams", + "jaccard", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index a66ce984745..1c0ddb1e5a4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams +from . import edit_distance, generate_ngrams, jaccard __all__ = [ "edit_distance", "generate_ngrams", + "jaccard", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd new file mode 100644 index 00000000000..a4d4a15335b --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx new file mode 100644 index 00000000000..9334d7ce751 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -0,0 +1,47 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.jaccard cimport ( + jaccard_index as cpp_jaccard_index, +) +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width): + """ + Returns the Jaccard similarity between individual rows in two strings columns. + + For details, see :cpp:func:`jaccard_index` + + Parameters + ---------- + input1 : Column + Input strings column + input2 : Column + Input strings column + width : size_type + The ngram number to generate + + Returns + ------- + Column + Index calculation values + """ + cdef column_view c_input1 = input1.view() + cdef column_view c_input2 = input2.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_jaccard_index( + c_input1, + c_input2, + width + ) + ) + + return Column.from_libcudf(move(c_result)) From 1db10e6221b8cdfc7cc885531ad9b3703a53ff8a Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 09:09:50 -0700 Subject: [PATCH 3/9] comment out test --- cpp/tests/text/jaccard_tests.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp index 57e3683eaad..413c9ad0b3b 100644 --- a/cpp/tests/text/jaccard_tests.cpp +++ b/cpp/tests/text/jaccard_tests.cpp @@ -41,9 +41,9 @@ TEST_F(JaccardTest, Basic) auto expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f, 1.0f, 1.0f}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f}); - results = nvtext::jaccard_index(view1, view1, 5); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + // expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f}); + // results = nvtext::jaccard_index(view1, view1, 5); + // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); // results = nvtext::jaccard_index(view2, view2, 10); // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } From af71a5bde672efa5cd579d473d27383fc0a79c12 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 17:31:56 -0700 Subject: [PATCH 4/9] add a test --- cpp/tests/text/jaccard_tests.cpp | 22 +++++------ .../pylibcudf/tests/test_nvtext_jaccard.py | 37 +++++++++++++++++++ 2 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp index 413c9ad0b3b..91ebb644f83 100644 --- a/cpp/tests/text/jaccard_tests.cpp +++ b/cpp/tests/text/jaccard_tests.cpp @@ -26,26 +26,24 @@ struct JaccardTest : public cudf::test::BaseFixture {}; TEST_F(JaccardTest, Basic) { - // input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] - // input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] - auto input1 = cudf::test::strings_column_wrapper( - {"the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"}); - auto input2 = cudf::test::strings_column_wrapper( - {"the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"}); + auto input1 = + cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); + auto input2 = + cudf::test::strings_column_wrapper({"the slowest brown cat", "crawled under the jumping fox"}); auto view1 = cudf::strings_column_view(input1); auto view2 = cudf::strings_column_view(input2); auto results = nvtext::jaccard_index(view1, view2, 5); - auto expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f, 1.0f, 1.0f}); + auto expected = cudf::test::fixed_width_column_wrapper({0.103448279f, 0.0697674453f}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - // expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f}); - // results = nvtext::jaccard_index(view1, view1, 5); - // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - // results = nvtext::jaccard_index(view2, view2, 10); - // CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1.0f, 1.0f}); + results = nvtext::jaccard_index(view1, view1, 5); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + results = nvtext::jaccard_index(view2, view2, 10); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(JaccardTest, WithNulls) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py new file mode 100644 index 00000000000..8ef140f3090 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_data(): + input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] + input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] + return pa.array(input1), pa.array(input2) + + +@pytest.mark.parametrize("width", [2, 3]) +def test_generate_ngrams(input_data, width): + def get_tokens(s, width): + return [s[i : i + width] for i in range(len(s) - width + 1)] + + def jaccard_index(s1, s2, width): + x = set(get_tokens(s1, width)) + y = set(get_tokens(s2, width)) + return len(x & y) / len(x | y) + + input1, input2 = input_data + result = plc.nvtext.jaccard.jaccard_index( + plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width + ) + expected = pa.array( + [ + jaccard_index(s1.as_py(), s2.as_py(), width) + for s1, s2 in zip(input1, input2) + ], + type=pa.float32(), + ) + assert_column_eq(result, expected) From 860835de397e2931d8562fa7f938ed343720644e Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 18:59:26 -0700 Subject: [PATCH 5/9] clean up --- python/cudf/cudf/_lib/nvtext/jaccard.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx index 798601ce364..c964d0206b7 100644 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx @@ -2,8 +2,6 @@ from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column from pylibcudf import nvtext @@ -14,6 +12,6 @@ def jaccard_index(Column input1, Column input2, int width): result = nvtext.jaccard.jaccard_index( input1.to_pylibcudf(mode="read"), input2.to_pylibcudf(mode="read"), - width, + width, ) return Column.from_pylibcudf(result) From 5e5cd0330d2d32af80191d6e0313df4273bb1bba Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 8 Oct 2024 14:36:24 -0700 Subject: [PATCH 6/9] Migrate Min Hashing APIs to pylibcudf --- python/cudf/cudf/_lib/nvtext/minhash.pyx | 99 ++++----------- .../pylibcudf/libcudf/concatenate.pxd | 2 +- .../pylibcudf/pylibcudf/libcudf/groupby.pxd | 1 - .../pylibcudf/libcudf/nvtext/minhash.pxd | 21 ++-- .../utilities/{host_span.pxd => span.pxd} | 5 +- .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 2 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 3 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 3 +- python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 14 +++ python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 115 ++++++++++++++++++ .../pylibcudf/tests/test_nvtext_minhash.py | 25 ++++ 11 files changed, 203 insertions(+), 87 deletions(-) rename python/pylibcudf/pylibcudf/libcudf/utilities/{host_span.pxd => span.pxd} (57%) create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..c2886709402 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -2,93 +2,46 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column +from pylibcudf import nvtext + @acquire_spill_lock() def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash( + strings.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash64( + strings.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + 4, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + 4, + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index 92f5a185a54..81c889e3b3d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view -from pylibcudf.libcudf.utilities.host_span cimport host_span +from pylibcudf.libcudf.utilities.span cimport host_span from rmm._lib.device_buffer cimport device_buffer diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd index 848462131fe..17ea33a2066 100644 --- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport ( size_type, sorted, ) -from pylibcudf.libcudf.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..b4d4733e962 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,31 +1,36 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.utilities.span cimport device_span cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( const column_view &strings, - const column_view &seeds, + const numeric_scalar[uint32_t] seed, const size_type width, ) except + - cdef unique_ptr[column] minhash64( + cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, const size_type width, ) except + - cdef unique_ptr[column] word_minhash( - const column_view &input, - const column_view &seeds + cdef unique_ptr[column] minhash64( + const column_view &strings, + const column_view &seeds, + const size_type width, ) except + - cdef unique_ptr[column] word_minhash64( - const column_view &input, - const column_view &seeds + cdef unique_ptr[column] minhash64( + const column_view &strings, + const numeric_scalar[uint64_t] seed, + const size_type width, ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd similarity index 57% rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd index 7e591e96373..36876972a92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp.vector cimport vector @@ -7,3 +7,6 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil: cdef cppclass host_span[T]: host_span() except + host_span(vector[T]) except + + cdef cppclass device_span[T]: + device_span() + device_span(device_span other) except + diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 9913e1fbadb..7fd65beeeb0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 5f1762b1e3d..9eed1da1ab5 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance, generate_ngrams, jaccard +from . cimport edit_distance, generate_ngrams, jaccard, minhash __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash" ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 1c0ddb1e5a4..a3a2363f7ef 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams, jaccard +from . import edit_distance, generate_ngrams, jaccard, minhash __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd new file mode 100644 index 00000000000..be0912d2d47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx new file mode 100644 index 00000000000..3434ef3d7da --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -0,0 +1,115 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, +) +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`cudf::nvtext::minhash`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + cdef numeric_scalar[uint32_t]* cpp_seed + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_minhash( + input.view(), + seeds.view(), + width + ) + ) + elif ColumnOrScalar is Scalar: + cpp_seed = seeds.c_obj.get() + with nogil: + c_result = move( + cpp_minhash( + input.view(), + dereference(cpp_seed), + width + ) + ) + else: + raise ValueError("seeds must be a Column or Scalar") + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`cudf::nvtext::minhash64`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + cdef numeric_scalar[uint64_t]* cpp_seed + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_minhash64( + input.view(), + seeds.view(), + width + ) + ) + elif ColumnOrScalar is Scalar: + cpp_seed = seeds.c_obj.get() + with nogil: + c_result = move( + cpp_minhash64( + input.view(), + dereference(cpp_seed), + width + ) + ) + else: + raise ValueError("seeds must be a Column or Scalar") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py new file mode 100644 index 00000000000..f1bfe70ac05 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest + + +@pytest.fixture(scope="module") +def input_data(): + input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) + seeds = pa.array([2, 3, 4, 5], pa.uint32()) + return input_arr, seeds + + +@pytest.mark.parametrize("width", [5, 12]) +def test_minhash(input_data, width): + input_arr, seeds = input_data + result = plc.nvtext.minhash.minhash( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + ) From bfa583e60d008596b621777c30ab7946ede80a82 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 8 Oct 2024 17:43:31 -0700 Subject: [PATCH 7/9] clean up, add missing tests --- .../api_docs/pylibcudf/nvtext/index.rst | 1 + .../api_docs/pylibcudf/nvtext/minhash.rst | 6 + python/cudf/cudf/_lib/nvtext/minhash.pyx | 14 +- .../pylibcudf/libcudf/nvtext/minhash.pxd | 10 ++ python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 4 + python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 135 ++++++++++++------ .../pylibcudf/tests/test_nvtext_minhash.py | 45 ++++-- 7 files changed, 154 insertions(+), 61 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index 6300f77d686..f6caabe324d 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -7,3 +7,4 @@ nvtext edit_distance generate_ngrams jaccard + minhash diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst new file mode 100644 index 00000000000..b8ec02fca35 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst @@ -0,0 +1,6 @@ +======= +minhash +======= + +.. automodule:: pylibcudf.nvtext.minhash + :members: diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index c2886709402..5e39cafa47b 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -8,9 +8,9 @@ from pylibcudf import nvtext @acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): +def minhash(Column input, Column seeds, int width=4): result = nvtext.minhash.minhash( - strings.to_pylibcudf(mode="read"), + input.to_pylibcudf(mode="read"), seeds.to_pylibcudf(mode="read"), width, ) @@ -18,9 +18,9 @@ def minhash(Column strings, Column seeds, int width): @acquire_spill_lock() -def minhash64(Column strings, Column seeds, int width): +def minhash64(Column input, Column seeds, int width=4): result = nvtext.minhash.minhash64( - strings.to_pylibcudf(mode="read"), + input.to_pylibcudf(mode="read"), seeds.to_pylibcudf(mode="read"), width, ) @@ -29,19 +29,17 @@ def minhash64(Column strings, Column seeds, int width): @acquire_spill_lock() def word_minhash(Column input, Column seeds): - result = nvtext.minhash.minhash( + result = nvtext.minhash.word_minhash( input.to_pylibcudf(mode="read"), seeds.to_pylibcudf(mode="read"), - 4, ) return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash64(Column input, Column seeds): - result = nvtext.minhash.minhash64( + result = nvtext.minhash.word_minhash64( input.to_pylibcudf(mode="read"), seeds.to_pylibcudf(mode="read"), - 4, ) return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index b4d4733e962..c26d92551eb 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -34,3 +34,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const numeric_scalar[uint64_t] seed, const size_type width, ) except + + + cdef unique_ptr[column] word_minhash( + const column_view &input, + const column_view &seeds + ) except + + + cdef unique_ptr[column] word_minhash64( + const column_view &input, + const column_view &seeds + ) except + diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index be0912d2d47..97e8c9dc83c 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -12,3 +12,7 @@ ctypedef fused ColumnOrScalar: cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column word_minhash(Column input, Column seeds) + +cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 3434ef3d7da..5fabf6a3f89 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, ) from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type @@ -21,7 +23,7 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): Returns the minhash values for each string per seed. This function uses MurmurHash3_x86_32 for the hash algorithm. - For details, see :cpp:func:`cudf::nvtext::minhash`. + For details, see :cpp:func:`minhash`. Parameters ---------- @@ -39,29 +41,19 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): List column of minhash values for each string per seed """ cdef unique_ptr[column] c_result - cdef numeric_scalar[uint32_t]* cpp_seed - - if ColumnOrScalar is Column: - with nogil: - c_result = move( - cpp_minhash( - input.view(), - seeds.view(), - width - ) - ) - elif ColumnOrScalar is Scalar: - cpp_seed = seeds.c_obj.get() - with nogil: - c_result = move( - cpp_minhash( - input.view(), - dereference(cpp_seed), - width - ) + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = move( + cpp_minhash( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width ) - else: - raise ValueError("seeds must be a Column or Scalar") + ) return Column.from_libcudf(move(c_result)) @@ -70,7 +62,7 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): Returns the minhash values for each string per seed. This function uses MurmurHash3_x64_128 for the hash algorithm. - For details, see :cpp:func:`cudf::nvtext::minhash64`. + For details, see :cpp:func:`minhash64`. Parameters ---------- @@ -88,28 +80,81 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): List column of minhash values for each string per seed """ cdef unique_ptr[column] c_result - cdef numeric_scalar[uint64_t]* cpp_seed - - if ColumnOrScalar is Column: - with nogil: - c_result = move( - cpp_minhash64( - input.view(), - seeds.view(), - width - ) + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = move( + cpp_minhash64( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width ) - elif ColumnOrScalar is Scalar: - cpp_seed = seeds.c_obj.get() - with nogil: - c_result = move( - cpp_minhash64( - input.view(), - dereference(cpp_seed), - width - ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`word_minhash`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + input.view(), + seeds.view() + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash64(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm though + only the first 64-bits of the hash are used in computing the output. + + For details, see :cpp:func:`word_minhash64`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + input.view(), + seeds.view() ) - else: - raise ValueError("seeds must be a Column or Scalar") + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index f1bfe70ac05..4e389a63f90 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -5,21 +5,50 @@ import pytest -@pytest.fixture(scope="module") -def input_data(): +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_input_data(request): input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) - seeds = pa.array([2, 3, 4, 5], pa.uint32()) - return input_arr, seeds + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def word_minhash_input_data(request): + input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param @pytest.mark.parametrize("width", [5, 12]) -def test_minhash(input_data, width): - input_arr, seeds = input_data - result = plc.nvtext.minhash.minhash( +def test_minhash(minhash_input_data, width): + input_arr, seeds, seed_type = minhash_input_data + minhash_func = ( + plc.nvtext.minhash.minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64 + ) + result = minhash_func( plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width ) pa_result = plc.interop.to_arrow(result) assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) assert pa_result.type == pa.list_( - pa.field("element", pa.uint32(), nullable=False) + pa.field("element", seed_type, nullable=False) + ) + + +def test_word_minhash(word_minhash_input_data): + input_arr, seeds, seed_type = word_minhash_input_data + word_minhash_func = ( + plc.nvtext.minhash.word_minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.word_minhash64 + ) + result = word_minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) ) From cd710f4c9a37face4a877dcde1cfb9aa9c30c12e Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 11 Oct 2024 05:25:29 -0700 Subject: [PATCH 8/9] remove device_span --- python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd index 36876972a92..7e591e96373 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from libcpp.vector cimport vector @@ -7,6 +7,3 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil: cdef cppclass host_span[T]: host_span() except + host_span(vector[T]) except + - cdef cppclass device_span[T]: - device_span() - device_span(device_span other) except + From a419ab5ac5626ea043b109dd037117feb462b327 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 11 Oct 2024 09:12:34 -0700 Subject: [PATCH 9/9] remove import --- python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index c26d92551eb..41250037dcf 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -6,7 +6,6 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type -from pylibcudf.libcudf.utilities.span cimport device_span cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: