From 7a0e19f66d894058a806cb1a70fc5a1e9fbca861 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Sun, 6 Oct 2024 13:49:36 -0700 Subject: [PATCH 1/5] Migrate nvtext generate_ngrams APIs to pylibcudf --- .../pylibcudf/nvtext/generate_ngrams.rst | 6 + .../api_docs/pylibcudf/nvtext/index.rst | 1 + .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 76 +++--------- .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 2 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 3 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 3 +- .../pylibcudf/nvtext/generate_ngrams.pxd | 12 ++ .../pylibcudf/nvtext/generate_ngrams.pyx | 111 ++++++++++++++++++ .../tests/test_nvtext_generate_ngrams.py | 55 +++++++++ 9 files changed, 209 insertions(+), 60 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst new file mode 100644 index 00000000000..d68199271bd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst @@ -0,0 +1,6 @@ +=============== +generate_ngrams +=============== + +.. automodule:: pylibcudf.nvtext.generate_ngrams + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index b5cd5ee42c3..2e03b589c8b 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -5,3 +5,4 @@ nvtext :maxdepth: 1 edit_distance + generate_ngrams diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 6591b527eec..b4dabf4b33e 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,75 +2,37 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar as plc_Scalar from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_chash_character_ngramsharacter_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index ebe1fda1f12..eb5617a1da6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 82f7c425b1d..7f5fa2b9925 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance +from . cimport edit_distance, generate_ngrams __all__ = [ "edit_distance", + "generate_ngrams", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 986652a241f..a66ce984745 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance +from . import edit_distance, generate_ngrams __all__ = [ "edit_distance", + "generate_ngrams", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd new file mode 100644 index 00000000000..f15eb1f25e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) + +cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx new file mode 100644 index 00000000000..8c7a8edc01d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -0,0 +1,111 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( + generate_character_ngrams as cpp_generate_character_ngrams, + generate_ngrams as cpp_generate_ngrams, + hash_character_ngrams as cpp_hash_character_ngrams, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): + """ + Returns a single column of strings by generating ngrams from a strings column. + + For details, see :cpp:func:`generate_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef column_view c_strings = input.view() + cdef const string_scalar* c_separator = separator.c_obj.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of ngrams of characters within each string. + + For details, see :cpp:func:`generate_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of strings + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_generate_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of hash values of the characters in each string + + For details, see :cpp:func:`hash_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of hash values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_hash_character_ngrams( + c_strings, + ngrams, + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py new file mode 100644 index 00000000000..08daba64d38 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["ab", "cde", "fgh"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngram", [2, 3]) +@pytest.mark.parametrize("sep", ["_", "**", ","]) +def test_generate_ngrams(input_col, ngram, sep): + result = plc.nvtext.generate_ngrams.generate_ngrams( + plc.interop.from_arrow(input_col), + ngram, + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) + if ngram == 3: + expected = pa.array([f"ab{sep}cde{sep}fgh"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_generate_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) + if ngram == 3: + expected = pa.array([[], ["cde"], ["fgh"]]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_hash_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + pa_result = plc.interop.to_arrow(result) + if ngram == 2: + assert len(pa_result[0]) == 1 + assert len(pa_result[1]) == 2 + assert len(pa_result[2]) == 2 + else: + assert len(pa_result[0]) == 0 + assert len(pa_result[1]) == 1 + assert len(pa_result[2]) == 1 From 1e0feafccc19be6272cee53d16d28dabbfcc35c7 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 10:46:11 -0700 Subject: [PATCH 2/5] Fix typo --- python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index b4dabf4b33e..cc9f139cdf9 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -31,7 +31,7 @@ def generate_character_ngrams(Column strings, int ngrams): @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - result = nvtext.generate_ngrams.generate_chash_character_ngramsharacter_ngrams( + result = nvtext.generate_ngrams.hash_character_ngrams( strings.to_pylibcudf(mode="read"), ngrams ) From ff9fb427a90a7f5b9b42b75ec832a900e7915e89 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 14:29:57 -0700 Subject: [PATCH 3/5] remove explicit casts --- python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index cc9f139cdf9..7fdf9258b7f 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,9 +2,6 @@ from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.types cimport size_type -from pylibcudf.scalar cimport Scalar as plc_Scalar - from cudf._lib.column cimport Column from pylibcudf import nvtext @@ -14,8 +11,8 @@ from pylibcudf import nvtext def generate_ngrams(Column strings, int ngrams, object py_separator): result = nvtext.generate_ngrams.generate_ngrams( strings.to_pylibcudf(mode="read"), - ngrams, - py_separator.device_value.c_value + ngrams, + py_separator.device_value.c_value ) return Column.from_pylibcudf(result) @@ -24,7 +21,7 @@ def generate_ngrams(Column strings, int ngrams, object py_separator): def generate_character_ngrams(Column strings, int ngrams): result = nvtext.generate_ngrams.generate_character_ngrams( strings.to_pylibcudf(mode="read"), - ngrams + ngrams ) return Column.from_pylibcudf(result) @@ -33,6 +30,6 @@ def generate_character_ngrams(Column strings, int ngrams): def hash_character_ngrams(Column strings, int ngrams): result = nvtext.generate_ngrams.hash_character_ngrams( strings.to_pylibcudf(mode="read"), - ngrams + ngrams ) return Column.from_pylibcudf(result) From ecd9fc25ec155eeffd4ea2a75b917dd6f62dac36 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 15:20:39 -0700 Subject: [PATCH 4/5] address review --- .../pylibcudf/tests/test_nvtext_generate_ngrams.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py index 08daba64d38..aedb4efddd0 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -45,11 +45,7 @@ def test_hash_character_ngrams(input_col, ngram): ngram, ) pa_result = plc.interop.to_arrow(result) - if ngram == 2: - assert len(pa_result[0]) == 1 - assert len(pa_result[1]) == 2 - assert len(pa_result[2]) == 2 - else: - assert len(pa_result[0]) == 0 - assert len(pa_result[1]) == 1 - assert len(pa_result[2]) == 1 + assert all( + len(got) == max(0, len(s.as_py()) - ngram + 1) + for got, s in zip(pa_result, input_col) + ) From 985bbff22d2a385ee91764525af8d475e7fc5b87 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 7 Oct 2024 16:00:27 -0700 Subject: [PATCH 5/5] address review --- .../pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py index aedb4efddd0..5cf9874d595 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -49,3 +49,6 @@ def test_hash_character_ngrams(input_col, ngram): len(got) == max(0, len(s.as_py()) - ngram + 1) for got, s in zip(pa_result, input_col) ) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + )