Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate NVText Tokenizing APIs to pylibcudf #17100

Merged
merged 1 commit into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/include/nvtext/tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(
* @throw cudf::logic_error if `delimiter` is invalid
*
* @param input Strings column to tokenize
* @param vocabulary Used to lookup tokens within
* @param vocabulary Used to lookup tokens within `input`
* @param delimiter Used to identify tokens within `input`
* @param default_id The token id to be used for tokens not found in the `vocabulary`;
* Default is -1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ nvtext
normalize
replace
stemmer
tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
========
tokenize
========

.. automodule:: pylibcudf.nvtext.tokenize
:members:
161 changes: 42 additions & 119 deletions python/cudf/cudf/_lib/nvtext/tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,162 +2,85 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.tokenize cimport (
character_tokenize as cpp_character_tokenize,
count_tokens as cpp_count_tokens,
detokenize as cpp_detokenize,
load_vocabulary as cpp_load_vocabulary,
tokenize as cpp_tokenize,
tokenize_vocabulary as cpp_tokenize_vocabulary,
tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type

from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf import nvtext


@acquire_spill_lock()
def _tokenize_scalar(Column strings, object py_delimiter):

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_tokenize(
c_strings,
c_delimiter[0],
)
return Column.from_pylibcudf(
nvtext.tokenize.tokenize_scalar(
strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def _tokenize_column(Column strings, Column delimiters):
cdef column_view c_strings = strings.view()
cdef column_view c_delimiters = delimiters.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_tokenize(
c_strings,
c_delimiters
)
return Column.from_pylibcudf(
nvtext.tokenize.tokenize_column(
strings.to_pylibcudf(mode="read"),
delimiters.to_pylibcudf(mode="read"),
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def _count_tokens_scalar(Column strings, object py_delimiter):

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_count_tokens(
c_strings,
c_delimiter[0]
)
return Column.from_pylibcudf(
nvtext.tokenize.count_tokens_scalar(
strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def _count_tokens_column(Column strings, Column delimiters):
cdef column_view c_strings = strings.view()
cdef column_view c_delimiters = delimiters.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_count_tokens(
c_strings,
c_delimiters
)
return Column.from_pylibcudf(
nvtext.tokenize.count_tokens_column(
strings.to_pylibcudf(mode="read"),
delimiters.to_pylibcudf(mode="read")
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def character_tokenize(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_character_tokenize(c_strings)
return Column.from_pylibcudf(
nvtext.tokenize.character_tokenize(
strings.to_pylibcudf(mode="read")
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def detokenize(Column strings, Column indices, object py_separator):

cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef column_view c_indices = indices.view()
cdef const string_scalar* c_separator = <const string_scalar*>separator\
.get_raw_ptr()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_detokenize(c_strings, c_indices, c_separator[0])
return Column.from_pylibcudf(
nvtext.tokenize.detokenize(
strings.to_pylibcudf(mode="read"),
indices.to_pylibcudf(mode="read"),
py_separator.device_value.c_value
)

return Column.from_unique_ptr(move(c_result))


cdef class TokenizeVocabulary:
cdef unique_ptr[cpp_tokenize_vocabulary] c_obj

def __cinit__(self, Column vocab):
cdef column_view c_vocab = vocab.view()
with nogil:
self.c_obj = move(cpp_load_vocabulary(c_vocab))
)


@acquire_spill_lock()
def tokenize_with_vocabulary(Column strings,
TokenizeVocabulary vocabulary,
object vocabulary,
object py_delimiter,
size_type default_id):

cdef DeviceScalar delimiter = py_delimiter.device_value
cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_tokenize_with_vocabulary(
c_strings,
vocabulary.c_obj.get()[0],
c_delimiter[0],
default_id
)
return Column.from_pylibcudf(
nvtext.tokenize.tokenize_with_vocabulary(
strings.to_pylibcudf(mode="read"),
vocabulary,
py_delimiter.device_value.c_value,
default_id
)

return Column.from_unique_ptr(move(c_result))
)
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/tokenize_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ class TokenizeVocabulary:
"""

def __init__(self, vocabulary: "cudf.Series"):
self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
self.vocabulary = cpp_tokenize_vocabulary(
vocabulary._column.to_pylibcudf(mode="read")
)

def tokenize(
self, text, delimiter: str = "", default_id: int = -1
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx tokenize.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from . cimport (
normalize,
replace,
stemmer,
tokenize,
)

__all__ = [
Expand All @@ -20,4 +21,5 @@ __all__ = [
"normalize",
"replace",
"stemmer",
"tokenize",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
normalize,
replace,
stemmer,
tokenize,
)

__all__ = [
Expand All @@ -20,4 +21,5 @@
"normalize",
"replace",
"stemmer",
"tokenize",
]
31 changes: 31 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from pylibcudf.column cimport Column
from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

cdef class TokenizeVocabulary:
cdef unique_ptr[tokenize_vocabulary] c_obj

cpdef Column tokenize_scalar(Column input, Scalar delimiter=*)

cpdef Column tokenize_column(Column input, Column delimiters)

cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*)

cpdef Column count_tokens_column(Column input, Column delimiters)

cpdef Column character_tokenize(Column input)

cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)

cpdef TokenizeVocabulary load_vocabulary(Column input)

cpdef Column tokenize_with_vocabulary(
Column input,
TokenizeVocabulary vocabulary,
Scalar delimiter,
size_type default_id=*
)
Loading
Loading