Skip to content

Commit

Permalink
Migrate NVText Tokenizing APIs to pylibcudf (#17100)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17100
  • Loading branch information
Matt711 authored Oct 31, 2024
1 parent 0e294b1 commit 893d0fd
Show file tree
Hide file tree
Showing 11 changed files with 476 additions and 122 deletions.
2 changes: 1 addition & 1 deletion cpp/include/nvtext/tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(
* @throw cudf::logic_error if `delimiter` is invalid
*
* @param input Strings column to tokenize
* @param vocabulary Used to lookup tokens within
* @param vocabulary Used to lookup tokens within `input`
* @param delimiter Used to identify tokens within `input`
* @param default_id The token id to be used for tokens not found in the `vocabulary`;
* Default is -1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ nvtext
normalize
replace
stemmer
tokenize
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
========
tokenize
========

.. automodule:: pylibcudf.nvtext.tokenize
:members:
161 changes: 42 additions & 119 deletions python/cudf/cudf/_lib/nvtext/tokenize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,162 +2,85 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.tokenize cimport (
character_tokenize as cpp_character_tokenize,
count_tokens as cpp_count_tokens,
detokenize as cpp_detokenize,
load_vocabulary as cpp_load_vocabulary,
tokenize as cpp_tokenize,
tokenize_vocabulary as cpp_tokenize_vocabulary,
tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type

from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf import nvtext


@acquire_spill_lock()
def _tokenize_scalar(Column strings, object py_delimiter):

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_tokenize(
c_strings,
c_delimiter[0],
)
return Column.from_pylibcudf(
nvtext.tokenize.tokenize_scalar(
strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def _tokenize_column(Column strings, Column delimiters):
cdef column_view c_strings = strings.view()
cdef column_view c_delimiters = delimiters.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_tokenize(
c_strings,
c_delimiters
)
return Column.from_pylibcudf(
nvtext.tokenize.tokenize_column(
strings.to_pylibcudf(mode="read"),
delimiters.to_pylibcudf(mode="read"),
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def _count_tokens_scalar(Column strings, object py_delimiter):

cdef DeviceScalar delimiter = py_delimiter.device_value

cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_count_tokens(
c_strings,
c_delimiter[0]
)
return Column.from_pylibcudf(
nvtext.tokenize.count_tokens_scalar(
strings.to_pylibcudf(mode="read"),
py_delimiter.device_value.c_value
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def _count_tokens_column(Column strings, Column delimiters):
cdef column_view c_strings = strings.view()
cdef column_view c_delimiters = delimiters.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_count_tokens(
c_strings,
c_delimiters
)
return Column.from_pylibcudf(
nvtext.tokenize.count_tokens_column(
strings.to_pylibcudf(mode="read"),
delimiters.to_pylibcudf(mode="read")
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def character_tokenize(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_character_tokenize(c_strings)
return Column.from_pylibcudf(
nvtext.tokenize.character_tokenize(
strings.to_pylibcudf(mode="read")
)

return Column.from_unique_ptr(move(c_result))
)


@acquire_spill_lock()
def detokenize(Column strings, Column indices, object py_separator):

cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef column_view c_indices = indices.view()
cdef const string_scalar* c_separator = <const string_scalar*>separator\
.get_raw_ptr()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_detokenize(c_strings, c_indices, c_separator[0])
return Column.from_pylibcudf(
nvtext.tokenize.detokenize(
strings.to_pylibcudf(mode="read"),
indices.to_pylibcudf(mode="read"),
py_separator.device_value.c_value
)

return Column.from_unique_ptr(move(c_result))


cdef class TokenizeVocabulary:
cdef unique_ptr[cpp_tokenize_vocabulary] c_obj

def __cinit__(self, Column vocab):
cdef column_view c_vocab = vocab.view()
with nogil:
self.c_obj = move(cpp_load_vocabulary(c_vocab))
)


@acquire_spill_lock()
def tokenize_with_vocabulary(Column strings,
TokenizeVocabulary vocabulary,
object vocabulary,
object py_delimiter,
size_type default_id):

cdef DeviceScalar delimiter = py_delimiter.device_value
cdef column_view c_strings = strings.view()
cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_tokenize_with_vocabulary(
c_strings,
vocabulary.c_obj.get()[0],
c_delimiter[0],
default_id
)
return Column.from_pylibcudf(
nvtext.tokenize.tokenize_with_vocabulary(
strings.to_pylibcudf(mode="read"),
vocabulary,
py_delimiter.device_value.c_value,
default_id
)

return Column.from_unique_ptr(move(c_result))
)
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/tokenize_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ class TokenizeVocabulary:
"""

def __init__(self, vocabulary: "cudf.Series"):
self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
self.vocabulary = cpp_tokenize_vocabulary(
vocabulary._column.to_pylibcudf(mode="read")
)

def tokenize(
self, text, delimiter: str = "", default_id: int = -1
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx tokenize.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from . cimport (
normalize,
replace,
stemmer,
tokenize,
)

__all__ = [
Expand All @@ -20,4 +21,5 @@ __all__ = [
"normalize",
"replace",
"stemmer",
"tokenize",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
normalize,
replace,
stemmer,
tokenize,
)

__all__ = [
Expand All @@ -20,4 +21,5 @@
"normalize",
"replace",
"stemmer",
"tokenize",
]
31 changes: 31 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from pylibcudf.column cimport Column
from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

cdef class TokenizeVocabulary:
cdef unique_ptr[tokenize_vocabulary] c_obj

cpdef Column tokenize_scalar(Column input, Scalar delimiter=*)

cpdef Column tokenize_column(Column input, Column delimiters)

cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*)

cpdef Column count_tokens_column(Column input, Column delimiters)

cpdef Column character_tokenize(Column input)

cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)

cpdef TokenizeVocabulary load_vocabulary(Column input)

cpdef Column tokenize_with_vocabulary(
Column input,
TokenizeVocabulary vocabulary,
Scalar delimiter,
size_type default_id=*
)
Loading

0 comments on commit 893d0fd

Please sign in to comment.