-
Notifications
You must be signed in to change notification settings - Fork 917
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Migrate nvtext jaccard API to pylibcudf (#17007)
Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #17007
- Loading branch information
Showing
9 changed files
with
111 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ nvtext | |
|
||
edit_distance | ||
generate_ngrams | ||
jaccard |
6 changes: 6 additions & 0 deletions
6
docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
======= | ||
jaccard | ||
======= | ||
|
||
.. automodule:: pylibcudf.nvtext.jaccard | ||
:members: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . cimport edit_distance, generate_ngrams | ||
from . cimport edit_distance, generate_ngrams, jaccard | ||
|
||
__all__ = [ | ||
"edit_distance", | ||
"generate_ngrams", | ||
"jaccard", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . import edit_distance, generate_ngrams | ||
from . import edit_distance, generate_ngrams, jaccard | ||
|
||
__all__ = [ | ||
"edit_distance", | ||
"generate_ngrams", | ||
"jaccard", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.types cimport size_type | ||
|
||
|
||
cpdef Column jaccard_index(Column input1, Column input2, size_type width) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# Copyright (c) 2023-2024, NVIDIA CORPORATION. | ||
|
||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.column.column_view cimport column_view | ||
from pylibcudf.libcudf.nvtext.jaccard cimport ( | ||
jaccard_index as cpp_jaccard_index, | ||
) | ||
from pylibcudf.libcudf.types cimport size_type | ||
|
||
|
||
cpdef Column jaccard_index(Column input1, Column input2, size_type width): | ||
""" | ||
Returns the Jaccard similarity between individual rows in two strings columns. | ||
For details, see :cpp:func:`jaccard_index` | ||
Parameters | ||
---------- | ||
input1 : Column | ||
Input strings column | ||
input2 : Column | ||
Input strings column | ||
width : size_type | ||
The ngram number to generate | ||
Returns | ||
------- | ||
Column | ||
Index calculation values | ||
""" | ||
cdef column_view c_input1 = input1.view() | ||
cdef column_view c_input2 = input2.view() | ||
cdef unique_ptr[column] c_result | ||
|
||
with nogil: | ||
c_result = move( | ||
cpp_jaccard_index( | ||
c_input1, | ||
c_input2, | ||
width | ||
) | ||
) | ||
|
||
return Column.from_libcudf(move(c_result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pylibcudf as plc | ||
import pytest | ||
from utils import assert_column_eq | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def input_data(): | ||
input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] | ||
input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] | ||
return pa.array(input1), pa.array(input2) | ||
|
||
|
||
@pytest.mark.parametrize("width", [2, 3]) | ||
def test_jaccard_index(input_data, width): | ||
def get_tokens(s, width): | ||
return [s[i : i + width] for i in range(len(s) - width + 1)] | ||
|
||
def jaccard_index(s1, s2, width): | ||
x = set(get_tokens(s1, width)) | ||
y = set(get_tokens(s2, width)) | ||
return len(x & y) / len(x | y) | ||
|
||
input1, input2 = input_data | ||
result = plc.nvtext.jaccard.jaccard_index( | ||
plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width | ||
) | ||
expected = pa.array( | ||
[ | ||
jaccard_index(s1.as_py(), s2.as_py(), width) | ||
for s1, s2 in zip(input1, input2) | ||
], | ||
type=pa.float32(), | ||
) | ||
assert_column_eq(result, expected) |